diff options
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts')
345 files changed, 259830 insertions, 0 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/aarch64/dtrace/fasttrap_isa.c b/sys/cddl/contrib/opensolaris/uts/aarch64/dtrace/fasttrap_isa.c new file mode 100644 index 000000000000..f5377a895d6c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/aarch64/dtrace/fasttrap_isa.c @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * XXX: Placeholder for AArch64 fasttrap code + */ diff --git a/sys/cddl/contrib/opensolaris/uts/aarch64/sys/fasttrap_isa.h b/sys/cddl/contrib/opensolaris/uts/aarch64/sys/fasttrap_isa.h new file mode 100644 index 000000000000..d85426edb417 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/aarch64/sys/fasttrap_isa.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _FASTTRAP_ISA_H +#define _FASTTRAP_ISA_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint32_t fasttrap_instr_t; + +/* XXX: Place for AArch64 fasttrap headers */ + +#ifdef __cplusplus +} +#endif + +#endif /* _FASTTRAP_ISA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/arm/dtrace/fasttrap_isa.c b/sys/cddl/contrib/opensolaris/uts/arm/dtrace/fasttrap_isa.c new file mode 100644 index 000000000000..18e3837b35b6 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/arm/dtrace/fasttrap_isa.c @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +/* + * XXX: Placeholder for ARM fasttrap code + */ diff --git a/sys/cddl/contrib/opensolaris/uts/arm/sys/fasttrap_isa.h b/sys/cddl/contrib/opensolaris/uts/arm/sys/fasttrap_isa.h new file mode 100644 index 000000000000..10361cbed8de --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/arm/sys/fasttrap_isa.h @@ -0,0 +1,94 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _FASTTRAP_ISA_H +#define _FASTTRAP_ISA_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is our reserved trap instruction: ta 0x38 + */ +#define FASTTRAP_INSTR 0x91d02038 + +#define FASTTRAP_SUNWDTRACE_SIZE 128 + +typedef uint32_t fasttrap_instr_t; + +typedef struct fasttrap_machtp { + fasttrap_instr_t ftmt_instr; /* original instruction */ + uintptr_t ftmt_dest; /* destination of DCTI */ + uint8_t ftmt_type; /* emulation type */ + uint8_t ftmt_flags; /* emulation flags */ + uint8_t ftmt_cc; /* which cc to look at */ + uint8_t ftmt_code; /* branch condition */ +} fasttrap_machtp_t; + +#define ftt_instr ftt_mtp.ftmt_instr +#define ftt_dest ftt_mtp.ftmt_dest +#define ftt_type ftt_mtp.ftmt_type +#define ftt_flags ftt_mtp.ftmt_flags +#define ftt_cc ftt_mtp.ftmt_cc +#define ftt_code ftt_mtp.ftmt_code + +#define FASTTRAP_T_COMMON 0x00 /* common case -- no emulation */ +#define FASTTRAP_T_CCR 0x01 /* integer condition code branch */ +#define FASTTRAP_T_FCC 0x02 /* floating-point branch */ +#define FASTTRAP_T_REG 0x03 /* register predicated branch */ +#define FASTTRAP_T_ALWAYS 0x04 /* branch always */ +#define FASTTRAP_T_CALL 0x05 /* call instruction */ +#define FASTTRAP_T_JMPL 0x06 /* jmpl instruction */ +#define FASTTRAP_T_RDPC 0x07 /* rdpc instruction */ +#define FASTTRAP_T_RETURN 0x08 /* return instruction */ + +/* + * For performance rather than correctness. + */ +#define FASTTRAP_T_SAVE 0x10 /* save instruction (func entry only) */ +#define FASTTRAP_T_RESTORE 0x11 /* restore instruction */ +#define FASTTRAP_T_OR 0x12 /* mov instruction */ +#define FASTTRAP_T_SETHI 0x13 /* sethi instruction (includes nop) */ + +#define FASTTRAP_F_ANNUL 0x01 /* branch is annulled */ +#define FASTTRAP_F_RETMAYBE 0x02 /* not definitely a return site */ + +#define FASTTRAP_AFRAMES 3 +#define FASTTRAP_RETURN_AFRAMES 4 +#define FASTTRAP_ENTRY_AFRAMES 3 +#define FASTTRAP_OFFSET_AFRAMES 3 + + +#ifdef __cplusplus +} +#endif + +#endif /* _FASTTRAP_ISA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files new file mode 100644 index 000000000000..a38e1bcebd22 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files @@ -0,0 +1,183 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2012 Joyent, Inc. All rights reserved. +# Copyright (c) 2011, 2014 by Delphix. All rights reserved. +# Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +# +# +# This Makefile defines all file modules for the directory uts/common +# and its children. These are the source files which may be considered +# common to all SunOS systems. + +LUA_OBJS += \ + ldo.o \ + lvm.o \ + lbitlib.o \ + lopcodes.o \ + lstring.o \ + ltable.o \ + ltm.o \ + lcorolib.o \ + lauxlib.o \ + ldebug.o \ + lstate.o \ + lgc.o \ + lmem.o \ + lctype.o \ + lfunc.o \ + ldump.o \ + lundump.o \ + lstrlib.o \ + ltablib.o \ + lapi.o \ + lobject.o \ + lbaselib.o \ + lcompat.o \ + lzio.o \ + lcode.o \ + llex.o \ + lparser.o + +ZFS_COMMON_OBJS += \ + abd.o \ + aggsum.o \ + arc.o \ + bplist.o \ + blkptr.o \ + bpobj.o \ + bptree.o \ + bqueue.o \ + cityhash.o \ + dbuf.o \ + dbuf_stats.o \ + ddt.o \ + ddt_zap.o \ + dmu.o \ + dmu_diff.o \ + dmu_send.o \ + dmu_object.o \ + dmu_objset.o \ + dmu_traverse.o \ + dmu_tx.o \ + dnode.o \ + dnode_sync.o \ + dsl_bookmark.o \ + dsl_dir.o \ + dsl_dataset.o \ + dsl_deadlist.o \ + dsl_destroy.o \ + dsl_pool.o \ + dsl_synctask.o \ + dsl_userhold.o \ + dmu_zfetch.o \ + dsl_deleg.o \ + dsl_prop.o \ + dsl_scan.o \ + zfeature.o \ + gzip.o \ + lz4.o \ + lzjb.o \ + metaslab.o \ + multilist.o \ + range_tree.o \ + refcount.o \ + rrwlock.o \ + sa.o \ + sha256.o \ + skein_zfs.o \ + spa.o \ + spa_checkpoint.o \ + spa_config.o \ + spa_errlog.o \ + spa_history.o \ + spa_misc.o \ + space_map.o \ + space_reftree.o \ + txg.o \ + uberblock.o \ + unique.o \ + vdev.o \ + vdev_cache.o \ + vdev_file.o \ + vdev_indirect.o \ + vdev_indirect_births.o \ + vdev_indirect_mapping.o \ + vdev_initialize.o \ + vdev_label.o \ + vdev_mirror.o \ + vdev_missing.o \ + vdev_queue.o \ + vdev_raidz.o \ + vdev_removal.o \ + vdev_root.o \ + zap.o \ + zap_leaf.o \ + zap_micro.o \ + zcp.o \ + zcp_get.o \ + zcp_global.o \ + zcp_iter.o \ + zcp_synctask.o \ + zfs_byteswap.o \ + zfs_debug.o \ + zfs_fm.o \ + zfs_fuid.o \ + zfs_sa.o \ + zfs_znode.o \ + zil.o \ + zio.o \ + zio_checksum.o \ + zio_compress.o \ + zio_inject.o \ + zle.o \ + zrlock.o \ + zthr.o + +ZFS_SHARED_OBJS += \ + zfeature_common.o \ + zfs_comutil.o \ + zfs_deleg.o \ + zfs_fletcher.o \ + zfs_namecheck.o \ + zfs_prop.o \ + zpool_prop.o \ + zprop_common.o + +ZFS_OBJS += \ + $(ZFS_COMMON_OBJS) \ + $(ZFS_SHARED_OBJS) \ + zfs_acl.o \ + zfs_ctldir.o \ + zfs_dir.o \ + zfs_ioctl.o \ + zfs_ioctl_compat.o \ + zfs_log.o \ + zfs_onexit.o \ + zfs_replay.o \ + zfs_rlock.o \ + zfs_vfsops.o \ + zfs_vnops.o \ + zvol.o diff --git a/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c b/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c new file mode 100644 index 000000000000..b34cf400cd2c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c @@ -0,0 +1,177 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/sysmacros.h> +#include <sys/modctl.h> +#include <sys/debug.h> +#include <sys/mman.h> +#include <sys/modctl.h> +#include <sys/kobj.h> +#include <ctf_impl.h> + +int ctf_leave_compressed = 0; + +static struct modlmisc modlmisc = { + &mod_miscops, "Compact C Type Format routines" +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlmisc, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *mip) +{ + return (mod_info(&modlinkage, mip)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} + +/*ARGSUSED*/ +void * +ctf_zopen(int *errp) +{ + return ((void *)1); /* zmod is always loaded because we depend on it */ +} + +/*ARGSUSED*/ +const void * +ctf_sect_mmap(ctf_sect_t *sp, int fd) +{ + return (MAP_FAILED); /* we don't support this in the kernel */ +} + +/*ARGSUSED*/ +void +ctf_sect_munmap(const ctf_sect_t *sp) +{ + /* we don't support this in the kernel */ +} + +/*ARGSUSED*/ +ctf_file_t * +ctf_fdopen(int fd, int *errp) +{ + return (ctf_set_open_errno(errp, ENOTSUP)); +} + +/*ARGSUSED*/ +ctf_file_t * +ctf_open(const char *filename, int *errp) +{ + return (ctf_set_open_errno(errp, ENOTSUP)); +} + +/*ARGSUSED*/ +int +ctf_write(ctf_file_t *fp, int fd) +{ + return (ctf_set_errno(fp, ENOTSUP)); +} + +int +ctf_version(int version) +{ + ASSERT(version > 0 && version <= CTF_VERSION); + + if (version > 0) + _libctf_version = MIN(CTF_VERSION, version); + + return (_libctf_version); +} + +/*ARGSUSED*/ +ctf_file_t * +ctf_modopen(struct module *mp, int *error) +{ + ctf_sect_t ctfsect, symsect, strsect; + ctf_file_t *fp = NULL; + int err; + + if (error == NULL) + error = &err; + + ctfsect.cts_name = ".SUNW_ctf"; + ctfsect.cts_type = SHT_PROGBITS; + ctfsect.cts_flags = SHF_ALLOC; + ctfsect.cts_data = mp->ctfdata; + ctfsect.cts_size = mp->ctfsize; + ctfsect.cts_entsize = 1; + ctfsect.cts_offset = 0; + + symsect.cts_name = ".symtab"; + symsect.cts_type = SHT_SYMTAB; + symsect.cts_flags = 0; + symsect.cts_data = mp->symtbl; + symsect.cts_size = mp->symhdr->sh_size; +#ifdef _LP64 + symsect.cts_entsize = sizeof (Elf64_Sym); +#else + symsect.cts_entsize = sizeof (Elf32_Sym); +#endif + symsect.cts_offset = 0; + + strsect.cts_name = ".strtab"; + strsect.cts_type = SHT_STRTAB; + strsect.cts_flags = 0; + strsect.cts_data = mp->strings; + strsect.cts_size = mp->strhdr->sh_size; + strsect.cts_entsize = 1; + strsect.cts_offset = 0; + + ASSERT(MUTEX_HELD(&mod_lock)); + + if ((fp = ctf_bufopen(&ctfsect, &symsect, &strsect, error)) == NULL) + return (NULL); + + if (!ctf_leave_compressed && (caddr_t)fp->ctf_base != mp->ctfdata) { + /* + * We must have just uncompressed the CTF data. To avoid + * others having to pay the (substantial) cost of decompressing + * the data, we're going to substitute the uncompressed version + * for the compressed version. Note that this implies that the + * first CTF consumer will induce memory impact on the system + * (but in the name of performance of future CTF consumers). + */ + kobj_set_ctf(mp, (caddr_t)fp->ctf_base, fp->ctf_size); + fp->ctf_data.cts_data = fp->ctf_base; + fp->ctf_data.cts_size = fp->ctf_size; + } + + return (fp); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c b/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c new file mode 100644 index 000000000000..cd0a828628d4 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c @@ -0,0 +1,96 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <ctf_impl.h> +#include <sys/kobj.h> +#include <sys/kobj_impl.h> + +/* + * This module is used both during the normal operation of the kernel (i.e. + * after kmem has been initialized) and during boot (before unix`_start has + * been called). kobj_alloc is able to tell the difference between the two + * cases, and as such must be used instead of kmem_alloc. + */ + +void * +ctf_data_alloc(size_t size) +{ + void *buf = kobj_alloc(size, KM_NOWAIT|KM_SCRATCH); + + if (buf == NULL) + return (MAP_FAILED); + + return (buf); +} + +void +ctf_data_free(void *buf, size_t size) +{ + kobj_free(buf, size); +} + +/*ARGSUSED*/ +void +ctf_data_protect(void *buf, size_t size) +{ + /* we don't support this operation in the kernel */ +} + +void * +ctf_alloc(size_t size) +{ + return (kobj_alloc(size, KM_NOWAIT|KM_TMP)); +} + +/*ARGSUSED*/ +void +ctf_free(void *buf, size_t size) +{ + kobj_free(buf, size); +} + +/*ARGSUSED*/ +const char * +ctf_strerror(int err) +{ + return (NULL); /* we don't support this operation in the kernel */ +} + +/*PRINTFLIKE1*/ +void +ctf_dprintf(const char *format, ...) +{ + if (_libctf_debug) { + va_list alist; + + va_start(alist, format); + (void) printf("ctf DEBUG: "); + (void) vprintf(format, alist); + va_end(alist); + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c new file mode 100644 index 000000000000..cab8c334d082 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c @@ -0,0 +1,18424 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ + +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, Joyent, Inc. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +/* + * DTrace - Dynamic Tracing for Solaris + * + * This is the implementation of the Solaris Dynamic Tracing framework + * (DTrace). The user-visible interface to DTrace is described at length in + * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace + * library, the in-kernel DTrace framework, and the DTrace providers are + * described in the block comments in the <sys/dtrace.h> header file. The + * internal architecture of DTrace is described in the block comments in the + * <sys/dtrace_impl.h> header file. The comments contained within the DTrace + * implementation very much assume mastery of all of these sources; if one has + * an unanswered question about the implementation, one should consult them + * first. + * + * The functions here are ordered roughly as follows: + * + * - Probe context functions + * - Probe hashing functions + * - Non-probe context utility functions + * - Matching functions + * - Provider-to-Framework API functions + * - Probe management functions + * - DIF object functions + * - Format functions + * - Predicate functions + * - ECB functions + * - Buffer functions + * - Enabling functions + * - DOF functions + * - Anonymous enabling functions + * - Consumer state functions + * - Helper functions + * - Hook functions + * - Driver cookbook functions + * + * Each group of functions begins with a block comment labelled the "DTrace + * [Group] Functions", allowing one to find each block by searching forward + * on capital-f functions. + */ +#include <sys/errno.h> +#ifndef illumos +#include <sys/time.h> +#endif +#include <sys/stat.h> +#include <sys/modctl.h> +#include <sys/conf.h> +#include <sys/systm.h> +#ifdef illumos +#include <sys/ddi.h> +#include <sys/sunddi.h> +#endif +#include <sys/cpuvar.h> +#include <sys/kmem.h> +#ifdef illumos +#include <sys/strsubr.h> +#endif +#include <sys/sysmacros.h> +#include <sys/dtrace_impl.h> +#include <sys/atomic.h> +#include <sys/cmn_err.h> +#ifdef illumos +#include <sys/mutex_impl.h> +#include <sys/rwlock_impl.h> +#endif +#include <sys/ctf_api.h> +#ifdef illumos +#include <sys/panic.h> +#include <sys/priv_impl.h> +#endif +#include <sys/policy.h> +#ifdef illumos +#include <sys/cred_impl.h> +#include <sys/procfs_isa.h> +#endif +#include <sys/taskq.h> +#ifdef illumos +#include <sys/mkdev.h> +#include <sys/kdi.h> +#endif +#include <sys/zone.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include "strtolctype.h" + +/* FreeBSD includes: */ +#ifndef illumos +#include <sys/callout.h> +#include <sys/ctype.h> +#include <sys/eventhandler.h> +#include <sys/limits.h> +#include <sys/linker.h> +#include <sys/kdb.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/ptrace.h> +#include <sys/random.h> +#include <sys/rwlock.h> +#include <sys/sx.h> +#include <sys/sysctl.h> + +#include <sys/dtrace_bsd.h> + +#include <netinet/in.h> + +#include "dtrace_cddl.h" +#include "dtrace_debug.c" +#endif + +#include "dtrace_xoroshiro128_plus.h" + +/* + * DTrace Tunable Variables + * + * The following variables may be tuned by adding a line to /etc/system that + * includes both the name of the DTrace module ("dtrace") and the name of the + * variable. For example: + * + * set dtrace:dtrace_destructive_disallow = 1 + * + * In general, the only variables that one should be tuning this way are those + * that affect system-wide DTrace behavior, and for which the default behavior + * is undesirable. Most of these variables are tunable on a per-consumer + * basis using DTrace options, and need not be tuned on a system-wide basis. + * When tuning these variables, avoid pathological values; while some attempt + * is made to verify the integrity of these variables, they are not considered + * part of the supported interface to DTrace, and they are therefore not + * checked comprehensively. Further, these variables should not be tuned + * dynamically via "mdb -kw" or other means; they should only be tuned via + * /etc/system. + */ +int dtrace_destructive_disallow = 0; +#ifndef illumos +/* Positive logic version of dtrace_destructive_disallow for loader tunable */ +int dtrace_allow_destructive = 1; +#endif +dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024); +size_t dtrace_difo_maxsize = (256 * 1024); +dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024); +size_t dtrace_statvar_maxsize = (16 * 1024); +size_t dtrace_actions_max = (16 * 1024); +size_t dtrace_retain_max = 1024; +dtrace_optval_t dtrace_helper_actions_max = 128; +dtrace_optval_t dtrace_helper_providers_max = 32; +dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024); +size_t dtrace_strsize_default = 256; +dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */ +dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */ +dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */ +dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */ +dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */ +dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */ +dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */ +dtrace_optval_t dtrace_nspec_default = 1; +dtrace_optval_t dtrace_specsize_default = 32 * 1024; +dtrace_optval_t dtrace_stackframes_default = 20; +dtrace_optval_t dtrace_ustackframes_default = 20; +dtrace_optval_t dtrace_jstackframes_default = 50; +dtrace_optval_t dtrace_jstackstrsize_default = 512; +int dtrace_msgdsize_max = 128; +hrtime_t dtrace_chill_max = MSEC2NSEC(500); /* 500 ms */ +hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */ +int dtrace_devdepth_max = 32; +int dtrace_err_verbose; +hrtime_t dtrace_deadman_interval = NANOSEC; +hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC; +hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC; +hrtime_t dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC; +#ifndef illumos +int dtrace_memstr_max = 4096; +#endif + +/* + * DTrace External Variables + * + * As dtrace(7D) is a kernel module, any DTrace variables are obviously + * available to DTrace consumers via the backtick (`) syntax. One of these, + * dtrace_zero, is made deliberately so: it is provided as a source of + * well-known, zero-filled memory. While this variable is not documented, + * it is used by some translators as an implementation detail. + */ +const char dtrace_zero[256] = { 0 }; /* zero-filled memory */ + +/* + * DTrace Internal Variables + */ +#ifdef illumos +static dev_info_t *dtrace_devi; /* device info */ +#endif +#ifdef illumos +static vmem_t *dtrace_arena; /* probe ID arena */ +static vmem_t *dtrace_minor; /* minor number arena */ +#else +static taskq_t *dtrace_taskq; /* task queue */ +static struct unrhdr *dtrace_arena; /* Probe ID number. */ +#endif +static dtrace_probe_t **dtrace_probes; /* array of all probes */ +static int dtrace_nprobes; /* number of probes */ +static dtrace_provider_t *dtrace_provider; /* provider list */ +static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */ +static int dtrace_opens; /* number of opens */ +static int dtrace_helpers; /* number of helpers */ +static int dtrace_getf; /* number of unpriv getf()s */ +#ifdef illumos +static void *dtrace_softstate; /* softstate pointer */ +#endif +static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */ +static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */ +static dtrace_hash_t *dtrace_byname; /* probes hashed by name */ +static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */ +static int dtrace_toxranges; /* number of toxic ranges */ +static int dtrace_toxranges_max; /* size of toxic range array */ +static dtrace_anon_t dtrace_anon; /* anonymous enabling */ +static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */ +static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */ +static kthread_t *dtrace_panicked; /* panicking thread */ +static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */ +static dtrace_genid_t dtrace_probegen; /* current probe generation */ +static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */ +static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */ +static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */ +static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */ +static int dtrace_dynvar_failclean; /* dynvars failed to clean */ +#ifndef illumos +static struct mtx dtrace_unr_mtx; +MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF); +static eventhandler_tag dtrace_kld_load_tag; +static eventhandler_tag dtrace_kld_unload_try_tag; +#endif + +/* + * DTrace Locking + * DTrace is protected by three (relatively coarse-grained) locks: + * + * (1) dtrace_lock is required to manipulate essentially any DTrace state, + * including enabling state, probes, ECBs, consumer state, helper state, + * etc. Importantly, dtrace_lock is _not_ required when in probe context; + * probe context is lock-free -- synchronization is handled via the + * dtrace_sync() cross call mechanism. + * + * (2) dtrace_provider_lock is required when manipulating provider state, or + * when provider state must be held constant. + * + * (3) dtrace_meta_lock is required when manipulating meta provider state, or + * when meta provider state must be held constant. + * + * The lock ordering between these three locks is dtrace_meta_lock before + * dtrace_provider_lock before dtrace_lock. (In particular, there are + * several places where dtrace_provider_lock is held by the framework as it + * calls into the providers -- which then call back into the framework, + * grabbing dtrace_lock.) + * + * There are two other locks in the mix: mod_lock and cpu_lock. With respect + * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical + * role as a coarse-grained lock; it is acquired before both of these locks. + * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must + * be acquired _between_ dtrace_meta_lock and any other DTrace locks. + * mod_lock is similar with respect to dtrace_provider_lock in that it must be + * acquired _between_ dtrace_provider_lock and dtrace_lock. + */ +static kmutex_t dtrace_lock; /* probe state lock */ +static kmutex_t dtrace_provider_lock; /* provider state lock */ +static kmutex_t dtrace_meta_lock; /* meta-provider state lock */ + +#ifndef illumos +/* XXX FreeBSD hacks. */ +#define cr_suid cr_svuid +#define cr_sgid cr_svgid +#define ipaddr_t in_addr_t +#define mod_modname pathname +#define vuprintf vprintf +#define ttoproc(_a) ((_a)->td_proc) +#define crgetzoneid(_a) 0 +#define SNOCD 0 +#define CPU_ON_INTR(_a) 0 + +#define PRIV_EFFECTIVE (1 << 0) +#define PRIV_DTRACE_KERNEL (1 << 1) +#define PRIV_DTRACE_PROC (1 << 2) +#define PRIV_DTRACE_USER (1 << 3) +#define PRIV_PROC_OWNER (1 << 4) +#define PRIV_PROC_ZONE (1 << 5) +#define PRIV_ALL ~0 + +SYSCTL_DECL(_debug_dtrace); +SYSCTL_DECL(_kern_dtrace); +#endif + +#ifdef illumos +#define curcpu CPU->cpu_id +#endif + + +/* + * DTrace Provider Variables + * + * These are the variables relating to DTrace as a provider (that is, the + * provider of the BEGIN, END, and ERROR probes). + */ +static dtrace_pattr_t dtrace_provider_attr = { +{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON }, +}; + +static void +dtrace_nullop(void) +{} + +static dtrace_pops_t dtrace_provider_ops = { + .dtps_provide = (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop, + .dtps_provide_module = (void (*)(void *, modctl_t *))dtrace_nullop, + .dtps_enable = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, + .dtps_disable = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, + .dtps_suspend = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, + .dtps_resume = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, + .dtps_getargdesc = NULL, + .dtps_getargval = NULL, + .dtps_usermode = NULL, + .dtps_destroy = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, +}; + +static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */ +static dtrace_id_t dtrace_probeid_end; /* special END probe */ +dtrace_id_t dtrace_probeid_error; /* special ERROR probe */ + +/* + * DTrace Helper Tracing Variables + * + * These variables should be set dynamically to enable helper tracing. The + * only variables that should be set are dtrace_helptrace_enable (which should + * be set to a non-zero value to allocate helper tracing buffers on the next + * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a + * non-zero value to deallocate helper tracing buffers on the next close of + * /dev/dtrace). When (and only when) helper tracing is disabled, the + * buffer size may also be set via dtrace_helptrace_bufsize. + */ +int dtrace_helptrace_enable = 0; +int dtrace_helptrace_disable = 0; +int dtrace_helptrace_bufsize = 16 * 1024 * 1024; +uint32_t dtrace_helptrace_nlocals; +static dtrace_helptrace_t *dtrace_helptrace_buffer; +static uint32_t dtrace_helptrace_next = 0; +static int dtrace_helptrace_wrapped = 0; + +/* + * DTrace Error Hashing + * + * On DEBUG kernels, DTrace will track the errors that has seen in a hash + * table. This is very useful for checking coverage of tests that are + * expected to induce DIF or DOF processing errors, and may be useful for + * debugging problems in the DIF code generator or in DOF generation . The + * error hash may be examined with the ::dtrace_errhash MDB dcmd. + */ +#ifdef DEBUG +static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ]; +static const char *dtrace_errlast; +static kthread_t *dtrace_errthread; +static kmutex_t dtrace_errlock; +#endif + +/* + * DTrace Macros and Constants + * + * These are various macros that are useful in various spots in the + * implementation, along with a few random constants that have no meaning + * outside of the implementation. There is no real structure to this cpp + * mishmash -- but is there ever? + */ +#define DTRACE_HASHSTR(hash, probe) \ + dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs))) + +#define DTRACE_HASHNEXT(hash, probe) \ + (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs) + +#define DTRACE_HASHPREV(hash, probe) \ + (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs) + +#define DTRACE_HASHEQ(hash, lhs, rhs) \ + (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \ + *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0) + +#define DTRACE_AGGHASHSIZE_SLEW 17 + +#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3) + +/* + * The key for a thread-local variable consists of the lower 61 bits of the + * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL. + * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never + * equal to a variable identifier. This is necessary (but not sufficient) to + * assure that global associative arrays never collide with thread-local + * variables. To guarantee that they cannot collide, we must also define the + * order for keying dynamic variables. That order is: + * + * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ] + * + * Because the variable-key and the tls-key are in orthogonal spaces, there is + * no way for a global variable key signature to match a thread-local key + * signature. + */ +#ifdef illumos +#define DTRACE_TLS_THRKEY(where) { \ + uint_t intr = 0; \ + uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \ + for (; actv; actv >>= 1) \ + intr++; \ + ASSERT(intr < (1 << 3)); \ + (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \ + (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \ +} +#else +#define DTRACE_TLS_THRKEY(where) { \ + solaris_cpu_t *_c = &solaris_cpu[curcpu]; \ + uint_t intr = 0; \ + uint_t actv = _c->cpu_intr_actv; \ + for (; actv; actv >>= 1) \ + intr++; \ + ASSERT(intr < (1 << 3)); \ + (where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \ + (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \ +} +#endif + +#define DT_BSWAP_8(x) ((x) & 0xff) +#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8)) +#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16)) +#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32)) + +#define DT_MASK_LO 0x00000000FFFFFFFFULL + +#define DTRACE_STORE(type, tomax, offset, what) \ + *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what); + +#ifndef __x86 +#define DTRACE_ALIGNCHECK(addr, size, flags) \ + if (addr & (size - 1)) { \ + *flags |= CPU_DTRACE_BADALIGN; \ + cpu_core[curcpu].cpuc_dtrace_illval = addr; \ + return (0); \ + } +#else +#define DTRACE_ALIGNCHECK(addr, size, flags) +#endif + +/* + * Test whether a range of memory starting at testaddr of size testsz falls + * within the range of memory described by addr, sz. We take care to avoid + * problems with overflow and underflow of the unsigned quantities, and + * disallow all negative sizes. Ranges of size 0 are allowed. + */ +#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \ + ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \ + (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \ + (testaddr) + (testsz) >= (testaddr)) + +#define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \ +do { \ + if ((remp) != NULL) { \ + *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \ + } \ +_NOTE(CONSTCOND) } while (0) + + +/* + * Test whether alloc_sz bytes will fit in the scratch region. We isolate + * alloc_sz on the righthand side of the comparison in order to avoid overflow + * or underflow in the comparison with it. This is simpler than the INRANGE + * check above, because we know that the dtms_scratch_ptr is valid in the + * range. Allocations of size zero are allowed. + */ +#define DTRACE_INSCRATCH(mstate, alloc_sz) \ + ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \ + (mstate)->dtms_scratch_ptr >= (alloc_sz)) + +#define DTRACE_LOADFUNC(bits) \ +/*CSTYLED*/ \ +uint##bits##_t \ +dtrace_load##bits(uintptr_t addr) \ +{ \ + size_t size = bits / NBBY; \ + /*CSTYLED*/ \ + uint##bits##_t rval; \ + int i; \ + volatile uint16_t *flags = (volatile uint16_t *) \ + &cpu_core[curcpu].cpuc_dtrace_flags; \ + \ + DTRACE_ALIGNCHECK(addr, size, flags); \ + \ + for (i = 0; i < dtrace_toxranges; i++) { \ + if (addr >= dtrace_toxrange[i].dtt_limit) \ + continue; \ + \ + if (addr + size <= dtrace_toxrange[i].dtt_base) \ + continue; \ + \ + /* \ + * This address falls within a toxic region; return 0. \ + */ \ + *flags |= CPU_DTRACE_BADADDR; \ + cpu_core[curcpu].cpuc_dtrace_illval = addr; \ + return (0); \ + } \ + \ + *flags |= CPU_DTRACE_NOFAULT; \ + /*CSTYLED*/ \ + rval = *((volatile uint##bits##_t *)addr); \ + *flags &= ~CPU_DTRACE_NOFAULT; \ + \ + return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \ +} + +#ifdef _LP64 +#define dtrace_loadptr dtrace_load64 +#else +#define dtrace_loadptr dtrace_load32 +#endif + +#define DTRACE_DYNHASH_FREE 0 +#define DTRACE_DYNHASH_SINK 1 +#define DTRACE_DYNHASH_VALID 2 + +#define DTRACE_MATCH_NEXT 0 +#define DTRACE_MATCH_DONE 1 +#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0') +#define DTRACE_STATE_ALIGN 64 + +#define DTRACE_FLAGS2FLT(flags) \ + (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \ + ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \ + ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \ + ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \ + ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \ + ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \ + ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \ + ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \ + ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \ + DTRACEFLT_UNKNOWN) + +#define DTRACEACT_ISSTRING(act) \ + ((act)->dta_kind == DTRACEACT_DIFEXPR && \ + (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) + +/* Function prototype definitions: */ +static size_t dtrace_strlen(const char *, size_t); +static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id); +static void dtrace_enabling_provide(dtrace_provider_t *); +static int dtrace_enabling_match(dtrace_enabling_t *, int *); +static void dtrace_enabling_matchall(void); +static void dtrace_enabling_reap(void); +static dtrace_state_t *dtrace_anon_grab(void); +static uint64_t dtrace_helper(int, dtrace_mstate_t *, + dtrace_state_t *, uint64_t, uint64_t); +static dtrace_helpers_t *dtrace_helpers_create(proc_t *); +static void dtrace_buffer_drop(dtrace_buffer_t *); +static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when); +static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t, + dtrace_state_t *, dtrace_mstate_t *); +static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t, + dtrace_optval_t); +static int dtrace_ecb_create_enable(dtrace_probe_t *, void *); +static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *); +uint16_t dtrace_load16(uintptr_t); +uint32_t dtrace_load32(uintptr_t); +uint64_t dtrace_load64(uintptr_t); +uint8_t dtrace_load8(uintptr_t); +void dtrace_dynvar_clean(dtrace_dstate_t *); +dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *, + size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *); +uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *); +static int dtrace_priv_proc(dtrace_state_t *); +static void dtrace_getf_barrier(void); +static int dtrace_canload_remains(uint64_t, size_t, size_t *, + dtrace_mstate_t *, dtrace_vstate_t *); +static int dtrace_canstore_remains(uint64_t, size_t, size_t *, + dtrace_mstate_t *, dtrace_vstate_t *); + +/* + * DTrace Probe Context Functions + * + * These functions are called from probe context. Because probe context is + * any context in which C may be called, arbitrarily locks may be held, + * interrupts may be disabled, we may be in arbitrary dispatched state, etc. + * As a result, functions called from probe context may only call other DTrace + * support functions -- they may not interact at all with the system at large. + * (Note that the ASSERT macro is made probe-context safe by redefining it in + * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary + * loads are to be performed from probe context, they _must_ be in terms of + * the safe dtrace_load*() variants. + * + * Some functions in this block are not actually called from probe context; + * for these functions, there will be a comment above the function reading + * "Note: not called from probe context." + */ +void +dtrace_panic(const char *format, ...) +{ + va_list alist; + + va_start(alist, format); +#ifdef __FreeBSD__ + vpanic(format, alist); +#else + dtrace_vpanic(format, alist); +#endif + va_end(alist); +} + +int +dtrace_assfail(const char *a, const char *f, int l) +{ + dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l); + + /* + * We just need something here that even the most clever compiler + * cannot optimize away. + */ + return (a[(uintptr_t)f]); +} + +/* + * Atomically increment a specified error counter from probe context. + */ +static void +dtrace_error(uint32_t *counter) +{ + /* + * Most counters stored to in probe context are per-CPU counters. + * However, there are some error conditions that are sufficiently + * arcane that they don't merit per-CPU storage. If these counters + * are incremented concurrently on different CPUs, scalability will be + * adversely affected -- but we don't expect them to be white-hot in a + * correctly constructed enabling... + */ + uint32_t oval, nval; + + do { + oval = *counter; + + if ((nval = oval + 1) == 0) { + /* + * If the counter would wrap, set it to 1 -- assuring + * that the counter is never zero when we have seen + * errors. (The counter must be 32-bits because we + * aren't guaranteed a 64-bit compare&swap operation.) + * To save this code both the infamy of being fingered + * by a priggish news story and the indignity of being + * the target of a neo-puritan witch trial, we're + * carefully avoiding any colorful description of the + * likelihood of this condition -- but suffice it to + * say that it is only slightly more likely than the + * overflow of predicate cache IDs, as discussed in + * dtrace_predicate_create(). + */ + nval = 1; + } + } while (dtrace_cas32(counter, oval, nval) != oval); +} + +/* + * Use the DTRACE_LOADFUNC macro to define functions for each of loading a + * uint8_t, a uint16_t, a uint32_t and a uint64_t. + */ +/* BEGIN CSTYLED */ +DTRACE_LOADFUNC(8) +DTRACE_LOADFUNC(16) +DTRACE_LOADFUNC(32) +DTRACE_LOADFUNC(64) +/* END CSTYLED */ + +static int +dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate) +{ + if (dest < mstate->dtms_scratch_base) + return (0); + + if (dest + size < dest) + return (0); + + if (dest + size > mstate->dtms_scratch_ptr) + return (0); + + return (1); +} + +static int +dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain, + dtrace_statvar_t **svars, int nsvars) +{ + int i; + size_t maxglobalsize, maxlocalsize; + + if (nsvars == 0) + return (0); + + maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t); + maxlocalsize = maxglobalsize * NCPU; + + for (i = 0; i < nsvars; i++) { + dtrace_statvar_t *svar = svars[i]; + uint8_t scope; + size_t size; + + if (svar == NULL || (size = svar->dtsv_size) == 0) + continue; + + scope = svar->dtsv_var.dtdv_scope; + + /* + * We verify that our size is valid in the spirit of providing + * defense in depth: we want to prevent attackers from using + * DTrace to escalate an orthogonal kernel heap corruption bug + * into the ability to store to arbitrary locations in memory. + */ + VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) || + (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize)); + + if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, + svar->dtsv_size)) { + DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data, + svar->dtsv_size); + return (1); + } + } + + return (0); +} + +/* + * Check to see if the address is within a memory region to which a store may + * be issued. This includes the DTrace scratch areas, and any DTrace variable + * region. The caller of dtrace_canstore() is responsible for performing any + * alignment checks that are needed before stores are actually executed. + */ +static int +dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, + dtrace_vstate_t *vstate) +{ + return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate)); +} + +/* + * Implementation of dtrace_canstore which communicates the upper bound of the + * allowed memory region. + */ +static int +dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) +{ + /* + * First, check to see if the address is in scratch space... + */ + if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base, + mstate->dtms_scratch_size)) { + DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base, + mstate->dtms_scratch_size); + return (1); + } + + /* + * Now check to see if it's a dynamic variable. This check will pick + * up both thread-local variables and any global dynamically-allocated + * variables. + */ + if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base, + vstate->dtvs_dynvars.dtds_size)) { + dtrace_dstate_t *dstate = &vstate->dtvs_dynvars; + uintptr_t base = (uintptr_t)dstate->dtds_base + + (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t)); + uintptr_t chunkoffs; + dtrace_dynvar_t *dvar; + + /* + * Before we assume that we can store here, we need to make + * sure that it isn't in our metadata -- storing to our + * dynamic variable metadata would corrupt our state. For + * the range to not include any dynamic variable metadata, + * it must: + * + * (1) Start above the hash table that is at the base of + * the dynamic variable space + * + * (2) Have a starting chunk offset that is beyond the + * dtrace_dynvar_t that is at the base of every chunk + * + * (3) Not span a chunk boundary + * + * (4) Not be in the tuple space of a dynamic variable + * + */ + if (addr < base) + return (0); + + chunkoffs = (addr - base) % dstate->dtds_chunksize; + + if (chunkoffs < sizeof (dtrace_dynvar_t)) + return (0); + + if (chunkoffs + sz > dstate->dtds_chunksize) + return (0); + + dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs); + + if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) + return (0); + + if (chunkoffs < sizeof (dtrace_dynvar_t) + + ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t))) + return (0); + + DTRACE_RANGE_REMAIN(remain, addr, dvar, dstate->dtds_chunksize); + return (1); + } + + /* + * Finally, check the static local and global variables. These checks + * take the longest, so we perform them last. + */ + if (dtrace_canstore_statvar(addr, sz, remain, + vstate->dtvs_locals, vstate->dtvs_nlocals)) + return (1); + + if (dtrace_canstore_statvar(addr, sz, remain, + vstate->dtvs_globals, vstate->dtvs_nglobals)) + return (1); + + return (0); +} + + +/* + * Convenience routine to check to see if the address is within a memory + * region in which a load may be issued given the user's privilege level; + * if not, it sets the appropriate error flags and loads 'addr' into the + * illegal value slot. + * + * DTrace subroutines (DIF_SUBR_*) should use this helper to implement + * appropriate memory access protection. + */ +static int +dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, + dtrace_vstate_t *vstate) +{ + return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate)); +} + +/* + * Implementation of dtrace_canload which communicates the uppoer bound of the + * allowed memory region. + */ +static int +dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) +{ + volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval; + file_t *fp; + + /* + * If we hold the privilege to read from kernel memory, then + * everything is readable. + */ + if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) { + DTRACE_RANGE_REMAIN(remain, addr, addr, sz); + return (1); + } + + /* + * You can obviously read that which you can store. + */ + if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate)) + return (1); + + /* + * We're allowed to read from our own string table. + */ + if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab, + mstate->dtms_difo->dtdo_strlen)) { + DTRACE_RANGE_REMAIN(remain, addr, + mstate->dtms_difo->dtdo_strtab, + mstate->dtms_difo->dtdo_strlen); + return (1); + } + + if (vstate->dtvs_state != NULL && + dtrace_priv_proc(vstate->dtvs_state)) { + proc_t *p; + + /* + * When we have privileges to the current process, there are + * several context-related kernel structures that are safe to + * read, even absent the privilege to read from kernel memory. + * These reads are safe because these structures contain only + * state that (1) we're permitted to read, (2) is harmless or + * (3) contains pointers to additional kernel state that we're + * not permitted to read (and as such, do not present an + * opportunity for privilege escalation). Finally (and + * critically), because of the nature of their relation with + * the current thread context, the memory associated with these + * structures cannot change over the duration of probe context, + * and it is therefore impossible for this memory to be + * deallocated and reallocated as something else while it's + * being operated upon. + */ + if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) { + DTRACE_RANGE_REMAIN(remain, addr, curthread, + sizeof (kthread_t)); + return (1); + } + + if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr, + sz, curthread->t_procp, sizeof (proc_t))) { + DTRACE_RANGE_REMAIN(remain, addr, curthread->t_procp, + sizeof (proc_t)); + return (1); + } + + if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz, + curthread->t_cred, sizeof (cred_t))) { + DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cred, + sizeof (cred_t)); + return (1); + } + +#ifdef illumos + if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz, + &(p->p_pidp->pid_id), sizeof (pid_t))) { + DTRACE_RANGE_REMAIN(remain, addr, &(p->p_pidp->pid_id), + sizeof (pid_t)); + return (1); + } + + if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz, + curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) { + DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cpu, + offsetof(cpu_t, cpu_pause_thread)); + return (1); + } +#endif + } + + if ((fp = mstate->dtms_getf) != NULL) { + uintptr_t psz = sizeof (void *); + vnode_t *vp; + vnodeops_t *op; + + /* + * When getf() returns a file_t, the enabling is implicitly + * granted the (transient) right to read the returned file_t + * as well as the v_path and v_op->vnop_name of the underlying + * vnode. These accesses are allowed after a successful + * getf() because the members that they refer to cannot change + * once set -- and the barrier logic in the kernel's closef() + * path assures that the file_t and its referenced vode_t + * cannot themselves be stale (that is, it impossible for + * either dtms_getf itself or its f_vnode member to reference + * freed memory). + */ + if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) { + DTRACE_RANGE_REMAIN(remain, addr, fp, sizeof (file_t)); + return (1); + } + + if ((vp = fp->f_vnode) != NULL) { + size_t slen; +#ifdef illumos + if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) { + DTRACE_RANGE_REMAIN(remain, addr, &vp->v_path, + psz); + return (1); + } + slen = strlen(vp->v_path) + 1; + if (DTRACE_INRANGE(addr, sz, vp->v_path, slen)) { + DTRACE_RANGE_REMAIN(remain, addr, vp->v_path, + slen); + return (1); + } +#endif + + if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) { + DTRACE_RANGE_REMAIN(remain, addr, &vp->v_op, + psz); + return (1); + } + +#ifdef illumos + if ((op = vp->v_op) != NULL && + DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) { + DTRACE_RANGE_REMAIN(remain, addr, + &op->vnop_name, psz); + return (1); + } + + if (op != NULL && op->vnop_name != NULL && + DTRACE_INRANGE(addr, sz, op->vnop_name, + (slen = strlen(op->vnop_name) + 1))) { + DTRACE_RANGE_REMAIN(remain, addr, + op->vnop_name, slen); + return (1); + } +#endif + } + } + + DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV); + *illval = addr; + return (0); +} + +/* + * Convenience routine to check to see if a given string is within a memory + * region in which a load may be issued given the user's privilege level; + * this exists so that we don't need to issue unnecessary dtrace_strlen() + * calls in the event that the user has all privileges. + */ +static int +dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) +{ + size_t rsize; + + /* + * If we hold the privilege to read from kernel memory, then + * everything is readable. + */ + if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) { + DTRACE_RANGE_REMAIN(remain, addr, addr, sz); + return (1); + } + + /* + * Even if the caller is uninterested in querying the remaining valid + * range, it is required to ensure that the access is allowed. + */ + if (remain == NULL) { + remain = &rsize; + } + if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) { + size_t strsz; + /* + * Perform the strlen after determining the length of the + * memory region which is accessible. This prevents timing + * information from being used to find NULs in memory which is + * not accessible to the caller. + */ + strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, + MIN(sz, *remain)); + if (strsz <= *remain) { + return (1); + } + } + + return (0); +} + +/* + * Convenience routine to check to see if a given variable is within a memory + * region in which a load may be issued given the user's privilege level. + */ +static int +dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) +{ + size_t sz; + ASSERT(type->dtdt_flags & DIF_TF_BYREF); + + /* + * Calculate the max size before performing any checks since even + * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function + * return the max length via 'remain'. + */ + if (type->dtdt_kind == DIF_TYPE_STRING) { + dtrace_state_t *state = vstate->dtvs_state; + + if (state != NULL) { + sz = state->dts_options[DTRACEOPT_STRSIZE]; + } else { + /* + * In helper context, we have a NULL state; fall back + * to using the system-wide default for the string size + * in this case. + */ + sz = dtrace_strsize_default; + } + } else { + sz = type->dtdt_size; + } + + /* + * If we hold the privilege to read from kernel memory, then + * everything is readable. + */ + if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) { + DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz); + return (1); + } + + if (type->dtdt_kind == DIF_TYPE_STRING) { + return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate, + vstate)); + } + return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate, + vstate)); +} + +/* + * Convert a string to a signed integer using safe loads. + * + * NOTE: This function uses various macros from strtolctype.h to manipulate + * digit values, etc -- these have all been checked to ensure they make + * no additional function calls. + */ +static int64_t +dtrace_strtoll(char *input, int base, size_t limit) +{ + uintptr_t pos = (uintptr_t)input; + int64_t val = 0; + int x; + boolean_t neg = B_FALSE; + char c, cc, ccc; + uintptr_t end = pos + limit; + + /* + * Consume any whitespace preceding digits. + */ + while ((c = dtrace_load8(pos)) == ' ' || c == '\t') + pos++; + + /* + * Handle an explicit sign if one is present. + */ + if (c == '-' || c == '+') { + if (c == '-') + neg = B_TRUE; + c = dtrace_load8(++pos); + } + + /* + * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it + * if present. + */ + if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' || + cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) { + pos += 2; + c = ccc; + } + + /* + * Read in contiguous digits until the first non-digit character. + */ + for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base; + c = dtrace_load8(++pos)) + val = val * base + x; + + return (neg ? -val : val); +} + +/* + * Compare two strings using safe loads. + */ +static int +dtrace_strncmp(char *s1, char *s2, size_t limit) +{ + uint8_t c1, c2; + volatile uint16_t *flags; + + if (s1 == s2 || limit == 0) + return (0); + + flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags; + + do { + if (s1 == NULL) { + c1 = '\0'; + } else { + c1 = dtrace_load8((uintptr_t)s1++); + } + + if (s2 == NULL) { + c2 = '\0'; + } else { + c2 = dtrace_load8((uintptr_t)s2++); + } + + if (c1 != c2) + return (c1 - c2); + } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT)); + + return (0); +} + +/* + * Compute strlen(s) for a string using safe memory accesses. The additional + * len parameter is used to specify a maximum length to ensure completion. + */ +static size_t +dtrace_strlen(const char *s, size_t lim) +{ + uint_t len; + + for (len = 0; len != lim; len++) { + if (dtrace_load8((uintptr_t)s++) == '\0') + break; + } + + return (len); +} + +/* + * Check if an address falls within a toxic region. + */ +static int +dtrace_istoxic(uintptr_t kaddr, size_t size) +{ + uintptr_t taddr, tsize; + int i; + + for (i = 0; i < dtrace_toxranges; i++) { + taddr = dtrace_toxrange[i].dtt_base; + tsize = dtrace_toxrange[i].dtt_limit - taddr; + + if (kaddr - taddr < tsize) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR); + cpu_core[curcpu].cpuc_dtrace_illval = kaddr; + return (1); + } + + if (taddr - kaddr < size) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR); + cpu_core[curcpu].cpuc_dtrace_illval = taddr; + return (1); + } + } + + return (0); +} + +/* + * Copy src to dst using safe memory accesses. The src is assumed to be unsafe + * memory specified by the DIF program. The dst is assumed to be safe memory + * that we can store to directly because it is managed by DTrace. As with + * standard bcopy, overlapping copies are handled properly. + */ +static void +dtrace_bcopy(const void *src, void *dst, size_t len) +{ + if (len != 0) { + uint8_t *s1 = dst; + const uint8_t *s2 = src; + + if (s1 <= s2) { + do { + *s1++ = dtrace_load8((uintptr_t)s2++); + } while (--len != 0); + } else { + s2 += len; + s1 += len; + + do { + *--s1 = dtrace_load8((uintptr_t)--s2); + } while (--len != 0); + } + } +} + +/* + * Copy src to dst using safe memory accesses, up to either the specified + * length, or the point that a nul byte is encountered. The src is assumed to + * be unsafe memory specified by the DIF program. The dst is assumed to be + * safe memory that we can store to directly because it is managed by DTrace. + * Unlike dtrace_bcopy(), overlapping regions are not handled. + */ +static void +dtrace_strcpy(const void *src, void *dst, size_t len) +{ + if (len != 0) { + uint8_t *s1 = dst, c; + const uint8_t *s2 = src; + + do { + *s1++ = c = dtrace_load8((uintptr_t)s2++); + } while (--len != 0 && c != '\0'); + } +} + +/* + * Copy src to dst, deriving the size and type from the specified (BYREF) + * variable type. The src is assumed to be unsafe memory specified by the DIF + * program. The dst is assumed to be DTrace variable memory that is of the + * specified type; we assume that we can store to directly. + */ +static void +dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit) +{ + ASSERT(type->dtdt_flags & DIF_TF_BYREF); + + if (type->dtdt_kind == DIF_TYPE_STRING) { + dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit)); + } else { + dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit)); + } +} + +/* + * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be + * unsafe memory specified by the DIF program. The s2 data is assumed to be + * safe memory that we can access directly because it is managed by DTrace. + */ +static int +dtrace_bcmp(const void *s1, const void *s2, size_t len) +{ + volatile uint16_t *flags; + + flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags; + + if (s1 == s2) + return (0); + + if (s1 == NULL || s2 == NULL) + return (1); + + if (s1 != s2 && len != 0) { + const uint8_t *ps1 = s1; + const uint8_t *ps2 = s2; + + do { + if (dtrace_load8((uintptr_t)ps1++) != *ps2++) + return (1); + } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT)); + } + return (0); +} + +/* + * Zero the specified region using a simple byte-by-byte loop. Note that this + * is for safe DTrace-managed memory only. + */ +static void +dtrace_bzero(void *dst, size_t len) +{ + uchar_t *cp; + + for (cp = dst; len != 0; len--) + *cp++ = 0; +} + +static void +dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum) +{ + uint64_t result[2]; + + result[0] = addend1[0] + addend2[0]; + result[1] = addend1[1] + addend2[1] + + (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0); + + sum[0] = result[0]; + sum[1] = result[1]; +} + +/* + * Shift the 128-bit value in a by b. If b is positive, shift left. + * If b is negative, shift right. + */ +static void +dtrace_shift_128(uint64_t *a, int b) +{ + uint64_t mask; + + if (b == 0) + return; + + if (b < 0) { + b = -b; + if (b >= 64) { + a[0] = a[1] >> (b - 64); + a[1] = 0; + } else { + a[0] >>= b; + mask = 1LL << (64 - b); + mask -= 1; + a[0] |= ((a[1] & mask) << (64 - b)); + a[1] >>= b; + } + } else { + if (b >= 64) { + a[1] = a[0] << (b - 64); + a[0] = 0; + } else { + a[1] <<= b; + mask = a[0] >> (64 - b); + a[1] |= mask; + a[0] <<= b; + } + } +} + +/* + * The basic idea is to break the 2 64-bit values into 4 32-bit values, + * use native multiplication on those, and then re-combine into the + * resulting 128-bit value. + * + * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) = + * hi1 * hi2 << 64 + + * hi1 * lo2 << 32 + + * hi2 * lo1 << 32 + + * lo1 * lo2 + */ +static void +dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product) +{ + uint64_t hi1, hi2, lo1, lo2; + uint64_t tmp[2]; + + hi1 = factor1 >> 32; + hi2 = factor2 >> 32; + + lo1 = factor1 & DT_MASK_LO; + lo2 = factor2 & DT_MASK_LO; + + product[0] = lo1 * lo2; + product[1] = hi1 * hi2; + + tmp[0] = hi1 * lo2; + tmp[1] = 0; + dtrace_shift_128(tmp, 32); + dtrace_add_128(product, tmp, product); + + tmp[0] = hi2 * lo1; + tmp[1] = 0; + dtrace_shift_128(tmp, 32); + dtrace_add_128(product, tmp, product); +} + +/* + * This privilege check should be used by actions and subroutines to + * verify that the user credentials of the process that enabled the + * invoking ECB match the target credentials + */ +static int +dtrace_priv_proc_common_user(dtrace_state_t *state) +{ + cred_t *cr, *s_cr = state->dts_cred.dcr_cred; + + /* + * We should always have a non-NULL state cred here, since if cred + * is null (anonymous tracing), we fast-path bypass this routine. + */ + ASSERT(s_cr != NULL); + + if ((cr = CRED()) != NULL && + s_cr->cr_uid == cr->cr_uid && + s_cr->cr_uid == cr->cr_ruid && + s_cr->cr_uid == cr->cr_suid && + s_cr->cr_gid == cr->cr_gid && + s_cr->cr_gid == cr->cr_rgid && + s_cr->cr_gid == cr->cr_sgid) + return (1); + + return (0); +} + +/* + * This privilege check should be used by actions and subroutines to + * verify that the zone of the process that enabled the invoking ECB + * matches the target credentials + */ +static int +dtrace_priv_proc_common_zone(dtrace_state_t *state) +{ +#ifdef illumos + cred_t *cr, *s_cr = state->dts_cred.dcr_cred; + + /* + * We should always have a non-NULL state cred here, since if cred + * is null (anonymous tracing), we fast-path bypass this routine. + */ + ASSERT(s_cr != NULL); + + if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone) + return (1); + + return (0); +#else + return (1); +#endif +} + +/* + * This privilege check should be used by actions and subroutines to + * verify that the process has not setuid or changed credentials. + */ +static int +dtrace_priv_proc_common_nocd(void) +{ + proc_t *proc; + + if ((proc = ttoproc(curthread)) != NULL && + !(proc->p_flag & SNOCD)) + return (1); + + return (0); +} + +static int +dtrace_priv_proc_destructive(dtrace_state_t *state) +{ + int action = state->dts_cred.dcr_action; + + if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) && + dtrace_priv_proc_common_zone(state) == 0) + goto bad; + + if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) && + dtrace_priv_proc_common_user(state) == 0) + goto bad; + + if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) && + dtrace_priv_proc_common_nocd() == 0) + goto bad; + + return (1); + +bad: + cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV; + + return (0); +} + +static int +dtrace_priv_proc_control(dtrace_state_t *state) +{ + if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL) + return (1); + + if (dtrace_priv_proc_common_zone(state) && + dtrace_priv_proc_common_user(state) && + dtrace_priv_proc_common_nocd()) + return (1); + + cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV; + + return (0); +} + +static int +dtrace_priv_proc(dtrace_state_t *state) +{ + if (state->dts_cred.dcr_action & DTRACE_CRA_PROC) + return (1); + + cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV; + + return (0); +} + +static int +dtrace_priv_kernel(dtrace_state_t *state) +{ + if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL) + return (1); + + cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV; + + return (0); +} + +static int +dtrace_priv_kernel_destructive(dtrace_state_t *state) +{ + if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE) + return (1); + + cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV; + + return (0); +} + +/* + * Determine if the dte_cond of the specified ECB allows for processing of + * the current probe to continue. Note that this routine may allow continued + * processing, but with access(es) stripped from the mstate's dtms_access + * field. + */ +static int +dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate, + dtrace_ecb_t *ecb) +{ + dtrace_probe_t *probe = ecb->dte_probe; + dtrace_provider_t *prov = probe->dtpr_provider; + dtrace_pops_t *pops = &prov->dtpv_pops; + int mode = DTRACE_MODE_NOPRIV_DROP; + + ASSERT(ecb->dte_cond); + +#ifdef illumos + if (pops->dtps_mode != NULL) { + mode = pops->dtps_mode(prov->dtpv_arg, + probe->dtpr_id, probe->dtpr_arg); + + ASSERT((mode & DTRACE_MODE_USER) || + (mode & DTRACE_MODE_KERNEL)); + ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) || + (mode & DTRACE_MODE_NOPRIV_DROP)); + } + + /* + * If the dte_cond bits indicate that this consumer is only allowed to + * see user-mode firings of this probe, call the provider's dtps_mode() + * entry point to check that the probe was fired while in a user + * context. If that's not the case, use the policy specified by the + * provider to determine if we drop the probe or merely restrict + * operation. + */ + if (ecb->dte_cond & DTRACE_COND_USERMODE) { + ASSERT(mode != DTRACE_MODE_NOPRIV_DROP); + + if (!(mode & DTRACE_MODE_USER)) { + if (mode & DTRACE_MODE_NOPRIV_DROP) + return (0); + + mstate->dtms_access &= ~DTRACE_ACCESS_ARGS; + } + } +#endif + + /* + * This is more subtle than it looks. We have to be absolutely certain + * that CRED() isn't going to change out from under us so it's only + * legit to examine that structure if we're in constrained situations. + * Currently, the only times we'll this check is if a non-super-user + * has enabled the profile or syscall providers -- providers that + * allow visibility of all processes. For the profile case, the check + * above will ensure that we're examining a user context. + */ + if (ecb->dte_cond & DTRACE_COND_OWNER) { + cred_t *cr; + cred_t *s_cr = state->dts_cred.dcr_cred; + proc_t *proc; + + ASSERT(s_cr != NULL); + + if ((cr = CRED()) == NULL || + s_cr->cr_uid != cr->cr_uid || + s_cr->cr_uid != cr->cr_ruid || + s_cr->cr_uid != cr->cr_suid || + s_cr->cr_gid != cr->cr_gid || + s_cr->cr_gid != cr->cr_rgid || + s_cr->cr_gid != cr->cr_sgid || + (proc = ttoproc(curthread)) == NULL || + (proc->p_flag & SNOCD)) { + if (mode & DTRACE_MODE_NOPRIV_DROP) + return (0); + +#ifdef illumos + mstate->dtms_access &= ~DTRACE_ACCESS_PROC; +#endif + } + } + +#ifdef illumos + /* + * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not + * in our zone, check to see if our mode policy is to restrict rather + * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC + * and DTRACE_ACCESS_ARGS + */ + if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) { + cred_t *cr; + cred_t *s_cr = state->dts_cred.dcr_cred; + + ASSERT(s_cr != NULL); + + if ((cr = CRED()) == NULL || + s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) { + if (mode & DTRACE_MODE_NOPRIV_DROP) + return (0); + + mstate->dtms_access &= + ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS); + } + } +#endif + + return (1); +} + +/* + * Note: not called from probe context. This function is called + * asynchronously (and at a regular interval) from outside of probe context to + * clean the dirty dynamic variable lists on all CPUs. Dynamic variable + * cleaning is explained in detail in <sys/dtrace_impl.h>. + */ +void +dtrace_dynvar_clean(dtrace_dstate_t *dstate) +{ + dtrace_dynvar_t *dirty; + dtrace_dstate_percpu_t *dcpu; + dtrace_dynvar_t **rinsep; + int i, j, work = 0; + + for (i = 0; i < NCPU; i++) { + dcpu = &dstate->dtds_percpu[i]; + rinsep = &dcpu->dtdsc_rinsing; + + /* + * If the dirty list is NULL, there is no dirty work to do. + */ + if (dcpu->dtdsc_dirty == NULL) + continue; + + if (dcpu->dtdsc_rinsing != NULL) { + /* + * If the rinsing list is non-NULL, then it is because + * this CPU was selected to accept another CPU's + * dirty list -- and since that time, dirty buffers + * have accumulated. This is a highly unlikely + * condition, but we choose to ignore the dirty + * buffers -- they'll be picked up a future cleanse. + */ + continue; + } + + if (dcpu->dtdsc_clean != NULL) { + /* + * If the clean list is non-NULL, then we're in a + * situation where a CPU has done deallocations (we + * have a non-NULL dirty list) but no allocations (we + * also have a non-NULL clean list). We can't simply + * move the dirty list into the clean list on this + * CPU, yet we also don't want to allow this condition + * to persist, lest a short clean list prevent a + * massive dirty list from being cleaned (which in + * turn could lead to otherwise avoidable dynamic + * drops). To deal with this, we look for some CPU + * with a NULL clean list, NULL dirty list, and NULL + * rinsing list -- and then we borrow this CPU to + * rinse our dirty list. + */ + for (j = 0; j < NCPU; j++) { + dtrace_dstate_percpu_t *rinser; + + rinser = &dstate->dtds_percpu[j]; + + if (rinser->dtdsc_rinsing != NULL) + continue; + + if (rinser->dtdsc_dirty != NULL) + continue; + + if (rinser->dtdsc_clean != NULL) + continue; + + rinsep = &rinser->dtdsc_rinsing; + break; + } + + if (j == NCPU) { + /* + * We were unable to find another CPU that + * could accept this dirty list -- we are + * therefore unable to clean it now. + */ + dtrace_dynvar_failclean++; + continue; + } + } + + work = 1; + + /* + * Atomically move the dirty list aside. + */ + do { + dirty = dcpu->dtdsc_dirty; + + /* + * Before we zap the dirty list, set the rinsing list. + * (This allows for a potential assertion in + * dtrace_dynvar(): if a free dynamic variable appears + * on a hash chain, either the dirty list or the + * rinsing list for some CPU must be non-NULL.) + */ + *rinsep = dirty; + dtrace_membar_producer(); + } while (dtrace_casptr(&dcpu->dtdsc_dirty, + dirty, NULL) != dirty); + } + + if (!work) { + /* + * We have no work to do; we can simply return. + */ + return; + } + + dtrace_sync(); + + for (i = 0; i < NCPU; i++) { + dcpu = &dstate->dtds_percpu[i]; + + if (dcpu->dtdsc_rinsing == NULL) + continue; + + /* + * We are now guaranteed that no hash chain contains a pointer + * into this dirty list; we can make it clean. + */ + ASSERT(dcpu->dtdsc_clean == NULL); + dcpu->dtdsc_clean = dcpu->dtdsc_rinsing; + dcpu->dtdsc_rinsing = NULL; + } + + /* + * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make + * sure that all CPUs have seen all of the dtdsc_clean pointers. + * This prevents a race whereby a CPU incorrectly decides that + * the state should be something other than DTRACE_DSTATE_CLEAN + * after dtrace_dynvar_clean() has completed. + */ + dtrace_sync(); + + dstate->dtds_state = DTRACE_DSTATE_CLEAN; +} + +/* + * Depending on the value of the op parameter, this function looks-up, + * allocates or deallocates an arbitrarily-keyed dynamic variable. If an + * allocation is requested, this function will return a pointer to a + * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no + * variable can be allocated. If NULL is returned, the appropriate counter + * will be incremented. + */ +dtrace_dynvar_t * +dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys, + dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) +{ + uint64_t hashval = DTRACE_DYNHASH_VALID; + dtrace_dynhash_t *hash = dstate->dtds_hash; + dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL; + processorid_t me = curcpu, cpu = me; + dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me]; + size_t bucket, ksize; + size_t chunksize = dstate->dtds_chunksize; + uintptr_t kdata, lock, nstate; + uint_t i; + + ASSERT(nkeys != 0); + + /* + * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time" + * algorithm. For the by-value portions, we perform the algorithm in + * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a + * bit, and seems to have only a minute effect on distribution. For + * the by-reference data, we perform "One-at-a-time" iterating (safely) + * over each referenced byte. It's painful to do this, but it's much + * better than pathological hash distribution. The efficacy of the + * hashing algorithm (and a comparison with other algorithms) may be + * found by running the ::dtrace_dynstat MDB dcmd. + */ + for (i = 0; i < nkeys; i++) { + if (key[i].dttk_size == 0) { + uint64_t val = key[i].dttk_value; + + hashval += (val >> 48) & 0xffff; + hashval += (hashval << 10); + hashval ^= (hashval >> 6); + + hashval += (val >> 32) & 0xffff; + hashval += (hashval << 10); + hashval ^= (hashval >> 6); + + hashval += (val >> 16) & 0xffff; + hashval += (hashval << 10); + hashval ^= (hashval >> 6); + + hashval += val & 0xffff; + hashval += (hashval << 10); + hashval ^= (hashval >> 6); + } else { + /* + * This is incredibly painful, but it beats the hell + * out of the alternative. + */ + uint64_t j, size = key[i].dttk_size; + uintptr_t base = (uintptr_t)key[i].dttk_value; + + if (!dtrace_canload(base, size, mstate, vstate)) + break; + + for (j = 0; j < size; j++) { + hashval += dtrace_load8(base + j); + hashval += (hashval << 10); + hashval ^= (hashval >> 6); + } + } + } + + if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT)) + return (NULL); + + hashval += (hashval << 3); + hashval ^= (hashval >> 11); + hashval += (hashval << 15); + + /* + * There is a remote chance (ideally, 1 in 2^31) that our hashval + * comes out to be one of our two sentinel hash values. If this + * actually happens, we set the hashval to be a value known to be a + * non-sentinel value. + */ + if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK) + hashval = DTRACE_DYNHASH_VALID; + + /* + * Yes, it's painful to do a divide here. If the cycle count becomes + * important here, tricks can be pulled to reduce it. (However, it's + * critical that hash collisions be kept to an absolute minimum; + * they're much more painful than a divide.) It's better to have a + * solution that generates few collisions and still keeps things + * relatively simple. + */ + bucket = hashval % dstate->dtds_hashsize; + + if (op == DTRACE_DYNVAR_DEALLOC) { + volatile uintptr_t *lockp = &hash[bucket].dtdh_lock; + + for (;;) { + while ((lock = *lockp) & 1) + continue; + + if (dtrace_casptr((volatile void *)lockp, + (volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock) + break; + } + + dtrace_membar_producer(); + } + +top: + prev = NULL; + lock = hash[bucket].dtdh_lock; + + dtrace_membar_consumer(); + + start = hash[bucket].dtdh_chain; + ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK || + start->dtdv_hashval != DTRACE_DYNHASH_FREE || + op != DTRACE_DYNVAR_DEALLOC)); + + for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) { + dtrace_tuple_t *dtuple = &dvar->dtdv_tuple; + dtrace_key_t *dkey = &dtuple->dtt_key[0]; + + if (dvar->dtdv_hashval != hashval) { + if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) { + /* + * We've reached the sink, and therefore the + * end of the hash chain; we can kick out of + * the loop knowing that we have seen a valid + * snapshot of state. + */ + ASSERT(dvar->dtdv_next == NULL); + ASSERT(dvar == &dtrace_dynhash_sink); + break; + } + + if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) { + /* + * We've gone off the rails: somewhere along + * the line, one of the members of this hash + * chain was deleted. Note that we could also + * detect this by simply letting this loop run + * to completion, as we would eventually hit + * the end of the dirty list. However, we + * want to avoid running the length of the + * dirty list unnecessarily (it might be quite + * long), so we catch this as early as + * possible by detecting the hash marker. In + * this case, we simply set dvar to NULL and + * break; the conditional after the loop will + * send us back to top. + */ + dvar = NULL; + break; + } + + goto next; + } + + if (dtuple->dtt_nkeys != nkeys) + goto next; + + for (i = 0; i < nkeys; i++, dkey++) { + if (dkey->dttk_size != key[i].dttk_size) + goto next; /* size or type mismatch */ + + if (dkey->dttk_size != 0) { + if (dtrace_bcmp( + (void *)(uintptr_t)key[i].dttk_value, + (void *)(uintptr_t)dkey->dttk_value, + dkey->dttk_size)) + goto next; + } else { + if (dkey->dttk_value != key[i].dttk_value) + goto next; + } + } + + if (op != DTRACE_DYNVAR_DEALLOC) + return (dvar); + + ASSERT(dvar->dtdv_next == NULL || + dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE); + + if (prev != NULL) { + ASSERT(hash[bucket].dtdh_chain != dvar); + ASSERT(start != dvar); + ASSERT(prev->dtdv_next == dvar); + prev->dtdv_next = dvar->dtdv_next; + } else { + if (dtrace_casptr(&hash[bucket].dtdh_chain, + start, dvar->dtdv_next) != start) { + /* + * We have failed to atomically swing the + * hash table head pointer, presumably because + * of a conflicting allocation on another CPU. + * We need to reread the hash chain and try + * again. + */ + goto top; + } + } + + dtrace_membar_producer(); + + /* + * Now set the hash value to indicate that it's free. + */ + ASSERT(hash[bucket].dtdh_chain != dvar); + dvar->dtdv_hashval = DTRACE_DYNHASH_FREE; + + dtrace_membar_producer(); + + /* + * Set the next pointer to point at the dirty list, and + * atomically swing the dirty pointer to the newly freed dvar. + */ + do { + next = dcpu->dtdsc_dirty; + dvar->dtdv_next = next; + } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next); + + /* + * Finally, unlock this hash bucket. + */ + ASSERT(hash[bucket].dtdh_lock == lock); + ASSERT(lock & 1); + hash[bucket].dtdh_lock++; + + return (NULL); +next: + prev = dvar; + continue; + } + + if (dvar == NULL) { + /* + * If dvar is NULL, it is because we went off the rails: + * one of the elements that we traversed in the hash chain + * was deleted while we were traversing it. In this case, + * we assert that we aren't doing a dealloc (deallocs lock + * the hash bucket to prevent themselves from racing with + * one another), and retry the hash chain traversal. + */ + ASSERT(op != DTRACE_DYNVAR_DEALLOC); + goto top; + } + + if (op != DTRACE_DYNVAR_ALLOC) { + /* + * If we are not to allocate a new variable, we want to + * return NULL now. Before we return, check that the value + * of the lock word hasn't changed. If it has, we may have + * seen an inconsistent snapshot. + */ + if (op == DTRACE_DYNVAR_NOALLOC) { + if (hash[bucket].dtdh_lock != lock) + goto top; + } else { + ASSERT(op == DTRACE_DYNVAR_DEALLOC); + ASSERT(hash[bucket].dtdh_lock == lock); + ASSERT(lock & 1); + hash[bucket].dtdh_lock++; + } + + return (NULL); + } + + /* + * We need to allocate a new dynamic variable. The size we need is the + * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the + * size of any auxiliary key data (rounded up to 8-byte alignment) plus + * the size of any referred-to data (dsize). We then round the final + * size up to the chunksize for allocation. + */ + for (ksize = 0, i = 0; i < nkeys; i++) + ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t)); + + /* + * This should be pretty much impossible, but could happen if, say, + * strange DIF specified the tuple. Ideally, this should be an + * assertion and not an error condition -- but that requires that the + * chunksize calculation in dtrace_difo_chunksize() be absolutely + * bullet-proof. (That is, it must not be able to be fooled by + * malicious DIF.) Given the lack of backwards branches in DIF, + * solving this would presumably not amount to solving the Halting + * Problem -- but it still seems awfully hard. + */ + if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) + + ksize + dsize > chunksize) { + dcpu->dtdsc_drops++; + return (NULL); + } + + nstate = DTRACE_DSTATE_EMPTY; + + do { +retry: + free = dcpu->dtdsc_free; + + if (free == NULL) { + dtrace_dynvar_t *clean = dcpu->dtdsc_clean; + void *rval; + + if (clean == NULL) { + /* + * We're out of dynamic variable space on + * this CPU. Unless we have tried all CPUs, + * we'll try to allocate from a different + * CPU. + */ + switch (dstate->dtds_state) { + case DTRACE_DSTATE_CLEAN: { + void *sp = &dstate->dtds_state; + + if (++cpu >= NCPU) + cpu = 0; + + if (dcpu->dtdsc_dirty != NULL && + nstate == DTRACE_DSTATE_EMPTY) + nstate = DTRACE_DSTATE_DIRTY; + + if (dcpu->dtdsc_rinsing != NULL) + nstate = DTRACE_DSTATE_RINSING; + + dcpu = &dstate->dtds_percpu[cpu]; + + if (cpu != me) + goto retry; + + (void) dtrace_cas32(sp, + DTRACE_DSTATE_CLEAN, nstate); + + /* + * To increment the correct bean + * counter, take another lap. + */ + goto retry; + } + + case DTRACE_DSTATE_DIRTY: + dcpu->dtdsc_dirty_drops++; + break; + + case DTRACE_DSTATE_RINSING: + dcpu->dtdsc_rinsing_drops++; + break; + + case DTRACE_DSTATE_EMPTY: + dcpu->dtdsc_drops++; + break; + } + + DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP); + return (NULL); + } + + /* + * The clean list appears to be non-empty. We want to + * move the clean list to the free list; we start by + * moving the clean pointer aside. + */ + if (dtrace_casptr(&dcpu->dtdsc_clean, + clean, NULL) != clean) { + /* + * We are in one of two situations: + * + * (a) The clean list was switched to the + * free list by another CPU. + * + * (b) The clean list was added to by the + * cleansing cyclic. + * + * In either of these situations, we can + * just reattempt the free list allocation. + */ + goto retry; + } + + ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE); + + /* + * Now we'll move the clean list to our free list. + * It's impossible for this to fail: the only way + * the free list can be updated is through this + * code path, and only one CPU can own the clean list. + * Thus, it would only be possible for this to fail if + * this code were racing with dtrace_dynvar_clean(). + * (That is, if dtrace_dynvar_clean() updated the clean + * list, and we ended up racing to update the free + * list.) This race is prevented by the dtrace_sync() + * in dtrace_dynvar_clean() -- which flushes the + * owners of the clean lists out before resetting + * the clean lists. + */ + dcpu = &dstate->dtds_percpu[me]; + rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean); + ASSERT(rval == NULL); + goto retry; + } + + dvar = free; + new_free = dvar->dtdv_next; + } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free); + + /* + * We have now allocated a new chunk. We copy the tuple keys into the + * tuple array and copy any referenced key data into the data space + * following the tuple array. As we do this, we relocate dttk_value + * in the final tuple to point to the key data address in the chunk. + */ + kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys]; + dvar->dtdv_data = (void *)(kdata + ksize); + dvar->dtdv_tuple.dtt_nkeys = nkeys; + + for (i = 0; i < nkeys; i++) { + dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i]; + size_t kesize = key[i].dttk_size; + + if (kesize != 0) { + dtrace_bcopy( + (const void *)(uintptr_t)key[i].dttk_value, + (void *)kdata, kesize); + dkey->dttk_value = kdata; + kdata += P2ROUNDUP(kesize, sizeof (uint64_t)); + } else { + dkey->dttk_value = key[i].dttk_value; + } + + dkey->dttk_size = kesize; + } + + ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE); + dvar->dtdv_hashval = hashval; + dvar->dtdv_next = start; + + if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start) + return (dvar); + + /* + * The cas has failed. Either another CPU is adding an element to + * this hash chain, or another CPU is deleting an element from this + * hash chain. The simplest way to deal with both of these cases + * (though not necessarily the most efficient) is to free our + * allocated block and re-attempt it all. Note that the free is + * to the dirty list and _not_ to the free list. This is to prevent + * races with allocators, above. + */ + dvar->dtdv_hashval = DTRACE_DYNHASH_FREE; + + dtrace_membar_producer(); + + do { + free = dcpu->dtdsc_dirty; + dvar->dtdv_next = free; + } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free); + + goto top; +} + +/*ARGSUSED*/ +static void +dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg) +{ + if ((int64_t)nval < (int64_t)*oval) + *oval = nval; +} + +/*ARGSUSED*/ +static void +dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg) +{ + if ((int64_t)nval > (int64_t)*oval) + *oval = nval; +} + +static void +dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr) +{ + int i, zero = DTRACE_QUANTIZE_ZEROBUCKET; + int64_t val = (int64_t)nval; + + if (val < 0) { + for (i = 0; i < zero; i++) { + if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) { + quanta[i] += incr; + return; + } + } + } else { + for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) { + if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) { + quanta[i - 1] += incr; + return; + } + } + + quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr; + return; + } + + ASSERT(0); +} + +static void +dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr) +{ + uint64_t arg = *lquanta++; + int32_t base = DTRACE_LQUANTIZE_BASE(arg); + uint16_t step = DTRACE_LQUANTIZE_STEP(arg); + uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg); + int32_t val = (int32_t)nval, level; + + ASSERT(step != 0); + ASSERT(levels != 0); + + if (val < base) { + /* + * This is an underflow. + */ + lquanta[0] += incr; + return; + } + + level = (val - base) / step; + + if (level < levels) { + lquanta[level + 1] += incr; + return; + } + + /* + * This is an overflow. + */ + lquanta[levels + 1] += incr; +} + +static int +dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low, + uint16_t high, uint16_t nsteps, int64_t value) +{ + int64_t this = 1, last, next; + int base = 1, order; + + ASSERT(factor <= nsteps); + ASSERT(nsteps % factor == 0); + + for (order = 0; order < low; order++) + this *= factor; + + /* + * If our value is less than our factor taken to the power of the + * low order of magnitude, it goes into the zeroth bucket. + */ + if (value < (last = this)) + return (0); + + for (this *= factor; order <= high; order++) { + int nbuckets = this > nsteps ? nsteps : this; + + if ((next = this * factor) < this) { + /* + * We should not generally get log/linear quantizations + * with a high magnitude that allows 64-bits to + * overflow, but we nonetheless protect against this + * by explicitly checking for overflow, and clamping + * our value accordingly. + */ + value = this - 1; + } + + if (value < this) { + /* + * If our value lies within this order of magnitude, + * determine its position by taking the offset within + * the order of magnitude, dividing by the bucket + * width, and adding to our (accumulated) base. + */ + return (base + (value - last) / (this / nbuckets)); + } + + base += nbuckets - (nbuckets / factor); + last = this; + this = next; + } + + /* + * Our value is greater than or equal to our factor taken to the + * power of one plus the high magnitude -- return the top bucket. + */ + return (base); +} + +static void +dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr) +{ + uint64_t arg = *llquanta++; + uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg); + uint16_t low = DTRACE_LLQUANTIZE_LOW(arg); + uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg); + uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg); + + llquanta[dtrace_aggregate_llquantize_bucket(factor, + low, high, nsteps, nval)] += incr; +} + +/*ARGSUSED*/ +static void +dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg) +{ + data[0]++; + data[1] += nval; +} + +/*ARGSUSED*/ +static void +dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg) +{ + int64_t snval = (int64_t)nval; + uint64_t tmp[2]; + + data[0]++; + data[1] += nval; + + /* + * What we want to say here is: + * + * data[2] += nval * nval; + * + * But given that nval is 64-bit, we could easily overflow, so + * we do this as 128-bit arithmetic. + */ + if (snval < 0) + snval = -snval; + + dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp); + dtrace_add_128(data + 2, tmp, data + 2); +} + +/*ARGSUSED*/ +static void +dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg) +{ + *oval = *oval + 1; +} + +/*ARGSUSED*/ +static void +dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg) +{ + *oval += nval; +} + +/* + * Aggregate given the tuple in the principal data buffer, and the aggregating + * action denoted by the specified dtrace_aggregation_t. The aggregation + * buffer is specified as the buf parameter. This routine does not return + * failure; if there is no space in the aggregation buffer, the data will be + * dropped, and a corresponding counter incremented. + */ +static void +dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf, + intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg) +{ + dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec; + uint32_t i, ndx, size, fsize; + uint32_t align = sizeof (uint64_t) - 1; + dtrace_aggbuffer_t *agb; + dtrace_aggkey_t *key; + uint32_t hashval = 0, limit, isstr; + caddr_t tomax, data, kdata; + dtrace_actkind_t action; + dtrace_action_t *act; + uintptr_t offs; + + if (buf == NULL) + return; + + if (!agg->dtag_hasarg) { + /* + * Currently, only quantize() and lquantize() take additional + * arguments, and they have the same semantics: an increment + * value that defaults to 1 when not present. If additional + * aggregating actions take arguments, the setting of the + * default argument value will presumably have to become more + * sophisticated... + */ + arg = 1; + } + + action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION; + size = rec->dtrd_offset - agg->dtag_base; + fsize = size + rec->dtrd_size; + + ASSERT(dbuf->dtb_tomax != NULL); + data = dbuf->dtb_tomax + offset + agg->dtag_base; + + if ((tomax = buf->dtb_tomax) == NULL) { + dtrace_buffer_drop(buf); + return; + } + + /* + * The metastructure is always at the bottom of the buffer. + */ + agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size - + sizeof (dtrace_aggbuffer_t)); + + if (buf->dtb_offset == 0) { + /* + * We just kludge up approximately 1/8th of the size to be + * buckets. If this guess ends up being routinely + * off-the-mark, we may need to dynamically readjust this + * based on past performance. + */ + uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t); + + if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) < + (uintptr_t)tomax || hashsize == 0) { + /* + * We've been given a ludicrously small buffer; + * increment our drop count and leave. + */ + dtrace_buffer_drop(buf); + return; + } + + /* + * And now, a pathetic attempt to try to get a an odd (or + * perchance, a prime) hash size for better hash distribution. + */ + if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3)) + hashsize -= DTRACE_AGGHASHSIZE_SLEW; + + agb->dtagb_hashsize = hashsize; + agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb - + agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *)); + agb->dtagb_free = (uintptr_t)agb->dtagb_hash; + + for (i = 0; i < agb->dtagb_hashsize; i++) + agb->dtagb_hash[i] = NULL; + } + + ASSERT(agg->dtag_first != NULL); + ASSERT(agg->dtag_first->dta_intuple); + + /* + * Calculate the hash value based on the key. Note that we _don't_ + * include the aggid in the hashing (but we will store it as part of + * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time" + * algorithm: a simple, quick algorithm that has no known funnels, and + * gets good distribution in practice. The efficacy of the hashing + * algorithm (and a comparison with other algorithms) may be found by + * running the ::dtrace_aggstat MDB dcmd. + */ + for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) { + i = act->dta_rec.dtrd_offset - agg->dtag_base; + limit = i + act->dta_rec.dtrd_size; + ASSERT(limit <= size); + isstr = DTRACEACT_ISSTRING(act); + + for (; i < limit; i++) { + hashval += data[i]; + hashval += (hashval << 10); + hashval ^= (hashval >> 6); + + if (isstr && data[i] == '\0') + break; + } + } + + hashval += (hashval << 3); + hashval ^= (hashval >> 11); + hashval += (hashval << 15); + + /* + * Yes, the divide here is expensive -- but it's generally the least + * of the performance issues given the amount of data that we iterate + * over to compute hash values, compare data, etc. + */ + ndx = hashval % agb->dtagb_hashsize; + + for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) { + ASSERT((caddr_t)key >= tomax); + ASSERT((caddr_t)key < tomax + buf->dtb_size); + + if (hashval != key->dtak_hashval || key->dtak_size != size) + continue; + + kdata = key->dtak_data; + ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size); + + for (act = agg->dtag_first; act->dta_intuple; + act = act->dta_next) { + i = act->dta_rec.dtrd_offset - agg->dtag_base; + limit = i + act->dta_rec.dtrd_size; + ASSERT(limit <= size); + isstr = DTRACEACT_ISSTRING(act); + + for (; i < limit; i++) { + if (kdata[i] != data[i]) + goto next; + + if (isstr && data[i] == '\0') + break; + } + } + + if (action != key->dtak_action) { + /* + * We are aggregating on the same value in the same + * aggregation with two different aggregating actions. + * (This should have been picked up in the compiler, + * so we may be dealing with errant or devious DIF.) + * This is an error condition; we indicate as much, + * and return. + */ + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return; + } + + /* + * This is a hit: we need to apply the aggregator to + * the value at this key. + */ + agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg); + return; +next: + continue; + } + + /* + * We didn't find it. We need to allocate some zero-filled space, + * link it into the hash table appropriately, and apply the aggregator + * to the (zero-filled) value. + */ + offs = buf->dtb_offset; + while (offs & (align - 1)) + offs += sizeof (uint32_t); + + /* + * If we don't have enough room to both allocate a new key _and_ + * its associated data, increment the drop count and return. + */ + if ((uintptr_t)tomax + offs + fsize > + agb->dtagb_free - sizeof (dtrace_aggkey_t)) { + dtrace_buffer_drop(buf); + return; + } + + /*CONSTCOND*/ + ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1))); + key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t)); + agb->dtagb_free -= sizeof (dtrace_aggkey_t); + + key->dtak_data = kdata = tomax + offs; + buf->dtb_offset = offs + fsize; + + /* + * Now copy the data across. + */ + *((dtrace_aggid_t *)kdata) = agg->dtag_id; + + for (i = sizeof (dtrace_aggid_t); i < size; i++) + kdata[i] = data[i]; + + /* + * Because strings are not zeroed out by default, we need to iterate + * looking for actions that store strings, and we need to explicitly + * pad these strings out with zeroes. + */ + for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) { + int nul; + + if (!DTRACEACT_ISSTRING(act)) + continue; + + i = act->dta_rec.dtrd_offset - agg->dtag_base; + limit = i + act->dta_rec.dtrd_size; + ASSERT(limit <= size); + + for (nul = 0; i < limit; i++) { + if (nul) { + kdata[i] = '\0'; + continue; + } + + if (data[i] != '\0') + continue; + + nul = 1; + } + } + + for (i = size; i < fsize; i++) + kdata[i] = 0; + + key->dtak_hashval = hashval; + key->dtak_size = size; + key->dtak_action = action; + key->dtak_next = agb->dtagb_hash[ndx]; + agb->dtagb_hash[ndx] = key; + + /* + * Finally, apply the aggregator. + */ + *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial; + agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg); +} + +/* + * Given consumer state, this routine finds a speculation in the INACTIVE + * state and transitions it into the ACTIVE state. If there is no speculation + * in the INACTIVE state, 0 is returned. In this case, no error counter is + * incremented -- it is up to the caller to take appropriate action. + */ +static int +dtrace_speculation(dtrace_state_t *state) +{ + int i = 0; + dtrace_speculation_state_t current; + uint32_t *stat = &state->dts_speculations_unavail, count; + + while (i < state->dts_nspeculations) { + dtrace_speculation_t *spec = &state->dts_speculations[i]; + + current = spec->dtsp_state; + + if (current != DTRACESPEC_INACTIVE) { + if (current == DTRACESPEC_COMMITTINGMANY || + current == DTRACESPEC_COMMITTING || + current == DTRACESPEC_DISCARDING) + stat = &state->dts_speculations_busy; + i++; + continue; + } + + if (dtrace_cas32((uint32_t *)&spec->dtsp_state, + current, DTRACESPEC_ACTIVE) == current) + return (i + 1); + } + + /* + * We couldn't find a speculation. If we found as much as a single + * busy speculation buffer, we'll attribute this failure as "busy" + * instead of "unavail". + */ + do { + count = *stat; + } while (dtrace_cas32(stat, count, count + 1) != count); + + return (0); +} + +/* + * This routine commits an active speculation. If the specified speculation + * is not in a valid state to perform a commit(), this routine will silently do + * nothing. The state of the specified speculation is transitioned according + * to the state transition diagram outlined in <sys/dtrace_impl.h> + */ +static void +dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu, + dtrace_specid_t which) +{ + dtrace_speculation_t *spec; + dtrace_buffer_t *src, *dest; + uintptr_t daddr, saddr, dlimit, slimit; + dtrace_speculation_state_t current, new = 0; + intptr_t offs; + uint64_t timestamp; + + if (which == 0) + return; + + if (which > state->dts_nspeculations) { + cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; + return; + } + + spec = &state->dts_speculations[which - 1]; + src = &spec->dtsp_buffer[cpu]; + dest = &state->dts_buffer[cpu]; + + do { + current = spec->dtsp_state; + + if (current == DTRACESPEC_COMMITTINGMANY) + break; + + switch (current) { + case DTRACESPEC_INACTIVE: + case DTRACESPEC_DISCARDING: + return; + + case DTRACESPEC_COMMITTING: + /* + * This is only possible if we are (a) commit()'ing + * without having done a prior speculate() on this CPU + * and (b) racing with another commit() on a different + * CPU. There's nothing to do -- we just assert that + * our offset is 0. + */ + ASSERT(src->dtb_offset == 0); + return; + + case DTRACESPEC_ACTIVE: + new = DTRACESPEC_COMMITTING; + break; + + case DTRACESPEC_ACTIVEONE: + /* + * This speculation is active on one CPU. If our + * buffer offset is non-zero, we know that the one CPU + * must be us. Otherwise, we are committing on a + * different CPU from the speculate(), and we must + * rely on being asynchronously cleaned. + */ + if (src->dtb_offset != 0) { + new = DTRACESPEC_COMMITTING; + break; + } + /*FALLTHROUGH*/ + + case DTRACESPEC_ACTIVEMANY: + new = DTRACESPEC_COMMITTINGMANY; + break; + + default: + ASSERT(0); + } + } while (dtrace_cas32((uint32_t *)&spec->dtsp_state, + current, new) != current); + + /* + * We have set the state to indicate that we are committing this + * speculation. Now reserve the necessary space in the destination + * buffer. + */ + if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset, + sizeof (uint64_t), state, NULL)) < 0) { + dtrace_buffer_drop(dest); + goto out; + } + + /* + * We have sufficient space to copy the speculative buffer into the + * primary buffer. First, modify the speculative buffer, filling + * in the timestamp of all entries with the current time. The data + * must have the commit() time rather than the time it was traced, + * so that all entries in the primary buffer are in timestamp order. + */ + timestamp = dtrace_gethrtime(); + saddr = (uintptr_t)src->dtb_tomax; + slimit = saddr + src->dtb_offset; + while (saddr < slimit) { + size_t size; + dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr; + + if (dtrh->dtrh_epid == DTRACE_EPIDNONE) { + saddr += sizeof (dtrace_epid_t); + continue; + } + ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs); + size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size; + + ASSERT3U(saddr + size, <=, slimit); + ASSERT3U(size, >=, sizeof (dtrace_rechdr_t)); + ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX); + + DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp); + + saddr += size; + } + + /* + * Copy the buffer across. (Note that this is a + * highly subobtimal bcopy(); in the unlikely event that this becomes + * a serious performance issue, a high-performance DTrace-specific + * bcopy() should obviously be invented.) + */ + daddr = (uintptr_t)dest->dtb_tomax + offs; + dlimit = daddr + src->dtb_offset; + saddr = (uintptr_t)src->dtb_tomax; + + /* + * First, the aligned portion. + */ + while (dlimit - daddr >= sizeof (uint64_t)) { + *((uint64_t *)daddr) = *((uint64_t *)saddr); + + daddr += sizeof (uint64_t); + saddr += sizeof (uint64_t); + } + + /* + * Now any left-over bit... + */ + while (dlimit - daddr) + *((uint8_t *)daddr++) = *((uint8_t *)saddr++); + + /* + * Finally, commit the reserved space in the destination buffer. + */ + dest->dtb_offset = offs + src->dtb_offset; + +out: + /* + * If we're lucky enough to be the only active CPU on this speculation + * buffer, we can just set the state back to DTRACESPEC_INACTIVE. + */ + if (current == DTRACESPEC_ACTIVE || + (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) { + uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state, + DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE); + + ASSERT(rval == DTRACESPEC_COMMITTING); + } + + src->dtb_offset = 0; + src->dtb_xamot_drops += src->dtb_drops; + src->dtb_drops = 0; +} + +/* + * This routine discards an active speculation. If the specified speculation + * is not in a valid state to perform a discard(), this routine will silently + * do nothing. The state of the specified speculation is transitioned + * according to the state transition diagram outlined in <sys/dtrace_impl.h> + */ +static void +dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu, + dtrace_specid_t which) +{ + dtrace_speculation_t *spec; + dtrace_speculation_state_t current, new = 0; + dtrace_buffer_t *buf; + + if (which == 0) + return; + + if (which > state->dts_nspeculations) { + cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; + return; + } + + spec = &state->dts_speculations[which - 1]; + buf = &spec->dtsp_buffer[cpu]; + + do { + current = spec->dtsp_state; + + switch (current) { + case DTRACESPEC_INACTIVE: + case DTRACESPEC_COMMITTINGMANY: + case DTRACESPEC_COMMITTING: + case DTRACESPEC_DISCARDING: + return; + + case DTRACESPEC_ACTIVE: + case DTRACESPEC_ACTIVEMANY: + new = DTRACESPEC_DISCARDING; + break; + + case DTRACESPEC_ACTIVEONE: + if (buf->dtb_offset != 0) { + new = DTRACESPEC_INACTIVE; + } else { + new = DTRACESPEC_DISCARDING; + } + break; + + default: + ASSERT(0); + } + } while (dtrace_cas32((uint32_t *)&spec->dtsp_state, + current, new) != current); + + buf->dtb_offset = 0; + buf->dtb_drops = 0; +} + +/* + * Note: not called from probe context. This function is called + * asynchronously from cross call context to clean any speculations that are + * in the COMMITTINGMANY or DISCARDING states. These speculations may not be + * transitioned back to the INACTIVE state until all CPUs have cleaned the + * speculation. + */ +static void +dtrace_speculation_clean_here(dtrace_state_t *state) +{ + dtrace_icookie_t cookie; + processorid_t cpu = curcpu; + dtrace_buffer_t *dest = &state->dts_buffer[cpu]; + dtrace_specid_t i; + + cookie = dtrace_interrupt_disable(); + + if (dest->dtb_tomax == NULL) { + dtrace_interrupt_enable(cookie); + return; + } + + for (i = 0; i < state->dts_nspeculations; i++) { + dtrace_speculation_t *spec = &state->dts_speculations[i]; + dtrace_buffer_t *src = &spec->dtsp_buffer[cpu]; + + if (src->dtb_tomax == NULL) + continue; + + if (spec->dtsp_state == DTRACESPEC_DISCARDING) { + src->dtb_offset = 0; + continue; + } + + if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY) + continue; + + if (src->dtb_offset == 0) + continue; + + dtrace_speculation_commit(state, cpu, i + 1); + } + + dtrace_interrupt_enable(cookie); +} + +/* + * Note: not called from probe context. This function is called + * asynchronously (and at a regular interval) to clean any speculations that + * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there + * is work to be done, it cross calls all CPUs to perform that work; + * COMMITMANY and DISCARDING speculations may not be transitioned back to the + * INACTIVE state until they have been cleaned by all CPUs. + */ +static void +dtrace_speculation_clean(dtrace_state_t *state) +{ + int work = 0, rv; + dtrace_specid_t i; + + for (i = 0; i < state->dts_nspeculations; i++) { + dtrace_speculation_t *spec = &state->dts_speculations[i]; + + ASSERT(!spec->dtsp_cleaning); + + if (spec->dtsp_state != DTRACESPEC_DISCARDING && + spec->dtsp_state != DTRACESPEC_COMMITTINGMANY) + continue; + + work++; + spec->dtsp_cleaning = 1; + } + + if (!work) + return; + + dtrace_xcall(DTRACE_CPUALL, + (dtrace_xcall_t)dtrace_speculation_clean_here, state); + + /* + * We now know that all CPUs have committed or discarded their + * speculation buffers, as appropriate. We can now set the state + * to inactive. + */ + for (i = 0; i < state->dts_nspeculations; i++) { + dtrace_speculation_t *spec = &state->dts_speculations[i]; + dtrace_speculation_state_t current, new; + + if (!spec->dtsp_cleaning) + continue; + + current = spec->dtsp_state; + ASSERT(current == DTRACESPEC_DISCARDING || + current == DTRACESPEC_COMMITTINGMANY); + + new = DTRACESPEC_INACTIVE; + + rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new); + ASSERT(rv == current); + spec->dtsp_cleaning = 0; + } +} + +/* + * Called as part of a speculate() to get the speculative buffer associated + * with a given speculation. Returns NULL if the specified speculation is not + * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and + * the active CPU is not the specified CPU -- the speculation will be + * atomically transitioned into the ACTIVEMANY state. + */ +static dtrace_buffer_t * +dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid, + dtrace_specid_t which) +{ + dtrace_speculation_t *spec; + dtrace_speculation_state_t current, new = 0; + dtrace_buffer_t *buf; + + if (which == 0) + return (NULL); + + if (which > state->dts_nspeculations) { + cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; + return (NULL); + } + + spec = &state->dts_speculations[which - 1]; + buf = &spec->dtsp_buffer[cpuid]; + + do { + current = spec->dtsp_state; + + switch (current) { + case DTRACESPEC_INACTIVE: + case DTRACESPEC_COMMITTINGMANY: + case DTRACESPEC_DISCARDING: + return (NULL); + + case DTRACESPEC_COMMITTING: + ASSERT(buf->dtb_offset == 0); + return (NULL); + + case DTRACESPEC_ACTIVEONE: + /* + * This speculation is currently active on one CPU. + * Check the offset in the buffer; if it's non-zero, + * that CPU must be us (and we leave the state alone). + * If it's zero, assume that we're starting on a new + * CPU -- and change the state to indicate that the + * speculation is active on more than one CPU. + */ + if (buf->dtb_offset != 0) + return (buf); + + new = DTRACESPEC_ACTIVEMANY; + break; + + case DTRACESPEC_ACTIVEMANY: + return (buf); + + case DTRACESPEC_ACTIVE: + new = DTRACESPEC_ACTIVEONE; + break; + + default: + ASSERT(0); + } + } while (dtrace_cas32((uint32_t *)&spec->dtsp_state, + current, new) != current); + + ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY); + return (buf); +} + +/* + * Return a string. In the event that the user lacks the privilege to access + * arbitrary kernel memory, we copy the string out to scratch memory so that we + * don't fail access checking. + * + * dtrace_dif_variable() uses this routine as a helper for various + * builtin values such as 'execname' and 'probefunc.' + */ +uintptr_t +dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state, + dtrace_mstate_t *mstate) +{ + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + uintptr_t ret; + size_t strsz; + + /* + * The easy case: this probe is allowed to read all of memory, so + * we can just return this as a vanilla pointer. + */ + if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) + return (addr); + + /* + * This is the tougher case: we copy the string in question from + * kernel memory into scratch memory and return it that way: this + * ensures that we won't trip up when access checking tests the + * BYREF return value. + */ + strsz = dtrace_strlen((char *)addr, size) + 1; + + if (mstate->dtms_scratch_ptr + strsz > + mstate->dtms_scratch_base + mstate->dtms_scratch_size) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + return (0); + } + + dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr, + strsz); + ret = mstate->dtms_scratch_ptr; + mstate->dtms_scratch_ptr += strsz; + return (ret); +} + +/* + * Return a string from a memoy address which is known to have one or + * more concatenated, individually zero terminated, sub-strings. + * In the event that the user lacks the privilege to access + * arbitrary kernel memory, we copy the string out to scratch memory so that we + * don't fail access checking. + * + * dtrace_dif_variable() uses this routine as a helper for various + * builtin values such as 'execargs'. + */ +static uintptr_t +dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state, + dtrace_mstate_t *mstate) +{ + char *p; + size_t i; + uintptr_t ret; + + if (mstate->dtms_scratch_ptr + strsz > + mstate->dtms_scratch_base + mstate->dtms_scratch_size) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + return (0); + } + + dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr, + strsz); + + /* Replace sub-string termination characters with a space. */ + for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1; + p++, i++) + if (*p == '\0') + *p = ' '; + + ret = mstate->dtms_scratch_ptr; + mstate->dtms_scratch_ptr += strsz; + return (ret); +} + +/* + * This function implements the DIF emulator's variable lookups. The emulator + * passes a reserved variable identifier and optional built-in array index. + */ +static uint64_t +dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, + uint64_t ndx) +{ + /* + * If we're accessing one of the uncached arguments, we'll turn this + * into a reference in the args array. + */ + if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) { + ndx = v - DIF_VAR_ARG0; + v = DIF_VAR_ARGS; + } + + switch (v) { + case DIF_VAR_ARGS: + ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS); + if (ndx >= sizeof (mstate->dtms_arg) / + sizeof (mstate->dtms_arg[0])) { + int aframes = mstate->dtms_probe->dtpr_aframes + 2; + dtrace_provider_t *pv; + uint64_t val; + + pv = mstate->dtms_probe->dtpr_provider; + if (pv->dtpv_pops.dtps_getargval != NULL) + val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg, + mstate->dtms_probe->dtpr_id, + mstate->dtms_probe->dtpr_arg, ndx, aframes); + else + val = dtrace_getarg(ndx, aframes); + + /* + * This is regrettably required to keep the compiler + * from tail-optimizing the call to dtrace_getarg(). + * The condition always evaluates to true, but the + * compiler has no way of figuring that out a priori. + * (None of this would be necessary if the compiler + * could be relied upon to _always_ tail-optimize + * the call to dtrace_getarg() -- but it can't.) + */ + if (mstate->dtms_probe != NULL) + return (val); + + ASSERT(0); + } + + return (mstate->dtms_arg[ndx]); + +#ifdef illumos + case DIF_VAR_UREGS: { + klwp_t *lwp; + + if (!dtrace_priv_proc(state)) + return (0); + + if ((lwp = curthread->t_lwp) == NULL) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR); + cpu_core[curcpu].cpuc_dtrace_illval = NULL; + return (0); + } + + return (dtrace_getreg(lwp->lwp_regs, ndx)); + return (0); + } +#else + case DIF_VAR_UREGS: { + struct trapframe *tframe; + + if (!dtrace_priv_proc(state)) + return (0); + + if ((tframe = curthread->td_frame) == NULL) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR); + cpu_core[curcpu].cpuc_dtrace_illval = 0; + return (0); + } + + return (dtrace_getreg(tframe, ndx)); + } +#endif + + case DIF_VAR_CURTHREAD: + if (!dtrace_priv_proc(state)) + return (0); + return ((uint64_t)(uintptr_t)curthread); + + case DIF_VAR_TIMESTAMP: + if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) { + mstate->dtms_timestamp = dtrace_gethrtime(); + mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP; + } + return (mstate->dtms_timestamp); + + case DIF_VAR_VTIMESTAMP: + ASSERT(dtrace_vtime_references != 0); + return (curthread->t_dtrace_vtime); + + case DIF_VAR_WALLTIMESTAMP: + if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) { + mstate->dtms_walltimestamp = dtrace_gethrestime(); + mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP; + } + return (mstate->dtms_walltimestamp); + +#ifdef illumos + case DIF_VAR_IPL: + if (!dtrace_priv_kernel(state)) + return (0); + if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) { + mstate->dtms_ipl = dtrace_getipl(); + mstate->dtms_present |= DTRACE_MSTATE_IPL; + } + return (mstate->dtms_ipl); +#endif + + case DIF_VAR_EPID: + ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID); + return (mstate->dtms_epid); + + case DIF_VAR_ID: + ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE); + return (mstate->dtms_probe->dtpr_id); + + case DIF_VAR_STACKDEPTH: + if (!dtrace_priv_kernel(state)) + return (0); + if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) { + int aframes = mstate->dtms_probe->dtpr_aframes + 2; + + mstate->dtms_stackdepth = dtrace_getstackdepth(aframes); + mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH; + } + return (mstate->dtms_stackdepth); + + case DIF_VAR_USTACKDEPTH: + if (!dtrace_priv_proc(state)) + return (0); + if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) { + /* + * See comment in DIF_VAR_PID. + */ + if (DTRACE_ANCHORED(mstate->dtms_probe) && + CPU_ON_INTR(CPU)) { + mstate->dtms_ustackdepth = 0; + } else { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + mstate->dtms_ustackdepth = + dtrace_getustackdepth(); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + } + mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH; + } + return (mstate->dtms_ustackdepth); + + case DIF_VAR_CALLER: + if (!dtrace_priv_kernel(state)) + return (0); + if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) { + int aframes = mstate->dtms_probe->dtpr_aframes + 2; + + if (!DTRACE_ANCHORED(mstate->dtms_probe)) { + /* + * If this is an unanchored probe, we are + * required to go through the slow path: + * dtrace_caller() only guarantees correct + * results for anchored probes. + */ + pc_t caller[2] = {0, 0}; + + dtrace_getpcstack(caller, 2, aframes, + (uint32_t *)(uintptr_t)mstate->dtms_arg[0]); + mstate->dtms_caller = caller[1]; + } else if ((mstate->dtms_caller = + dtrace_caller(aframes)) == -1) { + /* + * We have failed to do this the quick way; + * we must resort to the slower approach of + * calling dtrace_getpcstack(). + */ + pc_t caller = 0; + + dtrace_getpcstack(&caller, 1, aframes, NULL); + mstate->dtms_caller = caller; + } + + mstate->dtms_present |= DTRACE_MSTATE_CALLER; + } + return (mstate->dtms_caller); + + case DIF_VAR_UCALLER: + if (!dtrace_priv_proc(state)) + return (0); + + if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) { + uint64_t ustack[3]; + + /* + * dtrace_getupcstack() fills in the first uint64_t + * with the current PID. The second uint64_t will + * be the program counter at user-level. The third + * uint64_t will contain the caller, which is what + * we're after. + */ + ustack[2] = 0; + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + dtrace_getupcstack(ustack, 3); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + mstate->dtms_ucaller = ustack[2]; + mstate->dtms_present |= DTRACE_MSTATE_UCALLER; + } + + return (mstate->dtms_ucaller); + + case DIF_VAR_PROBEPROV: + ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE); + return (dtrace_dif_varstr( + (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name, + state, mstate)); + + case DIF_VAR_PROBEMOD: + ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE); + return (dtrace_dif_varstr( + (uintptr_t)mstate->dtms_probe->dtpr_mod, + state, mstate)); + + case DIF_VAR_PROBEFUNC: + ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE); + return (dtrace_dif_varstr( + (uintptr_t)mstate->dtms_probe->dtpr_func, + state, mstate)); + + case DIF_VAR_PROBENAME: + ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE); + return (dtrace_dif_varstr( + (uintptr_t)mstate->dtms_probe->dtpr_name, + state, mstate)); + + case DIF_VAR_PID: + if (!dtrace_priv_proc(state)) + return (0); + +#ifdef illumos + /* + * Note that we are assuming that an unanchored probe is + * always due to a high-level interrupt. (And we're assuming + * that there is only a single high level interrupt.) + */ + if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) + return (pid0.pid_id); + + /* + * It is always safe to dereference one's own t_procp pointer: + * it always points to a valid, allocated proc structure. + * Further, it is always safe to dereference the p_pidp member + * of one's own proc structure. (These are truisms becuase + * threads and processes don't clean up their own state -- + * they leave that task to whomever reaps them.) + */ + return ((uint64_t)curthread->t_procp->p_pidp->pid_id); +#else + return ((uint64_t)curproc->p_pid); +#endif + + case DIF_VAR_PPID: + if (!dtrace_priv_proc(state)) + return (0); + +#ifdef illumos + /* + * See comment in DIF_VAR_PID. + */ + if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) + return (pid0.pid_id); + + /* + * It is always safe to dereference one's own t_procp pointer: + * it always points to a valid, allocated proc structure. + * (This is true because threads don't clean up their own + * state -- they leave that task to whomever reaps them.) + */ + return ((uint64_t)curthread->t_procp->p_ppid); +#else + if (curproc->p_pid == proc0.p_pid) + return (curproc->p_pid); + else + return (curproc->p_pptr->p_pid); +#endif + + case DIF_VAR_TID: +#ifdef illumos + /* + * See comment in DIF_VAR_PID. + */ + if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) + return (0); +#endif + + return ((uint64_t)curthread->t_tid); + + case DIF_VAR_EXECARGS: { + struct pargs *p_args = curthread->td_proc->p_args; + + if (p_args == NULL) + return(0); + + return (dtrace_dif_varstrz( + (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate)); + } + + case DIF_VAR_EXECNAME: +#ifdef illumos + if (!dtrace_priv_proc(state)) + return (0); + + /* + * See comment in DIF_VAR_PID. + */ + if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) + return ((uint64_t)(uintptr_t)p0.p_user.u_comm); + + /* + * It is always safe to dereference one's own t_procp pointer: + * it always points to a valid, allocated proc structure. + * (This is true because threads don't clean up their own + * state -- they leave that task to whomever reaps them.) + */ + return (dtrace_dif_varstr( + (uintptr_t)curthread->t_procp->p_user.u_comm, + state, mstate)); +#else + return (dtrace_dif_varstr( + (uintptr_t) curthread->td_proc->p_comm, state, mstate)); +#endif + + case DIF_VAR_ZONENAME: +#ifdef illumos + if (!dtrace_priv_proc(state)) + return (0); + + /* + * See comment in DIF_VAR_PID. + */ + if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) + return ((uint64_t)(uintptr_t)p0.p_zone->zone_name); + + /* + * It is always safe to dereference one's own t_procp pointer: + * it always points to a valid, allocated proc structure. + * (This is true because threads don't clean up their own + * state -- they leave that task to whomever reaps them.) + */ + return (dtrace_dif_varstr( + (uintptr_t)curthread->t_procp->p_zone->zone_name, + state, mstate)); +#elif defined(__FreeBSD__) + /* + * On FreeBSD, we introduce compatibility to zonename by falling through + * into jailname. + */ + case DIF_VAR_JAILNAME: + if (!dtrace_priv_kernel(state)) + return (0); + + return (dtrace_dif_varstr( + (uintptr_t)curthread->td_ucred->cr_prison->pr_name, + state, mstate)); + + case DIF_VAR_JID: + if (!dtrace_priv_kernel(state)) + return (0); + + return ((uint64_t)curthread->td_ucred->cr_prison->pr_id); +#else + return (0); +#endif + + case DIF_VAR_UID: + if (!dtrace_priv_proc(state)) + return (0); + +#ifdef illumos + /* + * See comment in DIF_VAR_PID. + */ + if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) + return ((uint64_t)p0.p_cred->cr_uid); + + /* + * It is always safe to dereference one's own t_procp pointer: + * it always points to a valid, allocated proc structure. + * (This is true because threads don't clean up their own + * state -- they leave that task to whomever reaps them.) + * + * Additionally, it is safe to dereference one's own process + * credential, since this is never NULL after process birth. + */ + return ((uint64_t)curthread->t_procp->p_cred->cr_uid); +#else + return ((uint64_t)curthread->td_ucred->cr_uid); +#endif + + case DIF_VAR_GID: + if (!dtrace_priv_proc(state)) + return (0); + +#ifdef illumos + /* + * See comment in DIF_VAR_PID. + */ + if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) + return ((uint64_t)p0.p_cred->cr_gid); + + /* + * It is always safe to dereference one's own t_procp pointer: + * it always points to a valid, allocated proc structure. + * (This is true because threads don't clean up their own + * state -- they leave that task to whomever reaps them.) + * + * Additionally, it is safe to dereference one's own process + * credential, since this is never NULL after process birth. + */ + return ((uint64_t)curthread->t_procp->p_cred->cr_gid); +#else + return ((uint64_t)curthread->td_ucred->cr_gid); +#endif + + case DIF_VAR_ERRNO: { +#ifdef illumos + klwp_t *lwp; + if (!dtrace_priv_proc(state)) + return (0); + + /* + * See comment in DIF_VAR_PID. + */ + if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) + return (0); + + /* + * It is always safe to dereference one's own t_lwp pointer in + * the event that this pointer is non-NULL. (This is true + * because threads and lwps don't clean up their own state -- + * they leave that task to whomever reaps them.) + */ + if ((lwp = curthread->t_lwp) == NULL) + return (0); + + return ((uint64_t)lwp->lwp_errno); +#else + return (curthread->td_errno); +#endif + } +#ifndef illumos + case DIF_VAR_CPU: { + return curcpu; + } +#endif + default: + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return (0); + } +} + + +typedef enum dtrace_json_state { + DTRACE_JSON_REST = 1, + DTRACE_JSON_OBJECT, + DTRACE_JSON_STRING, + DTRACE_JSON_STRING_ESCAPE, + DTRACE_JSON_STRING_ESCAPE_UNICODE, + DTRACE_JSON_COLON, + DTRACE_JSON_COMMA, + DTRACE_JSON_VALUE, + DTRACE_JSON_IDENTIFIER, + DTRACE_JSON_NUMBER, + DTRACE_JSON_NUMBER_FRAC, + DTRACE_JSON_NUMBER_EXP, + DTRACE_JSON_COLLECT_OBJECT +} dtrace_json_state_t; + +/* + * This function possesses just enough knowledge about JSON to extract a single + * value from a JSON string and store it in the scratch buffer. It is able + * to extract nested object values, and members of arrays by index. + * + * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to + * be looked up as we descend into the object tree. e.g. + * + * foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL + * with nelems = 5. + * + * The run time of this function must be bounded above by strsize to limit the + * amount of work done in probe context. As such, it is implemented as a + * simple state machine, reading one character at a time using safe loads + * until we find the requested element, hit a parsing error or run off the + * end of the object or string. + * + * As there is no way for a subroutine to return an error without interrupting + * clause execution, we simply return NULL in the event of a missing key or any + * other error condition. Each NULL return in this function is commented with + * the error condition it represents -- parsing or otherwise. + * + * The set of states for the state machine closely matches the JSON + * specification (http://json.org/). Briefly: + * + * DTRACE_JSON_REST: + * Skip whitespace until we find either a top-level Object, moving + * to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE. + * + * DTRACE_JSON_OBJECT: + * Locate the next key String in an Object. Sets a flag to denote + * the next String as a key string and moves to DTRACE_JSON_STRING. + * + * DTRACE_JSON_COLON: + * Skip whitespace until we find the colon that separates key Strings + * from their values. Once found, move to DTRACE_JSON_VALUE. + * + * DTRACE_JSON_VALUE: + * Detects the type of the next value (String, Number, Identifier, Object + * or Array) and routes to the states that process that type. Here we also + * deal with the element selector list if we are requested to traverse down + * into the object tree. + * + * DTRACE_JSON_COMMA: + * Skip whitespace until we find the comma that separates key-value pairs + * in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays + * (similarly DTRACE_JSON_VALUE). All following literal value processing + * states return to this state at the end of their value, unless otherwise + * noted. + * + * DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP: + * Processes a Number literal from the JSON, including any exponent + * component that may be present. Numbers are returned as strings, which + * may be passed to strtoll() if an integer is required. + * + * DTRACE_JSON_IDENTIFIER: + * Processes a "true", "false" or "null" literal in the JSON. + * + * DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE, + * DTRACE_JSON_STRING_ESCAPE_UNICODE: + * Processes a String literal from the JSON, whether the String denotes + * a key, a value or part of a larger Object. Handles all escape sequences + * present in the specification, including four-digit unicode characters, + * but merely includes the escape sequence without converting it to the + * actual escaped character. If the String is flagged as a key, we + * move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA. + * + * DTRACE_JSON_COLLECT_OBJECT: + * This state collects an entire Object (or Array), correctly handling + * embedded strings. If the full element selector list matches this nested + * object, we return the Object in full as a string. If not, we use this + * state to skip to the next value at this level and continue processing. + * + * NOTE: This function uses various macros from strtolctype.h to manipulate + * digit values, etc -- these have all been checked to ensure they make + * no additional function calls. + */ +static char * +dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems, + char *dest) +{ + dtrace_json_state_t state = DTRACE_JSON_REST; + int64_t array_elem = INT64_MIN; + int64_t array_pos = 0; + uint8_t escape_unicount = 0; + boolean_t string_is_key = B_FALSE; + boolean_t collect_object = B_FALSE; + boolean_t found_key = B_FALSE; + boolean_t in_array = B_FALSE; + uint32_t braces = 0, brackets = 0; + char *elem = elemlist; + char *dd = dest; + uintptr_t cur; + + for (cur = json; cur < json + size; cur++) { + char cc = dtrace_load8(cur); + if (cc == '\0') + return (NULL); + + switch (state) { + case DTRACE_JSON_REST: + if (isspace(cc)) + break; + + if (cc == '{') { + state = DTRACE_JSON_OBJECT; + break; + } + + if (cc == '[') { + in_array = B_TRUE; + array_pos = 0; + array_elem = dtrace_strtoll(elem, 10, size); + found_key = array_elem == 0 ? B_TRUE : B_FALSE; + state = DTRACE_JSON_VALUE; + break; + } + + /* + * ERROR: expected to find a top-level object or array. + */ + return (NULL); + case DTRACE_JSON_OBJECT: + if (isspace(cc)) + break; + + if (cc == '"') { + state = DTRACE_JSON_STRING; + string_is_key = B_TRUE; + break; + } + + /* + * ERROR: either the object did not start with a key + * string, or we've run off the end of the object + * without finding the requested key. + */ + return (NULL); + case DTRACE_JSON_STRING: + if (cc == '\\') { + *dd++ = '\\'; + state = DTRACE_JSON_STRING_ESCAPE; + break; + } + + if (cc == '"') { + if (collect_object) { + /* + * We don't reset the dest here, as + * the string is part of a larger + * object being collected. + */ + *dd++ = cc; + collect_object = B_FALSE; + state = DTRACE_JSON_COLLECT_OBJECT; + break; + } + *dd = '\0'; + dd = dest; /* reset string buffer */ + if (string_is_key) { + if (dtrace_strncmp(dest, elem, + size) == 0) + found_key = B_TRUE; + } else if (found_key) { + if (nelems > 1) { + /* + * We expected an object, not + * this string. + */ + return (NULL); + } + return (dest); + } + state = string_is_key ? DTRACE_JSON_COLON : + DTRACE_JSON_COMMA; + string_is_key = B_FALSE; + break; + } + + *dd++ = cc; + break; + case DTRACE_JSON_STRING_ESCAPE: + *dd++ = cc; + if (cc == 'u') { + escape_unicount = 0; + state = DTRACE_JSON_STRING_ESCAPE_UNICODE; + } else { + state = DTRACE_JSON_STRING; + } + break; + case DTRACE_JSON_STRING_ESCAPE_UNICODE: + if (!isxdigit(cc)) { + /* + * ERROR: invalid unicode escape, expected + * four valid hexidecimal digits. + */ + return (NULL); + } + + *dd++ = cc; + if (++escape_unicount == 4) + state = DTRACE_JSON_STRING; + break; + case DTRACE_JSON_COLON: + if (isspace(cc)) + break; + + if (cc == ':') { + state = DTRACE_JSON_VALUE; + break; + } + + /* + * ERROR: expected a colon. + */ + return (NULL); + case DTRACE_JSON_COMMA: + if (isspace(cc)) + break; + + if (cc == ',') { + if (in_array) { + state = DTRACE_JSON_VALUE; + if (++array_pos == array_elem) + found_key = B_TRUE; + } else { + state = DTRACE_JSON_OBJECT; + } + break; + } + + /* + * ERROR: either we hit an unexpected character, or + * we reached the end of the object or array without + * finding the requested key. + */ + return (NULL); + case DTRACE_JSON_IDENTIFIER: + if (islower(cc)) { + *dd++ = cc; + break; + } + + *dd = '\0'; + dd = dest; /* reset string buffer */ + + if (dtrace_strncmp(dest, "true", 5) == 0 || + dtrace_strncmp(dest, "false", 6) == 0 || + dtrace_strncmp(dest, "null", 5) == 0) { + if (found_key) { + if (nelems > 1) { + /* + * ERROR: We expected an object, + * not this identifier. + */ + return (NULL); + } + return (dest); + } else { + cur--; + state = DTRACE_JSON_COMMA; + break; + } + } + + /* + * ERROR: we did not recognise the identifier as one + * of those in the JSON specification. + */ + return (NULL); + case DTRACE_JSON_NUMBER: + if (cc == '.') { + *dd++ = cc; + state = DTRACE_JSON_NUMBER_FRAC; + break; + } + + if (cc == 'x' || cc == 'X') { + /* + * ERROR: specification explicitly excludes + * hexidecimal or octal numbers. + */ + return (NULL); + } + + /* FALLTHRU */ + case DTRACE_JSON_NUMBER_FRAC: + if (cc == 'e' || cc == 'E') { + *dd++ = cc; + state = DTRACE_JSON_NUMBER_EXP; + break; + } + + if (cc == '+' || cc == '-') { + /* + * ERROR: expect sign as part of exponent only. + */ + return (NULL); + } + /* FALLTHRU */ + case DTRACE_JSON_NUMBER_EXP: + if (isdigit(cc) || cc == '+' || cc == '-') { + *dd++ = cc; + break; + } + + *dd = '\0'; + dd = dest; /* reset string buffer */ + if (found_key) { + if (nelems > 1) { + /* + * ERROR: We expected an object, not + * this number. + */ + return (NULL); + } + return (dest); + } + + cur--; + state = DTRACE_JSON_COMMA; + break; + case DTRACE_JSON_VALUE: + if (isspace(cc)) + break; + + if (cc == '{' || cc == '[') { + if (nelems > 1 && found_key) { + in_array = cc == '[' ? B_TRUE : B_FALSE; + /* + * If our element selector directs us + * to descend into this nested object, + * then move to the next selector + * element in the list and restart the + * state machine. + */ + while (*elem != '\0') + elem++; + elem++; /* skip the inter-element NUL */ + nelems--; + dd = dest; + if (in_array) { + state = DTRACE_JSON_VALUE; + array_pos = 0; + array_elem = dtrace_strtoll( + elem, 10, size); + found_key = array_elem == 0 ? + B_TRUE : B_FALSE; + } else { + found_key = B_FALSE; + state = DTRACE_JSON_OBJECT; + } + break; + } + + /* + * Otherwise, we wish to either skip this + * nested object or return it in full. + */ + if (cc == '[') + brackets = 1; + else + braces = 1; + *dd++ = cc; + state = DTRACE_JSON_COLLECT_OBJECT; + break; + } + + if (cc == '"') { + state = DTRACE_JSON_STRING; + break; + } + + if (islower(cc)) { + /* + * Here we deal with true, false and null. + */ + *dd++ = cc; + state = DTRACE_JSON_IDENTIFIER; + break; + } + + if (cc == '-' || isdigit(cc)) { + *dd++ = cc; + state = DTRACE_JSON_NUMBER; + break; + } + + /* + * ERROR: unexpected character at start of value. + */ + return (NULL); + case DTRACE_JSON_COLLECT_OBJECT: + if (cc == '\0') + /* + * ERROR: unexpected end of input. + */ + return (NULL); + + *dd++ = cc; + if (cc == '"') { + collect_object = B_TRUE; + state = DTRACE_JSON_STRING; + break; + } + + if (cc == ']') { + if (brackets-- == 0) { + /* + * ERROR: unbalanced brackets. + */ + return (NULL); + } + } else if (cc == '}') { + if (braces-- == 0) { + /* + * ERROR: unbalanced braces. + */ + return (NULL); + } + } else if (cc == '{') { + braces++; + } else if (cc == '[') { + brackets++; + } + + if (brackets == 0 && braces == 0) { + if (found_key) { + *dd = '\0'; + return (dest); + } + dd = dest; /* reset string buffer */ + state = DTRACE_JSON_COMMA; + } + break; + } + } + return (NULL); +} + +/* + * Emulate the execution of DTrace ID subroutines invoked by the call opcode. + * Notice that we don't bother validating the proper number of arguments or + * their types in the tuple stack. This isn't needed because all argument + * interpretation is safe because of our load safety -- the worst that can + * happen is that a bogus program can obtain bogus results. + */ +static void +dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, + dtrace_key_t *tupregs, int nargs, + dtrace_mstate_t *mstate, dtrace_state_t *state) +{ + volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags; + volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval; + dtrace_vstate_t *vstate = &state->dts_vstate; + +#ifdef illumos + union { + mutex_impl_t mi; + uint64_t mx; + } m; + + union { + krwlock_t ri; + uintptr_t rw; + } r; +#else + struct thread *lowner; + union { + struct lock_object *li; + uintptr_t lx; + } l; +#endif + + switch (subr) { + case DIF_SUBR_RAND: + regs[rd] = dtrace_xoroshiro128_plus_next( + state->dts_rstate[curcpu]); + break; + +#ifdef illumos + case DIF_SUBR_MUTEX_OWNED: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t), + mstate, vstate)) { + regs[rd] = 0; + break; + } + + m.mx = dtrace_load64(tupregs[0].dttk_value); + if (MUTEX_TYPE_ADAPTIVE(&m.mi)) + regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER; + else + regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock); + break; + + case DIF_SUBR_MUTEX_OWNER: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t), + mstate, vstate)) { + regs[rd] = 0; + break; + } + + m.mx = dtrace_load64(tupregs[0].dttk_value); + if (MUTEX_TYPE_ADAPTIVE(&m.mi) && + MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER) + regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi); + else + regs[rd] = 0; + break; + + case DIF_SUBR_MUTEX_TYPE_ADAPTIVE: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t), + mstate, vstate)) { + regs[rd] = 0; + break; + } + + m.mx = dtrace_load64(tupregs[0].dttk_value); + regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi); + break; + + case DIF_SUBR_MUTEX_TYPE_SPIN: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t), + mstate, vstate)) { + regs[rd] = 0; + break; + } + + m.mx = dtrace_load64(tupregs[0].dttk_value); + regs[rd] = MUTEX_TYPE_SPIN(&m.mi); + break; + + case DIF_SUBR_RW_READ_HELD: { + uintptr_t tmp; + + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t), + mstate, vstate)) { + regs[rd] = 0; + break; + } + + r.rw = dtrace_loadptr(tupregs[0].dttk_value); + regs[rd] = _RW_READ_HELD(&r.ri, tmp); + break; + } + + case DIF_SUBR_RW_WRITE_HELD: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t), + mstate, vstate)) { + regs[rd] = 0; + break; + } + + r.rw = dtrace_loadptr(tupregs[0].dttk_value); + regs[rd] = _RW_WRITE_HELD(&r.ri); + break; + + case DIF_SUBR_RW_ISWRITER: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t), + mstate, vstate)) { + regs[rd] = 0; + break; + } + + r.rw = dtrace_loadptr(tupregs[0].dttk_value); + regs[rd] = _RW_ISWRITER(&r.ri); + break; + +#else /* !illumos */ + case DIF_SUBR_MUTEX_OWNED: + if (!dtrace_canload(tupregs[0].dttk_value, + sizeof (struct lock_object), mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + + case DIF_SUBR_MUTEX_OWNER: + if (!dtrace_canload(tupregs[0].dttk_value, + sizeof (struct lock_object), mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + LOCK_CLASS(l.li)->lc_owner(l.li, &lowner); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + regs[rd] = (uintptr_t)lowner; + break; + + case DIF_SUBR_MUTEX_TYPE_ADAPTIVE: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx), + mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SLEEPLOCK) != 0; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + + case DIF_SUBR_MUTEX_TYPE_SPIN: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx), + mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + + case DIF_SUBR_RW_READ_HELD: + case DIF_SUBR_SX_SHARED_HELD: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t), + mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) && + lowner == NULL; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + + case DIF_SUBR_RW_WRITE_HELD: + case DIF_SUBR_SX_EXCLUSIVE_HELD: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t), + mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr(tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) && + lowner != NULL; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + + case DIF_SUBR_RW_ISWRITER: + case DIF_SUBR_SX_ISEXCLUSIVE: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t), + mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr(tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + LOCK_CLASS(l.li)->lc_owner(l.li, &lowner); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + regs[rd] = (lowner == curthread); + break; +#endif /* illumos */ + + case DIF_SUBR_BCOPY: { + /* + * We need to be sure that the destination is in the scratch + * region -- no other region is allowed. + */ + uintptr_t src = tupregs[0].dttk_value; + uintptr_t dest = tupregs[1].dttk_value; + size_t size = tupregs[2].dttk_value; + + if (!dtrace_inscratch(dest, size, mstate)) { + *flags |= CPU_DTRACE_BADADDR; + *illval = regs[rd]; + break; + } + + if (!dtrace_canload(src, size, mstate, vstate)) { + regs[rd] = 0; + break; + } + + dtrace_bcopy((void *)src, (void *)dest, size); + break; + } + + case DIF_SUBR_ALLOCA: + case DIF_SUBR_COPYIN: { + uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8); + uint64_t size = + tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value; + size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size; + + /* + * This action doesn't require any credential checks since + * probes will not activate in user contexts to which the + * enabling user does not have permissions. + */ + + /* + * Rounding up the user allocation size could have overflowed + * a large, bogus allocation (like -1ULL) to 0. + */ + if (scratch_size < size || + !DTRACE_INSCRATCH(mstate, scratch_size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + if (subr == DIF_SUBR_COPYIN) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + dtrace_copyin(tupregs[0].dttk_value, dest, size, flags); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + } + + mstate->dtms_scratch_ptr += scratch_size; + regs[rd] = dest; + break; + } + + case DIF_SUBR_COPYINTO: { + uint64_t size = tupregs[1].dttk_value; + uintptr_t dest = tupregs[2].dttk_value; + + /* + * This action doesn't require any credential checks since + * probes will not activate in user contexts to which the + * enabling user does not have permissions. + */ + if (!dtrace_inscratch(dest, size, mstate)) { + *flags |= CPU_DTRACE_BADADDR; + *illval = regs[rd]; + break; + } + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + dtrace_copyin(tupregs[0].dttk_value, dest, size, flags); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + } + + case DIF_SUBR_COPYINSTR: { + uintptr_t dest = mstate->dtms_scratch_ptr; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + + if (nargs > 1 && tupregs[1].dttk_value < size) + size = tupregs[1].dttk_value + 1; + + /* + * This action doesn't require any credential checks since + * probes will not activate in user contexts to which the + * enabling user does not have permissions. + */ + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + + ((char *)dest)[size - 1] = '\0'; + mstate->dtms_scratch_ptr += size; + regs[rd] = dest; + break; + } + +#ifdef illumos + case DIF_SUBR_MSGSIZE: + case DIF_SUBR_MSGDSIZE: { + uintptr_t baddr = tupregs[0].dttk_value, daddr; + uintptr_t wptr, rptr; + size_t count = 0; + int cont = 0; + + while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) { + + if (!dtrace_canload(baddr, sizeof (mblk_t), mstate, + vstate)) { + regs[rd] = 0; + break; + } + + wptr = dtrace_loadptr(baddr + + offsetof(mblk_t, b_wptr)); + + rptr = dtrace_loadptr(baddr + + offsetof(mblk_t, b_rptr)); + + if (wptr < rptr) { + *flags |= CPU_DTRACE_BADADDR; + *illval = tupregs[0].dttk_value; + break; + } + + daddr = dtrace_loadptr(baddr + + offsetof(mblk_t, b_datap)); + + baddr = dtrace_loadptr(baddr + + offsetof(mblk_t, b_cont)); + + /* + * We want to prevent against denial-of-service here, + * so we're only going to search the list for + * dtrace_msgdsize_max mblks. + */ + if (cont++ > dtrace_msgdsize_max) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + + if (subr == DIF_SUBR_MSGDSIZE) { + if (dtrace_load8(daddr + + offsetof(dblk_t, db_type)) != M_DATA) + continue; + } + + count += wptr - rptr; + } + + if (!(*flags & CPU_DTRACE_FAULT)) + regs[rd] = count; + + break; + } +#endif + + case DIF_SUBR_PROGENYOF: { + pid_t pid = tupregs[0].dttk_value; + proc_t *p; + int rval = 0; + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + + for (p = curthread->t_procp; p != NULL; p = p->p_parent) { +#ifdef illumos + if (p->p_pidp->pid_id == pid) { +#else + if (p->p_pid == pid) { +#endif + rval = 1; + break; + } + } + + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + + regs[rd] = rval; + break; + } + + case DIF_SUBR_SPECULATION: + regs[rd] = dtrace_speculation(state); + break; + + case DIF_SUBR_COPYOUT: { + uintptr_t kaddr = tupregs[0].dttk_value; + uintptr_t uaddr = tupregs[1].dttk_value; + uint64_t size = tupregs[2].dttk_value; + + if (!dtrace_destructive_disallow && + dtrace_priv_proc_control(state) && + !dtrace_istoxic(kaddr, size) && + dtrace_canload(kaddr, size, mstate, vstate)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + dtrace_copyout(kaddr, uaddr, size, flags); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + } + break; + } + + case DIF_SUBR_COPYOUTSTR: { + uintptr_t kaddr = tupregs[0].dttk_value; + uintptr_t uaddr = tupregs[1].dttk_value; + uint64_t size = tupregs[2].dttk_value; + size_t lim; + + if (!dtrace_destructive_disallow && + dtrace_priv_proc_control(state) && + !dtrace_istoxic(kaddr, size) && + dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + dtrace_copyoutstr(kaddr, uaddr, lim, flags); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + } + break; + } + + case DIF_SUBR_STRLEN: { + size_t size = state->dts_options[DTRACEOPT_STRSIZE]; + uintptr_t addr = (uintptr_t)tupregs[0].dttk_value; + size_t lim; + + if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) { + regs[rd] = 0; + break; + } + + regs[rd] = dtrace_strlen((char *)addr, lim); + break; + } + + case DIF_SUBR_STRCHR: + case DIF_SUBR_STRRCHR: { + /* + * We're going to iterate over the string looking for the + * specified character. We will iterate until we have reached + * the string length or we have found the character. If this + * is DIF_SUBR_STRRCHR, we will look for the last occurrence + * of the specified character instead of the first. + */ + uintptr_t addr = tupregs[0].dttk_value; + uintptr_t addr_limit; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + size_t lim; + char c, target = (char)tupregs[1].dttk_value; + + if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) { + regs[rd] = 0; + break; + } + addr_limit = addr + lim; + + for (regs[rd] = 0; addr < addr_limit; addr++) { + if ((c = dtrace_load8(addr)) == target) { + regs[rd] = addr; + + if (subr == DIF_SUBR_STRCHR) + break; + } + + if (c == '\0') + break; + } + break; + } + + case DIF_SUBR_STRSTR: + case DIF_SUBR_INDEX: + case DIF_SUBR_RINDEX: { + /* + * We're going to iterate over the string looking for the + * specified string. We will iterate until we have reached + * the string length or we have found the string. (Yes, this + * is done in the most naive way possible -- but considering + * that the string we're searching for is likely to be + * relatively short, the complexity of Rabin-Karp or similar + * hardly seems merited.) + */ + char *addr = (char *)(uintptr_t)tupregs[0].dttk_value; + char *substr = (char *)(uintptr_t)tupregs[1].dttk_value; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + size_t len = dtrace_strlen(addr, size); + size_t sublen = dtrace_strlen(substr, size); + char *limit = addr + len, *orig = addr; + int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1; + int inc = 1; + + regs[rd] = notfound; + + if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) { + regs[rd] = 0; + break; + } + + if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate, + vstate)) { + regs[rd] = 0; + break; + } + + /* + * strstr() and index()/rindex() have similar semantics if + * both strings are the empty string: strstr() returns a + * pointer to the (empty) string, and index() and rindex() + * both return index 0 (regardless of any position argument). + */ + if (sublen == 0 && len == 0) { + if (subr == DIF_SUBR_STRSTR) + regs[rd] = (uintptr_t)addr; + else + regs[rd] = 0; + break; + } + + if (subr != DIF_SUBR_STRSTR) { + if (subr == DIF_SUBR_RINDEX) { + limit = orig - 1; + addr += len; + inc = -1; + } + + /* + * Both index() and rindex() take an optional position + * argument that denotes the starting position. + */ + if (nargs == 3) { + int64_t pos = (int64_t)tupregs[2].dttk_value; + + /* + * If the position argument to index() is + * negative, Perl implicitly clamps it at + * zero. This semantic is a little surprising + * given the special meaning of negative + * positions to similar Perl functions like + * substr(), but it appears to reflect a + * notion that index() can start from a + * negative index and increment its way up to + * the string. Given this notion, Perl's + * rindex() is at least self-consistent in + * that it implicitly clamps positions greater + * than the string length to be the string + * length. Where Perl completely loses + * coherence, however, is when the specified + * substring is the empty string (""). In + * this case, even if the position is + * negative, rindex() returns 0 -- and even if + * the position is greater than the length, + * index() returns the string length. These + * semantics violate the notion that index() + * should never return a value less than the + * specified position and that rindex() should + * never return a value greater than the + * specified position. (One assumes that + * these semantics are artifacts of Perl's + * implementation and not the results of + * deliberate design -- it beggars belief that + * even Larry Wall could desire such oddness.) + * While in the abstract one would wish for + * consistent position semantics across + * substr(), index() and rindex() -- or at the + * very least self-consistent position + * semantics for index() and rindex() -- we + * instead opt to keep with the extant Perl + * semantics, in all their broken glory. (Do + * we have more desire to maintain Perl's + * semantics than Perl does? Probably.) + */ + if (subr == DIF_SUBR_RINDEX) { + if (pos < 0) { + if (sublen == 0) + regs[rd] = 0; + break; + } + + if (pos > len) + pos = len; + } else { + if (pos < 0) + pos = 0; + + if (pos >= len) { + if (sublen == 0) + regs[rd] = len; + break; + } + } + + addr = orig + pos; + } + } + + for (regs[rd] = notfound; addr != limit; addr += inc) { + if (dtrace_strncmp(addr, substr, sublen) == 0) { + if (subr != DIF_SUBR_STRSTR) { + /* + * As D index() and rindex() are + * modeled on Perl (and not on awk), + * we return a zero-based (and not a + * one-based) index. (For you Perl + * weenies: no, we're not going to add + * $[ -- and shouldn't you be at a con + * or something?) + */ + regs[rd] = (uintptr_t)(addr - orig); + break; + } + + ASSERT(subr == DIF_SUBR_STRSTR); + regs[rd] = (uintptr_t)addr; + break; + } + } + + break; + } + + case DIF_SUBR_STRTOK: { + uintptr_t addr = tupregs[0].dttk_value; + uintptr_t tokaddr = tupregs[1].dttk_value; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + uintptr_t limit, toklimit; + size_t clim; + uint8_t c = 0, tokmap[32]; /* 256 / 8 */ + char *dest = (char *)mstate->dtms_scratch_ptr; + int i; + + /* + * Check both the token buffer and (later) the input buffer, + * since both could be non-scratch addresses. + */ + if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) { + regs[rd] = 0; + break; + } + toklimit = tokaddr + clim; + + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + if (addr == 0) { + /* + * If the address specified is NULL, we use our saved + * strtok pointer from the mstate. Note that this + * means that the saved strtok pointer is _only_ + * valid within multiple enablings of the same probe -- + * it behaves like an implicit clause-local variable. + */ + addr = mstate->dtms_strtok; + limit = mstate->dtms_strtok_limit; + } else { + /* + * If the user-specified address is non-NULL we must + * access check it. This is the only time we have + * a chance to do so, since this address may reside + * in the string table of this clause-- future calls + * (when we fetch addr from mstate->dtms_strtok) + * would fail this access check. + */ + if (!dtrace_strcanload(addr, size, &clim, mstate, + vstate)) { + regs[rd] = 0; + break; + } + limit = addr + clim; + } + + /* + * First, zero the token map, and then process the token + * string -- setting a bit in the map for every character + * found in the token string. + */ + for (i = 0; i < sizeof (tokmap); i++) + tokmap[i] = 0; + + for (; tokaddr < toklimit; tokaddr++) { + if ((c = dtrace_load8(tokaddr)) == '\0') + break; + + ASSERT((c >> 3) < sizeof (tokmap)); + tokmap[c >> 3] |= (1 << (c & 0x7)); + } + + for (; addr < limit; addr++) { + /* + * We're looking for a character that is _not_ + * contained in the token string. + */ + if ((c = dtrace_load8(addr)) == '\0') + break; + + if (!(tokmap[c >> 3] & (1 << (c & 0x7)))) + break; + } + + if (c == '\0') { + /* + * We reached the end of the string without finding + * any character that was not in the token string. + * We return NULL in this case, and we set the saved + * address to NULL as well. + */ + regs[rd] = 0; + mstate->dtms_strtok = 0; + mstate->dtms_strtok_limit = 0; + break; + } + + /* + * From here on, we're copying into the destination string. + */ + for (i = 0; addr < limit && i < size - 1; addr++) { + if ((c = dtrace_load8(addr)) == '\0') + break; + + if (tokmap[c >> 3] & (1 << (c & 0x7))) + break; + + ASSERT(i < size); + dest[i++] = c; + } + + ASSERT(i < size); + dest[i] = '\0'; + regs[rd] = (uintptr_t)dest; + mstate->dtms_scratch_ptr += size; + mstate->dtms_strtok = addr; + mstate->dtms_strtok_limit = limit; + break; + } + + case DIF_SUBR_SUBSTR: { + uintptr_t s = tupregs[0].dttk_value; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + char *d = (char *)mstate->dtms_scratch_ptr; + int64_t index = (int64_t)tupregs[1].dttk_value; + int64_t remaining = (int64_t)tupregs[2].dttk_value; + size_t len = dtrace_strlen((char *)s, size); + int64_t i; + + if (!dtrace_canload(s, len + 1, mstate, vstate)) { + regs[rd] = 0; + break; + } + + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + if (nargs <= 2) + remaining = (int64_t)size; + + if (index < 0) { + index += len; + + if (index < 0 && index + remaining > 0) { + remaining += index; + index = 0; + } + } + + if (index >= len || index < 0) { + remaining = 0; + } else if (remaining < 0) { + remaining += len - index; + } else if (index + remaining > size) { + remaining = size - index; + } + + for (i = 0; i < remaining; i++) { + if ((d[i] = dtrace_load8(s + index + i)) == '\0') + break; + } + + d[i] = '\0'; + + mstate->dtms_scratch_ptr += size; + regs[rd] = (uintptr_t)d; + break; + } + + case DIF_SUBR_JSON: { + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + uintptr_t json = tupregs[0].dttk_value; + size_t jsonlen = dtrace_strlen((char *)json, size); + uintptr_t elem = tupregs[1].dttk_value; + size_t elemlen = dtrace_strlen((char *)elem, size); + + char *dest = (char *)mstate->dtms_scratch_ptr; + char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1; + char *ee = elemlist; + int nelems = 1; + uintptr_t cur; + + if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) || + !dtrace_canload(elem, elemlen + 1, mstate, vstate)) { + regs[rd] = 0; + break; + } + + if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + /* + * Read the element selector and split it up into a packed list + * of strings. + */ + for (cur = elem; cur < elem + elemlen; cur++) { + char cc = dtrace_load8(cur); + + if (cur == elem && cc == '[') { + /* + * If the first element selector key is + * actually an array index then ignore the + * bracket. + */ + continue; + } + + if (cc == ']') + continue; + + if (cc == '.' || cc == '[') { + nelems++; + cc = '\0'; + } + + *ee++ = cc; + } + *ee++ = '\0'; + + if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist, + nelems, dest)) != 0) + mstate->dtms_scratch_ptr += jsonlen + 1; + break; + } + + case DIF_SUBR_TOUPPER: + case DIF_SUBR_TOLOWER: { + uintptr_t s = tupregs[0].dttk_value; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + char *dest = (char *)mstate->dtms_scratch_ptr, c; + size_t len = dtrace_strlen((char *)s, size); + char lower, upper, convert; + int64_t i; + + if (subr == DIF_SUBR_TOUPPER) { + lower = 'a'; + upper = 'z'; + convert = 'A'; + } else { + lower = 'A'; + upper = 'Z'; + convert = 'a'; + } + + if (!dtrace_canload(s, len + 1, mstate, vstate)) { + regs[rd] = 0; + break; + } + + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + for (i = 0; i < size - 1; i++) { + if ((c = dtrace_load8(s + i)) == '\0') + break; + + if (c >= lower && c <= upper) + c = convert + (c - lower); + + dest[i] = c; + } + + ASSERT(i < size); + dest[i] = '\0'; + regs[rd] = (uintptr_t)dest; + mstate->dtms_scratch_ptr += size; + break; + } + +#ifdef illumos + case DIF_SUBR_GETMAJOR: +#ifdef _LP64 + regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64; +#else + regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ; +#endif + break; + + case DIF_SUBR_GETMINOR: +#ifdef _LP64 + regs[rd] = tupregs[0].dttk_value & MAXMIN64; +#else + regs[rd] = tupregs[0].dttk_value & MAXMIN; +#endif + break; + + case DIF_SUBR_DDI_PATHNAME: { + /* + * This one is a galactic mess. We are going to roughly + * emulate ddi_pathname(), but it's made more complicated + * by the fact that we (a) want to include the minor name and + * (b) must proceed iteratively instead of recursively. + */ + uintptr_t dest = mstate->dtms_scratch_ptr; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + char *start = (char *)dest, *end = start + size - 1; + uintptr_t daddr = tupregs[0].dttk_value; + int64_t minor = (int64_t)tupregs[1].dttk_value; + char *s; + int i, len, depth = 0; + + /* + * Due to all the pointer jumping we do and context we must + * rely upon, we just mandate that the user must have kernel + * read privileges to use this routine. + */ + if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) { + *flags |= CPU_DTRACE_KPRIV; + *illval = daddr; + regs[rd] = 0; + } + + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + *end = '\0'; + + /* + * We want to have a name for the minor. In order to do this, + * we need to walk the minor list from the devinfo. We want + * to be sure that we don't infinitely walk a circular list, + * so we check for circularity by sending a scout pointer + * ahead two elements for every element that we iterate over; + * if the list is circular, these will ultimately point to the + * same element. You may recognize this little trick as the + * answer to a stupid interview question -- one that always + * seems to be asked by those who had to have it laboriously + * explained to them, and who can't even concisely describe + * the conditions under which one would be forced to resort to + * this technique. Needless to say, those conditions are + * found here -- and probably only here. Is this the only use + * of this infamous trick in shipping, production code? If it + * isn't, it probably should be... + */ + if (minor != -1) { + uintptr_t maddr = dtrace_loadptr(daddr + + offsetof(struct dev_info, devi_minor)); + + uintptr_t next = offsetof(struct ddi_minor_data, next); + uintptr_t name = offsetof(struct ddi_minor_data, + d_minor) + offsetof(struct ddi_minor, name); + uintptr_t dev = offsetof(struct ddi_minor_data, + d_minor) + offsetof(struct ddi_minor, dev); + uintptr_t scout; + + if (maddr != NULL) + scout = dtrace_loadptr(maddr + next); + + while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) { + uint64_t m; +#ifdef _LP64 + m = dtrace_load64(maddr + dev) & MAXMIN64; +#else + m = dtrace_load32(maddr + dev) & MAXMIN; +#endif + if (m != minor) { + maddr = dtrace_loadptr(maddr + next); + + if (scout == NULL) + continue; + + scout = dtrace_loadptr(scout + next); + + if (scout == NULL) + continue; + + scout = dtrace_loadptr(scout + next); + + if (scout == NULL) + continue; + + if (scout == maddr) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + + continue; + } + + /* + * We have the minor data. Now we need to + * copy the minor's name into the end of the + * pathname. + */ + s = (char *)dtrace_loadptr(maddr + name); + len = dtrace_strlen(s, size); + + if (*flags & CPU_DTRACE_FAULT) + break; + + if (len != 0) { + if ((end -= (len + 1)) < start) + break; + + *end = ':'; + } + + for (i = 1; i <= len; i++) + end[i] = dtrace_load8((uintptr_t)s++); + break; + } + } + + while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) { + ddi_node_state_t devi_state; + + devi_state = dtrace_load32(daddr + + offsetof(struct dev_info, devi_node_state)); + + if (*flags & CPU_DTRACE_FAULT) + break; + + if (devi_state >= DS_INITIALIZED) { + s = (char *)dtrace_loadptr(daddr + + offsetof(struct dev_info, devi_addr)); + len = dtrace_strlen(s, size); + + if (*flags & CPU_DTRACE_FAULT) + break; + + if (len != 0) { + if ((end -= (len + 1)) < start) + break; + + *end = '@'; + } + + for (i = 1; i <= len; i++) + end[i] = dtrace_load8((uintptr_t)s++); + } + + /* + * Now for the node name... + */ + s = (char *)dtrace_loadptr(daddr + + offsetof(struct dev_info, devi_node_name)); + + daddr = dtrace_loadptr(daddr + + offsetof(struct dev_info, devi_parent)); + + /* + * If our parent is NULL (that is, if we're the root + * node), we're going to use the special path + * "devices". + */ + if (daddr == 0) + s = "devices"; + + len = dtrace_strlen(s, size); + if (*flags & CPU_DTRACE_FAULT) + break; + + if ((end -= (len + 1)) < start) + break; + + for (i = 1; i <= len; i++) + end[i] = dtrace_load8((uintptr_t)s++); + *end = '/'; + + if (depth++ > dtrace_devdepth_max) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + } + + if (end < start) + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + + if (daddr == 0) { + regs[rd] = (uintptr_t)end; + mstate->dtms_scratch_ptr += size; + } + + break; + } +#endif + + case DIF_SUBR_STRJOIN: { + char *d = (char *)mstate->dtms_scratch_ptr; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + uintptr_t s1 = tupregs[0].dttk_value; + uintptr_t s2 = tupregs[1].dttk_value; + int i = 0, j = 0; + size_t lim1, lim2; + char c; + + if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) || + !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) { + regs[rd] = 0; + break; + } + + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + for (;;) { + if (i >= size) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + c = (i >= lim1) ? '\0' : dtrace_load8(s1++); + if ((d[i++] = c) == '\0') { + i--; + break; + } + } + + for (;;) { + if (i >= size) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++); + if ((d[i++] = c) == '\0') + break; + } + + if (i < size) { + mstate->dtms_scratch_ptr += i; + regs[rd] = (uintptr_t)d; + } + + break; + } + + case DIF_SUBR_STRTOLL: { + uintptr_t s = tupregs[0].dttk_value; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + size_t lim; + int base = 10; + + if (nargs > 1) { + if ((base = tupregs[1].dttk_value) <= 1 || + base > ('z' - 'a' + 1) + ('9' - '0' + 1)) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + } + + if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) { + regs[rd] = INT64_MIN; + break; + } + + regs[rd] = dtrace_strtoll((char *)s, base, lim); + break; + } + + case DIF_SUBR_LLTOSTR: { + int64_t i = (int64_t)tupregs[0].dttk_value; + uint64_t val, digit; + uint64_t size = 65; /* enough room for 2^64 in binary */ + char *end = (char *)mstate->dtms_scratch_ptr + size - 1; + int base = 10; + + if (nargs > 1) { + if ((base = tupregs[1].dttk_value) <= 1 || + base > ('z' - 'a' + 1) + ('9' - '0' + 1)) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + } + + val = (base == 10 && i < 0) ? i * -1 : i; + + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + for (*end-- = '\0'; val; val /= base) { + if ((digit = val % base) <= '9' - '0') { + *end-- = '0' + digit; + } else { + *end-- = 'a' + (digit - ('9' - '0') - 1); + } + } + + if (i == 0 && base == 16) + *end-- = '0'; + + if (base == 16) + *end-- = 'x'; + + if (i == 0 || base == 8 || base == 16) + *end-- = '0'; + + if (i < 0 && base == 10) + *end-- = '-'; + + regs[rd] = (uintptr_t)end + 1; + mstate->dtms_scratch_ptr += size; + break; + } + + case DIF_SUBR_HTONS: + case DIF_SUBR_NTOHS: +#if BYTE_ORDER == BIG_ENDIAN + regs[rd] = (uint16_t)tupregs[0].dttk_value; +#else + regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value); +#endif + break; + + + case DIF_SUBR_HTONL: + case DIF_SUBR_NTOHL: +#if BYTE_ORDER == BIG_ENDIAN + regs[rd] = (uint32_t)tupregs[0].dttk_value; +#else + regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value); +#endif + break; + + + case DIF_SUBR_HTONLL: + case DIF_SUBR_NTOHLL: +#if BYTE_ORDER == BIG_ENDIAN + regs[rd] = (uint64_t)tupregs[0].dttk_value; +#else + regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value); +#endif + break; + + + case DIF_SUBR_DIRNAME: + case DIF_SUBR_BASENAME: { + char *dest = (char *)mstate->dtms_scratch_ptr; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + uintptr_t src = tupregs[0].dttk_value; + int i, j, len = dtrace_strlen((char *)src, size); + int lastbase = -1, firstbase = -1, lastdir = -1; + int start, end; + + if (!dtrace_canload(src, len + 1, mstate, vstate)) { + regs[rd] = 0; + break; + } + + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + /* + * The basename and dirname for a zero-length string is + * defined to be "." + */ + if (len == 0) { + len = 1; + src = (uintptr_t)"."; + } + + /* + * Start from the back of the string, moving back toward the + * front until we see a character that isn't a slash. That + * character is the last character in the basename. + */ + for (i = len - 1; i >= 0; i--) { + if (dtrace_load8(src + i) != '/') + break; + } + + if (i >= 0) + lastbase = i; + + /* + * Starting from the last character in the basename, move + * towards the front until we find a slash. The character + * that we processed immediately before that is the first + * character in the basename. + */ + for (; i >= 0; i--) { + if (dtrace_load8(src + i) == '/') + break; + } + + if (i >= 0) + firstbase = i + 1; + + /* + * Now keep going until we find a non-slash character. That + * character is the last character in the dirname. + */ + for (; i >= 0; i--) { + if (dtrace_load8(src + i) != '/') + break; + } + + if (i >= 0) + lastdir = i; + + ASSERT(!(lastbase == -1 && firstbase != -1)); + ASSERT(!(firstbase == -1 && lastdir != -1)); + + if (lastbase == -1) { + /* + * We didn't find a non-slash character. We know that + * the length is non-zero, so the whole string must be + * slashes. In either the dirname or the basename + * case, we return '/'. + */ + ASSERT(firstbase == -1); + firstbase = lastbase = lastdir = 0; + } + + if (firstbase == -1) { + /* + * The entire string consists only of a basename + * component. If we're looking for dirname, we need + * to change our string to be just "."; if we're + * looking for a basename, we'll just set the first + * character of the basename to be 0. + */ + if (subr == DIF_SUBR_DIRNAME) { + ASSERT(lastdir == -1); + src = (uintptr_t)"."; + lastdir = 0; + } else { + firstbase = 0; + } + } + + if (subr == DIF_SUBR_DIRNAME) { + if (lastdir == -1) { + /* + * We know that we have a slash in the name -- + * or lastdir would be set to 0, above. And + * because lastdir is -1, we know that this + * slash must be the first character. (That + * is, the full string must be of the form + * "/basename".) In this case, the last + * character of the directory name is 0. + */ + lastdir = 0; + } + + start = 0; + end = lastdir; + } else { + ASSERT(subr == DIF_SUBR_BASENAME); + ASSERT(firstbase != -1 && lastbase != -1); + start = firstbase; + end = lastbase; + } + + for (i = start, j = 0; i <= end && j < size - 1; i++, j++) + dest[j] = dtrace_load8(src + i); + + dest[j] = '\0'; + regs[rd] = (uintptr_t)dest; + mstate->dtms_scratch_ptr += size; + break; + } + + case DIF_SUBR_GETF: { + uintptr_t fd = tupregs[0].dttk_value; + struct filedesc *fdp; + file_t *fp; + + if (!dtrace_priv_proc(state)) { + regs[rd] = 0; + break; + } + fdp = curproc->p_fd; + FILEDESC_SLOCK(fdp); + fp = fget_locked(fdp, fd); + mstate->dtms_getf = fp; + regs[rd] = (uintptr_t)fp; + FILEDESC_SUNLOCK(fdp); + break; + } + + case DIF_SUBR_CLEANPATH: { + char *dest = (char *)mstate->dtms_scratch_ptr, c; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + uintptr_t src = tupregs[0].dttk_value; + size_t lim; + int i = 0, j = 0; +#ifdef illumos + zone_t *z; +#endif + + if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) { + regs[rd] = 0; + break; + } + + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + /* + * Move forward, loading each character. + */ + do { + c = (i >= lim) ? '\0' : dtrace_load8(src + i++); +next: + if (j + 5 >= size) /* 5 = strlen("/..c\0") */ + break; + + if (c != '/') { + dest[j++] = c; + continue; + } + + c = (i >= lim) ? '\0' : dtrace_load8(src + i++); + + if (c == '/') { + /* + * We have two slashes -- we can just advance + * to the next character. + */ + goto next; + } + + if (c != '.') { + /* + * This is not "." and it's not ".." -- we can + * just store the "/" and this character and + * drive on. + */ + dest[j++] = '/'; + dest[j++] = c; + continue; + } + + c = (i >= lim) ? '\0' : dtrace_load8(src + i++); + + if (c == '/') { + /* + * This is a "/./" component. We're not going + * to store anything in the destination buffer; + * we're just going to go to the next component. + */ + goto next; + } + + if (c != '.') { + /* + * This is not ".." -- we can just store the + * "/." and this character and continue + * processing. + */ + dest[j++] = '/'; + dest[j++] = '.'; + dest[j++] = c; + continue; + } + + c = (i >= lim) ? '\0' : dtrace_load8(src + i++); + + if (c != '/' && c != '\0') { + /* + * This is not ".." -- it's "..[mumble]". + * We'll store the "/.." and this character + * and continue processing. + */ + dest[j++] = '/'; + dest[j++] = '.'; + dest[j++] = '.'; + dest[j++] = c; + continue; + } + + /* + * This is "/../" or "/..\0". We need to back up + * our destination pointer until we find a "/". + */ + i--; + while (j != 0 && dest[--j] != '/') + continue; + + if (c == '\0') + dest[++j] = '/'; + } while (c != '\0'); + + dest[j] = '\0'; + +#ifdef illumos + if (mstate->dtms_getf != NULL && + !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) && + (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) { + /* + * If we've done a getf() as a part of this ECB and we + * don't have kernel access (and we're not in the global + * zone), check if the path we cleaned up begins with + * the zone's root path, and trim it off if so. Note + * that this is an output cleanliness issue, not a + * security issue: knowing one's zone root path does + * not enable privilege escalation. + */ + if (strstr(dest, z->zone_rootpath) == dest) + dest += strlen(z->zone_rootpath) - 1; + } +#endif + + regs[rd] = (uintptr_t)dest; + mstate->dtms_scratch_ptr += size; + break; + } + + case DIF_SUBR_INET_NTOA: + case DIF_SUBR_INET_NTOA6: + case DIF_SUBR_INET_NTOP: { + size_t size; + int af, argi, i; + char *base, *end; + + if (subr == DIF_SUBR_INET_NTOP) { + af = (int)tupregs[0].dttk_value; + argi = 1; + } else { + af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6; + argi = 0; + } + + if (af == AF_INET) { + ipaddr_t ip4; + uint8_t *ptr8, val; + + if (!dtrace_canload(tupregs[argi].dttk_value, + sizeof (ipaddr_t), mstate, vstate)) { + regs[rd] = 0; + break; + } + + /* + * Safely load the IPv4 address. + */ + ip4 = dtrace_load32(tupregs[argi].dttk_value); + + /* + * Check an IPv4 string will fit in scratch. + */ + size = INET_ADDRSTRLEN; + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + base = (char *)mstate->dtms_scratch_ptr; + end = (char *)mstate->dtms_scratch_ptr + size - 1; + + /* + * Stringify as a dotted decimal quad. + */ + *end-- = '\0'; + ptr8 = (uint8_t *)&ip4; + for (i = 3; i >= 0; i--) { + val = ptr8[i]; + + if (val == 0) { + *end-- = '0'; + } else { + for (; val; val /= 10) { + *end-- = '0' + (val % 10); + } + } + + if (i > 0) + *end-- = '.'; + } + ASSERT(end + 1 >= base); + + } else if (af == AF_INET6) { + struct in6_addr ip6; + int firstzero, tryzero, numzero, v6end; + uint16_t val; + const char digits[] = "0123456789abcdef"; + + /* + * Stringify using RFC 1884 convention 2 - 16 bit + * hexadecimal values with a zero-run compression. + * Lower case hexadecimal digits are used. + * eg, fe80::214:4fff:fe0b:76c8. + * The IPv4 embedded form is returned for inet_ntop, + * just the IPv4 string is returned for inet_ntoa6. + */ + + if (!dtrace_canload(tupregs[argi].dttk_value, + sizeof (struct in6_addr), mstate, vstate)) { + regs[rd] = 0; + break; + } + + /* + * Safely load the IPv6 address. + */ + dtrace_bcopy( + (void *)(uintptr_t)tupregs[argi].dttk_value, + (void *)(uintptr_t)&ip6, sizeof (struct in6_addr)); + + /* + * Check an IPv6 string will fit in scratch. + */ + size = INET6_ADDRSTRLEN; + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + base = (char *)mstate->dtms_scratch_ptr; + end = (char *)mstate->dtms_scratch_ptr + size - 1; + *end-- = '\0'; + + /* + * Find the longest run of 16 bit zero values + * for the single allowed zero compression - "::". + */ + firstzero = -1; + tryzero = -1; + numzero = 1; + for (i = 0; i < sizeof (struct in6_addr); i++) { +#ifdef illumos + if (ip6._S6_un._S6_u8[i] == 0 && +#else + if (ip6.__u6_addr.__u6_addr8[i] == 0 && +#endif + tryzero == -1 && i % 2 == 0) { + tryzero = i; + continue; + } + + if (tryzero != -1 && +#ifdef illumos + (ip6._S6_un._S6_u8[i] != 0 || +#else + (ip6.__u6_addr.__u6_addr8[i] != 0 || +#endif + i == sizeof (struct in6_addr) - 1)) { + + if (i - tryzero <= numzero) { + tryzero = -1; + continue; + } + + firstzero = tryzero; + numzero = i - i % 2 - tryzero; + tryzero = -1; + +#ifdef illumos + if (ip6._S6_un._S6_u8[i] == 0 && +#else + if (ip6.__u6_addr.__u6_addr8[i] == 0 && +#endif + i == sizeof (struct in6_addr) - 1) + numzero += 2; + } + } + ASSERT(firstzero + numzero <= sizeof (struct in6_addr)); + + /* + * Check for an IPv4 embedded address. + */ + v6end = sizeof (struct in6_addr) - 2; + if (IN6_IS_ADDR_V4MAPPED(&ip6) || + IN6_IS_ADDR_V4COMPAT(&ip6)) { + for (i = sizeof (struct in6_addr) - 1; + i >= DTRACE_V4MAPPED_OFFSET; i--) { + ASSERT(end >= base); + +#ifdef illumos + val = ip6._S6_un._S6_u8[i]; +#else + val = ip6.__u6_addr.__u6_addr8[i]; +#endif + + if (val == 0) { + *end-- = '0'; + } else { + for (; val; val /= 10) { + *end-- = '0' + val % 10; + } + } + + if (i > DTRACE_V4MAPPED_OFFSET) + *end-- = '.'; + } + + if (subr == DIF_SUBR_INET_NTOA6) + goto inetout; + + /* + * Set v6end to skip the IPv4 address that + * we have already stringified. + */ + v6end = 10; + } + + /* + * Build the IPv6 string by working through the + * address in reverse. + */ + for (i = v6end; i >= 0; i -= 2) { + ASSERT(end >= base); + + if (i == firstzero + numzero - 2) { + *end-- = ':'; + *end-- = ':'; + i -= numzero - 2; + continue; + } + + if (i < 14 && i != firstzero - 2) + *end-- = ':'; + +#ifdef illumos + val = (ip6._S6_un._S6_u8[i] << 8) + + ip6._S6_un._S6_u8[i + 1]; +#else + val = (ip6.__u6_addr.__u6_addr8[i] << 8) + + ip6.__u6_addr.__u6_addr8[i + 1]; +#endif + + if (val == 0) { + *end-- = '0'; + } else { + for (; val; val /= 16) { + *end-- = digits[val % 16]; + } + } + } + ASSERT(end + 1 >= base); + + } else { + /* + * The user didn't use AH_INET or AH_INET6. + */ + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + regs[rd] = 0; + break; + } + +inetout: regs[rd] = (uintptr_t)end + 1; + mstate->dtms_scratch_ptr += size; + break; + } + + case DIF_SUBR_MEMREF: { + uintptr_t size = 2 * sizeof(uintptr_t); + uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t)); + size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size; + + /* address and length */ + memref[0] = tupregs[0].dttk_value; + memref[1] = tupregs[1].dttk_value; + + regs[rd] = (uintptr_t) memref; + mstate->dtms_scratch_ptr += scratch_size; + break; + } + +#ifndef illumos + case DIF_SUBR_MEMSTR: { + char *str = (char *)mstate->dtms_scratch_ptr; + uintptr_t mem = tupregs[0].dttk_value; + char c = tupregs[1].dttk_value; + size_t size = tupregs[2].dttk_value; + uint8_t n; + int i; + + regs[rd] = 0; + + if (size == 0) + break; + + if (!dtrace_canload(mem, size - 1, mstate, vstate)) + break; + + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + break; + } + + if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + + for (i = 0; i < size - 1; i++) { + n = dtrace_load8(mem++); + str[i] = (n == 0) ? c : n; + } + str[size - 1] = 0; + + regs[rd] = (uintptr_t)str; + mstate->dtms_scratch_ptr += size; + break; + } +#endif + } +} + +/* + * Emulate the execution of DTrace IR instructions specified by the given + * DIF object. This function is deliberately void of assertions as all of + * the necessary checks are handled by a call to dtrace_difo_validate(). + */ +static uint64_t +dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, + dtrace_vstate_t *vstate, dtrace_state_t *state) +{ + const dif_instr_t *text = difo->dtdo_buf; + const uint_t textlen = difo->dtdo_len; + const char *strtab = difo->dtdo_strtab; + const uint64_t *inttab = difo->dtdo_inttab; + + uint64_t rval = 0; + dtrace_statvar_t *svar; + dtrace_dstate_t *dstate = &vstate->dtvs_dynvars; + dtrace_difv_t *v; + volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags; + volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval; + + dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */ + uint64_t regs[DIF_DIR_NREGS]; + uint64_t *tmp; + + uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0; + int64_t cc_r; + uint_t pc = 0, id, opc = 0; + uint8_t ttop = 0; + dif_instr_t instr; + uint_t r1, r2, rd; + + /* + * We stash the current DIF object into the machine state: we need it + * for subsequent access checking. + */ + mstate->dtms_difo = difo; + + regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */ + + while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) { + opc = pc; + + instr = text[pc++]; + r1 = DIF_INSTR_R1(instr); + r2 = DIF_INSTR_R2(instr); + rd = DIF_INSTR_RD(instr); + + switch (DIF_INSTR_OP(instr)) { + case DIF_OP_OR: + regs[rd] = regs[r1] | regs[r2]; + break; + case DIF_OP_XOR: + regs[rd] = regs[r1] ^ regs[r2]; + break; + case DIF_OP_AND: + regs[rd] = regs[r1] & regs[r2]; + break; + case DIF_OP_SLL: + regs[rd] = regs[r1] << regs[r2]; + break; + case DIF_OP_SRL: + regs[rd] = regs[r1] >> regs[r2]; + break; + case DIF_OP_SUB: + regs[rd] = regs[r1] - regs[r2]; + break; + case DIF_OP_ADD: + regs[rd] = regs[r1] + regs[r2]; + break; + case DIF_OP_MUL: + regs[rd] = regs[r1] * regs[r2]; + break; + case DIF_OP_SDIV: + if (regs[r2] == 0) { + regs[rd] = 0; + *flags |= CPU_DTRACE_DIVZERO; + } else { + regs[rd] = (int64_t)regs[r1] / + (int64_t)regs[r2]; + } + break; + + case DIF_OP_UDIV: + if (regs[r2] == 0) { + regs[rd] = 0; + *flags |= CPU_DTRACE_DIVZERO; + } else { + regs[rd] = regs[r1] / regs[r2]; + } + break; + + case DIF_OP_SREM: + if (regs[r2] == 0) { + regs[rd] = 0; + *flags |= CPU_DTRACE_DIVZERO; + } else { + regs[rd] = (int64_t)regs[r1] % + (int64_t)regs[r2]; + } + break; + + case DIF_OP_UREM: + if (regs[r2] == 0) { + regs[rd] = 0; + *flags |= CPU_DTRACE_DIVZERO; + } else { + regs[rd] = regs[r1] % regs[r2]; + } + break; + + case DIF_OP_NOT: + regs[rd] = ~regs[r1]; + break; + case DIF_OP_MOV: + regs[rd] = regs[r1]; + break; + case DIF_OP_CMP: + cc_r = regs[r1] - regs[r2]; + cc_n = cc_r < 0; + cc_z = cc_r == 0; + cc_v = 0; + cc_c = regs[r1] < regs[r2]; + break; + case DIF_OP_TST: + cc_n = cc_v = cc_c = 0; + cc_z = regs[r1] == 0; + break; + case DIF_OP_BA: + pc = DIF_INSTR_LABEL(instr); + break; + case DIF_OP_BE: + if (cc_z) + pc = DIF_INSTR_LABEL(instr); + break; + case DIF_OP_BNE: + if (cc_z == 0) + pc = DIF_INSTR_LABEL(instr); + break; + case DIF_OP_BG: + if ((cc_z | (cc_n ^ cc_v)) == 0) + pc = DIF_INSTR_LABEL(instr); + break; + case DIF_OP_BGU: + if ((cc_c | cc_z) == 0) + pc = DIF_INSTR_LABEL(instr); + break; + case DIF_OP_BGE: + if ((cc_n ^ cc_v) == 0) + pc = DIF_INSTR_LABEL(instr); + break; + case DIF_OP_BGEU: + if (cc_c == 0) + pc = DIF_INSTR_LABEL(instr); + break; + case DIF_OP_BL: + if (cc_n ^ cc_v) + pc = DIF_INSTR_LABEL(instr); + break; + case DIF_OP_BLU: + if (cc_c) + pc = DIF_INSTR_LABEL(instr); + break; + case DIF_OP_BLE: + if (cc_z | (cc_n ^ cc_v)) + pc = DIF_INSTR_LABEL(instr); + break; + case DIF_OP_BLEU: + if (cc_c | cc_z) + pc = DIF_INSTR_LABEL(instr); + break; + case DIF_OP_RLDSB: + if (!dtrace_canload(regs[r1], 1, mstate, vstate)) + break; + /*FALLTHROUGH*/ + case DIF_OP_LDSB: + regs[rd] = (int8_t)dtrace_load8(regs[r1]); + break; + case DIF_OP_RLDSH: + if (!dtrace_canload(regs[r1], 2, mstate, vstate)) + break; + /*FALLTHROUGH*/ + case DIF_OP_LDSH: + regs[rd] = (int16_t)dtrace_load16(regs[r1]); + break; + case DIF_OP_RLDSW: + if (!dtrace_canload(regs[r1], 4, mstate, vstate)) + break; + /*FALLTHROUGH*/ + case DIF_OP_LDSW: + regs[rd] = (int32_t)dtrace_load32(regs[r1]); + break; + case DIF_OP_RLDUB: + if (!dtrace_canload(regs[r1], 1, mstate, vstate)) + break; + /*FALLTHROUGH*/ + case DIF_OP_LDUB: + regs[rd] = dtrace_load8(regs[r1]); + break; + case DIF_OP_RLDUH: + if (!dtrace_canload(regs[r1], 2, mstate, vstate)) + break; + /*FALLTHROUGH*/ + case DIF_OP_LDUH: + regs[rd] = dtrace_load16(regs[r1]); + break; + case DIF_OP_RLDUW: + if (!dtrace_canload(regs[r1], 4, mstate, vstate)) + break; + /*FALLTHROUGH*/ + case DIF_OP_LDUW: + regs[rd] = dtrace_load32(regs[r1]); + break; + case DIF_OP_RLDX: + if (!dtrace_canload(regs[r1], 8, mstate, vstate)) + break; + /*FALLTHROUGH*/ + case DIF_OP_LDX: + regs[rd] = dtrace_load64(regs[r1]); + break; + case DIF_OP_ULDSB: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = (int8_t) + dtrace_fuword8((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + case DIF_OP_ULDSH: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = (int16_t) + dtrace_fuword16((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + case DIF_OP_ULDSW: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = (int32_t) + dtrace_fuword32((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + case DIF_OP_ULDUB: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = + dtrace_fuword8((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + case DIF_OP_ULDUH: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = + dtrace_fuword16((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + case DIF_OP_ULDUW: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = + dtrace_fuword32((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + case DIF_OP_ULDX: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = + dtrace_fuword64((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + case DIF_OP_RET: + rval = regs[rd]; + pc = textlen; + break; + case DIF_OP_NOP: + break; + case DIF_OP_SETX: + regs[rd] = inttab[DIF_INSTR_INTEGER(instr)]; + break; + case DIF_OP_SETS: + regs[rd] = (uint64_t)(uintptr_t) + (strtab + DIF_INSTR_STRING(instr)); + break; + case DIF_OP_SCMP: { + size_t sz = state->dts_options[DTRACEOPT_STRSIZE]; + uintptr_t s1 = regs[r1]; + uintptr_t s2 = regs[r2]; + size_t lim1, lim2; + + if (s1 != 0 && + !dtrace_strcanload(s1, sz, &lim1, mstate, vstate)) + break; + if (s2 != 0 && + !dtrace_strcanload(s2, sz, &lim2, mstate, vstate)) + break; + + cc_r = dtrace_strncmp((char *)s1, (char *)s2, + MIN(lim1, lim2)); + + cc_n = cc_r < 0; + cc_z = cc_r == 0; + cc_v = cc_c = 0; + break; + } + case DIF_OP_LDGA: + regs[rd] = dtrace_dif_variable(mstate, state, + r1, regs[r2]); + break; + case DIF_OP_LDGS: + id = DIF_INSTR_VAR(instr); + + if (id >= DIF_VAR_OTHER_UBASE) { + uintptr_t a; + + id -= DIF_VAR_OTHER_UBASE; + svar = vstate->dtvs_globals[id]; + ASSERT(svar != NULL); + v = &svar->dtsv_var; + + if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) { + regs[rd] = svar->dtsv_data; + break; + } + + a = (uintptr_t)svar->dtsv_data; + + if (*(uint8_t *)a == UINT8_MAX) { + /* + * If the 0th byte is set to UINT8_MAX + * then this is to be treated as a + * reference to a NULL variable. + */ + regs[rd] = 0; + } else { + regs[rd] = a + sizeof (uint64_t); + } + + break; + } + + regs[rd] = dtrace_dif_variable(mstate, state, id, 0); + break; + + case DIF_OP_STGS: + id = DIF_INSTR_VAR(instr); + + ASSERT(id >= DIF_VAR_OTHER_UBASE); + id -= DIF_VAR_OTHER_UBASE; + + VERIFY(id < vstate->dtvs_nglobals); + svar = vstate->dtvs_globals[id]; + ASSERT(svar != NULL); + v = &svar->dtsv_var; + + if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { + uintptr_t a = (uintptr_t)svar->dtsv_data; + size_t lim; + + ASSERT(a != 0); + ASSERT(svar->dtsv_size != 0); + + if (regs[rd] == 0) { + *(uint8_t *)a = UINT8_MAX; + break; + } else { + *(uint8_t *)a = 0; + a += sizeof (uint64_t); + } + if (!dtrace_vcanload( + (void *)(uintptr_t)regs[rd], &v->dtdv_type, + &lim, mstate, vstate)) + break; + + dtrace_vcopy((void *)(uintptr_t)regs[rd], + (void *)a, &v->dtdv_type, lim); + break; + } + + svar->dtsv_data = regs[rd]; + break; + + case DIF_OP_LDTA: + /* + * There are no DTrace built-in thread-local arrays at + * present. This opcode is saved for future work. + */ + *flags |= CPU_DTRACE_ILLOP; + regs[rd] = 0; + break; + + case DIF_OP_LDLS: + id = DIF_INSTR_VAR(instr); + + if (id < DIF_VAR_OTHER_UBASE) { + /* + * For now, this has no meaning. + */ + regs[rd] = 0; + break; + } + + id -= DIF_VAR_OTHER_UBASE; + + ASSERT(id < vstate->dtvs_nlocals); + ASSERT(vstate->dtvs_locals != NULL); + + svar = vstate->dtvs_locals[id]; + ASSERT(svar != NULL); + v = &svar->dtsv_var; + + if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { + uintptr_t a = (uintptr_t)svar->dtsv_data; + size_t sz = v->dtdv_type.dtdt_size; + size_t lim; + + sz += sizeof (uint64_t); + ASSERT(svar->dtsv_size == NCPU * sz); + a += curcpu * sz; + + if (*(uint8_t *)a == UINT8_MAX) { + /* + * If the 0th byte is set to UINT8_MAX + * then this is to be treated as a + * reference to a NULL variable. + */ + regs[rd] = 0; + } else { + regs[rd] = a + sizeof (uint64_t); + } + + break; + } + + ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t)); + tmp = (uint64_t *)(uintptr_t)svar->dtsv_data; + regs[rd] = tmp[curcpu]; + break; + + case DIF_OP_STLS: + id = DIF_INSTR_VAR(instr); + + ASSERT(id >= DIF_VAR_OTHER_UBASE); + id -= DIF_VAR_OTHER_UBASE; + VERIFY(id < vstate->dtvs_nlocals); + + ASSERT(vstate->dtvs_locals != NULL); + svar = vstate->dtvs_locals[id]; + ASSERT(svar != NULL); + v = &svar->dtsv_var; + + if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { + uintptr_t a = (uintptr_t)svar->dtsv_data; + size_t sz = v->dtdv_type.dtdt_size; + size_t lim; + + sz += sizeof (uint64_t); + ASSERT(svar->dtsv_size == NCPU * sz); + a += curcpu * sz; + + if (regs[rd] == 0) { + *(uint8_t *)a = UINT8_MAX; + break; + } else { + *(uint8_t *)a = 0; + a += sizeof (uint64_t); + } + + if (!dtrace_vcanload( + (void *)(uintptr_t)regs[rd], &v->dtdv_type, + &lim, mstate, vstate)) + break; + + dtrace_vcopy((void *)(uintptr_t)regs[rd], + (void *)a, &v->dtdv_type, lim); + break; + } + + ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t)); + tmp = (uint64_t *)(uintptr_t)svar->dtsv_data; + tmp[curcpu] = regs[rd]; + break; + + case DIF_OP_LDTS: { + dtrace_dynvar_t *dvar; + dtrace_key_t *key; + + id = DIF_INSTR_VAR(instr); + ASSERT(id >= DIF_VAR_OTHER_UBASE); + id -= DIF_VAR_OTHER_UBASE; + v = &vstate->dtvs_tlocals[id]; + + key = &tupregs[DIF_DTR_NREGS]; + key[0].dttk_value = (uint64_t)id; + key[0].dttk_size = 0; + DTRACE_TLS_THRKEY(key[1].dttk_value); + key[1].dttk_size = 0; + + dvar = dtrace_dynvar(dstate, 2, key, + sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC, + mstate, vstate); + + if (dvar == NULL) { + regs[rd] = 0; + break; + } + + if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { + regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data; + } else { + regs[rd] = *((uint64_t *)dvar->dtdv_data); + } + + break; + } + + case DIF_OP_STTS: { + dtrace_dynvar_t *dvar; + dtrace_key_t *key; + + id = DIF_INSTR_VAR(instr); + ASSERT(id >= DIF_VAR_OTHER_UBASE); + id -= DIF_VAR_OTHER_UBASE; + VERIFY(id < vstate->dtvs_ntlocals); + + key = &tupregs[DIF_DTR_NREGS]; + key[0].dttk_value = (uint64_t)id; + key[0].dttk_size = 0; + DTRACE_TLS_THRKEY(key[1].dttk_value); + key[1].dttk_size = 0; + v = &vstate->dtvs_tlocals[id]; + + dvar = dtrace_dynvar(dstate, 2, key, + v->dtdv_type.dtdt_size > sizeof (uint64_t) ? + v->dtdv_type.dtdt_size : sizeof (uint64_t), + regs[rd] ? DTRACE_DYNVAR_ALLOC : + DTRACE_DYNVAR_DEALLOC, mstate, vstate); + + /* + * Given that we're storing to thread-local data, + * we need to flush our predicate cache. + */ + curthread->t_predcache = 0; + + if (dvar == NULL) + break; + + if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { + size_t lim; + + if (!dtrace_vcanload( + (void *)(uintptr_t)regs[rd], + &v->dtdv_type, &lim, mstate, vstate)) + break; + + dtrace_vcopy((void *)(uintptr_t)regs[rd], + dvar->dtdv_data, &v->dtdv_type, lim); + } else { + *((uint64_t *)dvar->dtdv_data) = regs[rd]; + } + + break; + } + + case DIF_OP_SRA: + regs[rd] = (int64_t)regs[r1] >> regs[r2]; + break; + + case DIF_OP_CALL: + dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd, + regs, tupregs, ttop, mstate, state); + break; + + case DIF_OP_PUSHTR: + if (ttop == DIF_DTR_NREGS) { + *flags |= CPU_DTRACE_TUPOFLOW; + break; + } + + if (r1 == DIF_TYPE_STRING) { + /* + * If this is a string type and the size is 0, + * we'll use the system-wide default string + * size. Note that we are _not_ looking at + * the value of the DTRACEOPT_STRSIZE option; + * had this been set, we would expect to have + * a non-zero size value in the "pushtr". + */ + tupregs[ttop].dttk_size = + dtrace_strlen((char *)(uintptr_t)regs[rd], + regs[r2] ? regs[r2] : + dtrace_strsize_default) + 1; + } else { + if (regs[r2] > LONG_MAX) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + + tupregs[ttop].dttk_size = regs[r2]; + } + + tupregs[ttop++].dttk_value = regs[rd]; + break; + + case DIF_OP_PUSHTV: + if (ttop == DIF_DTR_NREGS) { + *flags |= CPU_DTRACE_TUPOFLOW; + break; + } + + tupregs[ttop].dttk_value = regs[rd]; + tupregs[ttop++].dttk_size = 0; + break; + + case DIF_OP_POPTS: + if (ttop != 0) + ttop--; + break; + + case DIF_OP_FLUSHTS: + ttop = 0; + break; + + case DIF_OP_LDGAA: + case DIF_OP_LDTAA: { + dtrace_dynvar_t *dvar; + dtrace_key_t *key = tupregs; + uint_t nkeys = ttop; + + id = DIF_INSTR_VAR(instr); + ASSERT(id >= DIF_VAR_OTHER_UBASE); + id -= DIF_VAR_OTHER_UBASE; + + key[nkeys].dttk_value = (uint64_t)id; + key[nkeys++].dttk_size = 0; + + if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) { + DTRACE_TLS_THRKEY(key[nkeys].dttk_value); + key[nkeys++].dttk_size = 0; + VERIFY(id < vstate->dtvs_ntlocals); + v = &vstate->dtvs_tlocals[id]; + } else { + VERIFY(id < vstate->dtvs_nglobals); + v = &vstate->dtvs_globals[id]->dtsv_var; + } + + dvar = dtrace_dynvar(dstate, nkeys, key, + v->dtdv_type.dtdt_size > sizeof (uint64_t) ? + v->dtdv_type.dtdt_size : sizeof (uint64_t), + DTRACE_DYNVAR_NOALLOC, mstate, vstate); + + if (dvar == NULL) { + regs[rd] = 0; + break; + } + + if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { + regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data; + } else { + regs[rd] = *((uint64_t *)dvar->dtdv_data); + } + + break; + } + + case DIF_OP_STGAA: + case DIF_OP_STTAA: { + dtrace_dynvar_t *dvar; + dtrace_key_t *key = tupregs; + uint_t nkeys = ttop; + + id = DIF_INSTR_VAR(instr); + ASSERT(id >= DIF_VAR_OTHER_UBASE); + id -= DIF_VAR_OTHER_UBASE; + + key[nkeys].dttk_value = (uint64_t)id; + key[nkeys++].dttk_size = 0; + + if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) { + DTRACE_TLS_THRKEY(key[nkeys].dttk_value); + key[nkeys++].dttk_size = 0; + VERIFY(id < vstate->dtvs_ntlocals); + v = &vstate->dtvs_tlocals[id]; + } else { + VERIFY(id < vstate->dtvs_nglobals); + v = &vstate->dtvs_globals[id]->dtsv_var; + } + + dvar = dtrace_dynvar(dstate, nkeys, key, + v->dtdv_type.dtdt_size > sizeof (uint64_t) ? + v->dtdv_type.dtdt_size : sizeof (uint64_t), + regs[rd] ? DTRACE_DYNVAR_ALLOC : + DTRACE_DYNVAR_DEALLOC, mstate, vstate); + + if (dvar == NULL) + break; + + if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { + size_t lim; + + if (!dtrace_vcanload( + (void *)(uintptr_t)regs[rd], &v->dtdv_type, + &lim, mstate, vstate)) + break; + + dtrace_vcopy((void *)(uintptr_t)regs[rd], + dvar->dtdv_data, &v->dtdv_type, lim); + } else { + *((uint64_t *)dvar->dtdv_data) = regs[rd]; + } + + break; + } + + case DIF_OP_ALLOCS: { + uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8); + size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1]; + + /* + * Rounding up the user allocation size could have + * overflowed large, bogus allocations (like -1ULL) to + * 0. + */ + if (size < regs[r1] || + !DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + dtrace_bzero((void *) mstate->dtms_scratch_ptr, size); + mstate->dtms_scratch_ptr += size; + regs[rd] = ptr; + break; + } + + case DIF_OP_COPYS: + if (!dtrace_canstore(regs[rd], regs[r2], + mstate, vstate)) { + *flags |= CPU_DTRACE_BADADDR; + *illval = regs[rd]; + break; + } + + if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate)) + break; + + dtrace_bcopy((void *)(uintptr_t)regs[r1], + (void *)(uintptr_t)regs[rd], (size_t)regs[r2]); + break; + + case DIF_OP_STB: + if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) { + *flags |= CPU_DTRACE_BADADDR; + *illval = regs[rd]; + break; + } + *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1]; + break; + + case DIF_OP_STH: + if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) { + *flags |= CPU_DTRACE_BADADDR; + *illval = regs[rd]; + break; + } + if (regs[rd] & 1) { + *flags |= CPU_DTRACE_BADALIGN; + *illval = regs[rd]; + break; + } + *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1]; + break; + + case DIF_OP_STW: + if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) { + *flags |= CPU_DTRACE_BADADDR; + *illval = regs[rd]; + break; + } + if (regs[rd] & 3) { + *flags |= CPU_DTRACE_BADALIGN; + *illval = regs[rd]; + break; + } + *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1]; + break; + + case DIF_OP_STX: + if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) { + *flags |= CPU_DTRACE_BADADDR; + *illval = regs[rd]; + break; + } + if (regs[rd] & 7) { + *flags |= CPU_DTRACE_BADALIGN; + *illval = regs[rd]; + break; + } + *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1]; + break; + } + } + + if (!(*flags & CPU_DTRACE_FAULT)) + return (rval); + + mstate->dtms_fltoffs = opc * sizeof (dif_instr_t); + mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS; + + return (0); +} + +static void +dtrace_action_breakpoint(dtrace_ecb_t *ecb) +{ + dtrace_probe_t *probe = ecb->dte_probe; + dtrace_provider_t *prov = probe->dtpr_provider; + char c[DTRACE_FULLNAMELEN + 80], *str; + char *msg = "dtrace: breakpoint action at probe "; + char *ecbmsg = " (ecb "; + uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4)); + uintptr_t val = (uintptr_t)ecb; + int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0; + + if (dtrace_destructive_disallow) + return; + + /* + * It's impossible to be taking action on the NULL probe. + */ + ASSERT(probe != NULL); + + /* + * This is a poor man's (destitute man's?) sprintf(): we want to + * print the provider name, module name, function name and name of + * the probe, along with the hex address of the ECB with the breakpoint + * action -- all of which we must place in the character buffer by + * hand. + */ + while (*msg != '\0') + c[i++] = *msg++; + + for (str = prov->dtpv_name; *str != '\0'; str++) + c[i++] = *str; + c[i++] = ':'; + + for (str = probe->dtpr_mod; *str != '\0'; str++) + c[i++] = *str; + c[i++] = ':'; + + for (str = probe->dtpr_func; *str != '\0'; str++) + c[i++] = *str; + c[i++] = ':'; + + for (str = probe->dtpr_name; *str != '\0'; str++) + c[i++] = *str; + + while (*ecbmsg != '\0') + c[i++] = *ecbmsg++; + + while (shift >= 0) { + mask = (uintptr_t)0xf << shift; + + if (val >= ((uintptr_t)1 << shift)) + c[i++] = "0123456789abcdef"[(val & mask) >> shift]; + shift -= 4; + } + + c[i++] = ')'; + c[i] = '\0'; + +#ifdef illumos + debug_enter(c); +#else + kdb_enter(KDB_WHY_DTRACE, "breakpoint action"); +#endif +} + +static void +dtrace_action_panic(dtrace_ecb_t *ecb) +{ + dtrace_probe_t *probe = ecb->dte_probe; + + /* + * It's impossible to be taking action on the NULL probe. + */ + ASSERT(probe != NULL); + + if (dtrace_destructive_disallow) + return; + + if (dtrace_panicked != NULL) + return; + + if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL) + return; + + /* + * We won the right to panic. (We want to be sure that only one + * thread calls panic() from dtrace_probe(), and that panic() is + * called exactly once.) + */ + dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)", + probe->dtpr_provider->dtpv_name, probe->dtpr_mod, + probe->dtpr_func, probe->dtpr_name, (void *)ecb); +} + +static void +dtrace_action_raise(uint64_t sig) +{ + if (dtrace_destructive_disallow) + return; + + if (sig >= NSIG) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return; + } + +#ifdef illumos + /* + * raise() has a queue depth of 1 -- we ignore all subsequent + * invocations of the raise() action. + */ + if (curthread->t_dtrace_sig == 0) + curthread->t_dtrace_sig = (uint8_t)sig; + + curthread->t_sig_check = 1; + aston(curthread); +#else + struct proc *p = curproc; + PROC_LOCK(p); + kern_psignal(p, sig); + PROC_UNLOCK(p); +#endif +} + +static void +dtrace_action_stop(void) +{ + if (dtrace_destructive_disallow) + return; + +#ifdef illumos + if (!curthread->t_dtrace_stop) { + curthread->t_dtrace_stop = 1; + curthread->t_sig_check = 1; + aston(curthread); + } +#else + struct proc *p = curproc; + PROC_LOCK(p); + kern_psignal(p, SIGSTOP); + PROC_UNLOCK(p); +#endif +} + +static void +dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val) +{ + hrtime_t now; + volatile uint16_t *flags; +#ifdef illumos + cpu_t *cpu = CPU; +#else + cpu_t *cpu = &solaris_cpu[curcpu]; +#endif + + if (dtrace_destructive_disallow) + return; + + flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags; + + now = dtrace_gethrtime(); + + if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) { + /* + * We need to advance the mark to the current time. + */ + cpu->cpu_dtrace_chillmark = now; + cpu->cpu_dtrace_chilled = 0; + } + + /* + * Now check to see if the requested chill time would take us over + * the maximum amount of time allowed in the chill interval. (Or + * worse, if the calculation itself induces overflow.) + */ + if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max || + cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) { + *flags |= CPU_DTRACE_ILLOP; + return; + } + + while (dtrace_gethrtime() - now < val) + continue; + + /* + * Normally, we assure that the value of the variable "timestamp" does + * not change within an ECB. The presence of chill() represents an + * exception to this rule, however. + */ + mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP; + cpu->cpu_dtrace_chilled += val; +} + +static void +dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state, + uint64_t *buf, uint64_t arg) +{ + int nframes = DTRACE_USTACK_NFRAMES(arg); + int strsize = DTRACE_USTACK_STRSIZE(arg); + uint64_t *pcs = &buf[1], *fps; + char *str = (char *)&pcs[nframes]; + int size, offs = 0, i, j; + size_t rem; + uintptr_t old = mstate->dtms_scratch_ptr, saved; + uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags; + char *sym; + + /* + * Should be taking a faster path if string space has not been + * allocated. + */ + ASSERT(strsize != 0); + + /* + * We will first allocate some temporary space for the frame pointers. + */ + fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8); + size = (uintptr_t)fps - mstate->dtms_scratch_ptr + + (nframes * sizeof (uint64_t)); + + if (!DTRACE_INSCRATCH(mstate, size)) { + /* + * Not enough room for our frame pointers -- need to indicate + * that we ran out of scratch space. + */ + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + return; + } + + mstate->dtms_scratch_ptr += size; + saved = mstate->dtms_scratch_ptr; + + /* + * Now get a stack with both program counters and frame pointers. + */ + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + dtrace_getufpstack(buf, fps, nframes + 1); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + + /* + * If that faulted, we're cooked. + */ + if (*flags & CPU_DTRACE_FAULT) + goto out; + + /* + * Now we want to walk up the stack, calling the USTACK helper. For + * each iteration, we restore the scratch pointer. + */ + for (i = 0; i < nframes; i++) { + mstate->dtms_scratch_ptr = saved; + + if (offs >= strsize) + break; + + sym = (char *)(uintptr_t)dtrace_helper( + DTRACE_HELPER_ACTION_USTACK, + mstate, state, pcs[i], fps[i]); + + /* + * If we faulted while running the helper, we're going to + * clear the fault and null out the corresponding string. + */ + if (*flags & CPU_DTRACE_FAULT) { + *flags &= ~CPU_DTRACE_FAULT; + str[offs++] = '\0'; + continue; + } + + if (sym == NULL) { + str[offs++] = '\0'; + continue; + } + + if (!dtrace_strcanload((uintptr_t)sym, strsize, &rem, mstate, + &(state->dts_vstate))) { + str[offs++] = '\0'; + continue; + } + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + + /* + * Now copy in the string that the helper returned to us. + */ + for (j = 0; offs + j < strsize && j < rem; j++) { + if ((str[offs + j] = sym[j]) == '\0') + break; + } + + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + + offs += j + 1; + } + + if (offs >= strsize) { + /* + * If we didn't have room for all of the strings, we don't + * abort processing -- this needn't be a fatal error -- but we + * still want to increment a counter (dts_stkstroverflows) to + * allow this condition to be warned about. (If this is from + * a jstack() action, it is easily tuned via jstackstrsize.) + */ + dtrace_error(&state->dts_stkstroverflows); + } + + while (offs < strsize) + str[offs++] = '\0'; + +out: + mstate->dtms_scratch_ptr = old; +} + +static void +dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size, + size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind) +{ + volatile uint16_t *flags; + uint64_t val = *valp; + size_t valoffs = *valoffsp; + + flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags; + ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF); + + /* + * If this is a string, we're going to only load until we find the zero + * byte -- after which we'll store zero bytes. + */ + if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) { + char c = '\0' + 1; + size_t s; + + for (s = 0; s < size; s++) { + if (c != '\0' && dtkind == DIF_TF_BYREF) { + c = dtrace_load8(val++); + } else if (c != '\0' && dtkind == DIF_TF_BYUREF) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + c = dtrace_fuword8((void *)(uintptr_t)val++); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + if (*flags & CPU_DTRACE_FAULT) + break; + } + + DTRACE_STORE(uint8_t, tomax, valoffs++, c); + + if (c == '\0' && intuple) + break; + } + } else { + uint8_t c; + while (valoffs < end) { + if (dtkind == DIF_TF_BYREF) { + c = dtrace_load8(val++); + } else if (dtkind == DIF_TF_BYUREF) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + c = dtrace_fuword8((void *)(uintptr_t)val++); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + if (*flags & CPU_DTRACE_FAULT) + break; + } + + DTRACE_STORE(uint8_t, tomax, + valoffs++, c); + } + } + + *valp = val; + *valoffsp = valoffs; +} + +/* + * Disables interrupts and sets the per-thread inprobe flag. When DEBUG is + * defined, we also assert that we are not recursing unless the probe ID is an + * error probe. + */ +static dtrace_icookie_t +dtrace_probe_enter(dtrace_id_t id) +{ + dtrace_icookie_t cookie; + + cookie = dtrace_interrupt_disable(); + + /* + * Unless this is an ERROR probe, we are not allowed to recurse in + * dtrace_probe(). Recursing into DTrace probe usually means that a + * function is instrumented that should not have been instrumented or + * that the ordering guarantee of the records will be violated, + * resulting in unexpected output. If there is an exception to this + * assertion, a new case should be added. + */ + ASSERT(curthread->t_dtrace_inprobe == 0 || + id == dtrace_probeid_error); + curthread->t_dtrace_inprobe = 1; + + return (cookie); +} + +/* + * Clears the per-thread inprobe flag and enables interrupts. + */ +static void +dtrace_probe_exit(dtrace_icookie_t cookie) +{ + + curthread->t_dtrace_inprobe = 0; + dtrace_interrupt_enable(cookie); +} + +/* + * If you're looking for the epicenter of DTrace, you just found it. This + * is the function called by the provider to fire a probe -- from which all + * subsequent probe-context DTrace activity emanates. + */ +void +dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) +{ + processorid_t cpuid; + dtrace_icookie_t cookie; + dtrace_probe_t *probe; + dtrace_mstate_t mstate; + dtrace_ecb_t *ecb; + dtrace_action_t *act; + intptr_t offs; + size_t size; + int vtime, onintr; + volatile uint16_t *flags; + hrtime_t now; + + if (panicstr != NULL) + return; + +#ifdef illumos + /* + * Kick out immediately if this CPU is still being born (in which case + * curthread will be set to -1) or the current thread can't allow + * probes in its current context. + */ + if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE)) + return; +#endif + + cookie = dtrace_probe_enter(id); + probe = dtrace_probes[id - 1]; + cpuid = curcpu; + onintr = CPU_ON_INTR(CPU); + + if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE && + probe->dtpr_predcache == curthread->t_predcache) { + /* + * We have hit in the predicate cache; we know that + * this predicate would evaluate to be false. + */ + dtrace_probe_exit(cookie); + return; + } + +#ifdef illumos + if (panic_quiesce) { +#else + if (panicstr != NULL) { +#endif + /* + * We don't trace anything if we're panicking. + */ + dtrace_probe_exit(cookie); + return; + } + + now = mstate.dtms_timestamp = dtrace_gethrtime(); + mstate.dtms_present = DTRACE_MSTATE_TIMESTAMP; + vtime = dtrace_vtime_references != 0; + + if (vtime && curthread->t_dtrace_start) + curthread->t_dtrace_vtime += now - curthread->t_dtrace_start; + + mstate.dtms_difo = NULL; + mstate.dtms_probe = probe; + mstate.dtms_strtok = 0; + mstate.dtms_arg[0] = arg0; + mstate.dtms_arg[1] = arg1; + mstate.dtms_arg[2] = arg2; + mstate.dtms_arg[3] = arg3; + mstate.dtms_arg[4] = arg4; + + flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags; + + for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) { + dtrace_predicate_t *pred = ecb->dte_predicate; + dtrace_state_t *state = ecb->dte_state; + dtrace_buffer_t *buf = &state->dts_buffer[cpuid]; + dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid]; + dtrace_vstate_t *vstate = &state->dts_vstate; + dtrace_provider_t *prov = probe->dtpr_provider; + uint64_t tracememsize = 0; + int committed = 0; + caddr_t tomax; + + /* + * A little subtlety with the following (seemingly innocuous) + * declaration of the automatic 'val': by looking at the + * code, you might think that it could be declared in the + * action processing loop, below. (That is, it's only used in + * the action processing loop.) However, it must be declared + * out of that scope because in the case of DIF expression + * arguments to aggregating actions, one iteration of the + * action loop will use the last iteration's value. + */ + uint64_t val = 0; + + mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE; + mstate.dtms_getf = NULL; + + *flags &= ~CPU_DTRACE_ERROR; + + if (prov == dtrace_provider) { + /* + * If dtrace itself is the provider of this probe, + * we're only going to continue processing the ECB if + * arg0 (the dtrace_state_t) is equal to the ECB's + * creating state. (This prevents disjoint consumers + * from seeing one another's metaprobes.) + */ + if (arg0 != (uint64_t)(uintptr_t)state) + continue; + } + + if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) { + /* + * We're not currently active. If our provider isn't + * the dtrace pseudo provider, we're not interested. + */ + if (prov != dtrace_provider) + continue; + + /* + * Now we must further check if we are in the BEGIN + * probe. If we are, we will only continue processing + * if we're still in WARMUP -- if one BEGIN enabling + * has invoked the exit() action, we don't want to + * evaluate subsequent BEGIN enablings. + */ + if (probe->dtpr_id == dtrace_probeid_begin && + state->dts_activity != DTRACE_ACTIVITY_WARMUP) { + ASSERT(state->dts_activity == + DTRACE_ACTIVITY_DRAINING); + continue; + } + } + + if (ecb->dte_cond) { + /* + * If the dte_cond bits indicate that this + * consumer is only allowed to see user-mode firings + * of this probe, call the provider's dtps_usermode() + * entry point to check that the probe was fired + * while in a user context. Skip this ECB if that's + * not the case. + */ + if ((ecb->dte_cond & DTRACE_COND_USERMODE) && + prov->dtpv_pops.dtps_usermode(prov->dtpv_arg, + probe->dtpr_id, probe->dtpr_arg) == 0) + continue; + +#ifdef illumos + /* + * This is more subtle than it looks. We have to be + * absolutely certain that CRED() isn't going to + * change out from under us so it's only legit to + * examine that structure if we're in constrained + * situations. Currently, the only times we'll this + * check is if a non-super-user has enabled the + * profile or syscall providers -- providers that + * allow visibility of all processes. For the + * profile case, the check above will ensure that + * we're examining a user context. + */ + if (ecb->dte_cond & DTRACE_COND_OWNER) { + cred_t *cr; + cred_t *s_cr = + ecb->dte_state->dts_cred.dcr_cred; + proc_t *proc; + + ASSERT(s_cr != NULL); + + if ((cr = CRED()) == NULL || + s_cr->cr_uid != cr->cr_uid || + s_cr->cr_uid != cr->cr_ruid || + s_cr->cr_uid != cr->cr_suid || + s_cr->cr_gid != cr->cr_gid || + s_cr->cr_gid != cr->cr_rgid || + s_cr->cr_gid != cr->cr_sgid || + (proc = ttoproc(curthread)) == NULL || + (proc->p_flag & SNOCD)) + continue; + } + + if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) { + cred_t *cr; + cred_t *s_cr = + ecb->dte_state->dts_cred.dcr_cred; + + ASSERT(s_cr != NULL); + + if ((cr = CRED()) == NULL || + s_cr->cr_zone->zone_id != + cr->cr_zone->zone_id) + continue; + } +#endif + } + + if (now - state->dts_alive > dtrace_deadman_timeout) { + /* + * We seem to be dead. Unless we (a) have kernel + * destructive permissions (b) have explicitly enabled + * destructive actions and (c) destructive actions have + * not been disabled, we're going to transition into + * the KILLED state, from which no further processing + * on this state will be performed. + */ + if (!dtrace_priv_kernel_destructive(state) || + !state->dts_cred.dcr_destructive || + dtrace_destructive_disallow) { + void *activity = &state->dts_activity; + dtrace_activity_t current; + + do { + current = state->dts_activity; + } while (dtrace_cas32(activity, current, + DTRACE_ACTIVITY_KILLED) != current); + + continue; + } + } + + if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed, + ecb->dte_alignment, state, &mstate)) < 0) + continue; + + tomax = buf->dtb_tomax; + ASSERT(tomax != NULL); + + if (ecb->dte_size != 0) { + dtrace_rechdr_t dtrh; + if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) { + mstate.dtms_timestamp = dtrace_gethrtime(); + mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP; + } + ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t)); + dtrh.dtrh_epid = ecb->dte_epid; + DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, + mstate.dtms_timestamp); + *((dtrace_rechdr_t *)(tomax + offs)) = dtrh; + } + + mstate.dtms_epid = ecb->dte_epid; + mstate.dtms_present |= DTRACE_MSTATE_EPID; + + if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) + mstate.dtms_access = DTRACE_ACCESS_KERNEL; + else + mstate.dtms_access = 0; + + if (pred != NULL) { + dtrace_difo_t *dp = pred->dtp_difo; + uint64_t rval; + + rval = dtrace_dif_emulate(dp, &mstate, vstate, state); + + if (!(*flags & CPU_DTRACE_ERROR) && !rval) { + dtrace_cacheid_t cid = probe->dtpr_predcache; + + if (cid != DTRACE_CACHEIDNONE && !onintr) { + /* + * Update the predicate cache... + */ + ASSERT(cid == pred->dtp_cacheid); + curthread->t_predcache = cid; + } + + continue; + } + } + + for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) && + act != NULL; act = act->dta_next) { + size_t valoffs; + dtrace_difo_t *dp; + dtrace_recdesc_t *rec = &act->dta_rec; + + size = rec->dtrd_size; + valoffs = offs + rec->dtrd_offset; + + if (DTRACEACT_ISAGG(act->dta_kind)) { + uint64_t v = 0xbad; + dtrace_aggregation_t *agg; + + agg = (dtrace_aggregation_t *)act; + + if ((dp = act->dta_difo) != NULL) + v = dtrace_dif_emulate(dp, + &mstate, vstate, state); + + if (*flags & CPU_DTRACE_ERROR) + continue; + + /* + * Note that we always pass the expression + * value from the previous iteration of the + * action loop. This value will only be used + * if there is an expression argument to the + * aggregating action, denoted by the + * dtag_hasarg field. + */ + dtrace_aggregate(agg, buf, + offs, aggbuf, v, val); + continue; + } + + switch (act->dta_kind) { + case DTRACEACT_STOP: + if (dtrace_priv_proc_destructive(state)) + dtrace_action_stop(); + continue; + + case DTRACEACT_BREAKPOINT: + if (dtrace_priv_kernel_destructive(state)) + dtrace_action_breakpoint(ecb); + continue; + + case DTRACEACT_PANIC: + if (dtrace_priv_kernel_destructive(state)) + dtrace_action_panic(ecb); + continue; + + case DTRACEACT_STACK: + if (!dtrace_priv_kernel(state)) + continue; + + dtrace_getpcstack((pc_t *)(tomax + valoffs), + size / sizeof (pc_t), probe->dtpr_aframes, + DTRACE_ANCHORED(probe) ? NULL : + (uint32_t *)arg0); + continue; + + case DTRACEACT_JSTACK: + case DTRACEACT_USTACK: + if (!dtrace_priv_proc(state)) + continue; + + /* + * See comment in DIF_VAR_PID. + */ + if (DTRACE_ANCHORED(mstate.dtms_probe) && + CPU_ON_INTR(CPU)) { + int depth = DTRACE_USTACK_NFRAMES( + rec->dtrd_arg) + 1; + + dtrace_bzero((void *)(tomax + valoffs), + DTRACE_USTACK_STRSIZE(rec->dtrd_arg) + + depth * sizeof (uint64_t)); + + continue; + } + + if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 && + curproc->p_dtrace_helpers != NULL) { + /* + * This is the slow path -- we have + * allocated string space, and we're + * getting the stack of a process that + * has helpers. Call into a separate + * routine to perform this processing. + */ + dtrace_action_ustack(&mstate, state, + (uint64_t *)(tomax + valoffs), + rec->dtrd_arg); + continue; + } + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + dtrace_getupcstack((uint64_t *) + (tomax + valoffs), + DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + continue; + + default: + break; + } + + dp = act->dta_difo; + ASSERT(dp != NULL); + + val = dtrace_dif_emulate(dp, &mstate, vstate, state); + + if (*flags & CPU_DTRACE_ERROR) + continue; + + switch (act->dta_kind) { + case DTRACEACT_SPECULATE: { + dtrace_rechdr_t *dtrh; + + ASSERT(buf == &state->dts_buffer[cpuid]); + buf = dtrace_speculation_buffer(state, + cpuid, val); + + if (buf == NULL) { + *flags |= CPU_DTRACE_DROP; + continue; + } + + offs = dtrace_buffer_reserve(buf, + ecb->dte_needed, ecb->dte_alignment, + state, NULL); + + if (offs < 0) { + *flags |= CPU_DTRACE_DROP; + continue; + } + + tomax = buf->dtb_tomax; + ASSERT(tomax != NULL); + + if (ecb->dte_size == 0) + continue; + + ASSERT3U(ecb->dte_size, >=, + sizeof (dtrace_rechdr_t)); + dtrh = ((void *)(tomax + offs)); + dtrh->dtrh_epid = ecb->dte_epid; + /* + * When the speculation is committed, all of + * the records in the speculative buffer will + * have their timestamps set to the commit + * time. Until then, it is set to a sentinel + * value, for debugability. + */ + DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX); + continue; + } + + case DTRACEACT_PRINTM: { + /* The DIF returns a 'memref'. */ + uintptr_t *memref = (uintptr_t *)(uintptr_t) val; + + /* Get the size from the memref. */ + size = memref[1]; + + /* + * Check if the size exceeds the allocated + * buffer size. + */ + if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) { + /* Flag a drop! */ + *flags |= CPU_DTRACE_DROP; + continue; + } + + /* Store the size in the buffer first. */ + DTRACE_STORE(uintptr_t, tomax, + valoffs, size); + + /* + * Offset the buffer address to the start + * of the data. + */ + valoffs += sizeof(uintptr_t); + + /* + * Reset to the memory address rather than + * the memref array, then let the BYREF + * code below do the work to store the + * memory data in the buffer. + */ + val = memref[0]; + break; + } + + case DTRACEACT_CHILL: + if (dtrace_priv_kernel_destructive(state)) + dtrace_action_chill(&mstate, val); + continue; + + case DTRACEACT_RAISE: + if (dtrace_priv_proc_destructive(state)) + dtrace_action_raise(val); + continue; + + case DTRACEACT_COMMIT: + ASSERT(!committed); + + /* + * We need to commit our buffer state. + */ + if (ecb->dte_size) + buf->dtb_offset = offs + ecb->dte_size; + buf = &state->dts_buffer[cpuid]; + dtrace_speculation_commit(state, cpuid, val); + committed = 1; + continue; + + case DTRACEACT_DISCARD: + dtrace_speculation_discard(state, cpuid, val); + continue; + + case DTRACEACT_DIFEXPR: + case DTRACEACT_LIBACT: + case DTRACEACT_PRINTF: + case DTRACEACT_PRINTA: + case DTRACEACT_SYSTEM: + case DTRACEACT_FREOPEN: + case DTRACEACT_TRACEMEM: + break; + + case DTRACEACT_TRACEMEM_DYNSIZE: + tracememsize = val; + break; + + case DTRACEACT_SYM: + case DTRACEACT_MOD: + if (!dtrace_priv_kernel(state)) + continue; + break; + + case DTRACEACT_USYM: + case DTRACEACT_UMOD: + case DTRACEACT_UADDR: { +#ifdef illumos + struct pid *pid = curthread->t_procp->p_pidp; +#endif + + if (!dtrace_priv_proc(state)) + continue; + + DTRACE_STORE(uint64_t, tomax, +#ifdef illumos + valoffs, (uint64_t)pid->pid_id); +#else + valoffs, (uint64_t) curproc->p_pid); +#endif + DTRACE_STORE(uint64_t, tomax, + valoffs + sizeof (uint64_t), val); + + continue; + } + + case DTRACEACT_EXIT: { + /* + * For the exit action, we are going to attempt + * to atomically set our activity to be + * draining. If this fails (either because + * another CPU has beat us to the exit action, + * or because our current activity is something + * other than ACTIVE or WARMUP), we will + * continue. This assures that the exit action + * can be successfully recorded at most once + * when we're in the ACTIVE state. If we're + * encountering the exit() action while in + * COOLDOWN, however, we want to honor the new + * status code. (We know that we're the only + * thread in COOLDOWN, so there is no race.) + */ + void *activity = &state->dts_activity; + dtrace_activity_t current = state->dts_activity; + + if (current == DTRACE_ACTIVITY_COOLDOWN) + break; + + if (current != DTRACE_ACTIVITY_WARMUP) + current = DTRACE_ACTIVITY_ACTIVE; + + if (dtrace_cas32(activity, current, + DTRACE_ACTIVITY_DRAINING) != current) { + *flags |= CPU_DTRACE_DROP; + continue; + } + + break; + } + + default: + ASSERT(0); + } + + if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF || + dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) { + uintptr_t end = valoffs + size; + + if (tracememsize != 0 && + valoffs + tracememsize < end) { + end = valoffs + tracememsize; + tracememsize = 0; + } + + if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF && + !dtrace_vcanload((void *)(uintptr_t)val, + &dp->dtdo_rtype, NULL, &mstate, vstate)) + continue; + + dtrace_store_by_ref(dp, tomax, size, &valoffs, + &val, end, act->dta_intuple, + dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ? + DIF_TF_BYREF: DIF_TF_BYUREF); + continue; + } + + switch (size) { + case 0: + break; + + case sizeof (uint8_t): + DTRACE_STORE(uint8_t, tomax, valoffs, val); + break; + case sizeof (uint16_t): + DTRACE_STORE(uint16_t, tomax, valoffs, val); + break; + case sizeof (uint32_t): + DTRACE_STORE(uint32_t, tomax, valoffs, val); + break; + case sizeof (uint64_t): + DTRACE_STORE(uint64_t, tomax, valoffs, val); + break; + default: + /* + * Any other size should have been returned by + * reference, not by value. + */ + ASSERT(0); + break; + } + } + + if (*flags & CPU_DTRACE_DROP) + continue; + + if (*flags & CPU_DTRACE_FAULT) { + int ndx; + dtrace_action_t *err; + + buf->dtb_errors++; + + if (probe->dtpr_id == dtrace_probeid_error) { + /* + * There's nothing we can do -- we had an + * error on the error probe. We bump an + * error counter to at least indicate that + * this condition happened. + */ + dtrace_error(&state->dts_dblerrors); + continue; + } + + if (vtime) { + /* + * Before recursing on dtrace_probe(), we + * need to explicitly clear out our start + * time to prevent it from being accumulated + * into t_dtrace_vtime. + */ + curthread->t_dtrace_start = 0; + } + + /* + * Iterate over the actions to figure out which action + * we were processing when we experienced the error. + * Note that act points _past_ the faulting action; if + * act is ecb->dte_action, the fault was in the + * predicate, if it's ecb->dte_action->dta_next it's + * in action #1, and so on. + */ + for (err = ecb->dte_action, ndx = 0; + err != act; err = err->dta_next, ndx++) + continue; + + dtrace_probe_error(state, ecb->dte_epid, ndx, + (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ? + mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags), + cpu_core[cpuid].cpuc_dtrace_illval); + + continue; + } + + if (!committed) + buf->dtb_offset = offs + ecb->dte_size; + } + + if (vtime) + curthread->t_dtrace_start = dtrace_gethrtime(); + + dtrace_probe_exit(cookie); +} + +/* + * DTrace Probe Hashing Functions + * + * The functions in this section (and indeed, the functions in remaining + * sections) are not _called_ from probe context. (Any exceptions to this are + * marked with a "Note:".) Rather, they are called from elsewhere in the + * DTrace framework to look-up probes in, add probes to and remove probes from + * the DTrace probe hashes. (Each probe is hashed by each element of the + * probe tuple -- allowing for fast lookups, regardless of what was + * specified.) + */ +static uint_t +dtrace_hash_str(const char *p) +{ + unsigned int g; + uint_t hval = 0; + + while (*p) { + hval = (hval << 4) + *p++; + if ((g = (hval & 0xf0000000)) != 0) + hval ^= g >> 24; + hval &= ~g; + } + return (hval); +} + +static dtrace_hash_t * +dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs) +{ + dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP); + + hash->dth_stroffs = stroffs; + hash->dth_nextoffs = nextoffs; + hash->dth_prevoffs = prevoffs; + + hash->dth_size = 1; + hash->dth_mask = hash->dth_size - 1; + + hash->dth_tab = kmem_zalloc(hash->dth_size * + sizeof (dtrace_hashbucket_t *), KM_SLEEP); + + return (hash); +} + +static void +dtrace_hash_destroy(dtrace_hash_t *hash) +{ +#ifdef DEBUG + int i; + + for (i = 0; i < hash->dth_size; i++) + ASSERT(hash->dth_tab[i] == NULL); +#endif + + kmem_free(hash->dth_tab, + hash->dth_size * sizeof (dtrace_hashbucket_t *)); + kmem_free(hash, sizeof (dtrace_hash_t)); +} + +static void +dtrace_hash_resize(dtrace_hash_t *hash) +{ + int size = hash->dth_size, i, ndx; + int new_size = hash->dth_size << 1; + int new_mask = new_size - 1; + dtrace_hashbucket_t **new_tab, *bucket, *next; + + ASSERT((new_size & new_mask) == 0); + + new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP); + + for (i = 0; i < size; i++) { + for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) { + dtrace_probe_t *probe = bucket->dthb_chain; + + ASSERT(probe != NULL); + ndx = DTRACE_HASHSTR(hash, probe) & new_mask; + + next = bucket->dthb_next; + bucket->dthb_next = new_tab[ndx]; + new_tab[ndx] = bucket; + } + } + + kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *)); + hash->dth_tab = new_tab; + hash->dth_size = new_size; + hash->dth_mask = new_mask; +} + +static void +dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new) +{ + int hashval = DTRACE_HASHSTR(hash, new); + int ndx = hashval & hash->dth_mask; + dtrace_hashbucket_t *bucket = hash->dth_tab[ndx]; + dtrace_probe_t **nextp, **prevp; + + for (; bucket != NULL; bucket = bucket->dthb_next) { + if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new)) + goto add; + } + + if ((hash->dth_nbuckets >> 1) > hash->dth_size) { + dtrace_hash_resize(hash); + dtrace_hash_add(hash, new); + return; + } + + bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP); + bucket->dthb_next = hash->dth_tab[ndx]; + hash->dth_tab[ndx] = bucket; + hash->dth_nbuckets++; + +add: + nextp = DTRACE_HASHNEXT(hash, new); + ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL); + *nextp = bucket->dthb_chain; + + if (bucket->dthb_chain != NULL) { + prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain); + ASSERT(*prevp == NULL); + *prevp = new; + } + + bucket->dthb_chain = new; + bucket->dthb_len++; +} + +static dtrace_probe_t * +dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template) +{ + int hashval = DTRACE_HASHSTR(hash, template); + int ndx = hashval & hash->dth_mask; + dtrace_hashbucket_t *bucket = hash->dth_tab[ndx]; + + for (; bucket != NULL; bucket = bucket->dthb_next) { + if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template)) + return (bucket->dthb_chain); + } + + return (NULL); +} + +static int +dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template) +{ + int hashval = DTRACE_HASHSTR(hash, template); + int ndx = hashval & hash->dth_mask; + dtrace_hashbucket_t *bucket = hash->dth_tab[ndx]; + + for (; bucket != NULL; bucket = bucket->dthb_next) { + if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template)) + return (bucket->dthb_len); + } + + return (0); +} + +static void +dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe) +{ + int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask; + dtrace_hashbucket_t *bucket = hash->dth_tab[ndx]; + + dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe); + dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe); + + /* + * Find the bucket that we're removing this probe from. + */ + for (; bucket != NULL; bucket = bucket->dthb_next) { + if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe)) + break; + } + + ASSERT(bucket != NULL); + + if (*prevp == NULL) { + if (*nextp == NULL) { + /* + * The removed probe was the only probe on this + * bucket; we need to remove the bucket. + */ + dtrace_hashbucket_t *b = hash->dth_tab[ndx]; + + ASSERT(bucket->dthb_chain == probe); + ASSERT(b != NULL); + + if (b == bucket) { + hash->dth_tab[ndx] = bucket->dthb_next; + } else { + while (b->dthb_next != bucket) + b = b->dthb_next; + b->dthb_next = bucket->dthb_next; + } + + ASSERT(hash->dth_nbuckets > 0); + hash->dth_nbuckets--; + kmem_free(bucket, sizeof (dtrace_hashbucket_t)); + return; + } + + bucket->dthb_chain = *nextp; + } else { + *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp; + } + + if (*nextp != NULL) + *(DTRACE_HASHPREV(hash, *nextp)) = *prevp; +} + +/* + * DTrace Utility Functions + * + * These are random utility functions that are _not_ called from probe context. + */ +static int +dtrace_badattr(const dtrace_attribute_t *a) +{ + return (a->dtat_name > DTRACE_STABILITY_MAX || + a->dtat_data > DTRACE_STABILITY_MAX || + a->dtat_class > DTRACE_CLASS_MAX); +} + +/* + * Return a duplicate copy of a string. If the specified string is NULL, + * this function returns a zero-length string. + */ +static char * +dtrace_strdup(const char *str) +{ + char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP); + + if (str != NULL) + (void) strcpy(new, str); + + return (new); +} + +#define DTRACE_ISALPHA(c) \ + (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) + +static int +dtrace_badname(const char *s) +{ + char c; + + if (s == NULL || (c = *s++) == '\0') + return (0); + + if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.') + return (1); + + while ((c = *s++) != '\0') { + if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') && + c != '-' && c != '_' && c != '.' && c != '`') + return (1); + } + + return (0); +} + +static void +dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp) +{ + uint32_t priv; + +#ifdef illumos + if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) { + /* + * For DTRACE_PRIV_ALL, the uid and zoneid don't matter. + */ + priv = DTRACE_PRIV_ALL; + } else { + *uidp = crgetuid(cr); + *zoneidp = crgetzoneid(cr); + + priv = 0; + if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) + priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER; + else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) + priv |= DTRACE_PRIV_USER; + if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) + priv |= DTRACE_PRIV_PROC; + if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) + priv |= DTRACE_PRIV_OWNER; + if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) + priv |= DTRACE_PRIV_ZONEOWNER; + } +#else + priv = DTRACE_PRIV_ALL; +#endif + + *privp = priv; +} + +#ifdef DTRACE_ERRDEBUG +static void +dtrace_errdebug(const char *str) +{ + int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ; + int occupied = 0; + + mutex_enter(&dtrace_errlock); + dtrace_errlast = str; + dtrace_errthread = curthread; + + while (occupied++ < DTRACE_ERRHASHSZ) { + if (dtrace_errhash[hval].dter_msg == str) { + dtrace_errhash[hval].dter_count++; + goto out; + } + + if (dtrace_errhash[hval].dter_msg != NULL) { + hval = (hval + 1) % DTRACE_ERRHASHSZ; + continue; + } + + dtrace_errhash[hval].dter_msg = str; + dtrace_errhash[hval].dter_count = 1; + goto out; + } + + panic("dtrace: undersized error hash"); +out: + mutex_exit(&dtrace_errlock); +} +#endif + +/* + * DTrace Matching Functions + * + * These functions are used to match groups of probes, given some elements of + * a probe tuple, or some globbed expressions for elements of a probe tuple. + */ +static int +dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid, + zoneid_t zoneid) +{ + if (priv != DTRACE_PRIV_ALL) { + uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags; + uint32_t match = priv & ppriv; + + /* + * No PRIV_DTRACE_* privileges... + */ + if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER | + DTRACE_PRIV_KERNEL)) == 0) + return (0); + + /* + * No matching bits, but there were bits to match... + */ + if (match == 0 && ppriv != 0) + return (0); + + /* + * Need to have permissions to the process, but don't... + */ + if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 && + uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) { + return (0); + } + + /* + * Need to be in the same zone unless we possess the + * privilege to examine all zones. + */ + if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 && + zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) { + return (0); + } + } + + return (1); +} + +/* + * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which + * consists of input pattern strings and an ops-vector to evaluate them. + * This function returns >0 for match, 0 for no match, and <0 for error. + */ +static int +dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp, + uint32_t priv, uid_t uid, zoneid_t zoneid) +{ + dtrace_provider_t *pvp = prp->dtpr_provider; + int rv; + + if (pvp->dtpv_defunct) + return (0); + + if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0) + return (rv); + + if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0) + return (rv); + + if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0) + return (rv); + + if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0) + return (rv); + + if (dtrace_match_priv(prp, priv, uid, zoneid) == 0) + return (0); + + return (rv); +} + +/* + * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN) + * interface for matching a glob pattern 'p' to an input string 's'. Unlike + * libc's version, the kernel version only applies to 8-bit ASCII strings. + * In addition, all of the recursion cases except for '*' matching have been + * unwound. For '*', we still implement recursive evaluation, but a depth + * counter is maintained and matching is aborted if we recurse too deep. + * The function returns 0 if no match, >0 if match, and <0 if recursion error. + */ +static int +dtrace_match_glob(const char *s, const char *p, int depth) +{ + const char *olds; + char s1, c; + int gs; + + if (depth > DTRACE_PROBEKEY_MAXDEPTH) + return (-1); + + if (s == NULL) + s = ""; /* treat NULL as empty string */ + +top: + olds = s; + s1 = *s++; + + if (p == NULL) + return (0); + + if ((c = *p++) == '\0') + return (s1 == '\0'); + + switch (c) { + case '[': { + int ok = 0, notflag = 0; + char lc = '\0'; + + if (s1 == '\0') + return (0); + + if (*p == '!') { + notflag = 1; + p++; + } + + if ((c = *p++) == '\0') + return (0); + + do { + if (c == '-' && lc != '\0' && *p != ']') { + if ((c = *p++) == '\0') + return (0); + if (c == '\\' && (c = *p++) == '\0') + return (0); + + if (notflag) { + if (s1 < lc || s1 > c) + ok++; + else + return (0); + } else if (lc <= s1 && s1 <= c) + ok++; + + } else if (c == '\\' && (c = *p++) == '\0') + return (0); + + lc = c; /* save left-hand 'c' for next iteration */ + + if (notflag) { + if (s1 != c) + ok++; + else + return (0); + } else if (s1 == c) + ok++; + + if ((c = *p++) == '\0') + return (0); + + } while (c != ']'); + + if (ok) + goto top; + + return (0); + } + + case '\\': + if ((c = *p++) == '\0') + return (0); + /*FALLTHRU*/ + + default: + if (c != s1) + return (0); + /*FALLTHRU*/ + + case '?': + if (s1 != '\0') + goto top; + return (0); + + case '*': + while (*p == '*') + p++; /* consecutive *'s are identical to a single one */ + + if (*p == '\0') + return (1); + + for (s = olds; *s != '\0'; s++) { + if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0) + return (gs); + } + + return (0); + } +} + +/*ARGSUSED*/ +static int +dtrace_match_string(const char *s, const char *p, int depth) +{ + return (s != NULL && strcmp(s, p) == 0); +} + +/*ARGSUSED*/ +static int +dtrace_match_nul(const char *s, const char *p, int depth) +{ + return (1); /* always match the empty pattern */ +} + +/*ARGSUSED*/ +static int +dtrace_match_nonzero(const char *s, const char *p, int depth) +{ + return (s != NULL && s[0] != '\0'); +} + +static int +dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, + zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg) +{ + dtrace_probe_t template, *probe; + dtrace_hash_t *hash = NULL; + int len, best = INT_MAX, nmatched = 0; + dtrace_id_t i; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + + /* + * If the probe ID is specified in the key, just lookup by ID and + * invoke the match callback once if a matching probe is found. + */ + if (pkp->dtpk_id != DTRACE_IDNONE) { + if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL && + dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) { + (void) (*matched)(probe, arg); + nmatched++; + } + return (nmatched); + } + + template.dtpr_mod = (char *)pkp->dtpk_mod; + template.dtpr_func = (char *)pkp->dtpk_func; + template.dtpr_name = (char *)pkp->dtpk_name; + + /* + * We want to find the most distinct of the module name, function + * name, and name. So for each one that is not a glob pattern or + * empty string, we perform a lookup in the corresponding hash and + * use the hash table with the fewest collisions to do our search. + */ + if (pkp->dtpk_mmatch == &dtrace_match_string && + (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) { + best = len; + hash = dtrace_bymod; + } + + if (pkp->dtpk_fmatch == &dtrace_match_string && + (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) { + best = len; + hash = dtrace_byfunc; + } + + if (pkp->dtpk_nmatch == &dtrace_match_string && + (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) { + best = len; + hash = dtrace_byname; + } + + /* + * If we did not select a hash table, iterate over every probe and + * invoke our callback for each one that matches our input probe key. + */ + if (hash == NULL) { + for (i = 0; i < dtrace_nprobes; i++) { + if ((probe = dtrace_probes[i]) == NULL || + dtrace_match_probe(probe, pkp, priv, uid, + zoneid) <= 0) + continue; + + nmatched++; + + if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT) + break; + } + + return (nmatched); + } + + /* + * If we selected a hash table, iterate over each probe of the same key + * name and invoke the callback for every probe that matches the other + * attributes of our input probe key. + */ + for (probe = dtrace_hash_lookup(hash, &template); probe != NULL; + probe = *(DTRACE_HASHNEXT(hash, probe))) { + + if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0) + continue; + + nmatched++; + + if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT) + break; + } + + return (nmatched); +} + +/* + * Return the function pointer dtrace_probecmp() should use to compare the + * specified pattern with a string. For NULL or empty patterns, we select + * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob(). + * For non-empty non-glob strings, we use dtrace_match_string(). + */ +static dtrace_probekey_f * +dtrace_probekey_func(const char *p) +{ + char c; + + if (p == NULL || *p == '\0') + return (&dtrace_match_nul); + + while ((c = *p++) != '\0') { + if (c == '[' || c == '?' || c == '*' || c == '\\') + return (&dtrace_match_glob); + } + + return (&dtrace_match_string); +} + +/* + * Build a probe comparison key for use with dtrace_match_probe() from the + * given probe description. By convention, a null key only matches anchored + * probes: if each field is the empty string, reset dtpk_fmatch to + * dtrace_match_nonzero(). + */ +static void +dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp) +{ + pkp->dtpk_prov = pdp->dtpd_provider; + pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider); + + pkp->dtpk_mod = pdp->dtpd_mod; + pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod); + + pkp->dtpk_func = pdp->dtpd_func; + pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func); + + pkp->dtpk_name = pdp->dtpd_name; + pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name); + + pkp->dtpk_id = pdp->dtpd_id; + + if (pkp->dtpk_id == DTRACE_IDNONE && + pkp->dtpk_pmatch == &dtrace_match_nul && + pkp->dtpk_mmatch == &dtrace_match_nul && + pkp->dtpk_fmatch == &dtrace_match_nul && + pkp->dtpk_nmatch == &dtrace_match_nul) + pkp->dtpk_fmatch = &dtrace_match_nonzero; +} + +/* + * DTrace Provider-to-Framework API Functions + * + * These functions implement much of the Provider-to-Framework API, as + * described in <sys/dtrace.h>. The parts of the API not in this section are + * the functions in the API for probe management (found below), and + * dtrace_probe() itself (found above). + */ + +/* + * Register the calling provider with the DTrace framework. This should + * generally be called by DTrace providers in their attach(9E) entry point. + */ +int +dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv, + cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp) +{ + dtrace_provider_t *provider; + + if (name == NULL || pap == NULL || pops == NULL || idp == NULL) { + cmn_err(CE_WARN, "failed to register provider '%s': invalid " + "arguments", name ? name : "<NULL>"); + return (EINVAL); + } + + if (name[0] == '\0' || dtrace_badname(name)) { + cmn_err(CE_WARN, "failed to register provider '%s': invalid " + "provider name", name); + return (EINVAL); + } + + if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) || + pops->dtps_enable == NULL || pops->dtps_disable == NULL || + pops->dtps_destroy == NULL || + ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) { + cmn_err(CE_WARN, "failed to register provider '%s': invalid " + "provider ops", name); + return (EINVAL); + } + + if (dtrace_badattr(&pap->dtpa_provider) || + dtrace_badattr(&pap->dtpa_mod) || + dtrace_badattr(&pap->dtpa_func) || + dtrace_badattr(&pap->dtpa_name) || + dtrace_badattr(&pap->dtpa_args)) { + cmn_err(CE_WARN, "failed to register provider '%s': invalid " + "provider attributes", name); + return (EINVAL); + } + + if (priv & ~DTRACE_PRIV_ALL) { + cmn_err(CE_WARN, "failed to register provider '%s': invalid " + "privilege attributes", name); + return (EINVAL); + } + + if ((priv & DTRACE_PRIV_KERNEL) && + (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) && + pops->dtps_usermode == NULL) { + cmn_err(CE_WARN, "failed to register provider '%s': need " + "dtps_usermode() op for given privilege attributes", name); + return (EINVAL); + } + + provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP); + provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP); + (void) strcpy(provider->dtpv_name, name); + + provider->dtpv_attr = *pap; + provider->dtpv_priv.dtpp_flags = priv; + if (cr != NULL) { + provider->dtpv_priv.dtpp_uid = crgetuid(cr); + provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr); + } + provider->dtpv_pops = *pops; + + if (pops->dtps_provide == NULL) { + ASSERT(pops->dtps_provide_module != NULL); + provider->dtpv_pops.dtps_provide = + (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop; + } + + if (pops->dtps_provide_module == NULL) { + ASSERT(pops->dtps_provide != NULL); + provider->dtpv_pops.dtps_provide_module = + (void (*)(void *, modctl_t *))dtrace_nullop; + } + + if (pops->dtps_suspend == NULL) { + ASSERT(pops->dtps_resume == NULL); + provider->dtpv_pops.dtps_suspend = + (void (*)(void *, dtrace_id_t, void *))dtrace_nullop; + provider->dtpv_pops.dtps_resume = + (void (*)(void *, dtrace_id_t, void *))dtrace_nullop; + } + + provider->dtpv_arg = arg; + *idp = (dtrace_provider_id_t)provider; + + if (pops == &dtrace_provider_ops) { + ASSERT(MUTEX_HELD(&dtrace_provider_lock)); + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(dtrace_anon.dta_enabling == NULL); + + /* + * We make sure that the DTrace provider is at the head of + * the provider chain. + */ + provider->dtpv_next = dtrace_provider; + dtrace_provider = provider; + return (0); + } + + mutex_enter(&dtrace_provider_lock); + mutex_enter(&dtrace_lock); + + /* + * If there is at least one provider registered, we'll add this + * provider after the first provider. + */ + if (dtrace_provider != NULL) { + provider->dtpv_next = dtrace_provider->dtpv_next; + dtrace_provider->dtpv_next = provider; + } else { + dtrace_provider = provider; + } + + if (dtrace_retained != NULL) { + dtrace_enabling_provide(provider); + + /* + * Now we need to call dtrace_enabling_matchall() -- which + * will acquire cpu_lock and dtrace_lock. We therefore need + * to drop all of our locks before calling into it... + */ + mutex_exit(&dtrace_lock); + mutex_exit(&dtrace_provider_lock); + dtrace_enabling_matchall(); + + return (0); + } + + mutex_exit(&dtrace_lock); + mutex_exit(&dtrace_provider_lock); + + return (0); +} + +/* + * Unregister the specified provider from the DTrace framework. This should + * generally be called by DTrace providers in their detach(9E) entry point. + */ +int +dtrace_unregister(dtrace_provider_id_t id) +{ + dtrace_provider_t *old = (dtrace_provider_t *)id; + dtrace_provider_t *prev = NULL; + int i, self = 0, noreap = 0; + dtrace_probe_t *probe, *first = NULL; + + if (old->dtpv_pops.dtps_enable == + (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) { + /* + * If DTrace itself is the provider, we're called with locks + * already held. + */ + ASSERT(old == dtrace_provider); +#ifdef illumos + ASSERT(dtrace_devi != NULL); +#endif + ASSERT(MUTEX_HELD(&dtrace_provider_lock)); + ASSERT(MUTEX_HELD(&dtrace_lock)); + self = 1; + + if (dtrace_provider->dtpv_next != NULL) { + /* + * There's another provider here; return failure. + */ + return (EBUSY); + } + } else { + mutex_enter(&dtrace_provider_lock); +#ifdef illumos + mutex_enter(&mod_lock); +#endif + mutex_enter(&dtrace_lock); + } + + /* + * If anyone has /dev/dtrace open, or if there are anonymous enabled + * probes, we refuse to let providers slither away, unless this + * provider has already been explicitly invalidated. + */ + if (!old->dtpv_defunct && + (dtrace_opens || (dtrace_anon.dta_state != NULL && + dtrace_anon.dta_state->dts_necbs > 0))) { + if (!self) { + mutex_exit(&dtrace_lock); +#ifdef illumos + mutex_exit(&mod_lock); +#endif + mutex_exit(&dtrace_provider_lock); + } + return (EBUSY); + } + + /* + * Attempt to destroy the probes associated with this provider. + */ + for (i = 0; i < dtrace_nprobes; i++) { + if ((probe = dtrace_probes[i]) == NULL) + continue; + + if (probe->dtpr_provider != old) + continue; + + if (probe->dtpr_ecb == NULL) + continue; + + /* + * If we are trying to unregister a defunct provider, and the + * provider was made defunct within the interval dictated by + * dtrace_unregister_defunct_reap, we'll (asynchronously) + * attempt to reap our enablings. To denote that the provider + * should reattempt to unregister itself at some point in the + * future, we will return a differentiable error code (EAGAIN + * instead of EBUSY) in this case. + */ + if (dtrace_gethrtime() - old->dtpv_defunct > + dtrace_unregister_defunct_reap) + noreap = 1; + + if (!self) { + mutex_exit(&dtrace_lock); +#ifdef illumos + mutex_exit(&mod_lock); +#endif + mutex_exit(&dtrace_provider_lock); + } + + if (noreap) + return (EBUSY); + + (void) taskq_dispatch(dtrace_taskq, + (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP); + + return (EAGAIN); + } + + /* + * All of the probes for this provider are disabled; we can safely + * remove all of them from their hash chains and from the probe array. + */ + for (i = 0; i < dtrace_nprobes; i++) { + if ((probe = dtrace_probes[i]) == NULL) + continue; + + if (probe->dtpr_provider != old) + continue; + + dtrace_probes[i] = NULL; + + dtrace_hash_remove(dtrace_bymod, probe); + dtrace_hash_remove(dtrace_byfunc, probe); + dtrace_hash_remove(dtrace_byname, probe); + + if (first == NULL) { + first = probe; + probe->dtpr_nextmod = NULL; + } else { + probe->dtpr_nextmod = first; + first = probe; + } + } + + /* + * The provider's probes have been removed from the hash chains and + * from the probe array. Now issue a dtrace_sync() to be sure that + * everyone has cleared out from any probe array processing. + */ + dtrace_sync(); + + for (probe = first; probe != NULL; probe = first) { + first = probe->dtpr_nextmod; + + old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id, + probe->dtpr_arg); + kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1); + kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1); + kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1); +#ifdef illumos + vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1); +#else + free_unr(dtrace_arena, probe->dtpr_id); +#endif + kmem_free(probe, sizeof (dtrace_probe_t)); + } + + if ((prev = dtrace_provider) == old) { +#ifdef illumos + ASSERT(self || dtrace_devi == NULL); + ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL); +#endif + dtrace_provider = old->dtpv_next; + } else { + while (prev != NULL && prev->dtpv_next != old) + prev = prev->dtpv_next; + + if (prev == NULL) { + panic("attempt to unregister non-existent " + "dtrace provider %p\n", (void *)id); + } + + prev->dtpv_next = old->dtpv_next; + } + + if (!self) { + mutex_exit(&dtrace_lock); +#ifdef illumos + mutex_exit(&mod_lock); +#endif + mutex_exit(&dtrace_provider_lock); + } + + kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1); + kmem_free(old, sizeof (dtrace_provider_t)); + + return (0); +} + +/* + * Invalidate the specified provider. All subsequent probe lookups for the + * specified provider will fail, but its probes will not be removed. + */ +void +dtrace_invalidate(dtrace_provider_id_t id) +{ + dtrace_provider_t *pvp = (dtrace_provider_t *)id; + + ASSERT(pvp->dtpv_pops.dtps_enable != + (void (*)(void *, dtrace_id_t, void *))dtrace_nullop); + + mutex_enter(&dtrace_provider_lock); + mutex_enter(&dtrace_lock); + + pvp->dtpv_defunct = dtrace_gethrtime(); + + mutex_exit(&dtrace_lock); + mutex_exit(&dtrace_provider_lock); +} + +/* + * Indicate whether or not DTrace has attached. + */ +int +dtrace_attached(void) +{ + /* + * dtrace_provider will be non-NULL iff the DTrace driver has + * attached. (It's non-NULL because DTrace is always itself a + * provider.) + */ + return (dtrace_provider != NULL); +} + +/* + * Remove all the unenabled probes for the given provider. This function is + * not unlike dtrace_unregister(), except that it doesn't remove the provider + * -- just as many of its associated probes as it can. + */ +int +dtrace_condense(dtrace_provider_id_t id) +{ + dtrace_provider_t *prov = (dtrace_provider_t *)id; + int i; + dtrace_probe_t *probe; + + /* + * Make sure this isn't the dtrace provider itself. + */ + ASSERT(prov->dtpv_pops.dtps_enable != + (void (*)(void *, dtrace_id_t, void *))dtrace_nullop); + + mutex_enter(&dtrace_provider_lock); + mutex_enter(&dtrace_lock); + + /* + * Attempt to destroy the probes associated with this provider. + */ + for (i = 0; i < dtrace_nprobes; i++) { + if ((probe = dtrace_probes[i]) == NULL) + continue; + + if (probe->dtpr_provider != prov) + continue; + + if (probe->dtpr_ecb != NULL) + continue; + + dtrace_probes[i] = NULL; + + dtrace_hash_remove(dtrace_bymod, probe); + dtrace_hash_remove(dtrace_byfunc, probe); + dtrace_hash_remove(dtrace_byname, probe); + + prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1, + probe->dtpr_arg); + kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1); + kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1); + kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1); + kmem_free(probe, sizeof (dtrace_probe_t)); +#ifdef illumos + vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1); +#else + free_unr(dtrace_arena, i + 1); +#endif + } + + mutex_exit(&dtrace_lock); + mutex_exit(&dtrace_provider_lock); + + return (0); +} + +/* + * DTrace Probe Management Functions + * + * The functions in this section perform the DTrace probe management, + * including functions to create probes, look-up probes, and call into the + * providers to request that probes be provided. Some of these functions are + * in the Provider-to-Framework API; these functions can be identified by the + * fact that they are not declared "static". + */ + +/* + * Create a probe with the specified module name, function name, and name. + */ +dtrace_id_t +dtrace_probe_create(dtrace_provider_id_t prov, const char *mod, + const char *func, const char *name, int aframes, void *arg) +{ + dtrace_probe_t *probe, **probes; + dtrace_provider_t *provider = (dtrace_provider_t *)prov; + dtrace_id_t id; + + if (provider == dtrace_provider) { + ASSERT(MUTEX_HELD(&dtrace_lock)); + } else { + mutex_enter(&dtrace_lock); + } + +#ifdef illumos + id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1, + VM_BESTFIT | VM_SLEEP); +#else + id = alloc_unr(dtrace_arena); +#endif + probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP); + + probe->dtpr_id = id; + probe->dtpr_gen = dtrace_probegen++; + probe->dtpr_mod = dtrace_strdup(mod); + probe->dtpr_func = dtrace_strdup(func); + probe->dtpr_name = dtrace_strdup(name); + probe->dtpr_arg = arg; + probe->dtpr_aframes = aframes; + probe->dtpr_provider = provider; + + dtrace_hash_add(dtrace_bymod, probe); + dtrace_hash_add(dtrace_byfunc, probe); + dtrace_hash_add(dtrace_byname, probe); + + if (id - 1 >= dtrace_nprobes) { + size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *); + size_t nsize = osize << 1; + + if (nsize == 0) { + ASSERT(osize == 0); + ASSERT(dtrace_probes == NULL); + nsize = sizeof (dtrace_probe_t *); + } + + probes = kmem_zalloc(nsize, KM_SLEEP); + + if (dtrace_probes == NULL) { + ASSERT(osize == 0); + dtrace_probes = probes; + dtrace_nprobes = 1; + } else { + dtrace_probe_t **oprobes = dtrace_probes; + + bcopy(oprobes, probes, osize); + dtrace_membar_producer(); + dtrace_probes = probes; + + dtrace_sync(); + + /* + * All CPUs are now seeing the new probes array; we can + * safely free the old array. + */ + kmem_free(oprobes, osize); + dtrace_nprobes <<= 1; + } + + ASSERT(id - 1 < dtrace_nprobes); + } + + ASSERT(dtrace_probes[id - 1] == NULL); + dtrace_probes[id - 1] = probe; + + if (provider != dtrace_provider) + mutex_exit(&dtrace_lock); + + return (id); +} + +static dtrace_probe_t * +dtrace_probe_lookup_id(dtrace_id_t id) +{ + ASSERT(MUTEX_HELD(&dtrace_lock)); + + if (id == 0 || id > dtrace_nprobes) + return (NULL); + + return (dtrace_probes[id - 1]); +} + +static int +dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg) +{ + *((dtrace_id_t *)arg) = probe->dtpr_id; + + return (DTRACE_MATCH_DONE); +} + +/* + * Look up a probe based on provider and one or more of module name, function + * name and probe name. + */ +dtrace_id_t +dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod, + char *func, char *name) +{ + dtrace_probekey_t pkey; + dtrace_id_t id; + int match; + + pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name; + pkey.dtpk_pmatch = &dtrace_match_string; + pkey.dtpk_mod = mod; + pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul; + pkey.dtpk_func = func; + pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul; + pkey.dtpk_name = name; + pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul; + pkey.dtpk_id = DTRACE_IDNONE; + + mutex_enter(&dtrace_lock); + match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0, + dtrace_probe_lookup_match, &id); + mutex_exit(&dtrace_lock); + + ASSERT(match == 1 || match == 0); + return (match ? id : 0); +} + +/* + * Returns the probe argument associated with the specified probe. + */ +void * +dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid) +{ + dtrace_probe_t *probe; + void *rval = NULL; + + mutex_enter(&dtrace_lock); + + if ((probe = dtrace_probe_lookup_id(pid)) != NULL && + probe->dtpr_provider == (dtrace_provider_t *)id) + rval = probe->dtpr_arg; + + mutex_exit(&dtrace_lock); + + return (rval); +} + +/* + * Copy a probe into a probe description. + */ +static void +dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp) +{ + bzero(pdp, sizeof (dtrace_probedesc_t)); + pdp->dtpd_id = prp->dtpr_id; + + (void) strncpy(pdp->dtpd_provider, + prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1); + + (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1); + (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1); + (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1); +} + +/* + * Called to indicate that a probe -- or probes -- should be provided by a + * specfied provider. If the specified description is NULL, the provider will + * be told to provide all of its probes. (This is done whenever a new + * consumer comes along, or whenever a retained enabling is to be matched.) If + * the specified description is non-NULL, the provider is given the + * opportunity to dynamically provide the specified probe, allowing providers + * to support the creation of probes on-the-fly. (So-called _autocreated_ + * probes.) If the provider is NULL, the operations will be applied to all + * providers; if the provider is non-NULL the operations will only be applied + * to the specified provider. The dtrace_provider_lock must be held, and the + * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation + * will need to grab the dtrace_lock when it reenters the framework through + * dtrace_probe_lookup(), dtrace_probe_create(), etc. + */ +static void +dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv) +{ +#ifdef illumos + modctl_t *ctl; +#endif + int all = 0; + + ASSERT(MUTEX_HELD(&dtrace_provider_lock)); + + if (prv == NULL) { + all = 1; + prv = dtrace_provider; + } + + do { + /* + * First, call the blanket provide operation. + */ + prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc); + +#ifdef illumos + /* + * Now call the per-module provide operation. We will grab + * mod_lock to prevent the list from being modified. Note + * that this also prevents the mod_busy bits from changing. + * (mod_busy can only be changed with mod_lock held.) + */ + mutex_enter(&mod_lock); + + ctl = &modules; + do { + if (ctl->mod_busy || ctl->mod_mp == NULL) + continue; + + prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); + + } while ((ctl = ctl->mod_next) != &modules); + + mutex_exit(&mod_lock); +#endif + } while (all && (prv = prv->dtpv_next) != NULL); +} + +#ifdef illumos +/* + * Iterate over each probe, and call the Framework-to-Provider API function + * denoted by offs. + */ +static void +dtrace_probe_foreach(uintptr_t offs) +{ + dtrace_provider_t *prov; + void (*func)(void *, dtrace_id_t, void *); + dtrace_probe_t *probe; + dtrace_icookie_t cookie; + int i; + + /* + * We disable interrupts to walk through the probe array. This is + * safe -- the dtrace_sync() in dtrace_unregister() assures that we + * won't see stale data. + */ + cookie = dtrace_interrupt_disable(); + + for (i = 0; i < dtrace_nprobes; i++) { + if ((probe = dtrace_probes[i]) == NULL) + continue; + + if (probe->dtpr_ecb == NULL) { + /* + * This probe isn't enabled -- don't call the function. + */ + continue; + } + + prov = probe->dtpr_provider; + func = *((void(**)(void *, dtrace_id_t, void *)) + ((uintptr_t)&prov->dtpv_pops + offs)); + + func(prov->dtpv_arg, i + 1, probe->dtpr_arg); + } + + dtrace_interrupt_enable(cookie); +} +#endif + +static int +dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab) +{ + dtrace_probekey_t pkey; + uint32_t priv; + uid_t uid; + zoneid_t zoneid; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + dtrace_ecb_create_cache = NULL; + + if (desc == NULL) { + /* + * If we're passed a NULL description, we're being asked to + * create an ECB with a NULL probe. + */ + (void) dtrace_ecb_create_enable(NULL, enab); + return (0); + } + + dtrace_probekey(desc, &pkey); + dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred, + &priv, &uid, &zoneid); + + return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, + enab)); +} + +/* + * DTrace Helper Provider Functions + */ +static void +dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr) +{ + attr->dtat_name = DOF_ATTR_NAME(dofattr); + attr->dtat_data = DOF_ATTR_DATA(dofattr); + attr->dtat_class = DOF_ATTR_CLASS(dofattr); +} + +static void +dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov, + const dof_provider_t *dofprov, char *strtab) +{ + hprov->dthpv_provname = strtab + dofprov->dofpv_name; + dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider, + dofprov->dofpv_provattr); + dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod, + dofprov->dofpv_modattr); + dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func, + dofprov->dofpv_funcattr); + dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name, + dofprov->dofpv_nameattr); + dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args, + dofprov->dofpv_argsattr); +} + +static void +dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid) +{ + uintptr_t daddr = (uintptr_t)dhp->dofhp_dof; + dof_hdr_t *dof = (dof_hdr_t *)daddr; + dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec; + dof_provider_t *provider; + dof_probe_t *probe; + uint32_t *off, *enoff; + uint8_t *arg; + char *strtab; + uint_t i, nprobes; + dtrace_helper_provdesc_t dhpv; + dtrace_helper_probedesc_t dhpb; + dtrace_meta_t *meta = dtrace_meta_pid; + dtrace_mops_t *mops = &meta->dtm_mops; + void *parg; + + provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset); + str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff + + provider->dofpv_strtab * dof->dofh_secsize); + prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff + + provider->dofpv_probes * dof->dofh_secsize); + arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff + + provider->dofpv_prargs * dof->dofh_secsize); + off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff + + provider->dofpv_proffs * dof->dofh_secsize); + + strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset); + off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset); + arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset); + enoff = NULL; + + /* + * See dtrace_helper_provider_validate(). + */ + if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 && + provider->dofpv_prenoffs != DOF_SECT_NONE) { + enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff + + provider->dofpv_prenoffs * dof->dofh_secsize); + enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset); + } + + nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize; + + /* + * Create the provider. + */ + dtrace_dofprov2hprov(&dhpv, provider, strtab); + + if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL) + return; + + meta->dtm_count++; + + /* + * Create the probes. + */ + for (i = 0; i < nprobes; i++) { + probe = (dof_probe_t *)(uintptr_t)(daddr + + prb_sec->dofs_offset + i * prb_sec->dofs_entsize); + + /* See the check in dtrace_helper_provider_validate(). */ + if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) + continue; + + dhpb.dthpb_mod = dhp->dofhp_mod; + dhpb.dthpb_func = strtab + probe->dofpr_func; + dhpb.dthpb_name = strtab + probe->dofpr_name; + dhpb.dthpb_base = probe->dofpr_addr; + dhpb.dthpb_offs = off + probe->dofpr_offidx; + dhpb.dthpb_noffs = probe->dofpr_noffs; + if (enoff != NULL) { + dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx; + dhpb.dthpb_nenoffs = probe->dofpr_nenoffs; + } else { + dhpb.dthpb_enoffs = NULL; + dhpb.dthpb_nenoffs = 0; + } + dhpb.dthpb_args = arg + probe->dofpr_argidx; + dhpb.dthpb_nargc = probe->dofpr_nargc; + dhpb.dthpb_xargc = probe->dofpr_xargc; + dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv; + dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv; + + mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb); + } +} + +static void +dtrace_helper_provide(dof_helper_t *dhp, pid_t pid) +{ + uintptr_t daddr = (uintptr_t)dhp->dofhp_dof; + dof_hdr_t *dof = (dof_hdr_t *)daddr; + int i; + + ASSERT(MUTEX_HELD(&dtrace_meta_lock)); + + for (i = 0; i < dof->dofh_secnum; i++) { + dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr + + dof->dofh_secoff + i * dof->dofh_secsize); + + if (sec->dofs_type != DOF_SECT_PROVIDER) + continue; + + dtrace_helper_provide_one(dhp, sec, pid); + } + + /* + * We may have just created probes, so we must now rematch against + * any retained enablings. Note that this call will acquire both + * cpu_lock and dtrace_lock; the fact that we are holding + * dtrace_meta_lock now is what defines the ordering with respect to + * these three locks. + */ + dtrace_enabling_matchall(); +} + +static void +dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid) +{ + uintptr_t daddr = (uintptr_t)dhp->dofhp_dof; + dof_hdr_t *dof = (dof_hdr_t *)daddr; + dof_sec_t *str_sec; + dof_provider_t *provider; + char *strtab; + dtrace_helper_provdesc_t dhpv; + dtrace_meta_t *meta = dtrace_meta_pid; + dtrace_mops_t *mops = &meta->dtm_mops; + + provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset); + str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff + + provider->dofpv_strtab * dof->dofh_secsize); + + strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset); + + /* + * Create the provider. + */ + dtrace_dofprov2hprov(&dhpv, provider, strtab); + + mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid); + + meta->dtm_count--; +} + +static void +dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid) +{ + uintptr_t daddr = (uintptr_t)dhp->dofhp_dof; + dof_hdr_t *dof = (dof_hdr_t *)daddr; + int i; + + ASSERT(MUTEX_HELD(&dtrace_meta_lock)); + + for (i = 0; i < dof->dofh_secnum; i++) { + dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr + + dof->dofh_secoff + i * dof->dofh_secsize); + + if (sec->dofs_type != DOF_SECT_PROVIDER) + continue; + + dtrace_helper_provider_remove_one(dhp, sec, pid); + } +} + +/* + * DTrace Meta Provider-to-Framework API Functions + * + * These functions implement the Meta Provider-to-Framework API, as described + * in <sys/dtrace.h>. + */ +int +dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg, + dtrace_meta_provider_id_t *idp) +{ + dtrace_meta_t *meta; + dtrace_helpers_t *help, *next; + int i; + + *idp = DTRACE_METAPROVNONE; + + /* + * We strictly don't need the name, but we hold onto it for + * debuggability. All hail error queues! + */ + if (name == NULL) { + cmn_err(CE_WARN, "failed to register meta-provider: " + "invalid name"); + return (EINVAL); + } + + if (mops == NULL || + mops->dtms_create_probe == NULL || + mops->dtms_provide_pid == NULL || + mops->dtms_remove_pid == NULL) { + cmn_err(CE_WARN, "failed to register meta-register %s: " + "invalid ops", name); + return (EINVAL); + } + + meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP); + meta->dtm_mops = *mops; + meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP); + (void) strcpy(meta->dtm_name, name); + meta->dtm_arg = arg; + + mutex_enter(&dtrace_meta_lock); + mutex_enter(&dtrace_lock); + + if (dtrace_meta_pid != NULL) { + mutex_exit(&dtrace_lock); + mutex_exit(&dtrace_meta_lock); + cmn_err(CE_WARN, "failed to register meta-register %s: " + "user-land meta-provider exists", name); + kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1); + kmem_free(meta, sizeof (dtrace_meta_t)); + return (EINVAL); + } + + dtrace_meta_pid = meta; + *idp = (dtrace_meta_provider_id_t)meta; + + /* + * If there are providers and probes ready to go, pass them + * off to the new meta provider now. + */ + + help = dtrace_deferred_pid; + dtrace_deferred_pid = NULL; + + mutex_exit(&dtrace_lock); + + while (help != NULL) { + for (i = 0; i < help->dthps_nprovs; i++) { + dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov, + help->dthps_pid); + } + + next = help->dthps_next; + help->dthps_next = NULL; + help->dthps_prev = NULL; + help->dthps_deferred = 0; + help = next; + } + + mutex_exit(&dtrace_meta_lock); + + return (0); +} + +int +dtrace_meta_unregister(dtrace_meta_provider_id_t id) +{ + dtrace_meta_t **pp, *old = (dtrace_meta_t *)id; + + mutex_enter(&dtrace_meta_lock); + mutex_enter(&dtrace_lock); + + if (old == dtrace_meta_pid) { + pp = &dtrace_meta_pid; + } else { + panic("attempt to unregister non-existent " + "dtrace meta-provider %p\n", (void *)old); + } + + if (old->dtm_count != 0) { + mutex_exit(&dtrace_lock); + mutex_exit(&dtrace_meta_lock); + return (EBUSY); + } + + *pp = NULL; + + mutex_exit(&dtrace_lock); + mutex_exit(&dtrace_meta_lock); + + kmem_free(old->dtm_name, strlen(old->dtm_name) + 1); + kmem_free(old, sizeof (dtrace_meta_t)); + + return (0); +} + + +/* + * DTrace DIF Object Functions + */ +static int +dtrace_difo_err(uint_t pc, const char *format, ...) +{ + if (dtrace_err_verbose) { + va_list alist; + + (void) uprintf("dtrace DIF object error: [%u]: ", pc); + va_start(alist, format); + (void) vuprintf(format, alist); + va_end(alist); + } + +#ifdef DTRACE_ERRDEBUG + dtrace_errdebug(format); +#endif + return (1); +} + +/* + * Validate a DTrace DIF object by checking the IR instructions. The following + * rules are currently enforced by dtrace_difo_validate(): + * + * 1. Each instruction must have a valid opcode + * 2. Each register, string, variable, or subroutine reference must be valid + * 3. No instruction can modify register %r0 (must be zero) + * 4. All instruction reserved bits must be set to zero + * 5. The last instruction must be a "ret" instruction + * 6. All branch targets must reference a valid instruction _after_ the branch + */ +static int +dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, + cred_t *cr) +{ + int err = 0, i; + int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err; + int kcheckload; + uint_t pc; + int maxglobal = -1, maxlocal = -1, maxtlocal = -1; + + kcheckload = cr == NULL || + (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0; + + dp->dtdo_destructive = 0; + + for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) { + dif_instr_t instr = dp->dtdo_buf[pc]; + + uint_t r1 = DIF_INSTR_R1(instr); + uint_t r2 = DIF_INSTR_R2(instr); + uint_t rd = DIF_INSTR_RD(instr); + uint_t rs = DIF_INSTR_RS(instr); + uint_t label = DIF_INSTR_LABEL(instr); + uint_t v = DIF_INSTR_VAR(instr); + uint_t subr = DIF_INSTR_SUBR(instr); + uint_t type = DIF_INSTR_TYPE(instr); + uint_t op = DIF_INSTR_OP(instr); + + switch (op) { + case DIF_OP_OR: + case DIF_OP_XOR: + case DIF_OP_AND: + case DIF_OP_SLL: + case DIF_OP_SRL: + case DIF_OP_SRA: + case DIF_OP_SUB: + case DIF_OP_ADD: + case DIF_OP_MUL: + case DIF_OP_SDIV: + case DIF_OP_UDIV: + case DIF_OP_SREM: + case DIF_OP_UREM: + case DIF_OP_COPYS: + if (r1 >= nregs) + err += efunc(pc, "invalid register %u\n", r1); + if (r2 >= nregs) + err += efunc(pc, "invalid register %u\n", r2); + if (rd >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + if (rd == 0) + err += efunc(pc, "cannot write to %r0\n"); + break; + case DIF_OP_NOT: + case DIF_OP_MOV: + case DIF_OP_ALLOCS: + if (r1 >= nregs) + err += efunc(pc, "invalid register %u\n", r1); + if (r2 != 0) + err += efunc(pc, "non-zero reserved bits\n"); + if (rd >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + if (rd == 0) + err += efunc(pc, "cannot write to %r0\n"); + break; + case DIF_OP_LDSB: + case DIF_OP_LDSH: + case DIF_OP_LDSW: + case DIF_OP_LDUB: + case DIF_OP_LDUH: + case DIF_OP_LDUW: + case DIF_OP_LDX: + if (r1 >= nregs) + err += efunc(pc, "invalid register %u\n", r1); + if (r2 != 0) + err += efunc(pc, "non-zero reserved bits\n"); + if (rd >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + if (rd == 0) + err += efunc(pc, "cannot write to %r0\n"); + if (kcheckload) + dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op + + DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd); + break; + case DIF_OP_RLDSB: + case DIF_OP_RLDSH: + case DIF_OP_RLDSW: + case DIF_OP_RLDUB: + case DIF_OP_RLDUH: + case DIF_OP_RLDUW: + case DIF_OP_RLDX: + if (r1 >= nregs) + err += efunc(pc, "invalid register %u\n", r1); + if (r2 != 0) + err += efunc(pc, "non-zero reserved bits\n"); + if (rd >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + if (rd == 0) + err += efunc(pc, "cannot write to %r0\n"); + break; + case DIF_OP_ULDSB: + case DIF_OP_ULDSH: + case DIF_OP_ULDSW: + case DIF_OP_ULDUB: + case DIF_OP_ULDUH: + case DIF_OP_ULDUW: + case DIF_OP_ULDX: + if (r1 >= nregs) + err += efunc(pc, "invalid register %u\n", r1); + if (r2 != 0) + err += efunc(pc, "non-zero reserved bits\n"); + if (rd >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + if (rd == 0) + err += efunc(pc, "cannot write to %r0\n"); + break; + case DIF_OP_STB: + case DIF_OP_STH: + case DIF_OP_STW: + case DIF_OP_STX: + if (r1 >= nregs) + err += efunc(pc, "invalid register %u\n", r1); + if (r2 != 0) + err += efunc(pc, "non-zero reserved bits\n"); + if (rd >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + if (rd == 0) + err += efunc(pc, "cannot write to 0 address\n"); + break; + case DIF_OP_CMP: + case DIF_OP_SCMP: + if (r1 >= nregs) + err += efunc(pc, "invalid register %u\n", r1); + if (r2 >= nregs) + err += efunc(pc, "invalid register %u\n", r2); + if (rd != 0) + err += efunc(pc, "non-zero reserved bits\n"); + break; + case DIF_OP_TST: + if (r1 >= nregs) + err += efunc(pc, "invalid register %u\n", r1); + if (r2 != 0 || rd != 0) + err += efunc(pc, "non-zero reserved bits\n"); + break; + case DIF_OP_BA: + case DIF_OP_BE: + case DIF_OP_BNE: + case DIF_OP_BG: + case DIF_OP_BGU: + case DIF_OP_BGE: + case DIF_OP_BGEU: + case DIF_OP_BL: + case DIF_OP_BLU: + case DIF_OP_BLE: + case DIF_OP_BLEU: + if (label >= dp->dtdo_len) { + err += efunc(pc, "invalid branch target %u\n", + label); + } + if (label <= pc) { + err += efunc(pc, "backward branch to %u\n", + label); + } + break; + case DIF_OP_RET: + if (r1 != 0 || r2 != 0) + err += efunc(pc, "non-zero reserved bits\n"); + if (rd >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + break; + case DIF_OP_NOP: + case DIF_OP_POPTS: + case DIF_OP_FLUSHTS: + if (r1 != 0 || r2 != 0 || rd != 0) + err += efunc(pc, "non-zero reserved bits\n"); + break; + case DIF_OP_SETX: + if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) { + err += efunc(pc, "invalid integer ref %u\n", + DIF_INSTR_INTEGER(instr)); + } + if (rd >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + if (rd == 0) + err += efunc(pc, "cannot write to %r0\n"); + break; + case DIF_OP_SETS: + if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) { + err += efunc(pc, "invalid string ref %u\n", + DIF_INSTR_STRING(instr)); + } + if (rd >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + if (rd == 0) + err += efunc(pc, "cannot write to %r0\n"); + break; + case DIF_OP_LDGA: + case DIF_OP_LDTA: + if (r1 > DIF_VAR_ARRAY_MAX) + err += efunc(pc, "invalid array %u\n", r1); + if (r2 >= nregs) + err += efunc(pc, "invalid register %u\n", r2); + if (rd >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + if (rd == 0) + err += efunc(pc, "cannot write to %r0\n"); + break; + case DIF_OP_LDGS: + case DIF_OP_LDTS: + case DIF_OP_LDLS: + case DIF_OP_LDGAA: + case DIF_OP_LDTAA: + if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX) + err += efunc(pc, "invalid variable %u\n", v); + if (rd >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + if (rd == 0) + err += efunc(pc, "cannot write to %r0\n"); + break; + case DIF_OP_STGS: + case DIF_OP_STTS: + case DIF_OP_STLS: + case DIF_OP_STGAA: + case DIF_OP_STTAA: + if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX) + err += efunc(pc, "invalid variable %u\n", v); + if (rs >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + break; + case DIF_OP_CALL: + if (subr > DIF_SUBR_MAX) + err += efunc(pc, "invalid subr %u\n", subr); + if (rd >= nregs) + err += efunc(pc, "invalid register %u\n", rd); + if (rd == 0) + err += efunc(pc, "cannot write to %r0\n"); + + if (subr == DIF_SUBR_COPYOUT || + subr == DIF_SUBR_COPYOUTSTR) { + dp->dtdo_destructive = 1; + } + + if (subr == DIF_SUBR_GETF) { + /* + * If we have a getf() we need to record that + * in our state. Note that our state can be + * NULL if this is a helper -- but in that + * case, the call to getf() is itself illegal, + * and will be caught (slightly later) when + * the helper is validated. + */ + if (vstate->dtvs_state != NULL) + vstate->dtvs_state->dts_getf++; + } + + break; + case DIF_OP_PUSHTR: + if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF) + err += efunc(pc, "invalid ref type %u\n", type); + if (r2 >= nregs) + err += efunc(pc, "invalid register %u\n", r2); + if (rs >= nregs) + err += efunc(pc, "invalid register %u\n", rs); + break; + case DIF_OP_PUSHTV: + if (type != DIF_TYPE_CTF) + err += efunc(pc, "invalid val type %u\n", type); + if (r2 >= nregs) + err += efunc(pc, "invalid register %u\n", r2); + if (rs >= nregs) + err += efunc(pc, "invalid register %u\n", rs); + break; + default: + err += efunc(pc, "invalid opcode %u\n", + DIF_INSTR_OP(instr)); + } + } + + if (dp->dtdo_len != 0 && + DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) { + err += efunc(dp->dtdo_len - 1, + "expected 'ret' as last DIF instruction\n"); + } + + if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) { + /* + * If we're not returning by reference, the size must be either + * 0 or the size of one of the base types. + */ + switch (dp->dtdo_rtype.dtdt_size) { + case 0: + case sizeof (uint8_t): + case sizeof (uint16_t): + case sizeof (uint32_t): + case sizeof (uint64_t): + break; + + default: + err += efunc(dp->dtdo_len - 1, "bad return size\n"); + } + } + + for (i = 0; i < dp->dtdo_varlen && err == 0; i++) { + dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL; + dtrace_diftype_t *vt, *et; + uint_t id, ndx; + + if (v->dtdv_scope != DIFV_SCOPE_GLOBAL && + v->dtdv_scope != DIFV_SCOPE_THREAD && + v->dtdv_scope != DIFV_SCOPE_LOCAL) { + err += efunc(i, "unrecognized variable scope %d\n", + v->dtdv_scope); + break; + } + + if (v->dtdv_kind != DIFV_KIND_ARRAY && + v->dtdv_kind != DIFV_KIND_SCALAR) { + err += efunc(i, "unrecognized variable type %d\n", + v->dtdv_kind); + break; + } + + if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) { + err += efunc(i, "%d exceeds variable id limit\n", id); + break; + } + + if (id < DIF_VAR_OTHER_UBASE) + continue; + + /* + * For user-defined variables, we need to check that this + * definition is identical to any previous definition that we + * encountered. + */ + ndx = id - DIF_VAR_OTHER_UBASE; + + switch (v->dtdv_scope) { + case DIFV_SCOPE_GLOBAL: + if (maxglobal == -1 || ndx > maxglobal) + maxglobal = ndx; + + if (ndx < vstate->dtvs_nglobals) { + dtrace_statvar_t *svar; + + if ((svar = vstate->dtvs_globals[ndx]) != NULL) + existing = &svar->dtsv_var; + } + + break; + + case DIFV_SCOPE_THREAD: + if (maxtlocal == -1 || ndx > maxtlocal) + maxtlocal = ndx; + + if (ndx < vstate->dtvs_ntlocals) + existing = &vstate->dtvs_tlocals[ndx]; + break; + + case DIFV_SCOPE_LOCAL: + if (maxlocal == -1 || ndx > maxlocal) + maxlocal = ndx; + + if (ndx < vstate->dtvs_nlocals) { + dtrace_statvar_t *svar; + + if ((svar = vstate->dtvs_locals[ndx]) != NULL) + existing = &svar->dtsv_var; + } + + break; + } + + vt = &v->dtdv_type; + + if (vt->dtdt_flags & DIF_TF_BYREF) { + if (vt->dtdt_size == 0) { + err += efunc(i, "zero-sized variable\n"); + break; + } + + if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL || + v->dtdv_scope == DIFV_SCOPE_LOCAL) && + vt->dtdt_size > dtrace_statvar_maxsize) { + err += efunc(i, "oversized by-ref static\n"); + break; + } + } + + if (existing == NULL || existing->dtdv_id == 0) + continue; + + ASSERT(existing->dtdv_id == v->dtdv_id); + ASSERT(existing->dtdv_scope == v->dtdv_scope); + + if (existing->dtdv_kind != v->dtdv_kind) + err += efunc(i, "%d changed variable kind\n", id); + + et = &existing->dtdv_type; + + if (vt->dtdt_flags != et->dtdt_flags) { + err += efunc(i, "%d changed variable type flags\n", id); + break; + } + + if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) { + err += efunc(i, "%d changed variable type size\n", id); + break; + } + } + + for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) { + dif_instr_t instr = dp->dtdo_buf[pc]; + + uint_t v = DIF_INSTR_VAR(instr); + uint_t op = DIF_INSTR_OP(instr); + + switch (op) { + case DIF_OP_LDGS: + case DIF_OP_LDGAA: + case DIF_OP_STGS: + case DIF_OP_STGAA: + if (v > DIF_VAR_OTHER_UBASE + maxglobal) + err += efunc(pc, "invalid variable %u\n", v); + break; + case DIF_OP_LDTS: + case DIF_OP_LDTAA: + case DIF_OP_STTS: + case DIF_OP_STTAA: + if (v > DIF_VAR_OTHER_UBASE + maxtlocal) + err += efunc(pc, "invalid variable %u\n", v); + break; + case DIF_OP_LDLS: + case DIF_OP_STLS: + if (v > DIF_VAR_OTHER_UBASE + maxlocal) + err += efunc(pc, "invalid variable %u\n", v); + break; + default: + break; + } + } + + return (err); +} + +/* + * Validate a DTrace DIF object that it is to be used as a helper. Helpers + * are much more constrained than normal DIFOs. Specifically, they may + * not: + * + * 1. Make calls to subroutines other than copyin(), copyinstr() or + * miscellaneous string routines + * 2. Access DTrace variables other than the args[] array, and the + * curthread, pid, ppid, tid, execname, zonename, uid and gid variables. + * 3. Have thread-local variables. + * 4. Have dynamic variables. + */ +static int +dtrace_difo_validate_helper(dtrace_difo_t *dp) +{ + int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err; + int err = 0; + uint_t pc; + + for (pc = 0; pc < dp->dtdo_len; pc++) { + dif_instr_t instr = dp->dtdo_buf[pc]; + + uint_t v = DIF_INSTR_VAR(instr); + uint_t subr = DIF_INSTR_SUBR(instr); + uint_t op = DIF_INSTR_OP(instr); + + switch (op) { + case DIF_OP_OR: + case DIF_OP_XOR: + case DIF_OP_AND: + case DIF_OP_SLL: + case DIF_OP_SRL: + case DIF_OP_SRA: + case DIF_OP_SUB: + case DIF_OP_ADD: + case DIF_OP_MUL: + case DIF_OP_SDIV: + case DIF_OP_UDIV: + case DIF_OP_SREM: + case DIF_OP_UREM: + case DIF_OP_COPYS: + case DIF_OP_NOT: + case DIF_OP_MOV: + case DIF_OP_RLDSB: + case DIF_OP_RLDSH: + case DIF_OP_RLDSW: + case DIF_OP_RLDUB: + case DIF_OP_RLDUH: + case DIF_OP_RLDUW: + case DIF_OP_RLDX: + case DIF_OP_ULDSB: + case DIF_OP_ULDSH: + case DIF_OP_ULDSW: + case DIF_OP_ULDUB: + case DIF_OP_ULDUH: + case DIF_OP_ULDUW: + case DIF_OP_ULDX: + case DIF_OP_STB: + case DIF_OP_STH: + case DIF_OP_STW: + case DIF_OP_STX: + case DIF_OP_ALLOCS: + case DIF_OP_CMP: + case DIF_OP_SCMP: + case DIF_OP_TST: + case DIF_OP_BA: + case DIF_OP_BE: + case DIF_OP_BNE: + case DIF_OP_BG: + case DIF_OP_BGU: + case DIF_OP_BGE: + case DIF_OP_BGEU: + case DIF_OP_BL: + case DIF_OP_BLU: + case DIF_OP_BLE: + case DIF_OP_BLEU: + case DIF_OP_RET: + case DIF_OP_NOP: + case DIF_OP_POPTS: + case DIF_OP_FLUSHTS: + case DIF_OP_SETX: + case DIF_OP_SETS: + case DIF_OP_LDGA: + case DIF_OP_LDLS: + case DIF_OP_STGS: + case DIF_OP_STLS: + case DIF_OP_PUSHTR: + case DIF_OP_PUSHTV: + break; + + case DIF_OP_LDGS: + if (v >= DIF_VAR_OTHER_UBASE) + break; + + if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) + break; + + if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID || + v == DIF_VAR_PPID || v == DIF_VAR_TID || + v == DIF_VAR_EXECARGS || + v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME || + v == DIF_VAR_UID || v == DIF_VAR_GID) + break; + + err += efunc(pc, "illegal variable %u\n", v); + break; + + case DIF_OP_LDTA: + case DIF_OP_LDTS: + case DIF_OP_LDGAA: + case DIF_OP_LDTAA: + err += efunc(pc, "illegal dynamic variable load\n"); + break; + + case DIF_OP_STTS: + case DIF_OP_STGAA: + case DIF_OP_STTAA: + err += efunc(pc, "illegal dynamic variable store\n"); + break; + + case DIF_OP_CALL: + if (subr == DIF_SUBR_ALLOCA || + subr == DIF_SUBR_BCOPY || + subr == DIF_SUBR_COPYIN || + subr == DIF_SUBR_COPYINTO || + subr == DIF_SUBR_COPYINSTR || + subr == DIF_SUBR_INDEX || + subr == DIF_SUBR_INET_NTOA || + subr == DIF_SUBR_INET_NTOA6 || + subr == DIF_SUBR_INET_NTOP || + subr == DIF_SUBR_JSON || + subr == DIF_SUBR_LLTOSTR || + subr == DIF_SUBR_STRTOLL || + subr == DIF_SUBR_RINDEX || + subr == DIF_SUBR_STRCHR || + subr == DIF_SUBR_STRJOIN || + subr == DIF_SUBR_STRRCHR || + subr == DIF_SUBR_STRSTR || + subr == DIF_SUBR_HTONS || + subr == DIF_SUBR_HTONL || + subr == DIF_SUBR_HTONLL || + subr == DIF_SUBR_NTOHS || + subr == DIF_SUBR_NTOHL || + subr == DIF_SUBR_NTOHLL || + subr == DIF_SUBR_MEMREF) + break; +#ifdef __FreeBSD__ + if (subr == DIF_SUBR_MEMSTR) + break; +#endif + + err += efunc(pc, "invalid subr %u\n", subr); + break; + + default: + err += efunc(pc, "invalid opcode %u\n", + DIF_INSTR_OP(instr)); + } + } + + return (err); +} + +/* + * Returns 1 if the expression in the DIF object can be cached on a per-thread + * basis; 0 if not. + */ +static int +dtrace_difo_cacheable(dtrace_difo_t *dp) +{ + int i; + + if (dp == NULL) + return (0); + + for (i = 0; i < dp->dtdo_varlen; i++) { + dtrace_difv_t *v = &dp->dtdo_vartab[i]; + + if (v->dtdv_scope != DIFV_SCOPE_GLOBAL) + continue; + + switch (v->dtdv_id) { + case DIF_VAR_CURTHREAD: + case DIF_VAR_PID: + case DIF_VAR_TID: + case DIF_VAR_EXECARGS: + case DIF_VAR_EXECNAME: + case DIF_VAR_ZONENAME: + break; + + default: + return (0); + } + } + + /* + * This DIF object may be cacheable. Now we need to look for any + * array loading instructions, any memory loading instructions, or + * any stores to thread-local variables. + */ + for (i = 0; i < dp->dtdo_len; i++) { + uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]); + + if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) || + (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) || + (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) || + op == DIF_OP_LDGA || op == DIF_OP_STTS) + return (0); + } + + return (1); +} + +static void +dtrace_difo_hold(dtrace_difo_t *dp) +{ + int i; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + + dp->dtdo_refcnt++; + ASSERT(dp->dtdo_refcnt != 0); + + /* + * We need to check this DIF object for references to the variable + * DIF_VAR_VTIMESTAMP. + */ + for (i = 0; i < dp->dtdo_varlen; i++) { + dtrace_difv_t *v = &dp->dtdo_vartab[i]; + + if (v->dtdv_id != DIF_VAR_VTIMESTAMP) + continue; + + if (dtrace_vtime_references++ == 0) + dtrace_vtime_enable(); + } +} + +/* + * This routine calculates the dynamic variable chunksize for a given DIF + * object. The calculation is not fool-proof, and can probably be tricked by + * malicious DIF -- but it works for all compiler-generated DIF. Because this + * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail + * if a dynamic variable size exceeds the chunksize. + */ +static void +dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate) +{ + uint64_t sval = 0; + dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */ + const dif_instr_t *text = dp->dtdo_buf; + uint_t pc, srd = 0; + uint_t ttop = 0; + size_t size, ksize; + uint_t id, i; + + for (pc = 0; pc < dp->dtdo_len; pc++) { + dif_instr_t instr = text[pc]; + uint_t op = DIF_INSTR_OP(instr); + uint_t rd = DIF_INSTR_RD(instr); + uint_t r1 = DIF_INSTR_R1(instr); + uint_t nkeys = 0; + uchar_t scope = 0; + + dtrace_key_t *key = tupregs; + + switch (op) { + case DIF_OP_SETX: + sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)]; + srd = rd; + continue; + + case DIF_OP_STTS: + key = &tupregs[DIF_DTR_NREGS]; + key[0].dttk_size = 0; + key[1].dttk_size = 0; + nkeys = 2; + scope = DIFV_SCOPE_THREAD; + break; + + case DIF_OP_STGAA: + case DIF_OP_STTAA: + nkeys = ttop; + + if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) + key[nkeys++].dttk_size = 0; + + key[nkeys++].dttk_size = 0; + + if (op == DIF_OP_STTAA) { + scope = DIFV_SCOPE_THREAD; + } else { + scope = DIFV_SCOPE_GLOBAL; + } + + break; + + case DIF_OP_PUSHTR: + if (ttop == DIF_DTR_NREGS) + return; + + if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) { + /* + * If the register for the size of the "pushtr" + * is %r0 (or the value is 0) and the type is + * a string, we'll use the system-wide default + * string size. + */ + tupregs[ttop++].dttk_size = + dtrace_strsize_default; + } else { + if (srd == 0) + return; + + if (sval > LONG_MAX) + return; + + tupregs[ttop++].dttk_size = sval; + } + + break; + + case DIF_OP_PUSHTV: + if (ttop == DIF_DTR_NREGS) + return; + + tupregs[ttop++].dttk_size = 0; + break; + + case DIF_OP_FLUSHTS: + ttop = 0; + break; + + case DIF_OP_POPTS: + if (ttop != 0) + ttop--; + break; + } + + sval = 0; + srd = 0; + + if (nkeys == 0) + continue; + + /* + * We have a dynamic variable allocation; calculate its size. + */ + for (ksize = 0, i = 0; i < nkeys; i++) + ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t)); + + size = sizeof (dtrace_dynvar_t); + size += sizeof (dtrace_key_t) * (nkeys - 1); + size += ksize; + + /* + * Now we need to determine the size of the stored data. + */ + id = DIF_INSTR_VAR(instr); + + for (i = 0; i < dp->dtdo_varlen; i++) { + dtrace_difv_t *v = &dp->dtdo_vartab[i]; + + if (v->dtdv_id == id && v->dtdv_scope == scope) { + size += v->dtdv_type.dtdt_size; + break; + } + } + + if (i == dp->dtdo_varlen) + return; + + /* + * We have the size. If this is larger than the chunk size + * for our dynamic variable state, reset the chunk size. + */ + size = P2ROUNDUP(size, sizeof (uint64_t)); + + /* + * Before setting the chunk size, check that we're not going + * to set it to a negative value... + */ + if (size > LONG_MAX) + return; + + /* + * ...and make certain that we didn't badly overflow. + */ + if (size < ksize || size < sizeof (dtrace_dynvar_t)) + return; + + if (size > vstate->dtvs_dynvars.dtds_chunksize) + vstate->dtvs_dynvars.dtds_chunksize = size; + } +} + +static void +dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate) +{ + int i, oldsvars, osz, nsz, otlocals, ntlocals; + uint_t id; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0); + + for (i = 0; i < dp->dtdo_varlen; i++) { + dtrace_difv_t *v = &dp->dtdo_vartab[i]; + dtrace_statvar_t *svar, ***svarp = NULL; + size_t dsize = 0; + uint8_t scope = v->dtdv_scope; + int *np = NULL; + + if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE) + continue; + + id -= DIF_VAR_OTHER_UBASE; + + switch (scope) { + case DIFV_SCOPE_THREAD: + while (id >= (otlocals = vstate->dtvs_ntlocals)) { + dtrace_difv_t *tlocals; + + if ((ntlocals = (otlocals << 1)) == 0) + ntlocals = 1; + + osz = otlocals * sizeof (dtrace_difv_t); + nsz = ntlocals * sizeof (dtrace_difv_t); + + tlocals = kmem_zalloc(nsz, KM_SLEEP); + + if (osz != 0) { + bcopy(vstate->dtvs_tlocals, + tlocals, osz); + kmem_free(vstate->dtvs_tlocals, osz); + } + + vstate->dtvs_tlocals = tlocals; + vstate->dtvs_ntlocals = ntlocals; + } + + vstate->dtvs_tlocals[id] = *v; + continue; + + case DIFV_SCOPE_LOCAL: + np = &vstate->dtvs_nlocals; + svarp = &vstate->dtvs_locals; + + if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) + dsize = NCPU * (v->dtdv_type.dtdt_size + + sizeof (uint64_t)); + else + dsize = NCPU * sizeof (uint64_t); + + break; + + case DIFV_SCOPE_GLOBAL: + np = &vstate->dtvs_nglobals; + svarp = &vstate->dtvs_globals; + + if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) + dsize = v->dtdv_type.dtdt_size + + sizeof (uint64_t); + + break; + + default: + ASSERT(0); + } + + while (id >= (oldsvars = *np)) { + dtrace_statvar_t **statics; + int newsvars, oldsize, newsize; + + if ((newsvars = (oldsvars << 1)) == 0) + newsvars = 1; + + oldsize = oldsvars * sizeof (dtrace_statvar_t *); + newsize = newsvars * sizeof (dtrace_statvar_t *); + + statics = kmem_zalloc(newsize, KM_SLEEP); + + if (oldsize != 0) { + bcopy(*svarp, statics, oldsize); + kmem_free(*svarp, oldsize); + } + + *svarp = statics; + *np = newsvars; + } + + if ((svar = (*svarp)[id]) == NULL) { + svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP); + svar->dtsv_var = *v; + + if ((svar->dtsv_size = dsize) != 0) { + svar->dtsv_data = (uint64_t)(uintptr_t) + kmem_zalloc(dsize, KM_SLEEP); + } + + (*svarp)[id] = svar; + } + + svar->dtsv_refcnt++; + } + + dtrace_difo_chunksize(dp, vstate); + dtrace_difo_hold(dp); +} + +static dtrace_difo_t * +dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate) +{ + dtrace_difo_t *new; + size_t sz; + + ASSERT(dp->dtdo_buf != NULL); + ASSERT(dp->dtdo_refcnt != 0); + + new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP); + + ASSERT(dp->dtdo_buf != NULL); + sz = dp->dtdo_len * sizeof (dif_instr_t); + new->dtdo_buf = kmem_alloc(sz, KM_SLEEP); + bcopy(dp->dtdo_buf, new->dtdo_buf, sz); + new->dtdo_len = dp->dtdo_len; + + if (dp->dtdo_strtab != NULL) { + ASSERT(dp->dtdo_strlen != 0); + new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP); + bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen); + new->dtdo_strlen = dp->dtdo_strlen; + } + + if (dp->dtdo_inttab != NULL) { + ASSERT(dp->dtdo_intlen != 0); + sz = dp->dtdo_intlen * sizeof (uint64_t); + new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP); + bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz); + new->dtdo_intlen = dp->dtdo_intlen; + } + + if (dp->dtdo_vartab != NULL) { + ASSERT(dp->dtdo_varlen != 0); + sz = dp->dtdo_varlen * sizeof (dtrace_difv_t); + new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP); + bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz); + new->dtdo_varlen = dp->dtdo_varlen; + } + + dtrace_difo_init(new, vstate); + return (new); +} + +static void +dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate) +{ + int i; + + ASSERT(dp->dtdo_refcnt == 0); + + for (i = 0; i < dp->dtdo_varlen; i++) { + dtrace_difv_t *v = &dp->dtdo_vartab[i]; + dtrace_statvar_t *svar, **svarp = NULL; + uint_t id; + uint8_t scope = v->dtdv_scope; + int *np = NULL; + + switch (scope) { + case DIFV_SCOPE_THREAD: + continue; + + case DIFV_SCOPE_LOCAL: + np = &vstate->dtvs_nlocals; + svarp = vstate->dtvs_locals; + break; + + case DIFV_SCOPE_GLOBAL: + np = &vstate->dtvs_nglobals; + svarp = vstate->dtvs_globals; + break; + + default: + ASSERT(0); + } + + if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE) + continue; + + id -= DIF_VAR_OTHER_UBASE; + ASSERT(id < *np); + + svar = svarp[id]; + ASSERT(svar != NULL); + ASSERT(svar->dtsv_refcnt > 0); + + if (--svar->dtsv_refcnt > 0) + continue; + + if (svar->dtsv_size != 0) { + ASSERT(svar->dtsv_data != 0); + kmem_free((void *)(uintptr_t)svar->dtsv_data, + svar->dtsv_size); + } + + kmem_free(svar, sizeof (dtrace_statvar_t)); + svarp[id] = NULL; + } + + if (dp->dtdo_buf != NULL) + kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t)); + if (dp->dtdo_inttab != NULL) + kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t)); + if (dp->dtdo_strtab != NULL) + kmem_free(dp->dtdo_strtab, dp->dtdo_strlen); + if (dp->dtdo_vartab != NULL) + kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t)); + + kmem_free(dp, sizeof (dtrace_difo_t)); +} + +static void +dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate) +{ + int i; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(dp->dtdo_refcnt != 0); + + for (i = 0; i < dp->dtdo_varlen; i++) { + dtrace_difv_t *v = &dp->dtdo_vartab[i]; + + if (v->dtdv_id != DIF_VAR_VTIMESTAMP) + continue; + + ASSERT(dtrace_vtime_references > 0); + if (--dtrace_vtime_references == 0) + dtrace_vtime_disable(); + } + + if (--dp->dtdo_refcnt == 0) + dtrace_difo_destroy(dp, vstate); +} + +/* + * DTrace Format Functions + */ +static uint16_t +dtrace_format_add(dtrace_state_t *state, char *str) +{ + char *fmt, **new; + uint16_t ndx, len = strlen(str) + 1; + + fmt = kmem_zalloc(len, KM_SLEEP); + bcopy(str, fmt, len); + + for (ndx = 0; ndx < state->dts_nformats; ndx++) { + if (state->dts_formats[ndx] == NULL) { + state->dts_formats[ndx] = fmt; + return (ndx + 1); + } + } + + if (state->dts_nformats == USHRT_MAX) { + /* + * This is only likely if a denial-of-service attack is being + * attempted. As such, it's okay to fail silently here. + */ + kmem_free(fmt, len); + return (0); + } + + /* + * For simplicity, we always resize the formats array to be exactly the + * number of formats. + */ + ndx = state->dts_nformats++; + new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP); + + if (state->dts_formats != NULL) { + ASSERT(ndx != 0); + bcopy(state->dts_formats, new, ndx * sizeof (char *)); + kmem_free(state->dts_formats, ndx * sizeof (char *)); + } + + state->dts_formats = new; + state->dts_formats[ndx] = fmt; + + return (ndx + 1); +} + +static void +dtrace_format_remove(dtrace_state_t *state, uint16_t format) +{ + char *fmt; + + ASSERT(state->dts_formats != NULL); + ASSERT(format <= state->dts_nformats); + ASSERT(state->dts_formats[format - 1] != NULL); + + fmt = state->dts_formats[format - 1]; + kmem_free(fmt, strlen(fmt) + 1); + state->dts_formats[format - 1] = NULL; +} + +static void +dtrace_format_destroy(dtrace_state_t *state) +{ + int i; + + if (state->dts_nformats == 0) { + ASSERT(state->dts_formats == NULL); + return; + } + + ASSERT(state->dts_formats != NULL); + + for (i = 0; i < state->dts_nformats; i++) { + char *fmt = state->dts_formats[i]; + + if (fmt == NULL) + continue; + + kmem_free(fmt, strlen(fmt) + 1); + } + + kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *)); + state->dts_nformats = 0; + state->dts_formats = NULL; +} + +/* + * DTrace Predicate Functions + */ +static dtrace_predicate_t * +dtrace_predicate_create(dtrace_difo_t *dp) +{ + dtrace_predicate_t *pred; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(dp->dtdo_refcnt != 0); + + pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP); + pred->dtp_difo = dp; + pred->dtp_refcnt = 1; + + if (!dtrace_difo_cacheable(dp)) + return (pred); + + if (dtrace_predcache_id == DTRACE_CACHEIDNONE) { + /* + * This is only theoretically possible -- we have had 2^32 + * cacheable predicates on this machine. We cannot allow any + * more predicates to become cacheable: as unlikely as it is, + * there may be a thread caching a (now stale) predicate cache + * ID. (N.B.: the temptation is being successfully resisted to + * have this cmn_err() "Holy shit -- we executed this code!") + */ + return (pred); + } + + pred->dtp_cacheid = dtrace_predcache_id++; + + return (pred); +} + +static void +dtrace_predicate_hold(dtrace_predicate_t *pred) +{ + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0); + ASSERT(pred->dtp_refcnt > 0); + + pred->dtp_refcnt++; +} + +static void +dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate) +{ + dtrace_difo_t *dp = pred->dtp_difo; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(dp != NULL && dp->dtdo_refcnt != 0); + ASSERT(pred->dtp_refcnt > 0); + + if (--pred->dtp_refcnt == 0) { + dtrace_difo_release(pred->dtp_difo, vstate); + kmem_free(pred, sizeof (dtrace_predicate_t)); + } +} + +/* + * DTrace Action Description Functions + */ +static dtrace_actdesc_t * +dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple, + uint64_t uarg, uint64_t arg) +{ + dtrace_actdesc_t *act; + +#ifdef illumos + ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL && + arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA)); +#endif + + act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP); + act->dtad_kind = kind; + act->dtad_ntuple = ntuple; + act->dtad_uarg = uarg; + act->dtad_arg = arg; + act->dtad_refcnt = 1; + + return (act); +} + +static void +dtrace_actdesc_hold(dtrace_actdesc_t *act) +{ + ASSERT(act->dtad_refcnt >= 1); + act->dtad_refcnt++; +} + +static void +dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate) +{ + dtrace_actkind_t kind = act->dtad_kind; + dtrace_difo_t *dp; + + ASSERT(act->dtad_refcnt >= 1); + + if (--act->dtad_refcnt != 0) + return; + + if ((dp = act->dtad_difo) != NULL) + dtrace_difo_release(dp, vstate); + + if (DTRACEACT_ISPRINTFLIKE(kind)) { + char *str = (char *)(uintptr_t)act->dtad_arg; + +#ifdef illumos + ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) || + (str == NULL && act->dtad_kind == DTRACEACT_PRINTA)); +#endif + + if (str != NULL) + kmem_free(str, strlen(str) + 1); + } + + kmem_free(act, sizeof (dtrace_actdesc_t)); +} + +/* + * DTrace ECB Functions + */ +static dtrace_ecb_t * +dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe) +{ + dtrace_ecb_t *ecb; + dtrace_epid_t epid; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + + ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP); + ecb->dte_predicate = NULL; + ecb->dte_probe = probe; + + /* + * The default size is the size of the default action: recording + * the header. + */ + ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t); + ecb->dte_alignment = sizeof (dtrace_epid_t); + + epid = state->dts_epid++; + + if (epid - 1 >= state->dts_necbs) { + dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs; + int necbs = state->dts_necbs << 1; + + ASSERT(epid == state->dts_necbs + 1); + + if (necbs == 0) { + ASSERT(oecbs == NULL); + necbs = 1; + } + + ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP); + + if (oecbs != NULL) + bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs)); + + dtrace_membar_producer(); + state->dts_ecbs = ecbs; + + if (oecbs != NULL) { + /* + * If this state is active, we must dtrace_sync() + * before we can free the old dts_ecbs array: we're + * coming in hot, and there may be active ring + * buffer processing (which indexes into the dts_ecbs + * array) on another CPU. + */ + if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) + dtrace_sync(); + + kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs)); + } + + dtrace_membar_producer(); + state->dts_necbs = necbs; + } + + ecb->dte_state = state; + + ASSERT(state->dts_ecbs[epid - 1] == NULL); + dtrace_membar_producer(); + state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb; + + return (ecb); +} + +static void +dtrace_ecb_enable(dtrace_ecb_t *ecb) +{ + dtrace_probe_t *probe = ecb->dte_probe; + + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(ecb->dte_next == NULL); + + if (probe == NULL) { + /* + * This is the NULL probe -- there's nothing to do. + */ + return; + } + + if (probe->dtpr_ecb == NULL) { + dtrace_provider_t *prov = probe->dtpr_provider; + + /* + * We're the first ECB on this probe. + */ + probe->dtpr_ecb = probe->dtpr_ecb_last = ecb; + + if (ecb->dte_predicate != NULL) + probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid; + + prov->dtpv_pops.dtps_enable(prov->dtpv_arg, + probe->dtpr_id, probe->dtpr_arg); + } else { + /* + * This probe is already active. Swing the last pointer to + * point to the new ECB, and issue a dtrace_sync() to assure + * that all CPUs have seen the change. + */ + ASSERT(probe->dtpr_ecb_last != NULL); + probe->dtpr_ecb_last->dte_next = ecb; + probe->dtpr_ecb_last = ecb; + probe->dtpr_predcache = 0; + + dtrace_sync(); + } +} + +static int +dtrace_ecb_resize(dtrace_ecb_t *ecb) +{ + dtrace_action_t *act; + uint32_t curneeded = UINT32_MAX; + uint32_t aggbase = UINT32_MAX; + + /* + * If we record anything, we always record the dtrace_rechdr_t. (And + * we always record it first.) + */ + ecb->dte_size = sizeof (dtrace_rechdr_t); + ecb->dte_alignment = sizeof (dtrace_epid_t); + + for (act = ecb->dte_action; act != NULL; act = act->dta_next) { + dtrace_recdesc_t *rec = &act->dta_rec; + ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1); + + ecb->dte_alignment = MAX(ecb->dte_alignment, + rec->dtrd_alignment); + + if (DTRACEACT_ISAGG(act->dta_kind)) { + dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act; + + ASSERT(rec->dtrd_size != 0); + ASSERT(agg->dtag_first != NULL); + ASSERT(act->dta_prev->dta_intuple); + ASSERT(aggbase != UINT32_MAX); + ASSERT(curneeded != UINT32_MAX); + + agg->dtag_base = aggbase; + + curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment); + rec->dtrd_offset = curneeded; + if (curneeded + rec->dtrd_size < curneeded) + return (EINVAL); + curneeded += rec->dtrd_size; + ecb->dte_needed = MAX(ecb->dte_needed, curneeded); + + aggbase = UINT32_MAX; + curneeded = UINT32_MAX; + } else if (act->dta_intuple) { + if (curneeded == UINT32_MAX) { + /* + * This is the first record in a tuple. Align + * curneeded to be at offset 4 in an 8-byte + * aligned block. + */ + ASSERT(act->dta_prev == NULL || + !act->dta_prev->dta_intuple); + ASSERT3U(aggbase, ==, UINT32_MAX); + curneeded = P2PHASEUP(ecb->dte_size, + sizeof (uint64_t), sizeof (dtrace_aggid_t)); + + aggbase = curneeded - sizeof (dtrace_aggid_t); + ASSERT(IS_P2ALIGNED(aggbase, + sizeof (uint64_t))); + } + curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment); + rec->dtrd_offset = curneeded; + if (curneeded + rec->dtrd_size < curneeded) + return (EINVAL); + curneeded += rec->dtrd_size; + } else { + /* tuples must be followed by an aggregation */ + ASSERT(act->dta_prev == NULL || + !act->dta_prev->dta_intuple); + + ecb->dte_size = P2ROUNDUP(ecb->dte_size, + rec->dtrd_alignment); + rec->dtrd_offset = ecb->dte_size; + if (ecb->dte_size + rec->dtrd_size < ecb->dte_size) + return (EINVAL); + ecb->dte_size += rec->dtrd_size; + ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size); + } + } + + if ((act = ecb->dte_action) != NULL && + !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) && + ecb->dte_size == sizeof (dtrace_rechdr_t)) { + /* + * If the size is still sizeof (dtrace_rechdr_t), then all + * actions store no data; set the size to 0. + */ + ecb->dte_size = 0; + } + + ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t)); + ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t))); + ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, + ecb->dte_needed); + return (0); +} + +static dtrace_action_t * +dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc) +{ + dtrace_aggregation_t *agg; + size_t size = sizeof (uint64_t); + int ntuple = desc->dtad_ntuple; + dtrace_action_t *act; + dtrace_recdesc_t *frec; + dtrace_aggid_t aggid; + dtrace_state_t *state = ecb->dte_state; + + agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP); + agg->dtag_ecb = ecb; + + ASSERT(DTRACEACT_ISAGG(desc->dtad_kind)); + + switch (desc->dtad_kind) { + case DTRACEAGG_MIN: + agg->dtag_initial = INT64_MAX; + agg->dtag_aggregate = dtrace_aggregate_min; + break; + + case DTRACEAGG_MAX: + agg->dtag_initial = INT64_MIN; + agg->dtag_aggregate = dtrace_aggregate_max; + break; + + case DTRACEAGG_COUNT: + agg->dtag_aggregate = dtrace_aggregate_count; + break; + + case DTRACEAGG_QUANTIZE: + agg->dtag_aggregate = dtrace_aggregate_quantize; + size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) * + sizeof (uint64_t); + break; + + case DTRACEAGG_LQUANTIZE: { + uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg); + uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg); + + agg->dtag_initial = desc->dtad_arg; + agg->dtag_aggregate = dtrace_aggregate_lquantize; + + if (step == 0 || levels == 0) + goto err; + + size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t); + break; + } + + case DTRACEAGG_LLQUANTIZE: { + uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg); + uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg); + uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg); + uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg); + int64_t v; + + agg->dtag_initial = desc->dtad_arg; + agg->dtag_aggregate = dtrace_aggregate_llquantize; + + if (factor < 2 || low >= high || nsteps < factor) + goto err; + + /* + * Now check that the number of steps evenly divides a power + * of the factor. (This assures both integer bucket size and + * linearity within each magnitude.) + */ + for (v = factor; v < nsteps; v *= factor) + continue; + + if ((v % nsteps) || (nsteps % factor)) + goto err; + + size = (dtrace_aggregate_llquantize_bucket(factor, + low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t); + break; + } + + case DTRACEAGG_AVG: + agg->dtag_aggregate = dtrace_aggregate_avg; + size = sizeof (uint64_t) * 2; + break; + + case DTRACEAGG_STDDEV: + agg->dtag_aggregate = dtrace_aggregate_stddev; + size = sizeof (uint64_t) * 4; + break; + + case DTRACEAGG_SUM: + agg->dtag_aggregate = dtrace_aggregate_sum; + break; + + default: + goto err; + } + + agg->dtag_action.dta_rec.dtrd_size = size; + + if (ntuple == 0) + goto err; + + /* + * We must make sure that we have enough actions for the n-tuple. + */ + for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) { + if (DTRACEACT_ISAGG(act->dta_kind)) + break; + + if (--ntuple == 0) { + /* + * This is the action with which our n-tuple begins. + */ + agg->dtag_first = act; + goto success; + } + } + + /* + * This n-tuple is short by ntuple elements. Return failure. + */ + ASSERT(ntuple != 0); +err: + kmem_free(agg, sizeof (dtrace_aggregation_t)); + return (NULL); + +success: + /* + * If the last action in the tuple has a size of zero, it's actually + * an expression argument for the aggregating action. + */ + ASSERT(ecb->dte_action_last != NULL); + act = ecb->dte_action_last; + + if (act->dta_kind == DTRACEACT_DIFEXPR) { + ASSERT(act->dta_difo != NULL); + + if (act->dta_difo->dtdo_rtype.dtdt_size == 0) + agg->dtag_hasarg = 1; + } + + /* + * We need to allocate an id for this aggregation. + */ +#ifdef illumos + aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1, + VM_BESTFIT | VM_SLEEP); +#else + aggid = alloc_unr(state->dts_aggid_arena); +#endif + + if (aggid - 1 >= state->dts_naggregations) { + dtrace_aggregation_t **oaggs = state->dts_aggregations; + dtrace_aggregation_t **aggs; + int naggs = state->dts_naggregations << 1; + int onaggs = state->dts_naggregations; + + ASSERT(aggid == state->dts_naggregations + 1); + + if (naggs == 0) { + ASSERT(oaggs == NULL); + naggs = 1; + } + + aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP); + + if (oaggs != NULL) { + bcopy(oaggs, aggs, onaggs * sizeof (*aggs)); + kmem_free(oaggs, onaggs * sizeof (*aggs)); + } + + state->dts_aggregations = aggs; + state->dts_naggregations = naggs; + } + + ASSERT(state->dts_aggregations[aggid - 1] == NULL); + state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg; + + frec = &agg->dtag_first->dta_rec; + if (frec->dtrd_alignment < sizeof (dtrace_aggid_t)) + frec->dtrd_alignment = sizeof (dtrace_aggid_t); + + for (act = agg->dtag_first; act != NULL; act = act->dta_next) { + ASSERT(!act->dta_intuple); + act->dta_intuple = 1; + } + + return (&agg->dtag_action); +} + +static void +dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act) +{ + dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act; + dtrace_state_t *state = ecb->dte_state; + dtrace_aggid_t aggid = agg->dtag_id; + + ASSERT(DTRACEACT_ISAGG(act->dta_kind)); +#ifdef illumos + vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1); +#else + free_unr(state->dts_aggid_arena, aggid); +#endif + + ASSERT(state->dts_aggregations[aggid - 1] == agg); + state->dts_aggregations[aggid - 1] = NULL; + + kmem_free(agg, sizeof (dtrace_aggregation_t)); +} + +static int +dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc) +{ + dtrace_action_t *action, *last; + dtrace_difo_t *dp = desc->dtad_difo; + uint32_t size = 0, align = sizeof (uint8_t), mask; + uint16_t format = 0; + dtrace_recdesc_t *rec; + dtrace_state_t *state = ecb->dte_state; + dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize; + uint64_t arg = desc->dtad_arg; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1); + + if (DTRACEACT_ISAGG(desc->dtad_kind)) { + /* + * If this is an aggregating action, there must be neither + * a speculate nor a commit on the action chain. + */ + dtrace_action_t *act; + + for (act = ecb->dte_action; act != NULL; act = act->dta_next) { + if (act->dta_kind == DTRACEACT_COMMIT) + return (EINVAL); + + if (act->dta_kind == DTRACEACT_SPECULATE) + return (EINVAL); + } + + action = dtrace_ecb_aggregation_create(ecb, desc); + + if (action == NULL) + return (EINVAL); + } else { + if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) || + (desc->dtad_kind == DTRACEACT_DIFEXPR && + dp != NULL && dp->dtdo_destructive)) { + state->dts_destructive = 1; + } + + switch (desc->dtad_kind) { + case DTRACEACT_PRINTF: + case DTRACEACT_PRINTA: + case DTRACEACT_SYSTEM: + case DTRACEACT_FREOPEN: + case DTRACEACT_DIFEXPR: + /* + * We know that our arg is a string -- turn it into a + * format. + */ + if (arg == 0) { + ASSERT(desc->dtad_kind == DTRACEACT_PRINTA || + desc->dtad_kind == DTRACEACT_DIFEXPR); + format = 0; + } else { + ASSERT(arg != 0); +#ifdef illumos + ASSERT(arg > KERNELBASE); +#endif + format = dtrace_format_add(state, + (char *)(uintptr_t)arg); + } + + /*FALLTHROUGH*/ + case DTRACEACT_LIBACT: + case DTRACEACT_TRACEMEM: + case DTRACEACT_TRACEMEM_DYNSIZE: + if (dp == NULL) + return (EINVAL); + + if ((size = dp->dtdo_rtype.dtdt_size) != 0) + break; + + if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) { + if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) + return (EINVAL); + + size = opt[DTRACEOPT_STRSIZE]; + } + + break; + + case DTRACEACT_STACK: + if ((nframes = arg) == 0) { + nframes = opt[DTRACEOPT_STACKFRAMES]; + ASSERT(nframes > 0); + arg = nframes; + } + + size = nframes * sizeof (pc_t); + break; + + case DTRACEACT_JSTACK: + if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0) + strsize = opt[DTRACEOPT_JSTACKSTRSIZE]; + + if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) + nframes = opt[DTRACEOPT_JSTACKFRAMES]; + + arg = DTRACE_USTACK_ARG(nframes, strsize); + + /*FALLTHROUGH*/ + case DTRACEACT_USTACK: + if (desc->dtad_kind != DTRACEACT_JSTACK && + (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) { + strsize = DTRACE_USTACK_STRSIZE(arg); + nframes = opt[DTRACEOPT_USTACKFRAMES]; + ASSERT(nframes > 0); + arg = DTRACE_USTACK_ARG(nframes, strsize); + } + + /* + * Save a slot for the pid. + */ + size = (nframes + 1) * sizeof (uint64_t); + size += DTRACE_USTACK_STRSIZE(arg); + size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t))); + + break; + + case DTRACEACT_SYM: + case DTRACEACT_MOD: + if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) != + sizeof (uint64_t)) || + (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) + return (EINVAL); + break; + + case DTRACEACT_USYM: + case DTRACEACT_UMOD: + case DTRACEACT_UADDR: + if (dp == NULL || + (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) || + (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) + return (EINVAL); + + /* + * We have a slot for the pid, plus a slot for the + * argument. To keep things simple (aligned with + * bitness-neutral sizing), we store each as a 64-bit + * quantity. + */ + size = 2 * sizeof (uint64_t); + break; + + case DTRACEACT_STOP: + case DTRACEACT_BREAKPOINT: + case DTRACEACT_PANIC: + break; + + case DTRACEACT_CHILL: + case DTRACEACT_DISCARD: + case DTRACEACT_RAISE: + if (dp == NULL) + return (EINVAL); + break; + + case DTRACEACT_EXIT: + if (dp == NULL || + (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) || + (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) + return (EINVAL); + break; + + case DTRACEACT_SPECULATE: + if (ecb->dte_size > sizeof (dtrace_rechdr_t)) + return (EINVAL); + + if (dp == NULL) + return (EINVAL); + + state->dts_speculates = 1; + break; + + case DTRACEACT_PRINTM: + size = dp->dtdo_rtype.dtdt_size; + break; + + case DTRACEACT_COMMIT: { + dtrace_action_t *act = ecb->dte_action; + + for (; act != NULL; act = act->dta_next) { + if (act->dta_kind == DTRACEACT_COMMIT) + return (EINVAL); + } + + if (dp == NULL) + return (EINVAL); + break; + } + + default: + return (EINVAL); + } + + if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) { + /* + * If this is a data-storing action or a speculate, + * we must be sure that there isn't a commit on the + * action chain. + */ + dtrace_action_t *act = ecb->dte_action; + + for (; act != NULL; act = act->dta_next) { + if (act->dta_kind == DTRACEACT_COMMIT) + return (EINVAL); + } + } + + action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP); + action->dta_rec.dtrd_size = size; + } + + action->dta_refcnt = 1; + rec = &action->dta_rec; + size = rec->dtrd_size; + + for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) { + if (!(size & mask)) { + align = mask + 1; + break; + } + } + + action->dta_kind = desc->dtad_kind; + + if ((action->dta_difo = dp) != NULL) + dtrace_difo_hold(dp); + + rec->dtrd_action = action->dta_kind; + rec->dtrd_arg = arg; + rec->dtrd_uarg = desc->dtad_uarg; + rec->dtrd_alignment = (uint16_t)align; + rec->dtrd_format = format; + + if ((last = ecb->dte_action_last) != NULL) { + ASSERT(ecb->dte_action != NULL); + action->dta_prev = last; + last->dta_next = action; + } else { + ASSERT(ecb->dte_action == NULL); + ecb->dte_action = action; + } + + ecb->dte_action_last = action; + + return (0); +} + +static void +dtrace_ecb_action_remove(dtrace_ecb_t *ecb) +{ + dtrace_action_t *act = ecb->dte_action, *next; + dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate; + dtrace_difo_t *dp; + uint16_t format; + + if (act != NULL && act->dta_refcnt > 1) { + ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1); + act->dta_refcnt--; + } else { + for (; act != NULL; act = next) { + next = act->dta_next; + ASSERT(next != NULL || act == ecb->dte_action_last); + ASSERT(act->dta_refcnt == 1); + + if ((format = act->dta_rec.dtrd_format) != 0) + dtrace_format_remove(ecb->dte_state, format); + + if ((dp = act->dta_difo) != NULL) + dtrace_difo_release(dp, vstate); + + if (DTRACEACT_ISAGG(act->dta_kind)) { + dtrace_ecb_aggregation_destroy(ecb, act); + } else { + kmem_free(act, sizeof (dtrace_action_t)); + } + } + } + + ecb->dte_action = NULL; + ecb->dte_action_last = NULL; + ecb->dte_size = 0; +} + +static void +dtrace_ecb_disable(dtrace_ecb_t *ecb) +{ + /* + * We disable the ECB by removing it from its probe. + */ + dtrace_ecb_t *pecb, *prev = NULL; + dtrace_probe_t *probe = ecb->dte_probe; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + + if (probe == NULL) { + /* + * This is the NULL probe; there is nothing to disable. + */ + return; + } + + for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) { + if (pecb == ecb) + break; + prev = pecb; + } + + ASSERT(pecb != NULL); + + if (prev == NULL) { + probe->dtpr_ecb = ecb->dte_next; + } else { + prev->dte_next = ecb->dte_next; + } + + if (ecb == probe->dtpr_ecb_last) { + ASSERT(ecb->dte_next == NULL); + probe->dtpr_ecb_last = prev; + } + + /* + * The ECB has been disconnected from the probe; now sync to assure + * that all CPUs have seen the change before returning. + */ + dtrace_sync(); + + if (probe->dtpr_ecb == NULL) { + /* + * That was the last ECB on the probe; clear the predicate + * cache ID for the probe, disable it and sync one more time + * to assure that we'll never hit it again. + */ + dtrace_provider_t *prov = probe->dtpr_provider; + + ASSERT(ecb->dte_next == NULL); + ASSERT(probe->dtpr_ecb_last == NULL); + probe->dtpr_predcache = DTRACE_CACHEIDNONE; + prov->dtpv_pops.dtps_disable(prov->dtpv_arg, + probe->dtpr_id, probe->dtpr_arg); + dtrace_sync(); + } else { + /* + * There is at least one ECB remaining on the probe. If there + * is _exactly_ one, set the probe's predicate cache ID to be + * the predicate cache ID of the remaining ECB. + */ + ASSERT(probe->dtpr_ecb_last != NULL); + ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE); + + if (probe->dtpr_ecb == probe->dtpr_ecb_last) { + dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate; + + ASSERT(probe->dtpr_ecb->dte_next == NULL); + + if (p != NULL) + probe->dtpr_predcache = p->dtp_cacheid; + } + + ecb->dte_next = NULL; + } +} + +static void +dtrace_ecb_destroy(dtrace_ecb_t *ecb) +{ + dtrace_state_t *state = ecb->dte_state; + dtrace_vstate_t *vstate = &state->dts_vstate; + dtrace_predicate_t *pred; + dtrace_epid_t epid = ecb->dte_epid; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(ecb->dte_next == NULL); + ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb); + + if ((pred = ecb->dte_predicate) != NULL) + dtrace_predicate_release(pred, vstate); + + dtrace_ecb_action_remove(ecb); + + ASSERT(state->dts_ecbs[epid - 1] == ecb); + state->dts_ecbs[epid - 1] = NULL; + + kmem_free(ecb, sizeof (dtrace_ecb_t)); +} + +static dtrace_ecb_t * +dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe, + dtrace_enabling_t *enab) +{ + dtrace_ecb_t *ecb; + dtrace_predicate_t *pred; + dtrace_actdesc_t *act; + dtrace_provider_t *prov; + dtrace_ecbdesc_t *desc = enab->dten_current; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(state != NULL); + + ecb = dtrace_ecb_add(state, probe); + ecb->dte_uarg = desc->dted_uarg; + + if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) { + dtrace_predicate_hold(pred); + ecb->dte_predicate = pred; + } + + if (probe != NULL) { + /* + * If the provider shows more leg than the consumer is old + * enough to see, we need to enable the appropriate implicit + * predicate bits to prevent the ecb from activating at + * revealing times. + * + * Providers specifying DTRACE_PRIV_USER at register time + * are stating that they need the /proc-style privilege + * model to be enforced, and this is what DTRACE_COND_OWNER + * and DTRACE_COND_ZONEOWNER will then do at probe time. + */ + prov = probe->dtpr_provider; + if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) && + (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER)) + ecb->dte_cond |= DTRACE_COND_OWNER; + + if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) && + (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER)) + ecb->dte_cond |= DTRACE_COND_ZONEOWNER; + + /* + * If the provider shows us kernel innards and the user + * is lacking sufficient privilege, enable the + * DTRACE_COND_USERMODE implicit predicate. + */ + if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) && + (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL)) + ecb->dte_cond |= DTRACE_COND_USERMODE; + } + + if (dtrace_ecb_create_cache != NULL) { + /* + * If we have a cached ecb, we'll use its action list instead + * of creating our own (saving both time and space). + */ + dtrace_ecb_t *cached = dtrace_ecb_create_cache; + dtrace_action_t *act = cached->dte_action; + + if (act != NULL) { + ASSERT(act->dta_refcnt > 0); + act->dta_refcnt++; + ecb->dte_action = act; + ecb->dte_action_last = cached->dte_action_last; + ecb->dte_needed = cached->dte_needed; + ecb->dte_size = cached->dte_size; + ecb->dte_alignment = cached->dte_alignment; + } + + return (ecb); + } + + for (act = desc->dted_action; act != NULL; act = act->dtad_next) { + if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) { + dtrace_ecb_destroy(ecb); + return (NULL); + } + } + + if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) { + dtrace_ecb_destroy(ecb); + return (NULL); + } + + return (dtrace_ecb_create_cache = ecb); +} + +static int +dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg) +{ + dtrace_ecb_t *ecb; + dtrace_enabling_t *enab = arg; + dtrace_state_t *state = enab->dten_vstate->dtvs_state; + + ASSERT(state != NULL); + + if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) { + /* + * This probe was created in a generation for which this + * enabling has previously created ECBs; we don't want to + * enable it again, so just kick out. + */ + return (DTRACE_MATCH_NEXT); + } + + if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL) + return (DTRACE_MATCH_DONE); + + dtrace_ecb_enable(ecb); + return (DTRACE_MATCH_NEXT); +} + +static dtrace_ecb_t * +dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id) +{ + dtrace_ecb_t *ecb; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + + if (id == 0 || id > state->dts_necbs) + return (NULL); + + ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL); + ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id); + + return (state->dts_ecbs[id - 1]); +} + +static dtrace_aggregation_t * +dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id) +{ + dtrace_aggregation_t *agg; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + + if (id == 0 || id > state->dts_naggregations) + return (NULL); + + ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL); + ASSERT((agg = state->dts_aggregations[id - 1]) == NULL || + agg->dtag_id == id); + + return (state->dts_aggregations[id - 1]); +} + +/* + * DTrace Buffer Functions + * + * The following functions manipulate DTrace buffers. Most of these functions + * are called in the context of establishing or processing consumer state; + * exceptions are explicitly noted. + */ + +/* + * Note: called from cross call context. This function switches the two + * buffers on a given CPU. The atomicity of this operation is assured by + * disabling interrupts while the actual switch takes place; the disabling of + * interrupts serializes the execution with any execution of dtrace_probe() on + * the same CPU. + */ +static void +dtrace_buffer_switch(dtrace_buffer_t *buf) +{ + caddr_t tomax = buf->dtb_tomax; + caddr_t xamot = buf->dtb_xamot; + dtrace_icookie_t cookie; + hrtime_t now; + + ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH)); + ASSERT(!(buf->dtb_flags & DTRACEBUF_RING)); + + cookie = dtrace_interrupt_disable(); + now = dtrace_gethrtime(); + buf->dtb_tomax = xamot; + buf->dtb_xamot = tomax; + buf->dtb_xamot_drops = buf->dtb_drops; + buf->dtb_xamot_offset = buf->dtb_offset; + buf->dtb_xamot_errors = buf->dtb_errors; + buf->dtb_xamot_flags = buf->dtb_flags; + buf->dtb_offset = 0; + buf->dtb_drops = 0; + buf->dtb_errors = 0; + buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED); + buf->dtb_interval = now - buf->dtb_switched; + buf->dtb_switched = now; + dtrace_interrupt_enable(cookie); +} + +/* + * Note: called from cross call context. This function activates a buffer + * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation + * is guaranteed by the disabling of interrupts. + */ +static void +dtrace_buffer_activate(dtrace_state_t *state) +{ + dtrace_buffer_t *buf; + dtrace_icookie_t cookie = dtrace_interrupt_disable(); + + buf = &state->dts_buffer[curcpu]; + + if (buf->dtb_tomax != NULL) { + /* + * We might like to assert that the buffer is marked inactive, + * but this isn't necessarily true: the buffer for the CPU + * that processes the BEGIN probe has its buffer activated + * manually. In this case, we take the (harmless) action + * re-clearing the bit INACTIVE bit. + */ + buf->dtb_flags &= ~DTRACEBUF_INACTIVE; + } + + dtrace_interrupt_enable(cookie); +} + +#ifdef __FreeBSD__ +/* + * Activate the specified per-CPU buffer. This is used instead of + * dtrace_buffer_activate() when APs have not yet started, i.e. when + * activating anonymous state. + */ +static void +dtrace_buffer_activate_cpu(dtrace_state_t *state, int cpu) +{ + + if (state->dts_buffer[cpu].dtb_tomax != NULL) + state->dts_buffer[cpu].dtb_flags &= ~DTRACEBUF_INACTIVE; +} +#endif + +static int +dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags, + processorid_t cpu, int *factor) +{ +#ifdef illumos + cpu_t *cp; +#endif + dtrace_buffer_t *buf; + int allocated = 0, desired = 0; + +#ifdef illumos + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(MUTEX_HELD(&dtrace_lock)); + + *factor = 1; + + if (size > dtrace_nonroot_maxsize && + !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE)) + return (EFBIG); + + cp = cpu_list; + + do { + if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id) + continue; + + buf = &bufs[cp->cpu_id]; + + /* + * If there is already a buffer allocated for this CPU, it + * is only possible that this is a DR event. In this case, + */ + if (buf->dtb_tomax != NULL) { + ASSERT(buf->dtb_size == size); + continue; + } + + ASSERT(buf->dtb_xamot == NULL); + + if ((buf->dtb_tomax = kmem_zalloc(size, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) + goto err; + + buf->dtb_size = size; + buf->dtb_flags = flags; + buf->dtb_offset = 0; + buf->dtb_drops = 0; + + if (flags & DTRACEBUF_NOSWITCH) + continue; + + if ((buf->dtb_xamot = kmem_zalloc(size, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) + goto err; + } while ((cp = cp->cpu_next) != cpu_list); + + return (0); + +err: + cp = cpu_list; + + do { + if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id) + continue; + + buf = &bufs[cp->cpu_id]; + desired += 2; + + if (buf->dtb_xamot != NULL) { + ASSERT(buf->dtb_tomax != NULL); + ASSERT(buf->dtb_size == size); + kmem_free(buf->dtb_xamot, size); + allocated++; + } + + if (buf->dtb_tomax != NULL) { + ASSERT(buf->dtb_size == size); + kmem_free(buf->dtb_tomax, size); + allocated++; + } + + buf->dtb_tomax = NULL; + buf->dtb_xamot = NULL; + buf->dtb_size = 0; + } while ((cp = cp->cpu_next) != cpu_list); +#else + int i; + + *factor = 1; +#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ + defined(__mips__) || defined(__powerpc__) || defined(__riscv) + /* + * FreeBSD isn't good at limiting the amount of memory we + * ask to malloc, so let's place a limit here before trying + * to do something that might well end in tears at bedtime. + */ + if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1))) + return (ENOMEM); +#endif + + ASSERT(MUTEX_HELD(&dtrace_lock)); + CPU_FOREACH(i) { + if (cpu != DTRACE_CPUALL && cpu != i) + continue; + + buf = &bufs[i]; + + /* + * If there is already a buffer allocated for this CPU, it + * is only possible that this is a DR event. In this case, + * the buffer size must match our specified size. + */ + if (buf->dtb_tomax != NULL) { + ASSERT(buf->dtb_size == size); + continue; + } + + ASSERT(buf->dtb_xamot == NULL); + + if ((buf->dtb_tomax = kmem_zalloc(size, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) + goto err; + + buf->dtb_size = size; + buf->dtb_flags = flags; + buf->dtb_offset = 0; + buf->dtb_drops = 0; + + if (flags & DTRACEBUF_NOSWITCH) + continue; + + if ((buf->dtb_xamot = kmem_zalloc(size, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) + goto err; + } + + return (0); + +err: + /* + * Error allocating memory, so free the buffers that were + * allocated before the failed allocation. + */ + CPU_FOREACH(i) { + if (cpu != DTRACE_CPUALL && cpu != i) + continue; + + buf = &bufs[i]; + desired += 2; + + if (buf->dtb_xamot != NULL) { + ASSERT(buf->dtb_tomax != NULL); + ASSERT(buf->dtb_size == size); + kmem_free(buf->dtb_xamot, size); + allocated++; + } + + if (buf->dtb_tomax != NULL) { + ASSERT(buf->dtb_size == size); + kmem_free(buf->dtb_tomax, size); + allocated++; + } + + buf->dtb_tomax = NULL; + buf->dtb_xamot = NULL; + buf->dtb_size = 0; + + } +#endif + *factor = desired / (allocated > 0 ? allocated : 1); + + return (ENOMEM); +} + +/* + * Note: called from probe context. This function just increments the drop + * count on a buffer. It has been made a function to allow for the + * possibility of understanding the source of mysterious drop counts. (A + * problem for which one may be particularly disappointed that DTrace cannot + * be used to understand DTrace.) + */ +static void +dtrace_buffer_drop(dtrace_buffer_t *buf) +{ + buf->dtb_drops++; +} + +/* + * Note: called from probe context. This function is called to reserve space + * in a buffer. If mstate is non-NULL, sets the scratch base and size in the + * mstate. Returns the new offset in the buffer, or a negative value if an + * error has occurred. + */ +static intptr_t +dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align, + dtrace_state_t *state, dtrace_mstate_t *mstate) +{ + intptr_t offs = buf->dtb_offset, soffs; + intptr_t woffs; + caddr_t tomax; + size_t total; + + if (buf->dtb_flags & DTRACEBUF_INACTIVE) + return (-1); + + if ((tomax = buf->dtb_tomax) == NULL) { + dtrace_buffer_drop(buf); + return (-1); + } + + if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) { + while (offs & (align - 1)) { + /* + * Assert that our alignment is off by a number which + * is itself sizeof (uint32_t) aligned. + */ + ASSERT(!((align - (offs & (align - 1))) & + (sizeof (uint32_t) - 1))); + DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE); + offs += sizeof (uint32_t); + } + + if ((soffs = offs + needed) > buf->dtb_size) { + dtrace_buffer_drop(buf); + return (-1); + } + + if (mstate == NULL) + return (offs); + + mstate->dtms_scratch_base = (uintptr_t)tomax + soffs; + mstate->dtms_scratch_size = buf->dtb_size - soffs; + mstate->dtms_scratch_ptr = mstate->dtms_scratch_base; + + return (offs); + } + + if (buf->dtb_flags & DTRACEBUF_FILL) { + if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN && + (buf->dtb_flags & DTRACEBUF_FULL)) + return (-1); + goto out; + } + + total = needed + (offs & (align - 1)); + + /* + * For a ring buffer, life is quite a bit more complicated. Before + * we can store any padding, we need to adjust our wrapping offset. + * (If we've never before wrapped or we're not about to, no adjustment + * is required.) + */ + if ((buf->dtb_flags & DTRACEBUF_WRAPPED) || + offs + total > buf->dtb_size) { + woffs = buf->dtb_xamot_offset; + + if (offs + total > buf->dtb_size) { + /* + * We can't fit in the end of the buffer. First, a + * sanity check that we can fit in the buffer at all. + */ + if (total > buf->dtb_size) { + dtrace_buffer_drop(buf); + return (-1); + } + + /* + * We're going to be storing at the top of the buffer, + * so now we need to deal with the wrapped offset. We + * only reset our wrapped offset to 0 if it is + * currently greater than the current offset. If it + * is less than the current offset, it is because a + * previous allocation induced a wrap -- but the + * allocation didn't subsequently take the space due + * to an error or false predicate evaluation. In this + * case, we'll just leave the wrapped offset alone: if + * the wrapped offset hasn't been advanced far enough + * for this allocation, it will be adjusted in the + * lower loop. + */ + if (buf->dtb_flags & DTRACEBUF_WRAPPED) { + if (woffs >= offs) + woffs = 0; + } else { + woffs = 0; + } + + /* + * Now we know that we're going to be storing to the + * top of the buffer and that there is room for us + * there. We need to clear the buffer from the current + * offset to the end (there may be old gunk there). + */ + while (offs < buf->dtb_size) + tomax[offs++] = 0; + + /* + * We need to set our offset to zero. And because we + * are wrapping, we need to set the bit indicating as + * much. We can also adjust our needed space back + * down to the space required by the ECB -- we know + * that the top of the buffer is aligned. + */ + offs = 0; + total = needed; + buf->dtb_flags |= DTRACEBUF_WRAPPED; + } else { + /* + * There is room for us in the buffer, so we simply + * need to check the wrapped offset. + */ + if (woffs < offs) { + /* + * The wrapped offset is less than the offset. + * This can happen if we allocated buffer space + * that induced a wrap, but then we didn't + * subsequently take the space due to an error + * or false predicate evaluation. This is + * okay; we know that _this_ allocation isn't + * going to induce a wrap. We still can't + * reset the wrapped offset to be zero, + * however: the space may have been trashed in + * the previous failed probe attempt. But at + * least the wrapped offset doesn't need to + * be adjusted at all... + */ + goto out; + } + } + + while (offs + total > woffs) { + dtrace_epid_t epid = *(uint32_t *)(tomax + woffs); + size_t size; + + if (epid == DTRACE_EPIDNONE) { + size = sizeof (uint32_t); + } else { + ASSERT3U(epid, <=, state->dts_necbs); + ASSERT(state->dts_ecbs[epid - 1] != NULL); + + size = state->dts_ecbs[epid - 1]->dte_size; + } + + ASSERT(woffs + size <= buf->dtb_size); + ASSERT(size != 0); + + if (woffs + size == buf->dtb_size) { + /* + * We've reached the end of the buffer; we want + * to set the wrapped offset to 0 and break + * out. However, if the offs is 0, then we're + * in a strange edge-condition: the amount of + * space that we want to reserve plus the size + * of the record that we're overwriting is + * greater than the size of the buffer. This + * is problematic because if we reserve the + * space but subsequently don't consume it (due + * to a failed predicate or error) the wrapped + * offset will be 0 -- yet the EPID at offset 0 + * will not be committed. This situation is + * relatively easy to deal with: if we're in + * this case, the buffer is indistinguishable + * from one that hasn't wrapped; we need only + * finish the job by clearing the wrapped bit, + * explicitly setting the offset to be 0, and + * zero'ing out the old data in the buffer. + */ + if (offs == 0) { + buf->dtb_flags &= ~DTRACEBUF_WRAPPED; + buf->dtb_offset = 0; + woffs = total; + + while (woffs < buf->dtb_size) + tomax[woffs++] = 0; + } + + woffs = 0; + break; + } + + woffs += size; + } + + /* + * We have a wrapped offset. It may be that the wrapped offset + * has become zero -- that's okay. + */ + buf->dtb_xamot_offset = woffs; + } + +out: + /* + * Now we can plow the buffer with any necessary padding. + */ + while (offs & (align - 1)) { + /* + * Assert that our alignment is off by a number which + * is itself sizeof (uint32_t) aligned. + */ + ASSERT(!((align - (offs & (align - 1))) & + (sizeof (uint32_t) - 1))); + DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE); + offs += sizeof (uint32_t); + } + + if (buf->dtb_flags & DTRACEBUF_FILL) { + if (offs + needed > buf->dtb_size - state->dts_reserve) { + buf->dtb_flags |= DTRACEBUF_FULL; + return (-1); + } + } + + if (mstate == NULL) + return (offs); + + /* + * For ring buffers and fill buffers, the scratch space is always + * the inactive buffer. + */ + mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot; + mstate->dtms_scratch_size = buf->dtb_size; + mstate->dtms_scratch_ptr = mstate->dtms_scratch_base; + + return (offs); +} + +static void +dtrace_buffer_polish(dtrace_buffer_t *buf) +{ + ASSERT(buf->dtb_flags & DTRACEBUF_RING); + ASSERT(MUTEX_HELD(&dtrace_lock)); + + if (!(buf->dtb_flags & DTRACEBUF_WRAPPED)) + return; + + /* + * We need to polish the ring buffer. There are three cases: + * + * - The first (and presumably most common) is that there is no gap + * between the buffer offset and the wrapped offset. In this case, + * there is nothing in the buffer that isn't valid data; we can + * mark the buffer as polished and return. + * + * - The second (less common than the first but still more common + * than the third) is that there is a gap between the buffer offset + * and the wrapped offset, and the wrapped offset is larger than the + * buffer offset. This can happen because of an alignment issue, or + * can happen because of a call to dtrace_buffer_reserve() that + * didn't subsequently consume the buffer space. In this case, + * we need to zero the data from the buffer offset to the wrapped + * offset. + * + * - The third (and least common) is that there is a gap between the + * buffer offset and the wrapped offset, but the wrapped offset is + * _less_ than the buffer offset. This can only happen because a + * call to dtrace_buffer_reserve() induced a wrap, but the space + * was not subsequently consumed. In this case, we need to zero the + * space from the offset to the end of the buffer _and_ from the + * top of the buffer to the wrapped offset. + */ + if (buf->dtb_offset < buf->dtb_xamot_offset) { + bzero(buf->dtb_tomax + buf->dtb_offset, + buf->dtb_xamot_offset - buf->dtb_offset); + } + + if (buf->dtb_offset > buf->dtb_xamot_offset) { + bzero(buf->dtb_tomax + buf->dtb_offset, + buf->dtb_size - buf->dtb_offset); + bzero(buf->dtb_tomax, buf->dtb_xamot_offset); + } +} + +/* + * This routine determines if data generated at the specified time has likely + * been entirely consumed at user-level. This routine is called to determine + * if an ECB on a defunct probe (but for an active enabling) can be safely + * disabled and destroyed. + */ +static int +dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when) +{ + int i; + + for (i = 0; i < NCPU; i++) { + dtrace_buffer_t *buf = &bufs[i]; + + if (buf->dtb_size == 0) + continue; + + if (buf->dtb_flags & DTRACEBUF_RING) + return (0); + + if (!buf->dtb_switched && buf->dtb_offset != 0) + return (0); + + if (buf->dtb_switched - buf->dtb_interval < when) + return (0); + } + + return (1); +} + +static void +dtrace_buffer_free(dtrace_buffer_t *bufs) +{ + int i; + + for (i = 0; i < NCPU; i++) { + dtrace_buffer_t *buf = &bufs[i]; + + if (buf->dtb_tomax == NULL) { + ASSERT(buf->dtb_xamot == NULL); + ASSERT(buf->dtb_size == 0); + continue; + } + + if (buf->dtb_xamot != NULL) { + ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH)); + kmem_free(buf->dtb_xamot, buf->dtb_size); + } + + kmem_free(buf->dtb_tomax, buf->dtb_size); + buf->dtb_size = 0; + buf->dtb_tomax = NULL; + buf->dtb_xamot = NULL; + } +} + +/* + * DTrace Enabling Functions + */ +static dtrace_enabling_t * +dtrace_enabling_create(dtrace_vstate_t *vstate) +{ + dtrace_enabling_t *enab; + + enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP); + enab->dten_vstate = vstate; + + return (enab); +} + +static void +dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb) +{ + dtrace_ecbdesc_t **ndesc; + size_t osize, nsize; + + /* + * We can't add to enablings after we've enabled them, or after we've + * retained them. + */ + ASSERT(enab->dten_probegen == 0); + ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL); + + if (enab->dten_ndesc < enab->dten_maxdesc) { + enab->dten_desc[enab->dten_ndesc++] = ecb; + return; + } + + osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *); + + if (enab->dten_maxdesc == 0) { + enab->dten_maxdesc = 1; + } else { + enab->dten_maxdesc <<= 1; + } + + ASSERT(enab->dten_ndesc < enab->dten_maxdesc); + + nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *); + ndesc = kmem_zalloc(nsize, KM_SLEEP); + bcopy(enab->dten_desc, ndesc, osize); + if (enab->dten_desc != NULL) + kmem_free(enab->dten_desc, osize); + + enab->dten_desc = ndesc; + enab->dten_desc[enab->dten_ndesc++] = ecb; +} + +static void +dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb, + dtrace_probedesc_t *pd) +{ + dtrace_ecbdesc_t *new; + dtrace_predicate_t *pred; + dtrace_actdesc_t *act; + + /* + * We're going to create a new ECB description that matches the + * specified ECB in every way, but has the specified probe description. + */ + new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP); + + if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL) + dtrace_predicate_hold(pred); + + for (act = ecb->dted_action; act != NULL; act = act->dtad_next) + dtrace_actdesc_hold(act); + + new->dted_action = ecb->dted_action; + new->dted_pred = ecb->dted_pred; + new->dted_probe = *pd; + new->dted_uarg = ecb->dted_uarg; + + dtrace_enabling_add(enab, new); +} + +static void +dtrace_enabling_dump(dtrace_enabling_t *enab) +{ + int i; + + for (i = 0; i < enab->dten_ndesc; i++) { + dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe; + +#ifdef __FreeBSD__ + printf("dtrace: enabling probe %d (%s:%s:%s:%s)\n", i, + desc->dtpd_provider, desc->dtpd_mod, + desc->dtpd_func, desc->dtpd_name); +#else + cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i, + desc->dtpd_provider, desc->dtpd_mod, + desc->dtpd_func, desc->dtpd_name); +#endif + } +} + +static void +dtrace_enabling_destroy(dtrace_enabling_t *enab) +{ + int i; + dtrace_ecbdesc_t *ep; + dtrace_vstate_t *vstate = enab->dten_vstate; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + + for (i = 0; i < enab->dten_ndesc; i++) { + dtrace_actdesc_t *act, *next; + dtrace_predicate_t *pred; + + ep = enab->dten_desc[i]; + + if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) + dtrace_predicate_release(pred, vstate); + + for (act = ep->dted_action; act != NULL; act = next) { + next = act->dtad_next; + dtrace_actdesc_release(act, vstate); + } + + kmem_free(ep, sizeof (dtrace_ecbdesc_t)); + } + + if (enab->dten_desc != NULL) + kmem_free(enab->dten_desc, + enab->dten_maxdesc * sizeof (dtrace_enabling_t *)); + + /* + * If this was a retained enabling, decrement the dts_nretained count + * and take it off of the dtrace_retained list. + */ + if (enab->dten_prev != NULL || enab->dten_next != NULL || + dtrace_retained == enab) { + ASSERT(enab->dten_vstate->dtvs_state != NULL); + ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0); + enab->dten_vstate->dtvs_state->dts_nretained--; + dtrace_retained_gen++; + } + + if (enab->dten_prev == NULL) { + if (dtrace_retained == enab) { + dtrace_retained = enab->dten_next; + + if (dtrace_retained != NULL) + dtrace_retained->dten_prev = NULL; + } + } else { + ASSERT(enab != dtrace_retained); + ASSERT(dtrace_retained != NULL); + enab->dten_prev->dten_next = enab->dten_next; + } + + if (enab->dten_next != NULL) { + ASSERT(dtrace_retained != NULL); + enab->dten_next->dten_prev = enab->dten_prev; + } + + kmem_free(enab, sizeof (dtrace_enabling_t)); +} + +static int +dtrace_enabling_retain(dtrace_enabling_t *enab) +{ + dtrace_state_t *state; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL); + ASSERT(enab->dten_vstate != NULL); + + state = enab->dten_vstate->dtvs_state; + ASSERT(state != NULL); + + /* + * We only allow each state to retain dtrace_retain_max enablings. + */ + if (state->dts_nretained >= dtrace_retain_max) + return (ENOSPC); + + state->dts_nretained++; + dtrace_retained_gen++; + + if (dtrace_retained == NULL) { + dtrace_retained = enab; + return (0); + } + + enab->dten_next = dtrace_retained; + dtrace_retained->dten_prev = enab; + dtrace_retained = enab; + + return (0); +} + +static int +dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match, + dtrace_probedesc_t *create) +{ + dtrace_enabling_t *new, *enab; + int found = 0, err = ENOENT; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN); + ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN); + ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN); + ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN); + + new = dtrace_enabling_create(&state->dts_vstate); + + /* + * Iterate over all retained enablings, looking for enablings that + * match the specified state. + */ + for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) { + int i; + + /* + * dtvs_state can only be NULL for helper enablings -- and + * helper enablings can't be retained. + */ + ASSERT(enab->dten_vstate->dtvs_state != NULL); + + if (enab->dten_vstate->dtvs_state != state) + continue; + + /* + * Now iterate over each probe description; we're looking for + * an exact match to the specified probe description. + */ + for (i = 0; i < enab->dten_ndesc; i++) { + dtrace_ecbdesc_t *ep = enab->dten_desc[i]; + dtrace_probedesc_t *pd = &ep->dted_probe; + + if (strcmp(pd->dtpd_provider, match->dtpd_provider)) + continue; + + if (strcmp(pd->dtpd_mod, match->dtpd_mod)) + continue; + + if (strcmp(pd->dtpd_func, match->dtpd_func)) + continue; + + if (strcmp(pd->dtpd_name, match->dtpd_name)) + continue; + + /* + * We have a winning probe! Add it to our growing + * enabling. + */ + found = 1; + dtrace_enabling_addlike(new, ep, create); + } + } + + if (!found || (err = dtrace_enabling_retain(new)) != 0) { + dtrace_enabling_destroy(new); + return (err); + } + + return (0); +} + +static void +dtrace_enabling_retract(dtrace_state_t *state) +{ + dtrace_enabling_t *enab, *next; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + + /* + * Iterate over all retained enablings, destroy the enablings retained + * for the specified state. + */ + for (enab = dtrace_retained; enab != NULL; enab = next) { + next = enab->dten_next; + + /* + * dtvs_state can only be NULL for helper enablings -- and + * helper enablings can't be retained. + */ + ASSERT(enab->dten_vstate->dtvs_state != NULL); + + if (enab->dten_vstate->dtvs_state == state) { + ASSERT(state->dts_nretained > 0); + dtrace_enabling_destroy(enab); + } + } + + ASSERT(state->dts_nretained == 0); +} + +static int +dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) +{ + int i = 0; + int matched = 0; + + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(MUTEX_HELD(&dtrace_lock)); + + for (i = 0; i < enab->dten_ndesc; i++) { + dtrace_ecbdesc_t *ep = enab->dten_desc[i]; + + enab->dten_current = ep; + enab->dten_error = 0; + + matched += dtrace_probe_enable(&ep->dted_probe, enab); + + if (enab->dten_error != 0) { + /* + * If we get an error half-way through enabling the + * probes, we kick out -- perhaps with some number of + * them enabled. Leaving enabled probes enabled may + * be slightly confusing for user-level, but we expect + * that no one will attempt to actually drive on in + * the face of such errors. If this is an anonymous + * enabling (indicated with a NULL nmatched pointer), + * we cmn_err() a message. We aren't expecting to + * get such an error -- such as it can exist at all, + * it would be a result of corrupted DOF in the driver + * properties. + */ + if (nmatched == NULL) { + cmn_err(CE_WARN, "dtrace_enabling_match() " + "error on %p: %d", (void *)ep, + enab->dten_error); + } + + return (enab->dten_error); + } + } + + enab->dten_probegen = dtrace_probegen; + if (nmatched != NULL) + *nmatched = matched; + + return (0); +} + +static void +dtrace_enabling_matchall(void) +{ + dtrace_enabling_t *enab; + + mutex_enter(&cpu_lock); + mutex_enter(&dtrace_lock); + + /* + * Iterate over all retained enablings to see if any probes match + * against them. We only perform this operation on enablings for which + * we have sufficient permissions by virtue of being in the global zone + * or in the same zone as the DTrace client. Because we can be called + * after dtrace_detach() has been called, we cannot assert that there + * are retained enablings. We can safely load from dtrace_retained, + * however: the taskq_destroy() at the end of dtrace_detach() will + * block pending our completion. + */ + for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) { +#ifdef illumos + cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred; + + if (INGLOBALZONE(curproc) || + cr != NULL && getzoneid() == crgetzoneid(cr)) +#endif + (void) dtrace_enabling_match(enab, NULL); + } + + mutex_exit(&dtrace_lock); + mutex_exit(&cpu_lock); +} + +/* + * If an enabling is to be enabled without having matched probes (that is, if + * dtrace_state_go() is to be called on the underlying dtrace_state_t), the + * enabling must be _primed_ by creating an ECB for every ECB description. + * This must be done to assure that we know the number of speculations, the + * number of aggregations, the minimum buffer size needed, etc. before we + * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually + * enabling any probes, we create ECBs for every ECB decription, but with a + * NULL probe -- which is exactly what this function does. + */ +static void +dtrace_enabling_prime(dtrace_state_t *state) +{ + dtrace_enabling_t *enab; + int i; + + for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) { + ASSERT(enab->dten_vstate->dtvs_state != NULL); + + if (enab->dten_vstate->dtvs_state != state) + continue; + + /* + * We don't want to prime an enabling more than once, lest + * we allow a malicious user to induce resource exhaustion. + * (The ECBs that result from priming an enabling aren't + * leaked -- but they also aren't deallocated until the + * consumer state is destroyed.) + */ + if (enab->dten_primed) + continue; + + for (i = 0; i < enab->dten_ndesc; i++) { + enab->dten_current = enab->dten_desc[i]; + (void) dtrace_probe_enable(NULL, enab); + } + + enab->dten_primed = 1; + } +} + +/* + * Called to indicate that probes should be provided due to retained + * enablings. This is implemented in terms of dtrace_probe_provide(), but it + * must take an initial lap through the enabling calling the dtps_provide() + * entry point explicitly to allow for autocreated probes. + */ +static void +dtrace_enabling_provide(dtrace_provider_t *prv) +{ + int i, all = 0; + dtrace_probedesc_t desc; + dtrace_genid_t gen; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(MUTEX_HELD(&dtrace_provider_lock)); + + if (prv == NULL) { + all = 1; + prv = dtrace_provider; + } + + do { + dtrace_enabling_t *enab; + void *parg = prv->dtpv_arg; + +retry: + gen = dtrace_retained_gen; + for (enab = dtrace_retained; enab != NULL; + enab = enab->dten_next) { + for (i = 0; i < enab->dten_ndesc; i++) { + desc = enab->dten_desc[i]->dted_probe; + mutex_exit(&dtrace_lock); + prv->dtpv_pops.dtps_provide(parg, &desc); + mutex_enter(&dtrace_lock); + /* + * Process the retained enablings again if + * they have changed while we weren't holding + * dtrace_lock. + */ + if (gen != dtrace_retained_gen) + goto retry; + } + } + } while (all && (prv = prv->dtpv_next) != NULL); + + mutex_exit(&dtrace_lock); + dtrace_probe_provide(NULL, all ? NULL : prv); + mutex_enter(&dtrace_lock); +} + +/* + * Called to reap ECBs that are attached to probes from defunct providers. + */ +static void +dtrace_enabling_reap(void) +{ + dtrace_provider_t *prov; + dtrace_probe_t *probe; + dtrace_ecb_t *ecb; + hrtime_t when; + int i; + + mutex_enter(&cpu_lock); + mutex_enter(&dtrace_lock); + + for (i = 0; i < dtrace_nprobes; i++) { + if ((probe = dtrace_probes[i]) == NULL) + continue; + + if (probe->dtpr_ecb == NULL) + continue; + + prov = probe->dtpr_provider; + + if ((when = prov->dtpv_defunct) == 0) + continue; + + /* + * We have ECBs on a defunct provider: we want to reap these + * ECBs to allow the provider to unregister. The destruction + * of these ECBs must be done carefully: if we destroy the ECB + * and the consumer later wishes to consume an EPID that + * corresponds to the destroyed ECB (and if the EPID metadata + * has not been previously consumed), the consumer will abort + * processing on the unknown EPID. To reduce (but not, sadly, + * eliminate) the possibility of this, we will only destroy an + * ECB for a defunct provider if, for the state that + * corresponds to the ECB: + * + * (a) There is no speculative tracing (which can effectively + * cache an EPID for an arbitrary amount of time). + * + * (b) The principal buffers have been switched twice since the + * provider became defunct. + * + * (c) The aggregation buffers are of zero size or have been + * switched twice since the provider became defunct. + * + * We use dts_speculates to determine (a) and call a function + * (dtrace_buffer_consumed()) to determine (b) and (c). Note + * that as soon as we've been unable to destroy one of the ECBs + * associated with the probe, we quit trying -- reaping is only + * fruitful in as much as we can destroy all ECBs associated + * with the defunct provider's probes. + */ + while ((ecb = probe->dtpr_ecb) != NULL) { + dtrace_state_t *state = ecb->dte_state; + dtrace_buffer_t *buf = state->dts_buffer; + dtrace_buffer_t *aggbuf = state->dts_aggbuffer; + + if (state->dts_speculates) + break; + + if (!dtrace_buffer_consumed(buf, when)) + break; + + if (!dtrace_buffer_consumed(aggbuf, when)) + break; + + dtrace_ecb_disable(ecb); + ASSERT(probe->dtpr_ecb != ecb); + dtrace_ecb_destroy(ecb); + } + } + + mutex_exit(&dtrace_lock); + mutex_exit(&cpu_lock); +} + +/* + * DTrace DOF Functions + */ +/*ARGSUSED*/ +static void +dtrace_dof_error(dof_hdr_t *dof, const char *str) +{ + if (dtrace_err_verbose) + cmn_err(CE_WARN, "failed to process DOF: %s", str); + +#ifdef DTRACE_ERRDEBUG + dtrace_errdebug(str); +#endif +} + +/* + * Create DOF out of a currently enabled state. Right now, we only create + * DOF containing the run-time options -- but this could be expanded to create + * complete DOF representing the enabled state. + */ +static dof_hdr_t * +dtrace_dof_create(dtrace_state_t *state) +{ + dof_hdr_t *dof; + dof_sec_t *sec; + dof_optdesc_t *opt; + int i, len = sizeof (dof_hdr_t) + + roundup(sizeof (dof_sec_t), sizeof (uint64_t)) + + sizeof (dof_optdesc_t) * DTRACEOPT_MAX; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + + dof = kmem_zalloc(len, KM_SLEEP); + dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0; + dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1; + dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2; + dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3; + + dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE; + dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE; + dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION; + dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION; + dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS; + dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS; + + dof->dofh_flags = 0; + dof->dofh_hdrsize = sizeof (dof_hdr_t); + dof->dofh_secsize = sizeof (dof_sec_t); + dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */ + dof->dofh_secoff = sizeof (dof_hdr_t); + dof->dofh_loadsz = len; + dof->dofh_filesz = len; + dof->dofh_pad = 0; + + /* + * Fill in the option section header... + */ + sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t)); + sec->dofs_type = DOF_SECT_OPTDESC; + sec->dofs_align = sizeof (uint64_t); + sec->dofs_flags = DOF_SECF_LOAD; + sec->dofs_entsize = sizeof (dof_optdesc_t); + + opt = (dof_optdesc_t *)((uintptr_t)sec + + roundup(sizeof (dof_sec_t), sizeof (uint64_t))); + + sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof; + sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX; + + for (i = 0; i < DTRACEOPT_MAX; i++) { + opt[i].dofo_option = i; + opt[i].dofo_strtab = DOF_SECIDX_NONE; + opt[i].dofo_value = state->dts_options[i]; + } + + return (dof); +} + +static dof_hdr_t * +dtrace_dof_copyin(uintptr_t uarg, int *errp) +{ + dof_hdr_t hdr, *dof; + + ASSERT(!MUTEX_HELD(&dtrace_lock)); + + /* + * First, we're going to copyin() the sizeof (dof_hdr_t). + */ + if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) { + dtrace_dof_error(NULL, "failed to copyin DOF header"); + *errp = EFAULT; + return (NULL); + } + + /* + * Now we'll allocate the entire DOF and copy it in -- provided + * that the length isn't outrageous. + */ + if (hdr.dofh_loadsz >= dtrace_dof_maxsize) { + dtrace_dof_error(&hdr, "load size exceeds maximum"); + *errp = E2BIG; + return (NULL); + } + + if (hdr.dofh_loadsz < sizeof (hdr)) { + dtrace_dof_error(&hdr, "invalid load size"); + *errp = EINVAL; + return (NULL); + } + + dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP); + + if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 || + dof->dofh_loadsz != hdr.dofh_loadsz) { + kmem_free(dof, hdr.dofh_loadsz); + *errp = EFAULT; + return (NULL); + } + + return (dof); +} + +#ifdef __FreeBSD__ +static dof_hdr_t * +dtrace_dof_copyin_proc(struct proc *p, uintptr_t uarg, int *errp) +{ + dof_hdr_t hdr, *dof; + struct thread *td; + size_t loadsz; + + ASSERT(!MUTEX_HELD(&dtrace_lock)); + + td = curthread; + + /* + * First, we're going to copyin() the sizeof (dof_hdr_t). + */ + if (proc_readmem(td, p, uarg, &hdr, sizeof(hdr)) != sizeof(hdr)) { + dtrace_dof_error(NULL, "failed to copyin DOF header"); + *errp = EFAULT; + return (NULL); + } + + /* + * Now we'll allocate the entire DOF and copy it in -- provided + * that the length isn't outrageous. + */ + if (hdr.dofh_loadsz >= dtrace_dof_maxsize) { + dtrace_dof_error(&hdr, "load size exceeds maximum"); + *errp = E2BIG; + return (NULL); + } + loadsz = (size_t)hdr.dofh_loadsz; + + if (loadsz < sizeof (hdr)) { + dtrace_dof_error(&hdr, "invalid load size"); + *errp = EINVAL; + return (NULL); + } + + dof = kmem_alloc(loadsz, KM_SLEEP); + + if (proc_readmem(td, p, uarg, dof, loadsz) != loadsz || + dof->dofh_loadsz != loadsz) { + kmem_free(dof, hdr.dofh_loadsz); + *errp = EFAULT; + return (NULL); + } + + return (dof); +} + +static __inline uchar_t +dtrace_dof_char(char c) +{ + + switch (c) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return (c - '0'); + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + return (c - 'A' + 10); + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + return (c - 'a' + 10); + } + /* Should not reach here. */ + return (UCHAR_MAX); +} +#endif /* __FreeBSD__ */ + +static dof_hdr_t * +dtrace_dof_property(const char *name) +{ +#ifdef __FreeBSD__ + uint8_t *dofbuf; + u_char *data, *eol; + caddr_t doffile; + size_t bytes, len, i; + dof_hdr_t *dof; + u_char c1, c2; + + dof = NULL; + + doffile = preload_search_by_type("dtrace_dof"); + if (doffile == NULL) + return (NULL); + + data = preload_fetch_addr(doffile); + len = preload_fetch_size(doffile); + for (;;) { + /* Look for the end of the line. All lines end in a newline. */ + eol = memchr(data, '\n', len); + if (eol == NULL) + return (NULL); + + if (strncmp(name, data, strlen(name)) == 0) + break; + + eol++; /* skip past the newline */ + len -= eol - data; + data = eol; + } + + /* We've found the data corresponding to the specified key. */ + + data += strlen(name) + 1; /* skip past the '=' */ + len = eol - data; + if (len % 2 != 0) { + dtrace_dof_error(NULL, "invalid DOF encoding length"); + goto doferr; + } + bytes = len / 2; + if (bytes < sizeof(dof_hdr_t)) { + dtrace_dof_error(NULL, "truncated header"); + goto doferr; + } + + /* + * Each byte is represented by the two ASCII characters in its hex + * representation. + */ + dofbuf = malloc(bytes, M_SOLARIS, M_WAITOK); + for (i = 0; i < bytes; i++) { + c1 = dtrace_dof_char(data[i * 2]); + c2 = dtrace_dof_char(data[i * 2 + 1]); + if (c1 == UCHAR_MAX || c2 == UCHAR_MAX) { + dtrace_dof_error(NULL, "invalid hex char in DOF"); + goto doferr; + } + dofbuf[i] = c1 * 16 + c2; + } + + dof = (dof_hdr_t *)dofbuf; + if (bytes < dof->dofh_loadsz) { + dtrace_dof_error(NULL, "truncated DOF"); + goto doferr; + } + + if (dof->dofh_loadsz >= dtrace_dof_maxsize) { + dtrace_dof_error(NULL, "oversized DOF"); + goto doferr; + } + + return (dof); + +doferr: + free(dof, M_SOLARIS); + return (NULL); +#else /* __FreeBSD__ */ + uchar_t *buf; + uint64_t loadsz; + unsigned int len, i; + dof_hdr_t *dof; + + /* + * Unfortunately, array of values in .conf files are always (and + * only) interpreted to be integer arrays. We must read our DOF + * as an integer array, and then squeeze it into a byte array. + */ + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0, + (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS) + return (NULL); + + for (i = 0; i < len; i++) + buf[i] = (uchar_t)(((int *)buf)[i]); + + if (len < sizeof (dof_hdr_t)) { + ddi_prop_free(buf); + dtrace_dof_error(NULL, "truncated header"); + return (NULL); + } + + if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) { + ddi_prop_free(buf); + dtrace_dof_error(NULL, "truncated DOF"); + return (NULL); + } + + if (loadsz >= dtrace_dof_maxsize) { + ddi_prop_free(buf); + dtrace_dof_error(NULL, "oversized DOF"); + return (NULL); + } + + dof = kmem_alloc(loadsz, KM_SLEEP); + bcopy(buf, dof, loadsz); + ddi_prop_free(buf); + + return (dof); +#endif /* !__FreeBSD__ */ +} + +static void +dtrace_dof_destroy(dof_hdr_t *dof) +{ + kmem_free(dof, dof->dofh_loadsz); +} + +/* + * Return the dof_sec_t pointer corresponding to a given section index. If the + * index is not valid, dtrace_dof_error() is called and NULL is returned. If + * a type other than DOF_SECT_NONE is specified, the header is checked against + * this type and NULL is returned if the types do not match. + */ +static dof_sec_t * +dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i) +{ + dof_sec_t *sec = (dof_sec_t *)(uintptr_t) + ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize); + + if (i >= dof->dofh_secnum) { + dtrace_dof_error(dof, "referenced section index is invalid"); + return (NULL); + } + + if (!(sec->dofs_flags & DOF_SECF_LOAD)) { + dtrace_dof_error(dof, "referenced section is not loadable"); + return (NULL); + } + + if (type != DOF_SECT_NONE && type != sec->dofs_type) { + dtrace_dof_error(dof, "referenced section is the wrong type"); + return (NULL); + } + + return (sec); +} + +static dtrace_probedesc_t * +dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc) +{ + dof_probedesc_t *probe; + dof_sec_t *strtab; + uintptr_t daddr = (uintptr_t)dof; + uintptr_t str; + size_t size; + + if (sec->dofs_type != DOF_SECT_PROBEDESC) { + dtrace_dof_error(dof, "invalid probe section"); + return (NULL); + } + + if (sec->dofs_align != sizeof (dof_secidx_t)) { + dtrace_dof_error(dof, "bad alignment in probe description"); + return (NULL); + } + + if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) { + dtrace_dof_error(dof, "truncated probe description"); + return (NULL); + } + + probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset); + strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab); + + if (strtab == NULL) + return (NULL); + + str = daddr + strtab->dofs_offset; + size = strtab->dofs_size; + + if (probe->dofp_provider >= strtab->dofs_size) { + dtrace_dof_error(dof, "corrupt probe provider"); + return (NULL); + } + + (void) strncpy(desc->dtpd_provider, + (char *)(str + probe->dofp_provider), + MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider)); + + if (probe->dofp_mod >= strtab->dofs_size) { + dtrace_dof_error(dof, "corrupt probe module"); + return (NULL); + } + + (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod), + MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod)); + + if (probe->dofp_func >= strtab->dofs_size) { + dtrace_dof_error(dof, "corrupt probe function"); + return (NULL); + } + + (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func), + MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func)); + + if (probe->dofp_name >= strtab->dofs_size) { + dtrace_dof_error(dof, "corrupt probe name"); + return (NULL); + } + + (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name), + MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name)); + + return (desc); +} + +static dtrace_difo_t * +dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate, + cred_t *cr) +{ + dtrace_difo_t *dp; + size_t ttl = 0; + dof_difohdr_t *dofd; + uintptr_t daddr = (uintptr_t)dof; + size_t max = dtrace_difo_maxsize; + int i, l, n; + + static const struct { + int section; + int bufoffs; + int lenoffs; + int entsize; + int align; + const char *msg; + } difo[] = { + { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf), + offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t), + sizeof (dif_instr_t), "multiple DIF sections" }, + + { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab), + offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t), + sizeof (uint64_t), "multiple integer tables" }, + + { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab), + offsetof(dtrace_difo_t, dtdo_strlen), 0, + sizeof (char), "multiple string tables" }, + + { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab), + offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t), + sizeof (uint_t), "multiple variable tables" }, + + { DOF_SECT_NONE, 0, 0, 0, 0, NULL } + }; + + if (sec->dofs_type != DOF_SECT_DIFOHDR) { + dtrace_dof_error(dof, "invalid DIFO header section"); + return (NULL); + } + + if (sec->dofs_align != sizeof (dof_secidx_t)) { + dtrace_dof_error(dof, "bad alignment in DIFO header"); + return (NULL); + } + + if (sec->dofs_size < sizeof (dof_difohdr_t) || + sec->dofs_size % sizeof (dof_secidx_t)) { + dtrace_dof_error(dof, "bad size in DIFO header"); + return (NULL); + } + + dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset); + n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1; + + dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP); + dp->dtdo_rtype = dofd->dofd_rtype; + + for (l = 0; l < n; l++) { + dof_sec_t *subsec; + void **bufp; + uint32_t *lenp; + + if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE, + dofd->dofd_links[l])) == NULL) + goto err; /* invalid section link */ + + if (ttl + subsec->dofs_size > max) { + dtrace_dof_error(dof, "exceeds maximum size"); + goto err; + } + + ttl += subsec->dofs_size; + + for (i = 0; difo[i].section != DOF_SECT_NONE; i++) { + if (subsec->dofs_type != difo[i].section) + continue; + + if (!(subsec->dofs_flags & DOF_SECF_LOAD)) { + dtrace_dof_error(dof, "section not loaded"); + goto err; + } + + if (subsec->dofs_align != difo[i].align) { + dtrace_dof_error(dof, "bad alignment"); + goto err; + } + + bufp = (void **)((uintptr_t)dp + difo[i].bufoffs); + lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs); + + if (*bufp != NULL) { + dtrace_dof_error(dof, difo[i].msg); + goto err; + } + + if (difo[i].entsize != subsec->dofs_entsize) { + dtrace_dof_error(dof, "entry size mismatch"); + goto err; + } + + if (subsec->dofs_entsize != 0 && + (subsec->dofs_size % subsec->dofs_entsize) != 0) { + dtrace_dof_error(dof, "corrupt entry size"); + goto err; + } + + *lenp = subsec->dofs_size; + *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP); + bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset), + *bufp, subsec->dofs_size); + + if (subsec->dofs_entsize != 0) + *lenp /= subsec->dofs_entsize; + + break; + } + + /* + * If we encounter a loadable DIFO sub-section that is not + * known to us, assume this is a broken program and fail. + */ + if (difo[i].section == DOF_SECT_NONE && + (subsec->dofs_flags & DOF_SECF_LOAD)) { + dtrace_dof_error(dof, "unrecognized DIFO subsection"); + goto err; + } + } + + if (dp->dtdo_buf == NULL) { + /* + * We can't have a DIF object without DIF text. + */ + dtrace_dof_error(dof, "missing DIF text"); + goto err; + } + + /* + * Before we validate the DIF object, run through the variable table + * looking for the strings -- if any of their size are under, we'll set + * their size to be the system-wide default string size. Note that + * this should _not_ happen if the "strsize" option has been set -- + * in this case, the compiler should have set the size to reflect the + * setting of the option. + */ + for (i = 0; i < dp->dtdo_varlen; i++) { + dtrace_difv_t *v = &dp->dtdo_vartab[i]; + dtrace_diftype_t *t = &v->dtdv_type; + + if (v->dtdv_id < DIF_VAR_OTHER_UBASE) + continue; + + if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0) + t->dtdt_size = dtrace_strsize_default; + } + + if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0) + goto err; + + dtrace_difo_init(dp, vstate); + return (dp); + +err: + kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t)); + kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t)); + kmem_free(dp->dtdo_strtab, dp->dtdo_strlen); + kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t)); + + kmem_free(dp, sizeof (dtrace_difo_t)); + return (NULL); +} + +static dtrace_predicate_t * +dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate, + cred_t *cr) +{ + dtrace_difo_t *dp; + + if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL) + return (NULL); + + return (dtrace_predicate_create(dp)); +} + +static dtrace_actdesc_t * +dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate, + cred_t *cr) +{ + dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next; + dof_actdesc_t *desc; + dof_sec_t *difosec; + size_t offs; + uintptr_t daddr = (uintptr_t)dof; + uint64_t arg; + dtrace_actkind_t kind; + + if (sec->dofs_type != DOF_SECT_ACTDESC) { + dtrace_dof_error(dof, "invalid action section"); + return (NULL); + } + + if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) { + dtrace_dof_error(dof, "truncated action description"); + return (NULL); + } + + if (sec->dofs_align != sizeof (uint64_t)) { + dtrace_dof_error(dof, "bad alignment in action description"); + return (NULL); + } + + if (sec->dofs_size < sec->dofs_entsize) { + dtrace_dof_error(dof, "section entry size exceeds total size"); + return (NULL); + } + + if (sec->dofs_entsize != sizeof (dof_actdesc_t)) { + dtrace_dof_error(dof, "bad entry size in action description"); + return (NULL); + } + + if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) { + dtrace_dof_error(dof, "actions exceed dtrace_actions_max"); + return (NULL); + } + + for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) { + desc = (dof_actdesc_t *)(daddr + + (uintptr_t)sec->dofs_offset + offs); + kind = (dtrace_actkind_t)desc->dofa_kind; + + if ((DTRACEACT_ISPRINTFLIKE(kind) && + (kind != DTRACEACT_PRINTA || + desc->dofa_strtab != DOF_SECIDX_NONE)) || + (kind == DTRACEACT_DIFEXPR && + desc->dofa_strtab != DOF_SECIDX_NONE)) { + dof_sec_t *strtab; + char *str, *fmt; + uint64_t i; + + /* + * The argument to these actions is an index into the + * DOF string table. For printf()-like actions, this + * is the format string. For print(), this is the + * CTF type of the expression result. + */ + if ((strtab = dtrace_dof_sect(dof, + DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL) + goto err; + + str = (char *)((uintptr_t)dof + + (uintptr_t)strtab->dofs_offset); + + for (i = desc->dofa_arg; i < strtab->dofs_size; i++) { + if (str[i] == '\0') + break; + } + + if (i >= strtab->dofs_size) { + dtrace_dof_error(dof, "bogus format string"); + goto err; + } + + if (i == desc->dofa_arg) { + dtrace_dof_error(dof, "empty format string"); + goto err; + } + + i -= desc->dofa_arg; + fmt = kmem_alloc(i + 1, KM_SLEEP); + bcopy(&str[desc->dofa_arg], fmt, i + 1); + arg = (uint64_t)(uintptr_t)fmt; + } else { + if (kind == DTRACEACT_PRINTA) { + ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE); + arg = 0; + } else { + arg = desc->dofa_arg; + } + } + + act = dtrace_actdesc_create(kind, desc->dofa_ntuple, + desc->dofa_uarg, arg); + + if (last != NULL) { + last->dtad_next = act; + } else { + first = act; + } + + last = act; + + if (desc->dofa_difo == DOF_SECIDX_NONE) + continue; + + if ((difosec = dtrace_dof_sect(dof, + DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL) + goto err; + + act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr); + + if (act->dtad_difo == NULL) + goto err; + } + + ASSERT(first != NULL); + return (first); + +err: + for (act = first; act != NULL; act = next) { + next = act->dtad_next; + dtrace_actdesc_release(act, vstate); + } + + return (NULL); +} + +static dtrace_ecbdesc_t * +dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate, + cred_t *cr) +{ + dtrace_ecbdesc_t *ep; + dof_ecbdesc_t *ecb; + dtrace_probedesc_t *desc; + dtrace_predicate_t *pred = NULL; + + if (sec->dofs_size < sizeof (dof_ecbdesc_t)) { + dtrace_dof_error(dof, "truncated ECB description"); + return (NULL); + } + + if (sec->dofs_align != sizeof (uint64_t)) { + dtrace_dof_error(dof, "bad alignment in ECB description"); + return (NULL); + } + + ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset); + sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes); + + if (sec == NULL) + return (NULL); + + ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP); + ep->dted_uarg = ecb->dofe_uarg; + desc = &ep->dted_probe; + + if (dtrace_dof_probedesc(dof, sec, desc) == NULL) + goto err; + + if (ecb->dofe_pred != DOF_SECIDX_NONE) { + if ((sec = dtrace_dof_sect(dof, + DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL) + goto err; + + if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL) + goto err; + + ep->dted_pred.dtpdd_predicate = pred; + } + + if (ecb->dofe_actions != DOF_SECIDX_NONE) { + if ((sec = dtrace_dof_sect(dof, + DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL) + goto err; + + ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr); + + if (ep->dted_action == NULL) + goto err; + } + + return (ep); + +err: + if (pred != NULL) + dtrace_predicate_release(pred, vstate); + kmem_free(ep, sizeof (dtrace_ecbdesc_t)); + return (NULL); +} + +/* + * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the + * specified DOF. SETX relocations are computed using 'ubase', the base load + * address of the object containing the DOF, and DOFREL relocations are relative + * to the relocation offset within the DOF. + */ +static int +dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase, + uint64_t udaddr) +{ + uintptr_t daddr = (uintptr_t)dof; + uintptr_t ts_end; + dof_relohdr_t *dofr = + (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset); + dof_sec_t *ss, *rs, *ts; + dof_relodesc_t *r; + uint_t i, n; + + if (sec->dofs_size < sizeof (dof_relohdr_t) || + sec->dofs_align != sizeof (dof_secidx_t)) { + dtrace_dof_error(dof, "invalid relocation header"); + return (-1); + } + + ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab); + rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec); + ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec); + ts_end = (uintptr_t)ts + sizeof (dof_sec_t); + + if (ss == NULL || rs == NULL || ts == NULL) + return (-1); /* dtrace_dof_error() has been called already */ + + if (rs->dofs_entsize < sizeof (dof_relodesc_t) || + rs->dofs_align != sizeof (uint64_t)) { + dtrace_dof_error(dof, "invalid relocation section"); + return (-1); + } + + r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset); + n = rs->dofs_size / rs->dofs_entsize; + + for (i = 0; i < n; i++) { + uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset; + + switch (r->dofr_type) { + case DOF_RELO_NONE: + break; + case DOF_RELO_SETX: + case DOF_RELO_DOFREL: + if (r->dofr_offset >= ts->dofs_size || r->dofr_offset + + sizeof (uint64_t) > ts->dofs_size) { + dtrace_dof_error(dof, "bad relocation offset"); + return (-1); + } + + if (taddr >= (uintptr_t)ts && taddr < ts_end) { + dtrace_dof_error(dof, "bad relocation offset"); + return (-1); + } + + if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) { + dtrace_dof_error(dof, "misaligned setx relo"); + return (-1); + } + + if (r->dofr_type == DOF_RELO_SETX) + *(uint64_t *)taddr += ubase; + else + *(uint64_t *)taddr += + udaddr + ts->dofs_offset + r->dofr_offset; + break; + default: + dtrace_dof_error(dof, "invalid relocation type"); + return (-1); + } + + r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize); + } + + return (0); +} + +/* + * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated + * header: it should be at the front of a memory region that is at least + * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in + * size. It need not be validated in any other way. + */ +static int +dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr, + dtrace_enabling_t **enabp, uint64_t ubase, uint64_t udaddr, int noprobes) +{ + uint64_t len = dof->dofh_loadsz, seclen; + uintptr_t daddr = (uintptr_t)dof; + dtrace_ecbdesc_t *ep; + dtrace_enabling_t *enab; + uint_t i; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t)); + + /* + * Check the DOF header identification bytes. In addition to checking + * valid settings, we also verify that unused bits/bytes are zeroed so + * we can use them later without fear of regressing existing binaries. + */ + if (bcmp(&dof->dofh_ident[DOF_ID_MAG0], + DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) { + dtrace_dof_error(dof, "DOF magic string mismatch"); + return (-1); + } + + if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 && + dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) { + dtrace_dof_error(dof, "DOF has invalid data model"); + return (-1); + } + + if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) { + dtrace_dof_error(dof, "DOF encoding mismatch"); + return (-1); + } + + if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 && + dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) { + dtrace_dof_error(dof, "DOF version mismatch"); + return (-1); + } + + if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) { + dtrace_dof_error(dof, "DOF uses unsupported instruction set"); + return (-1); + } + + if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) { + dtrace_dof_error(dof, "DOF uses too many integer registers"); + return (-1); + } + + if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) { + dtrace_dof_error(dof, "DOF uses too many tuple registers"); + return (-1); + } + + for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) { + if (dof->dofh_ident[i] != 0) { + dtrace_dof_error(dof, "DOF has invalid ident byte set"); + return (-1); + } + } + + if (dof->dofh_flags & ~DOF_FL_VALID) { + dtrace_dof_error(dof, "DOF has invalid flag bits set"); + return (-1); + } + + if (dof->dofh_secsize == 0) { + dtrace_dof_error(dof, "zero section header size"); + return (-1); + } + + /* + * Check that the section headers don't exceed the amount of DOF + * data. Note that we cast the section size and number of sections + * to uint64_t's to prevent possible overflow in the multiplication. + */ + seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize; + + if (dof->dofh_secoff > len || seclen > len || + dof->dofh_secoff + seclen > len) { + dtrace_dof_error(dof, "truncated section headers"); + return (-1); + } + + if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) { + dtrace_dof_error(dof, "misaligned section headers"); + return (-1); + } + + if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) { + dtrace_dof_error(dof, "misaligned section size"); + return (-1); + } + + /* + * Take an initial pass through the section headers to be sure that + * the headers don't have stray offsets. If the 'noprobes' flag is + * set, do not permit sections relating to providers, probes, or args. + */ + for (i = 0; i < dof->dofh_secnum; i++) { + dof_sec_t *sec = (dof_sec_t *)(daddr + + (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize); + + if (noprobes) { + switch (sec->dofs_type) { + case DOF_SECT_PROVIDER: + case DOF_SECT_PROBES: + case DOF_SECT_PRARGS: + case DOF_SECT_PROFFS: + dtrace_dof_error(dof, "illegal sections " + "for enabling"); + return (-1); + } + } + + if (DOF_SEC_ISLOADABLE(sec->dofs_type) && + !(sec->dofs_flags & DOF_SECF_LOAD)) { + dtrace_dof_error(dof, "loadable section with load " + "flag unset"); + return (-1); + } + + if (!(sec->dofs_flags & DOF_SECF_LOAD)) + continue; /* just ignore non-loadable sections */ + + if (!ISP2(sec->dofs_align)) { + dtrace_dof_error(dof, "bad section alignment"); + return (-1); + } + + if (sec->dofs_offset & (sec->dofs_align - 1)) { + dtrace_dof_error(dof, "misaligned section"); + return (-1); + } + + if (sec->dofs_offset > len || sec->dofs_size > len || + sec->dofs_offset + sec->dofs_size > len) { + dtrace_dof_error(dof, "corrupt section header"); + return (-1); + } + + if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr + + sec->dofs_offset + sec->dofs_size - 1) != '\0') { + dtrace_dof_error(dof, "non-terminating string table"); + return (-1); + } + } + + /* + * Take a second pass through the sections and locate and perform any + * relocations that are present. We do this after the first pass to + * be sure that all sections have had their headers validated. + */ + for (i = 0; i < dof->dofh_secnum; i++) { + dof_sec_t *sec = (dof_sec_t *)(daddr + + (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize); + + if (!(sec->dofs_flags & DOF_SECF_LOAD)) + continue; /* skip sections that are not loadable */ + + switch (sec->dofs_type) { + case DOF_SECT_URELHDR: + if (dtrace_dof_relocate(dof, sec, ubase, udaddr) != 0) + return (-1); + break; + } + } + + if ((enab = *enabp) == NULL) + enab = *enabp = dtrace_enabling_create(vstate); + + for (i = 0; i < dof->dofh_secnum; i++) { + dof_sec_t *sec = (dof_sec_t *)(daddr + + (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize); + + if (sec->dofs_type != DOF_SECT_ECBDESC) + continue; + + if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) { + dtrace_enabling_destroy(enab); + *enabp = NULL; + return (-1); + } + + dtrace_enabling_add(enab, ep); + } + + return (0); +} + +/* + * Process DOF for any options. This routine assumes that the DOF has been + * at least processed by dtrace_dof_slurp(). + */ +static int +dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state) +{ + int i, rval; + uint32_t entsize; + size_t offs; + dof_optdesc_t *desc; + + for (i = 0; i < dof->dofh_secnum; i++) { + dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof + + (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize); + + if (sec->dofs_type != DOF_SECT_OPTDESC) + continue; + + if (sec->dofs_align != sizeof (uint64_t)) { + dtrace_dof_error(dof, "bad alignment in " + "option description"); + return (EINVAL); + } + + if ((entsize = sec->dofs_entsize) == 0) { + dtrace_dof_error(dof, "zeroed option entry size"); + return (EINVAL); + } + + if (entsize < sizeof (dof_optdesc_t)) { + dtrace_dof_error(dof, "bad option entry size"); + return (EINVAL); + } + + for (offs = 0; offs < sec->dofs_size; offs += entsize) { + desc = (dof_optdesc_t *)((uintptr_t)dof + + (uintptr_t)sec->dofs_offset + offs); + + if (desc->dofo_strtab != DOF_SECIDX_NONE) { + dtrace_dof_error(dof, "non-zero option string"); + return (EINVAL); + } + + if (desc->dofo_value == DTRACEOPT_UNSET) { + dtrace_dof_error(dof, "unset option"); + return (EINVAL); + } + + if ((rval = dtrace_state_option(state, + desc->dofo_option, desc->dofo_value)) != 0) { + dtrace_dof_error(dof, "rejected option"); + return (rval); + } + } + } + + return (0); +} + +/* + * DTrace Consumer State Functions + */ +static int +dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size) +{ + size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize; + void *base; + uintptr_t limit; + dtrace_dynvar_t *dvar, *next, *start; + int i; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL); + + bzero(dstate, sizeof (dtrace_dstate_t)); + + if ((dstate->dtds_chunksize = chunksize) == 0) + dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE; + + VERIFY(dstate->dtds_chunksize < LONG_MAX); + + if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t))) + size = min; + + if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL) + return (ENOMEM); + + dstate->dtds_size = size; + dstate->dtds_base = base; + dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP); + bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t)); + + hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)); + + if (hashsize != 1 && (hashsize & 1)) + hashsize--; + + dstate->dtds_hashsize = hashsize; + dstate->dtds_hash = dstate->dtds_base; + + /* + * Set all of our hash buckets to point to the single sink, and (if + * it hasn't already been set), set the sink's hash value to be the + * sink sentinel value. The sink is needed for dynamic variable + * lookups to know that they have iterated over an entire, valid hash + * chain. + */ + for (i = 0; i < hashsize; i++) + dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink; + + if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK) + dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK; + + /* + * Determine number of active CPUs. Divide free list evenly among + * active CPUs. + */ + start = (dtrace_dynvar_t *) + ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t)); + limit = (uintptr_t)base + size; + + VERIFY((uintptr_t)start < limit); + VERIFY((uintptr_t)start >= (uintptr_t)base); + + maxper = (limit - (uintptr_t)start) / NCPU; + maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize; + +#ifndef illumos + CPU_FOREACH(i) { +#else + for (i = 0; i < NCPU; i++) { +#endif + dstate->dtds_percpu[i].dtdsc_free = dvar = start; + + /* + * If we don't even have enough chunks to make it once through + * NCPUs, we're just going to allocate everything to the first + * CPU. And if we're on the last CPU, we're going to allocate + * whatever is left over. In either case, we set the limit to + * be the limit of the dynamic variable space. + */ + if (maxper == 0 || i == NCPU - 1) { + limit = (uintptr_t)base + size; + start = NULL; + } else { + limit = (uintptr_t)start + maxper; + start = (dtrace_dynvar_t *)limit; + } + + VERIFY(limit <= (uintptr_t)base + size); + + for (;;) { + next = (dtrace_dynvar_t *)((uintptr_t)dvar + + dstate->dtds_chunksize); + + if ((uintptr_t)next + dstate->dtds_chunksize >= limit) + break; + + VERIFY((uintptr_t)dvar >= (uintptr_t)base && + (uintptr_t)dvar <= (uintptr_t)base + size); + dvar->dtdv_next = next; + dvar = next; + } + + if (maxper == 0) + break; + } + + return (0); +} + +static void +dtrace_dstate_fini(dtrace_dstate_t *dstate) +{ + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (dstate->dtds_base == NULL) + return; + + kmem_free(dstate->dtds_base, dstate->dtds_size); + kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu); +} + +static void +dtrace_vstate_fini(dtrace_vstate_t *vstate) +{ + /* + * Logical XOR, where are you? + */ + ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL)); + + if (vstate->dtvs_nglobals > 0) { + kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals * + sizeof (dtrace_statvar_t *)); + } + + if (vstate->dtvs_ntlocals > 0) { + kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals * + sizeof (dtrace_difv_t)); + } + + ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL)); + + if (vstate->dtvs_nlocals > 0) { + kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals * + sizeof (dtrace_statvar_t *)); + } +} + +#ifdef illumos +static void +dtrace_state_clean(dtrace_state_t *state) +{ + if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) + return; + + dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars); + dtrace_speculation_clean(state); +} + +static void +dtrace_state_deadman(dtrace_state_t *state) +{ + hrtime_t now; + + dtrace_sync(); + + now = dtrace_gethrtime(); + + if (state != dtrace_anon.dta_state && + now - state->dts_laststatus >= dtrace_deadman_user) + return; + + /* + * We must be sure that dts_alive never appears to be less than the + * value upon entry to dtrace_state_deadman(), and because we lack a + * dtrace_cas64(), we cannot store to it atomically. We thus instead + * store INT64_MAX to it, followed by a memory barrier, followed by + * the new value. This assures that dts_alive never appears to be + * less than its true value, regardless of the order in which the + * stores to the underlying storage are issued. + */ + state->dts_alive = INT64_MAX; + dtrace_membar_producer(); + state->dts_alive = now; +} +#else /* !illumos */ +static void +dtrace_state_clean(void *arg) +{ + dtrace_state_t *state = arg; + dtrace_optval_t *opt = state->dts_options; + + if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) + return; + + dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars); + dtrace_speculation_clean(state); + + callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC, + dtrace_state_clean, state); +} + +static void +dtrace_state_deadman(void *arg) +{ + dtrace_state_t *state = arg; + hrtime_t now; + + dtrace_sync(); + + dtrace_debug_output(); + + now = dtrace_gethrtime(); + + if (state != dtrace_anon.dta_state && + now - state->dts_laststatus >= dtrace_deadman_user) + return; + + /* + * We must be sure that dts_alive never appears to be less than the + * value upon entry to dtrace_state_deadman(), and because we lack a + * dtrace_cas64(), we cannot store to it atomically. We thus instead + * store INT64_MAX to it, followed by a memory barrier, followed by + * the new value. This assures that dts_alive never appears to be + * less than its true value, regardless of the order in which the + * stores to the underlying storage are issued. + */ + state->dts_alive = INT64_MAX; + dtrace_membar_producer(); + state->dts_alive = now; + + callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC, + dtrace_state_deadman, state); +} +#endif /* illumos */ + +static dtrace_state_t * +#ifdef illumos +dtrace_state_create(dev_t *devp, cred_t *cr) +#else +dtrace_state_create(struct cdev *dev, struct ucred *cred __unused) +#endif +{ +#ifdef illumos + minor_t minor; + major_t major; +#else + cred_t *cr = NULL; + int m = 0; +#endif + char c[30]; + dtrace_state_t *state; + dtrace_optval_t *opt; + int bufsize = NCPU * sizeof (dtrace_buffer_t), i; + int cpu_it; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(MUTEX_HELD(&cpu_lock)); + +#ifdef illumos + minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1, + VM_BESTFIT | VM_SLEEP); + + if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) { + vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1); + return (NULL); + } + + state = ddi_get_soft_state(dtrace_softstate, minor); +#else + if (dev != NULL) { + cr = dev->si_cred; + m = dev2unit(dev); + } + + /* Allocate memory for the state. */ + state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP); +#endif + + state->dts_epid = DTRACE_EPIDNONE + 1; + + (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m); +#ifdef illumos + state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1, + NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); + + if (devp != NULL) { + major = getemajor(*devp); + } else { + major = ddi_driver_major(dtrace_devi); + } + + state->dts_dev = makedevice(major, minor); + + if (devp != NULL) + *devp = state->dts_dev; +#else + state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx); + state->dts_dev = dev; +#endif + + /* + * We allocate NCPU buffers. On the one hand, this can be quite + * a bit of memory per instance (nearly 36K on a Starcat). On the + * other hand, it saves an additional memory reference in the probe + * path. + */ + state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP); + state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP); + + /* + * Allocate and initialise the per-process per-CPU random state. + * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is + * assumed to be seeded at this point (if from Fortuna seed file). + */ + arc4random_buf(&state->dts_rstate[0], 2 * sizeof(uint64_t)); + for (cpu_it = 1; cpu_it < NCPU; cpu_it++) { + /* + * Each CPU is assigned a 2^64 period, non-overlapping + * subsequence. + */ + dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1], + state->dts_rstate[cpu_it]); + } + +#ifdef illumos + state->dts_cleaner = CYCLIC_NONE; + state->dts_deadman = CYCLIC_NONE; +#else + callout_init(&state->dts_cleaner, 1); + callout_init(&state->dts_deadman, 1); +#endif + state->dts_vstate.dtvs_state = state; + + for (i = 0; i < DTRACEOPT_MAX; i++) + state->dts_options[i] = DTRACEOPT_UNSET; + + /* + * Set the default options. + */ + opt = state->dts_options; + opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH; + opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO; + opt[DTRACEOPT_NSPEC] = dtrace_nspec_default; + opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default; + opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL; + opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default; + opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default; + opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default; + opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default; + opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default; + opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default; + opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default; + opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default; + opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default; + + state->dts_activity = DTRACE_ACTIVITY_INACTIVE; + + /* + * Depending on the user credentials, we set flag bits which alter probe + * visibility or the amount of destructiveness allowed. In the case of + * actual anonymous tracing, or the possession of all privileges, all of + * the normal checks are bypassed. + */ + if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) { + state->dts_cred.dcr_visible = DTRACE_CRV_ALL; + state->dts_cred.dcr_action = DTRACE_CRA_ALL; + } else { + /* + * Set up the credentials for this instantiation. We take a + * hold on the credential to prevent it from disappearing on + * us; this in turn prevents the zone_t referenced by this + * credential from disappearing. This means that we can + * examine the credential and the zone from probe context. + */ + crhold(cr); + state->dts_cred.dcr_cred = cr; + + /* + * CRA_PROC means "we have *some* privilege for dtrace" and + * unlocks the use of variables like pid, zonename, etc. + */ + if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) || + PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) { + state->dts_cred.dcr_action |= DTRACE_CRA_PROC; + } + + /* + * dtrace_user allows use of syscall and profile providers. + * If the user also has proc_owner and/or proc_zone, we + * extend the scope to include additional visibility and + * destructive power. + */ + if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) { + if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) { + state->dts_cred.dcr_visible |= + DTRACE_CRV_ALLPROC; + + state->dts_cred.dcr_action |= + DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER; + } + + if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) { + state->dts_cred.dcr_visible |= + DTRACE_CRV_ALLZONE; + + state->dts_cred.dcr_action |= + DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE; + } + + /* + * If we have all privs in whatever zone this is, + * we can do destructive things to processes which + * have altered credentials. + */ +#ifdef illumos + if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE), + cr->cr_zone->zone_privset)) { + state->dts_cred.dcr_action |= + DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG; + } +#endif + } + + /* + * Holding the dtrace_kernel privilege also implies that + * the user has the dtrace_user privilege from a visibility + * perspective. But without further privileges, some + * destructive actions are not available. + */ + if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) { + /* + * Make all probes in all zones visible. However, + * this doesn't mean that all actions become available + * to all zones. + */ + state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL | + DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE; + + state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL | + DTRACE_CRA_PROC; + /* + * Holding proc_owner means that destructive actions + * for *this* zone are allowed. + */ + if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) + state->dts_cred.dcr_action |= + DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER; + + /* + * Holding proc_zone means that destructive actions + * for this user/group ID in all zones is allowed. + */ + if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) + state->dts_cred.dcr_action |= + DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE; + +#ifdef illumos + /* + * If we have all privs in whatever zone this is, + * we can do destructive things to processes which + * have altered credentials. + */ + if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE), + cr->cr_zone->zone_privset)) { + state->dts_cred.dcr_action |= + DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG; + } +#endif + } + + /* + * Holding the dtrace_proc privilege gives control over fasttrap + * and pid providers. We need to grant wider destructive + * privileges in the event that the user has proc_owner and/or + * proc_zone. + */ + if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) { + if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) + state->dts_cred.dcr_action |= + DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER; + + if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) + state->dts_cred.dcr_action |= + DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE; + } + } + + return (state); +} + +static int +dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which) +{ + dtrace_optval_t *opt = state->dts_options, size; + processorid_t cpu = 0;; + int flags = 0, rval, factor, divisor = 1; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(which < DTRACEOPT_MAX); + ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE || + (state == dtrace_anon.dta_state && + state->dts_activity == DTRACE_ACTIVITY_ACTIVE)); + + if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0) + return (0); + + if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET) + cpu = opt[DTRACEOPT_CPU]; + + if (which == DTRACEOPT_SPECSIZE) + flags |= DTRACEBUF_NOSWITCH; + + if (which == DTRACEOPT_BUFSIZE) { + if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING) + flags |= DTRACEBUF_RING; + + if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL) + flags |= DTRACEBUF_FILL; + + if (state != dtrace_anon.dta_state || + state->dts_activity != DTRACE_ACTIVITY_ACTIVE) + flags |= DTRACEBUF_INACTIVE; + } + + for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) { + /* + * The size must be 8-byte aligned. If the size is not 8-byte + * aligned, drop it down by the difference. + */ + if (size & (sizeof (uint64_t) - 1)) + size -= size & (sizeof (uint64_t) - 1); + + if (size < state->dts_reserve) { + /* + * Buffers always must be large enough to accommodate + * their prereserved space. We return E2BIG instead + * of ENOMEM in this case to allow for user-level + * software to differentiate the cases. + */ + return (E2BIG); + } + + rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor); + + if (rval != ENOMEM) { + opt[which] = size; + return (rval); + } + + if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL) + return (rval); + + for (divisor = 2; divisor < factor; divisor <<= 1) + continue; + } + + return (ENOMEM); +} + +static int +dtrace_state_buffers(dtrace_state_t *state) +{ + dtrace_speculation_t *spec = state->dts_speculations; + int rval, i; + + if ((rval = dtrace_state_buffer(state, state->dts_buffer, + DTRACEOPT_BUFSIZE)) != 0) + return (rval); + + if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer, + DTRACEOPT_AGGSIZE)) != 0) + return (rval); + + for (i = 0; i < state->dts_nspeculations; i++) { + if ((rval = dtrace_state_buffer(state, + spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0) + return (rval); + } + + return (0); +} + +static void +dtrace_state_prereserve(dtrace_state_t *state) +{ + dtrace_ecb_t *ecb; + dtrace_probe_t *probe; + + state->dts_reserve = 0; + + if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL) + return; + + /* + * If our buffer policy is a "fill" buffer policy, we need to set the + * prereserved space to be the space required by the END probes. + */ + probe = dtrace_probes[dtrace_probeid_end - 1]; + ASSERT(probe != NULL); + + for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) { + if (ecb->dte_state != state) + continue; + + state->dts_reserve += ecb->dte_needed + ecb->dte_alignment; + } +} + +static int +dtrace_state_go(dtrace_state_t *state, processorid_t *cpu) +{ + dtrace_optval_t *opt = state->dts_options, sz, nspec; + dtrace_speculation_t *spec; + dtrace_buffer_t *buf; +#ifdef illumos + cyc_handler_t hdlr; + cyc_time_t when; +#endif + int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t); + dtrace_icookie_t cookie; + + mutex_enter(&cpu_lock); + mutex_enter(&dtrace_lock); + + if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) { + rval = EBUSY; + goto out; + } + + /* + * Before we can perform any checks, we must prime all of the + * retained enablings that correspond to this state. + */ + dtrace_enabling_prime(state); + + if (state->dts_destructive && !state->dts_cred.dcr_destructive) { + rval = EACCES; + goto out; + } + + dtrace_state_prereserve(state); + + /* + * Now we want to do is try to allocate our speculations. + * We do not automatically resize the number of speculations; if + * this fails, we will fail the operation. + */ + nspec = opt[DTRACEOPT_NSPEC]; + ASSERT(nspec != DTRACEOPT_UNSET); + + if (nspec > INT_MAX) { + rval = ENOMEM; + goto out; + } + + spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), + KM_NOSLEEP | KM_NORMALPRI); + + if (spec == NULL) { + rval = ENOMEM; + goto out; + } + + state->dts_speculations = spec; + state->dts_nspeculations = (int)nspec; + + for (i = 0; i < nspec; i++) { + if ((buf = kmem_zalloc(bufsize, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) { + rval = ENOMEM; + goto err; + } + + spec[i].dtsp_buffer = buf; + } + + if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) { + if (dtrace_anon.dta_state == NULL) { + rval = ENOENT; + goto out; + } + + if (state->dts_necbs != 0) { + rval = EALREADY; + goto out; + } + + state->dts_anon = dtrace_anon_grab(); + ASSERT(state->dts_anon != NULL); + state = state->dts_anon; + + /* + * We want "grabanon" to be set in the grabbed state, so we'll + * copy that option value from the grabbing state into the + * grabbed state. + */ + state->dts_options[DTRACEOPT_GRABANON] = + opt[DTRACEOPT_GRABANON]; + + *cpu = dtrace_anon.dta_beganon; + + /* + * If the anonymous state is active (as it almost certainly + * is if the anonymous enabling ultimately matched anything), + * we don't allow any further option processing -- but we + * don't return failure. + */ + if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) + goto out; + } + + if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET && + opt[DTRACEOPT_AGGSIZE] != 0) { + if (state->dts_aggregations == NULL) { + /* + * We're not going to create an aggregation buffer + * because we don't have any ECBs that contain + * aggregations -- set this option to 0. + */ + opt[DTRACEOPT_AGGSIZE] = 0; + } else { + /* + * If we have an aggregation buffer, we must also have + * a buffer to use as scratch. + */ + if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET || + opt[DTRACEOPT_BUFSIZE] < state->dts_needed) { + opt[DTRACEOPT_BUFSIZE] = state->dts_needed; + } + } + } + + if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET && + opt[DTRACEOPT_SPECSIZE] != 0) { + if (!state->dts_speculates) { + /* + * We're not going to create speculation buffers + * because we don't have any ECBs that actually + * speculate -- set the speculation size to 0. + */ + opt[DTRACEOPT_SPECSIZE] = 0; + } + } + + /* + * The bare minimum size for any buffer that we're actually going to + * do anything to is sizeof (uint64_t). + */ + sz = sizeof (uint64_t); + + if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) || + (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) || + (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) { + /* + * A buffer size has been explicitly set to 0 (or to a size + * that will be adjusted to 0) and we need the space -- we + * need to return failure. We return ENOSPC to differentiate + * it from failing to allocate a buffer due to failure to meet + * the reserve (for which we return E2BIG). + */ + rval = ENOSPC; + goto out; + } + + if ((rval = dtrace_state_buffers(state)) != 0) + goto err; + + if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET) + sz = dtrace_dstate_defsize; + + do { + rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz); + + if (rval == 0) + break; + + if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL) + goto err; + } while (sz >>= 1); + + opt[DTRACEOPT_DYNVARSIZE] = sz; + + if (rval != 0) + goto err; + + if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max) + opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max; + + if (opt[DTRACEOPT_CLEANRATE] == 0) + opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max; + + if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min) + opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min; + + if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max) + opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max; + + state->dts_alive = state->dts_laststatus = dtrace_gethrtime(); +#ifdef illumos + hdlr.cyh_func = (cyc_func_t)dtrace_state_clean; + hdlr.cyh_arg = state; + hdlr.cyh_level = CY_LOW_LEVEL; + + when.cyt_when = 0; + when.cyt_interval = opt[DTRACEOPT_CLEANRATE]; + + state->dts_cleaner = cyclic_add(&hdlr, &when); + + hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman; + hdlr.cyh_arg = state; + hdlr.cyh_level = CY_LOW_LEVEL; + + when.cyt_when = 0; + when.cyt_interval = dtrace_deadman_interval; + + state->dts_deadman = cyclic_add(&hdlr, &when); +#else + callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC, + dtrace_state_clean, state); + callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC, + dtrace_state_deadman, state); +#endif + + state->dts_activity = DTRACE_ACTIVITY_WARMUP; + +#ifdef illumos + if (state->dts_getf != 0 && + !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) { + /* + * We don't have kernel privs but we have at least one call + * to getf(); we need to bump our zone's count, and (if + * this is the first enabling to have an unprivileged call + * to getf()) we need to hook into closef(). + */ + state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++; + + if (dtrace_getf++ == 0) { + ASSERT(dtrace_closef == NULL); + dtrace_closef = dtrace_getf_barrier; + } + } +#endif + + /* + * Now it's time to actually fire the BEGIN probe. We need to disable + * interrupts here both to record the CPU on which we fired the BEGIN + * probe (the data from this CPU will be processed first at user + * level) and to manually activate the buffer for this CPU. + */ + cookie = dtrace_interrupt_disable(); + *cpu = curcpu; + ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE); + state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE; + + dtrace_probe(dtrace_probeid_begin, + (uint64_t)(uintptr_t)state, 0, 0, 0, 0); + dtrace_interrupt_enable(cookie); + /* + * We may have had an exit action from a BEGIN probe; only change our + * state to ACTIVE if we're still in WARMUP. + */ + ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP || + state->dts_activity == DTRACE_ACTIVITY_DRAINING); + + if (state->dts_activity == DTRACE_ACTIVITY_WARMUP) + state->dts_activity = DTRACE_ACTIVITY_ACTIVE; + +#ifdef __FreeBSD__ + /* + * We enable anonymous tracing before APs are started, so we must + * activate buffers using the current CPU. + */ + if (state == dtrace_anon.dta_state) + for (int i = 0; i < NCPU; i++) + dtrace_buffer_activate_cpu(state, i); + else + dtrace_xcall(DTRACE_CPUALL, + (dtrace_xcall_t)dtrace_buffer_activate, state); +#else + /* + * Regardless of whether or not now we're in ACTIVE or DRAINING, we + * want each CPU to transition its principal buffer out of the + * INACTIVE state. Doing this assures that no CPU will suddenly begin + * processing an ECB halfway down a probe's ECB chain; all CPUs will + * atomically transition from processing none of a state's ECBs to + * processing all of them. + */ + dtrace_xcall(DTRACE_CPUALL, + (dtrace_xcall_t)dtrace_buffer_activate, state); +#endif + goto out; + +err: + dtrace_buffer_free(state->dts_buffer); + dtrace_buffer_free(state->dts_aggbuffer); + + if ((nspec = state->dts_nspeculations) == 0) { + ASSERT(state->dts_speculations == NULL); + goto out; + } + + spec = state->dts_speculations; + ASSERT(spec != NULL); + + for (i = 0; i < state->dts_nspeculations; i++) { + if ((buf = spec[i].dtsp_buffer) == NULL) + break; + + dtrace_buffer_free(buf); + kmem_free(buf, bufsize); + } + + kmem_free(spec, nspec * sizeof (dtrace_speculation_t)); + state->dts_nspeculations = 0; + state->dts_speculations = NULL; + +out: + mutex_exit(&dtrace_lock); + mutex_exit(&cpu_lock); + + return (rval); +} + +static int +dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu) +{ + dtrace_icookie_t cookie; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + + if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE && + state->dts_activity != DTRACE_ACTIVITY_DRAINING) + return (EINVAL); + + /* + * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync + * to be sure that every CPU has seen it. See below for the details + * on why this is done. + */ + state->dts_activity = DTRACE_ACTIVITY_DRAINING; + dtrace_sync(); + + /* + * By this point, it is impossible for any CPU to be still processing + * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to + * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any + * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe() + * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN + * iff we're in the END probe. + */ + state->dts_activity = DTRACE_ACTIVITY_COOLDOWN; + dtrace_sync(); + ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN); + + /* + * Finally, we can release the reserve and call the END probe. We + * disable interrupts across calling the END probe to allow us to + * return the CPU on which we actually called the END probe. This + * allows user-land to be sure that this CPU's principal buffer is + * processed last. + */ + state->dts_reserve = 0; + + cookie = dtrace_interrupt_disable(); + *cpu = curcpu; + dtrace_probe(dtrace_probeid_end, + (uint64_t)(uintptr_t)state, 0, 0, 0, 0); + dtrace_interrupt_enable(cookie); + + state->dts_activity = DTRACE_ACTIVITY_STOPPED; + dtrace_sync(); + +#ifdef illumos + if (state->dts_getf != 0 && + !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) { + /* + * We don't have kernel privs but we have at least one call + * to getf(); we need to lower our zone's count, and (if + * this is the last enabling to have an unprivileged call + * to getf()) we need to clear the closef() hook. + */ + ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0); + ASSERT(dtrace_closef == dtrace_getf_barrier); + ASSERT(dtrace_getf > 0); + + state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--; + + if (--dtrace_getf == 0) + dtrace_closef = NULL; + } +#endif + + return (0); +} + +static int +dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option, + dtrace_optval_t val) +{ + ASSERT(MUTEX_HELD(&dtrace_lock)); + + if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) + return (EBUSY); + + if (option >= DTRACEOPT_MAX) + return (EINVAL); + + if (option != DTRACEOPT_CPU && val < 0) + return (EINVAL); + + switch (option) { + case DTRACEOPT_DESTRUCTIVE: + if (dtrace_destructive_disallow) + return (EACCES); + + state->dts_cred.dcr_destructive = 1; + break; + + case DTRACEOPT_BUFSIZE: + case DTRACEOPT_DYNVARSIZE: + case DTRACEOPT_AGGSIZE: + case DTRACEOPT_SPECSIZE: + case DTRACEOPT_STRSIZE: + if (val < 0) + return (EINVAL); + + if (val >= LONG_MAX) { + /* + * If this is an otherwise negative value, set it to + * the highest multiple of 128m less than LONG_MAX. + * Technically, we're adjusting the size without + * regard to the buffer resizing policy, but in fact, + * this has no effect -- if we set the buffer size to + * ~LONG_MAX and the buffer policy is ultimately set to + * be "manual", the buffer allocation is guaranteed to + * fail, if only because the allocation requires two + * buffers. (We set the the size to the highest + * multiple of 128m because it ensures that the size + * will remain a multiple of a megabyte when + * repeatedly halved -- all the way down to 15m.) + */ + val = LONG_MAX - (1 << 27) + 1; + } + } + + state->dts_options[option] = val; + + return (0); +} + +static void +dtrace_state_destroy(dtrace_state_t *state) +{ + dtrace_ecb_t *ecb; + dtrace_vstate_t *vstate = &state->dts_vstate; +#ifdef illumos + minor_t minor = getminor(state->dts_dev); +#endif + int i, bufsize = NCPU * sizeof (dtrace_buffer_t); + dtrace_speculation_t *spec = state->dts_speculations; + int nspec = state->dts_nspeculations; + uint32_t match; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(MUTEX_HELD(&cpu_lock)); + + /* + * First, retract any retained enablings for this state. + */ + dtrace_enabling_retract(state); + ASSERT(state->dts_nretained == 0); + + if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE || + state->dts_activity == DTRACE_ACTIVITY_DRAINING) { + /* + * We have managed to come into dtrace_state_destroy() on a + * hot enabling -- almost certainly because of a disorderly + * shutdown of a consumer. (That is, a consumer that is + * exiting without having called dtrace_stop().) In this case, + * we're going to set our activity to be KILLED, and then + * issue a sync to be sure that everyone is out of probe + * context before we start blowing away ECBs. + */ + state->dts_activity = DTRACE_ACTIVITY_KILLED; + dtrace_sync(); + } + + /* + * Release the credential hold we took in dtrace_state_create(). + */ + if (state->dts_cred.dcr_cred != NULL) + crfree(state->dts_cred.dcr_cred); + + /* + * Now we can safely disable and destroy any enabled probes. Because + * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress + * (especially if they're all enabled), we take two passes through the + * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and + * in the second we disable whatever is left over. + */ + for (match = DTRACE_PRIV_KERNEL; ; match = 0) { + for (i = 0; i < state->dts_necbs; i++) { + if ((ecb = state->dts_ecbs[i]) == NULL) + continue; + + if (match && ecb->dte_probe != NULL) { + dtrace_probe_t *probe = ecb->dte_probe; + dtrace_provider_t *prov = probe->dtpr_provider; + + if (!(prov->dtpv_priv.dtpp_flags & match)) + continue; + } + + dtrace_ecb_disable(ecb); + dtrace_ecb_destroy(ecb); + } + + if (!match) + break; + } + + /* + * Before we free the buffers, perform one more sync to assure that + * every CPU is out of probe context. + */ + dtrace_sync(); + + dtrace_buffer_free(state->dts_buffer); + dtrace_buffer_free(state->dts_aggbuffer); + + for (i = 0; i < nspec; i++) + dtrace_buffer_free(spec[i].dtsp_buffer); + +#ifdef illumos + if (state->dts_cleaner != CYCLIC_NONE) + cyclic_remove(state->dts_cleaner); + + if (state->dts_deadman != CYCLIC_NONE) + cyclic_remove(state->dts_deadman); +#else + callout_stop(&state->dts_cleaner); + callout_drain(&state->dts_cleaner); + callout_stop(&state->dts_deadman); + callout_drain(&state->dts_deadman); +#endif + + dtrace_dstate_fini(&vstate->dtvs_dynvars); + dtrace_vstate_fini(vstate); + if (state->dts_ecbs != NULL) + kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *)); + + if (state->dts_aggregations != NULL) { +#ifdef DEBUG + for (i = 0; i < state->dts_naggregations; i++) + ASSERT(state->dts_aggregations[i] == NULL); +#endif + ASSERT(state->dts_naggregations > 0); + kmem_free(state->dts_aggregations, + state->dts_naggregations * sizeof (dtrace_aggregation_t *)); + } + + kmem_free(state->dts_buffer, bufsize); + kmem_free(state->dts_aggbuffer, bufsize); + + for (i = 0; i < nspec; i++) + kmem_free(spec[i].dtsp_buffer, bufsize); + + if (spec != NULL) + kmem_free(spec, nspec * sizeof (dtrace_speculation_t)); + + dtrace_format_destroy(state); + + if (state->dts_aggid_arena != NULL) { +#ifdef illumos + vmem_destroy(state->dts_aggid_arena); +#else + delete_unrhdr(state->dts_aggid_arena); +#endif + state->dts_aggid_arena = NULL; + } +#ifdef illumos + ddi_soft_state_free(dtrace_softstate, minor); + vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1); +#endif +} + +/* + * DTrace Anonymous Enabling Functions + */ +static dtrace_state_t * +dtrace_anon_grab(void) +{ + dtrace_state_t *state; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + + if ((state = dtrace_anon.dta_state) == NULL) { + ASSERT(dtrace_anon.dta_enabling == NULL); + return (NULL); + } + + ASSERT(dtrace_anon.dta_enabling != NULL); + ASSERT(dtrace_retained != NULL); + + dtrace_enabling_destroy(dtrace_anon.dta_enabling); + dtrace_anon.dta_enabling = NULL; + dtrace_anon.dta_state = NULL; + + return (state); +} + +static void +dtrace_anon_property(void) +{ + int i, rv; + dtrace_state_t *state; + dof_hdr_t *dof; + char c[32]; /* enough for "dof-data-" + digits */ + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(MUTEX_HELD(&cpu_lock)); + + for (i = 0; ; i++) { + (void) snprintf(c, sizeof (c), "dof-data-%d", i); + + dtrace_err_verbose = 1; + + if ((dof = dtrace_dof_property(c)) == NULL) { + dtrace_err_verbose = 0; + break; + } + +#ifdef illumos + /* + * We want to create anonymous state, so we need to transition + * the kernel debugger to indicate that DTrace is active. If + * this fails (e.g. because the debugger has modified text in + * some way), we won't continue with the processing. + */ + if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) { + cmn_err(CE_NOTE, "kernel debugger active; anonymous " + "enabling ignored."); + dtrace_dof_destroy(dof); + break; + } +#endif + + /* + * If we haven't allocated an anonymous state, we'll do so now. + */ + if ((state = dtrace_anon.dta_state) == NULL) { + state = dtrace_state_create(NULL, NULL); + dtrace_anon.dta_state = state; + + if (state == NULL) { + /* + * This basically shouldn't happen: the only + * failure mode from dtrace_state_create() is a + * failure of ddi_soft_state_zalloc() that + * itself should never happen. Still, the + * interface allows for a failure mode, and + * we want to fail as gracefully as possible: + * we'll emit an error message and cease + * processing anonymous state in this case. + */ + cmn_err(CE_WARN, "failed to create " + "anonymous state"); + dtrace_dof_destroy(dof); + break; + } + } + + rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(), + &dtrace_anon.dta_enabling, 0, 0, B_TRUE); + + if (rv == 0) + rv = dtrace_dof_options(dof, state); + + dtrace_err_verbose = 0; + dtrace_dof_destroy(dof); + + if (rv != 0) { + /* + * This is malformed DOF; chuck any anonymous state + * that we created. + */ + ASSERT(dtrace_anon.dta_enabling == NULL); + dtrace_state_destroy(state); + dtrace_anon.dta_state = NULL; + break; + } + + ASSERT(dtrace_anon.dta_enabling != NULL); + } + + if (dtrace_anon.dta_enabling != NULL) { + int rval; + + /* + * dtrace_enabling_retain() can only fail because we are + * trying to retain more enablings than are allowed -- but + * we only have one anonymous enabling, and we are guaranteed + * to be allowed at least one retained enabling; we assert + * that dtrace_enabling_retain() returns success. + */ + rval = dtrace_enabling_retain(dtrace_anon.dta_enabling); + ASSERT(rval == 0); + + dtrace_enabling_dump(dtrace_anon.dta_enabling); + } +} + +/* + * DTrace Helper Functions + */ +static void +dtrace_helper_trace(dtrace_helper_action_t *helper, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where) +{ + uint32_t size, next, nnext, i; + dtrace_helptrace_t *ent, *buffer; + uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags; + + if ((buffer = dtrace_helptrace_buffer) == NULL) + return; + + ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals); + + /* + * What would a tracing framework be without its own tracing + * framework? (Well, a hell of a lot simpler, for starters...) + */ + size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals * + sizeof (uint64_t) - sizeof (uint64_t); + + /* + * Iterate until we can allocate a slot in the trace buffer. + */ + do { + next = dtrace_helptrace_next; + + if (next + size < dtrace_helptrace_bufsize) { + nnext = next + size; + } else { + nnext = size; + } + } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next); + + /* + * We have our slot; fill it in. + */ + if (nnext == size) { + dtrace_helptrace_wrapped++; + next = 0; + } + + ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next); + ent->dtht_helper = helper; + ent->dtht_where = where; + ent->dtht_nlocals = vstate->dtvs_nlocals; + + ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ? + mstate->dtms_fltoffs : -1; + ent->dtht_fault = DTRACE_FLAGS2FLT(flags); + ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval; + + for (i = 0; i < vstate->dtvs_nlocals; i++) { + dtrace_statvar_t *svar; + + if ((svar = vstate->dtvs_locals[i]) == NULL) + continue; + + ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t)); + ent->dtht_locals[i] = + ((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu]; + } +} + +static uint64_t +dtrace_helper(int which, dtrace_mstate_t *mstate, + dtrace_state_t *state, uint64_t arg0, uint64_t arg1) +{ + uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags; + uint64_t sarg0 = mstate->dtms_arg[0]; + uint64_t sarg1 = mstate->dtms_arg[1]; + uint64_t rval = 0; + dtrace_helpers_t *helpers = curproc->p_dtrace_helpers; + dtrace_helper_action_t *helper; + dtrace_vstate_t *vstate; + dtrace_difo_t *pred; + int i, trace = dtrace_helptrace_buffer != NULL; + + ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS); + + if (helpers == NULL) + return (0); + + if ((helper = helpers->dthps_actions[which]) == NULL) + return (0); + + vstate = &helpers->dthps_vstate; + mstate->dtms_arg[0] = arg0; + mstate->dtms_arg[1] = arg1; + + /* + * Now iterate over each helper. If its predicate evaluates to 'true', + * we'll call the corresponding actions. Note that the below calls + * to dtrace_dif_emulate() may set faults in machine state. This is + * okay: our caller (the outer dtrace_dif_emulate()) will simply plow + * the stored DIF offset with its own (which is the desired behavior). + * Also, note the calls to dtrace_dif_emulate() may allocate scratch + * from machine state; this is okay, too. + */ + for (; helper != NULL; helper = helper->dtha_next) { + if ((pred = helper->dtha_predicate) != NULL) { + if (trace) + dtrace_helper_trace(helper, mstate, vstate, 0); + + if (!dtrace_dif_emulate(pred, mstate, vstate, state)) + goto next; + + if (*flags & CPU_DTRACE_FAULT) + goto err; + } + + for (i = 0; i < helper->dtha_nactions; i++) { + if (trace) + dtrace_helper_trace(helper, + mstate, vstate, i + 1); + + rval = dtrace_dif_emulate(helper->dtha_actions[i], + mstate, vstate, state); + + if (*flags & CPU_DTRACE_FAULT) + goto err; + } + +next: + if (trace) + dtrace_helper_trace(helper, mstate, vstate, + DTRACE_HELPTRACE_NEXT); + } + + if (trace) + dtrace_helper_trace(helper, mstate, vstate, + DTRACE_HELPTRACE_DONE); + + /* + * Restore the arg0 that we saved upon entry. + */ + mstate->dtms_arg[0] = sarg0; + mstate->dtms_arg[1] = sarg1; + + return (rval); + +err: + if (trace) + dtrace_helper_trace(helper, mstate, vstate, + DTRACE_HELPTRACE_ERR); + + /* + * Restore the arg0 that we saved upon entry. + */ + mstate->dtms_arg[0] = sarg0; + mstate->dtms_arg[1] = sarg1; + + return (0); +} + +static void +dtrace_helper_action_destroy(dtrace_helper_action_t *helper, + dtrace_vstate_t *vstate) +{ + int i; + + if (helper->dtha_predicate != NULL) + dtrace_difo_release(helper->dtha_predicate, vstate); + + for (i = 0; i < helper->dtha_nactions; i++) { + ASSERT(helper->dtha_actions[i] != NULL); + dtrace_difo_release(helper->dtha_actions[i], vstate); + } + + kmem_free(helper->dtha_actions, + helper->dtha_nactions * sizeof (dtrace_difo_t *)); + kmem_free(helper, sizeof (dtrace_helper_action_t)); +} + +static int +dtrace_helper_destroygen(dtrace_helpers_t *help, int gen) +{ + proc_t *p = curproc; + dtrace_vstate_t *vstate; + int i; + + if (help == NULL) + help = p->p_dtrace_helpers; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + + if (help == NULL || gen > help->dthps_generation) + return (EINVAL); + + vstate = &help->dthps_vstate; + + for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) { + dtrace_helper_action_t *last = NULL, *h, *next; + + for (h = help->dthps_actions[i]; h != NULL; h = next) { + next = h->dtha_next; + + if (h->dtha_generation == gen) { + if (last != NULL) { + last->dtha_next = next; + } else { + help->dthps_actions[i] = next; + } + + dtrace_helper_action_destroy(h, vstate); + } else { + last = h; + } + } + } + + /* + * Interate until we've cleared out all helper providers with the + * given generation number. + */ + for (;;) { + dtrace_helper_provider_t *prov; + + /* + * Look for a helper provider with the right generation. We + * have to start back at the beginning of the list each time + * because we drop dtrace_lock. It's unlikely that we'll make + * more than two passes. + */ + for (i = 0; i < help->dthps_nprovs; i++) { + prov = help->dthps_provs[i]; + + if (prov->dthp_generation == gen) + break; + } + + /* + * If there were no matches, we're done. + */ + if (i == help->dthps_nprovs) + break; + + /* + * Move the last helper provider into this slot. + */ + help->dthps_nprovs--; + help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs]; + help->dthps_provs[help->dthps_nprovs] = NULL; + + mutex_exit(&dtrace_lock); + + /* + * If we have a meta provider, remove this helper provider. + */ + mutex_enter(&dtrace_meta_lock); + if (dtrace_meta_pid != NULL) { + ASSERT(dtrace_deferred_pid == NULL); + dtrace_helper_provider_remove(&prov->dthp_prov, + p->p_pid); + } + mutex_exit(&dtrace_meta_lock); + + dtrace_helper_provider_destroy(prov); + + mutex_enter(&dtrace_lock); + } + + return (0); +} + +static int +dtrace_helper_validate(dtrace_helper_action_t *helper) +{ + int err = 0, i; + dtrace_difo_t *dp; + + if ((dp = helper->dtha_predicate) != NULL) + err += dtrace_difo_validate_helper(dp); + + for (i = 0; i < helper->dtha_nactions; i++) + err += dtrace_difo_validate_helper(helper->dtha_actions[i]); + + return (err == 0); +} + +static int +dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep, + dtrace_helpers_t *help) +{ + dtrace_helper_action_t *helper, *last; + dtrace_actdesc_t *act; + dtrace_vstate_t *vstate; + dtrace_predicate_t *pred; + int count = 0, nactions = 0, i; + + if (which < 0 || which >= DTRACE_NHELPER_ACTIONS) + return (EINVAL); + + last = help->dthps_actions[which]; + vstate = &help->dthps_vstate; + + for (count = 0; last != NULL; last = last->dtha_next) { + count++; + if (last->dtha_next == NULL) + break; + } + + /* + * If we already have dtrace_helper_actions_max helper actions for this + * helper action type, we'll refuse to add a new one. + */ + if (count >= dtrace_helper_actions_max) + return (ENOSPC); + + helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP); + helper->dtha_generation = help->dthps_generation; + + if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) { + ASSERT(pred->dtp_difo != NULL); + dtrace_difo_hold(pred->dtp_difo); + helper->dtha_predicate = pred->dtp_difo; + } + + for (act = ep->dted_action; act != NULL; act = act->dtad_next) { + if (act->dtad_kind != DTRACEACT_DIFEXPR) + goto err; + + if (act->dtad_difo == NULL) + goto err; + + nactions++; + } + + helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) * + (helper->dtha_nactions = nactions), KM_SLEEP); + + for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) { + dtrace_difo_hold(act->dtad_difo); + helper->dtha_actions[i++] = act->dtad_difo; + } + + if (!dtrace_helper_validate(helper)) + goto err; + + if (last == NULL) { + help->dthps_actions[which] = helper; + } else { + last->dtha_next = helper; + } + + if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) { + dtrace_helptrace_nlocals = vstate->dtvs_nlocals; + dtrace_helptrace_next = 0; + } + + return (0); +err: + dtrace_helper_action_destroy(helper, vstate); + return (EINVAL); +} + +static void +dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help, + dof_helper_t *dofhp) +{ + ASSERT(MUTEX_NOT_HELD(&dtrace_lock)); + + mutex_enter(&dtrace_meta_lock); + mutex_enter(&dtrace_lock); + + if (!dtrace_attached() || dtrace_meta_pid == NULL) { + /* + * If the dtrace module is loaded but not attached, or if + * there aren't isn't a meta provider registered to deal with + * these provider descriptions, we need to postpone creating + * the actual providers until later. + */ + + if (help->dthps_next == NULL && help->dthps_prev == NULL && + dtrace_deferred_pid != help) { + help->dthps_deferred = 1; + help->dthps_pid = p->p_pid; + help->dthps_next = dtrace_deferred_pid; + help->dthps_prev = NULL; + if (dtrace_deferred_pid != NULL) + dtrace_deferred_pid->dthps_prev = help; + dtrace_deferred_pid = help; + } + + mutex_exit(&dtrace_lock); + + } else if (dofhp != NULL) { + /* + * If the dtrace module is loaded and we have a particular + * helper provider description, pass that off to the + * meta provider. + */ + + mutex_exit(&dtrace_lock); + + dtrace_helper_provide(dofhp, p->p_pid); + + } else { + /* + * Otherwise, just pass all the helper provider descriptions + * off to the meta provider. + */ + + int i; + mutex_exit(&dtrace_lock); + + for (i = 0; i < help->dthps_nprovs; i++) { + dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov, + p->p_pid); + } + } + + mutex_exit(&dtrace_meta_lock); +} + +static int +dtrace_helper_provider_add(dof_helper_t *dofhp, dtrace_helpers_t *help, int gen) +{ + dtrace_helper_provider_t *hprov, **tmp_provs; + uint_t tmp_maxprovs, i; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(help != NULL); + + /* + * If we already have dtrace_helper_providers_max helper providers, + * we're refuse to add a new one. + */ + if (help->dthps_nprovs >= dtrace_helper_providers_max) + return (ENOSPC); + + /* + * Check to make sure this isn't a duplicate. + */ + for (i = 0; i < help->dthps_nprovs; i++) { + if (dofhp->dofhp_addr == + help->dthps_provs[i]->dthp_prov.dofhp_addr) + return (EALREADY); + } + + hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP); + hprov->dthp_prov = *dofhp; + hprov->dthp_ref = 1; + hprov->dthp_generation = gen; + + /* + * Allocate a bigger table for helper providers if it's already full. + */ + if (help->dthps_maxprovs == help->dthps_nprovs) { + tmp_maxprovs = help->dthps_maxprovs; + tmp_provs = help->dthps_provs; + + if (help->dthps_maxprovs == 0) + help->dthps_maxprovs = 2; + else + help->dthps_maxprovs *= 2; + if (help->dthps_maxprovs > dtrace_helper_providers_max) + help->dthps_maxprovs = dtrace_helper_providers_max; + + ASSERT(tmp_maxprovs < help->dthps_maxprovs); + + help->dthps_provs = kmem_zalloc(help->dthps_maxprovs * + sizeof (dtrace_helper_provider_t *), KM_SLEEP); + + if (tmp_provs != NULL) { + bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs * + sizeof (dtrace_helper_provider_t *)); + kmem_free(tmp_provs, tmp_maxprovs * + sizeof (dtrace_helper_provider_t *)); + } + } + + help->dthps_provs[help->dthps_nprovs] = hprov; + help->dthps_nprovs++; + + return (0); +} + +static void +dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov) +{ + mutex_enter(&dtrace_lock); + + if (--hprov->dthp_ref == 0) { + dof_hdr_t *dof; + mutex_exit(&dtrace_lock); + dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof; + dtrace_dof_destroy(dof); + kmem_free(hprov, sizeof (dtrace_helper_provider_t)); + } else { + mutex_exit(&dtrace_lock); + } +} + +static int +dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec) +{ + uintptr_t daddr = (uintptr_t)dof; + dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec; + dof_provider_t *provider; + dof_probe_t *probe; + uint8_t *arg; + char *strtab, *typestr; + dof_stridx_t typeidx; + size_t typesz; + uint_t nprobes, j, k; + + ASSERT(sec->dofs_type == DOF_SECT_PROVIDER); + + if (sec->dofs_offset & (sizeof (uint_t) - 1)) { + dtrace_dof_error(dof, "misaligned section offset"); + return (-1); + } + + /* + * The section needs to be large enough to contain the DOF provider + * structure appropriate for the given version. + */ + if (sec->dofs_size < + ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ? + offsetof(dof_provider_t, dofpv_prenoffs) : + sizeof (dof_provider_t))) { + dtrace_dof_error(dof, "provider section too small"); + return (-1); + } + + provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset); + str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab); + prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes); + arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs); + off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs); + + if (str_sec == NULL || prb_sec == NULL || + arg_sec == NULL || off_sec == NULL) + return (-1); + + enoff_sec = NULL; + + if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 && + provider->dofpv_prenoffs != DOF_SECT_NONE && + (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS, + provider->dofpv_prenoffs)) == NULL) + return (-1); + + strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset); + + if (provider->dofpv_name >= str_sec->dofs_size || + strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) { + dtrace_dof_error(dof, "invalid provider name"); + return (-1); + } + + if (prb_sec->dofs_entsize == 0 || + prb_sec->dofs_entsize > prb_sec->dofs_size) { + dtrace_dof_error(dof, "invalid entry size"); + return (-1); + } + + if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) { + dtrace_dof_error(dof, "misaligned entry size"); + return (-1); + } + + if (off_sec->dofs_entsize != sizeof (uint32_t)) { + dtrace_dof_error(dof, "invalid entry size"); + return (-1); + } + + if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) { + dtrace_dof_error(dof, "misaligned section offset"); + return (-1); + } + + if (arg_sec->dofs_entsize != sizeof (uint8_t)) { + dtrace_dof_error(dof, "invalid entry size"); + return (-1); + } + + arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset); + + nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize; + + /* + * Take a pass through the probes to check for errors. + */ + for (j = 0; j < nprobes; j++) { + probe = (dof_probe_t *)(uintptr_t)(daddr + + prb_sec->dofs_offset + j * prb_sec->dofs_entsize); + + if (probe->dofpr_func >= str_sec->dofs_size) { + dtrace_dof_error(dof, "invalid function name"); + return (-1); + } + + if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) { + dtrace_dof_error(dof, "function name too long"); + /* + * Keep going if the function name is too long. + * Unlike provider and probe names, we cannot reasonably + * impose restrictions on function names, since they're + * a property of the code being instrumented. We will + * skip this probe in dtrace_helper_provide_one(). + */ + } + + if (probe->dofpr_name >= str_sec->dofs_size || + strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) { + dtrace_dof_error(dof, "invalid probe name"); + return (-1); + } + + /* + * The offset count must not wrap the index, and the offsets + * must also not overflow the section's data. + */ + if (probe->dofpr_offidx + probe->dofpr_noffs < + probe->dofpr_offidx || + (probe->dofpr_offidx + probe->dofpr_noffs) * + off_sec->dofs_entsize > off_sec->dofs_size) { + dtrace_dof_error(dof, "invalid probe offset"); + return (-1); + } + + if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) { + /* + * If there's no is-enabled offset section, make sure + * there aren't any is-enabled offsets. Otherwise + * perform the same checks as for probe offsets + * (immediately above). + */ + if (enoff_sec == NULL) { + if (probe->dofpr_enoffidx != 0 || + probe->dofpr_nenoffs != 0) { + dtrace_dof_error(dof, "is-enabled " + "offsets with null section"); + return (-1); + } + } else if (probe->dofpr_enoffidx + + probe->dofpr_nenoffs < probe->dofpr_enoffidx || + (probe->dofpr_enoffidx + probe->dofpr_nenoffs) * + enoff_sec->dofs_entsize > enoff_sec->dofs_size) { + dtrace_dof_error(dof, "invalid is-enabled " + "offset"); + return (-1); + } + + if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) { + dtrace_dof_error(dof, "zero probe and " + "is-enabled offsets"); + return (-1); + } + } else if (probe->dofpr_noffs == 0) { + dtrace_dof_error(dof, "zero probe offsets"); + return (-1); + } + + if (probe->dofpr_argidx + probe->dofpr_xargc < + probe->dofpr_argidx || + (probe->dofpr_argidx + probe->dofpr_xargc) * + arg_sec->dofs_entsize > arg_sec->dofs_size) { + dtrace_dof_error(dof, "invalid args"); + return (-1); + } + + typeidx = probe->dofpr_nargv; + typestr = strtab + probe->dofpr_nargv; + for (k = 0; k < probe->dofpr_nargc; k++) { + if (typeidx >= str_sec->dofs_size) { + dtrace_dof_error(dof, "bad " + "native argument type"); + return (-1); + } + + typesz = strlen(typestr) + 1; + if (typesz > DTRACE_ARGTYPELEN) { + dtrace_dof_error(dof, "native " + "argument type too long"); + return (-1); + } + typeidx += typesz; + typestr += typesz; + } + + typeidx = probe->dofpr_xargv; + typestr = strtab + probe->dofpr_xargv; + for (k = 0; k < probe->dofpr_xargc; k++) { + if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) { + dtrace_dof_error(dof, "bad " + "native argument index"); + return (-1); + } + + if (typeidx >= str_sec->dofs_size) { + dtrace_dof_error(dof, "bad " + "translated argument type"); + return (-1); + } + + typesz = strlen(typestr) + 1; + if (typesz > DTRACE_ARGTYPELEN) { + dtrace_dof_error(dof, "translated argument " + "type too long"); + return (-1); + } + + typeidx += typesz; + typestr += typesz; + } + } + + return (0); +} + +static int +dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp, struct proc *p) +{ + dtrace_helpers_t *help; + dtrace_vstate_t *vstate; + dtrace_enabling_t *enab = NULL; + int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1; + uintptr_t daddr = (uintptr_t)dof; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + + if ((help = p->p_dtrace_helpers) == NULL) + help = dtrace_helpers_create(p); + + vstate = &help->dthps_vstate; + + if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab, dhp->dofhp_addr, + dhp->dofhp_dof, B_FALSE)) != 0) { + dtrace_dof_destroy(dof); + return (rv); + } + + /* + * Look for helper providers and validate their descriptions. + */ + for (i = 0; i < dof->dofh_secnum; i++) { + dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr + + dof->dofh_secoff + i * dof->dofh_secsize); + + if (sec->dofs_type != DOF_SECT_PROVIDER) + continue; + + if (dtrace_helper_provider_validate(dof, sec) != 0) { + dtrace_enabling_destroy(enab); + dtrace_dof_destroy(dof); + return (-1); + } + + nprovs++; + } + + /* + * Now we need to walk through the ECB descriptions in the enabling. + */ + for (i = 0; i < enab->dten_ndesc; i++) { + dtrace_ecbdesc_t *ep = enab->dten_desc[i]; + dtrace_probedesc_t *desc = &ep->dted_probe; + + if (strcmp(desc->dtpd_provider, "dtrace") != 0) + continue; + + if (strcmp(desc->dtpd_mod, "helper") != 0) + continue; + + if (strcmp(desc->dtpd_func, "ustack") != 0) + continue; + + if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK, + ep, help)) != 0) { + /* + * Adding this helper action failed -- we are now going + * to rip out the entire generation and return failure. + */ + (void) dtrace_helper_destroygen(help, + help->dthps_generation); + dtrace_enabling_destroy(enab); + dtrace_dof_destroy(dof); + return (-1); + } + + nhelpers++; + } + + if (nhelpers < enab->dten_ndesc) + dtrace_dof_error(dof, "unmatched helpers"); + + gen = help->dthps_generation++; + dtrace_enabling_destroy(enab); + + if (nprovs > 0) { + /* + * Now that this is in-kernel, we change the sense of the + * members: dofhp_dof denotes the in-kernel copy of the DOF + * and dofhp_addr denotes the address at user-level. + */ + dhp->dofhp_addr = dhp->dofhp_dof; + dhp->dofhp_dof = (uint64_t)(uintptr_t)dof; + + if (dtrace_helper_provider_add(dhp, help, gen) == 0) { + mutex_exit(&dtrace_lock); + dtrace_helper_provider_register(p, help, dhp); + mutex_enter(&dtrace_lock); + + destroy = 0; + } + } + + if (destroy) + dtrace_dof_destroy(dof); + + return (gen); +} + +static dtrace_helpers_t * +dtrace_helpers_create(proc_t *p) +{ + dtrace_helpers_t *help; + + ASSERT(MUTEX_HELD(&dtrace_lock)); + ASSERT(p->p_dtrace_helpers == NULL); + + help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP); + help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) * + DTRACE_NHELPER_ACTIONS, KM_SLEEP); + + p->p_dtrace_helpers = help; + dtrace_helpers++; + + return (help); +} + +#ifdef illumos +static +#endif +void +dtrace_helpers_destroy(proc_t *p) +{ + dtrace_helpers_t *help; + dtrace_vstate_t *vstate; +#ifdef illumos + proc_t *p = curproc; +#endif + int i; + + mutex_enter(&dtrace_lock); + + ASSERT(p->p_dtrace_helpers != NULL); + ASSERT(dtrace_helpers > 0); + + help = p->p_dtrace_helpers; + vstate = &help->dthps_vstate; + + /* + * We're now going to lose the help from this process. + */ + p->p_dtrace_helpers = NULL; + dtrace_sync(); + + /* + * Destory the helper actions. + */ + for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) { + dtrace_helper_action_t *h, *next; + + for (h = help->dthps_actions[i]; h != NULL; h = next) { + next = h->dtha_next; + dtrace_helper_action_destroy(h, vstate); + h = next; + } + } + + mutex_exit(&dtrace_lock); + + /* + * Destroy the helper providers. + */ + if (help->dthps_maxprovs > 0) { + mutex_enter(&dtrace_meta_lock); + if (dtrace_meta_pid != NULL) { + ASSERT(dtrace_deferred_pid == NULL); + + for (i = 0; i < help->dthps_nprovs; i++) { + dtrace_helper_provider_remove( + &help->dthps_provs[i]->dthp_prov, p->p_pid); + } + } else { + mutex_enter(&dtrace_lock); + ASSERT(help->dthps_deferred == 0 || + help->dthps_next != NULL || + help->dthps_prev != NULL || + help == dtrace_deferred_pid); + + /* + * Remove the helper from the deferred list. + */ + if (help->dthps_next != NULL) + help->dthps_next->dthps_prev = help->dthps_prev; + if (help->dthps_prev != NULL) + help->dthps_prev->dthps_next = help->dthps_next; + if (dtrace_deferred_pid == help) { + dtrace_deferred_pid = help->dthps_next; + ASSERT(help->dthps_prev == NULL); + } + + mutex_exit(&dtrace_lock); + } + + mutex_exit(&dtrace_meta_lock); + + for (i = 0; i < help->dthps_nprovs; i++) { + dtrace_helper_provider_destroy(help->dthps_provs[i]); + } + + kmem_free(help->dthps_provs, help->dthps_maxprovs * + sizeof (dtrace_helper_provider_t *)); + } + + mutex_enter(&dtrace_lock); + + dtrace_vstate_fini(&help->dthps_vstate); + kmem_free(help->dthps_actions, + sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS); + kmem_free(help, sizeof (dtrace_helpers_t)); + + --dtrace_helpers; + mutex_exit(&dtrace_lock); +} + +#ifdef illumos +static +#endif +void +dtrace_helpers_duplicate(proc_t *from, proc_t *to) +{ + dtrace_helpers_t *help, *newhelp; + dtrace_helper_action_t *helper, *new, *last; + dtrace_difo_t *dp; + dtrace_vstate_t *vstate; + int i, j, sz, hasprovs = 0; + + mutex_enter(&dtrace_lock); + ASSERT(from->p_dtrace_helpers != NULL); + ASSERT(dtrace_helpers > 0); + + help = from->p_dtrace_helpers; + newhelp = dtrace_helpers_create(to); + ASSERT(to->p_dtrace_helpers != NULL); + + newhelp->dthps_generation = help->dthps_generation; + vstate = &newhelp->dthps_vstate; + + /* + * Duplicate the helper actions. + */ + for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) { + if ((helper = help->dthps_actions[i]) == NULL) + continue; + + for (last = NULL; helper != NULL; helper = helper->dtha_next) { + new = kmem_zalloc(sizeof (dtrace_helper_action_t), + KM_SLEEP); + new->dtha_generation = helper->dtha_generation; + + if ((dp = helper->dtha_predicate) != NULL) { + dp = dtrace_difo_duplicate(dp, vstate); + new->dtha_predicate = dp; + } + + new->dtha_nactions = helper->dtha_nactions; + sz = sizeof (dtrace_difo_t *) * new->dtha_nactions; + new->dtha_actions = kmem_alloc(sz, KM_SLEEP); + + for (j = 0; j < new->dtha_nactions; j++) { + dtrace_difo_t *dp = helper->dtha_actions[j]; + + ASSERT(dp != NULL); + dp = dtrace_difo_duplicate(dp, vstate); + new->dtha_actions[j] = dp; + } + + if (last != NULL) { + last->dtha_next = new; + } else { + newhelp->dthps_actions[i] = new; + } + + last = new; + } + } + + /* + * Duplicate the helper providers and register them with the + * DTrace framework. + */ + if (help->dthps_nprovs > 0) { + newhelp->dthps_nprovs = help->dthps_nprovs; + newhelp->dthps_maxprovs = help->dthps_nprovs; + newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs * + sizeof (dtrace_helper_provider_t *), KM_SLEEP); + for (i = 0; i < newhelp->dthps_nprovs; i++) { + newhelp->dthps_provs[i] = help->dthps_provs[i]; + newhelp->dthps_provs[i]->dthp_ref++; + } + + hasprovs = 1; + } + + mutex_exit(&dtrace_lock); + + if (hasprovs) + dtrace_helper_provider_register(to, newhelp, NULL); +} + +/* + * DTrace Hook Functions + */ +static void +dtrace_module_loaded(modctl_t *ctl) +{ + dtrace_provider_t *prv; + + mutex_enter(&dtrace_provider_lock); +#ifdef illumos + mutex_enter(&mod_lock); +#endif + +#ifdef illumos + ASSERT(ctl->mod_busy); +#endif + + /* + * We're going to call each providers per-module provide operation + * specifying only this module. + */ + for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next) + prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); + +#ifdef illumos + mutex_exit(&mod_lock); +#endif + mutex_exit(&dtrace_provider_lock); + + /* + * If we have any retained enablings, we need to match against them. + * Enabling probes requires that cpu_lock be held, and we cannot hold + * cpu_lock here -- it is legal for cpu_lock to be held when loading a + * module. (In particular, this happens when loading scheduling + * classes.) So if we have any retained enablings, we need to dispatch + * our task queue to do the match for us. + */ + mutex_enter(&dtrace_lock); + + if (dtrace_retained == NULL) { + mutex_exit(&dtrace_lock); + return; + } + + (void) taskq_dispatch(dtrace_taskq, + (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP); + + mutex_exit(&dtrace_lock); + + /* + * And now, for a little heuristic sleaze: in general, we want to + * match modules as soon as they load. However, we cannot guarantee + * this, because it would lead us to the lock ordering violation + * outlined above. The common case, of course, is that cpu_lock is + * _not_ held -- so we delay here for a clock tick, hoping that that's + * long enough for the task queue to do its work. If it's not, it's + * not a serious problem -- it just means that the module that we + * just loaded may not be immediately instrumentable. + */ + delay(1); +} + +static void +#ifdef illumos +dtrace_module_unloaded(modctl_t *ctl) +#else +dtrace_module_unloaded(modctl_t *ctl, int *error) +#endif +{ + dtrace_probe_t template, *probe, *first, *next; + dtrace_provider_t *prov; +#ifndef illumos + char modname[DTRACE_MODNAMELEN]; + size_t len; +#endif + +#ifdef illumos + template.dtpr_mod = ctl->mod_modname; +#else + /* Handle the fact that ctl->filename may end in ".ko". */ + strlcpy(modname, ctl->filename, sizeof(modname)); + len = strlen(ctl->filename); + if (len > 3 && strcmp(modname + len - 3, ".ko") == 0) + modname[len - 3] = '\0'; + template.dtpr_mod = modname; +#endif + + mutex_enter(&dtrace_provider_lock); +#ifdef illumos + mutex_enter(&mod_lock); +#endif + mutex_enter(&dtrace_lock); + +#ifndef illumos + if (ctl->nenabled > 0) { + /* Don't allow unloads if a probe is enabled. */ + mutex_exit(&dtrace_provider_lock); + mutex_exit(&dtrace_lock); + *error = -1; + printf( + "kldunload: attempt to unload module that has DTrace probes enabled\n"); + return; + } +#endif + + if (dtrace_bymod == NULL) { + /* + * The DTrace module is loaded (obviously) but not attached; + * we don't have any work to do. + */ + mutex_exit(&dtrace_provider_lock); +#ifdef illumos + mutex_exit(&mod_lock); +#endif + mutex_exit(&dtrace_lock); + return; + } + + for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template); + probe != NULL; probe = probe->dtpr_nextmod) { + if (probe->dtpr_ecb != NULL) { + mutex_exit(&dtrace_provider_lock); +#ifdef illumos + mutex_exit(&mod_lock); +#endif + mutex_exit(&dtrace_lock); + + /* + * This shouldn't _actually_ be possible -- we're + * unloading a module that has an enabled probe in it. + * (It's normally up to the provider to make sure that + * this can't happen.) However, because dtps_enable() + * doesn't have a failure mode, there can be an + * enable/unload race. Upshot: we don't want to + * assert, but we're not going to disable the + * probe, either. + */ + if (dtrace_err_verbose) { +#ifdef illumos + cmn_err(CE_WARN, "unloaded module '%s' had " + "enabled probes", ctl->mod_modname); +#else + cmn_err(CE_WARN, "unloaded module '%s' had " + "enabled probes", modname); +#endif + } + + return; + } + } + + probe = first; + + for (first = NULL; probe != NULL; probe = next) { + ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe); + + dtrace_probes[probe->dtpr_id - 1] = NULL; + + next = probe->dtpr_nextmod; + dtrace_hash_remove(dtrace_bymod, probe); + dtrace_hash_remove(dtrace_byfunc, probe); + dtrace_hash_remove(dtrace_byname, probe); + + if (first == NULL) { + first = probe; + probe->dtpr_nextmod = NULL; + } else { + probe->dtpr_nextmod = first; + first = probe; + } + } + + /* + * We've removed all of the module's probes from the hash chains and + * from the probe array. Now issue a dtrace_sync() to be sure that + * everyone has cleared out from any probe array processing. + */ + dtrace_sync(); + + for (probe = first; probe != NULL; probe = first) { + first = probe->dtpr_nextmod; + prov = probe->dtpr_provider; + prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id, + probe->dtpr_arg); + kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1); + kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1); + kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1); +#ifdef illumos + vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1); +#else + free_unr(dtrace_arena, probe->dtpr_id); +#endif + kmem_free(probe, sizeof (dtrace_probe_t)); + } + + mutex_exit(&dtrace_lock); +#ifdef illumos + mutex_exit(&mod_lock); +#endif + mutex_exit(&dtrace_provider_lock); +} + +#ifndef illumos +static void +dtrace_kld_load(void *arg __unused, linker_file_t lf) +{ + + dtrace_module_loaded(lf); +} + +static void +dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error) +{ + + if (*error != 0) + /* We already have an error, so don't do anything. */ + return; + dtrace_module_unloaded(lf, error); +} +#endif + +#ifdef illumos +static void +dtrace_suspend(void) +{ + dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend)); +} + +static void +dtrace_resume(void) +{ + dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume)); +} +#endif + +static int +dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu) +{ + ASSERT(MUTEX_HELD(&cpu_lock)); + mutex_enter(&dtrace_lock); + + switch (what) { + case CPU_CONFIG: { + dtrace_state_t *state; + dtrace_optval_t *opt, rs, c; + + /* + * For now, we only allocate a new buffer for anonymous state. + */ + if ((state = dtrace_anon.dta_state) == NULL) + break; + + if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) + break; + + opt = state->dts_options; + c = opt[DTRACEOPT_CPU]; + + if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu) + break; + + /* + * Regardless of what the actual policy is, we're going to + * temporarily set our resize policy to be manual. We're + * also going to temporarily set our CPU option to denote + * the newly configured CPU. + */ + rs = opt[DTRACEOPT_BUFRESIZE]; + opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL; + opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu; + + (void) dtrace_state_buffers(state); + + opt[DTRACEOPT_BUFRESIZE] = rs; + opt[DTRACEOPT_CPU] = c; + + break; + } + + case CPU_UNCONFIG: + /* + * We don't free the buffer in the CPU_UNCONFIG case. (The + * buffer will be freed when the consumer exits.) + */ + break; + + default: + break; + } + + mutex_exit(&dtrace_lock); + return (0); +} + +#ifdef illumos +static void +dtrace_cpu_setup_initial(processorid_t cpu) +{ + (void) dtrace_cpu_setup(CPU_CONFIG, cpu); +} +#endif + +static void +dtrace_toxrange_add(uintptr_t base, uintptr_t limit) +{ + if (dtrace_toxranges >= dtrace_toxranges_max) { + int osize, nsize; + dtrace_toxrange_t *range; + + osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t); + + if (osize == 0) { + ASSERT(dtrace_toxrange == NULL); + ASSERT(dtrace_toxranges_max == 0); + dtrace_toxranges_max = 1; + } else { + dtrace_toxranges_max <<= 1; + } + + nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t); + range = kmem_zalloc(nsize, KM_SLEEP); + + if (dtrace_toxrange != NULL) { + ASSERT(osize != 0); + bcopy(dtrace_toxrange, range, osize); + kmem_free(dtrace_toxrange, osize); + } + + dtrace_toxrange = range; + } + + ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0); + ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0); + + dtrace_toxrange[dtrace_toxranges].dtt_base = base; + dtrace_toxrange[dtrace_toxranges].dtt_limit = limit; + dtrace_toxranges++; +} + +static void +dtrace_getf_barrier() +{ +#ifdef illumos + /* + * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings + * that contain calls to getf(), this routine will be called on every + * closef() before either the underlying vnode is released or the + * file_t itself is freed. By the time we are here, it is essential + * that the file_t can no longer be accessed from a call to getf() + * in probe context -- that assures that a dtrace_sync() can be used + * to clear out any enablings referring to the old structures. + */ + if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 || + kcred->cr_zone->zone_dtrace_getf != 0) + dtrace_sync(); +#endif +} + +/* + * DTrace Driver Cookbook Functions + */ +#ifdef illumos +/*ARGSUSED*/ +static int +dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + dtrace_provider_id_t id; + dtrace_state_t *state = NULL; + dtrace_enabling_t *enab; + + mutex_enter(&cpu_lock); + mutex_enter(&dtrace_provider_lock); + mutex_enter(&dtrace_lock); + + if (ddi_soft_state_init(&dtrace_softstate, + sizeof (dtrace_state_t), 0) != 0) { + cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state"); + mutex_exit(&cpu_lock); + mutex_exit(&dtrace_provider_lock); + mutex_exit(&dtrace_lock); + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR, + DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE || + ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR, + DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) { + cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes"); + ddi_remove_minor_node(devi, NULL); + ddi_soft_state_fini(&dtrace_softstate); + mutex_exit(&cpu_lock); + mutex_exit(&dtrace_provider_lock); + mutex_exit(&dtrace_lock); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + dtrace_devi = devi; + + dtrace_modload = dtrace_module_loaded; + dtrace_modunload = dtrace_module_unloaded; + dtrace_cpu_init = dtrace_cpu_setup_initial; + dtrace_helpers_cleanup = dtrace_helpers_destroy; + dtrace_helpers_fork = dtrace_helpers_duplicate; + dtrace_cpustart_init = dtrace_suspend; + dtrace_cpustart_fini = dtrace_resume; + dtrace_debugger_init = dtrace_suspend; + dtrace_debugger_fini = dtrace_resume; + + register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL); + + ASSERT(MUTEX_HELD(&cpu_lock)); + + dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1, + NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); + dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE, + UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0, + VM_SLEEP | VMC_IDENTIFIER); + dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri, + 1, INT_MAX, 0); + + dtrace_state_cache = kmem_cache_create("dtrace_state_cache", + sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN, + NULL, NULL, NULL, NULL, NULL, 0); + + ASSERT(MUTEX_HELD(&cpu_lock)); + dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod), + offsetof(dtrace_probe_t, dtpr_nextmod), + offsetof(dtrace_probe_t, dtpr_prevmod)); + + dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func), + offsetof(dtrace_probe_t, dtpr_nextfunc), + offsetof(dtrace_probe_t, dtpr_prevfunc)); + + dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name), + offsetof(dtrace_probe_t, dtpr_nextname), + offsetof(dtrace_probe_t, dtpr_prevname)); + + if (dtrace_retain_max < 1) { + cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; " + "setting to 1", dtrace_retain_max); + dtrace_retain_max = 1; + } + + /* + * Now discover our toxic ranges. + */ + dtrace_toxic_ranges(dtrace_toxrange_add); + + /* + * Before we register ourselves as a provider to our own framework, + * we would like to assert that dtrace_provider is NULL -- but that's + * not true if we were loaded as a dependency of a DTrace provider. + * Once we've registered, we can assert that dtrace_provider is our + * pseudo provider. + */ + (void) dtrace_register("dtrace", &dtrace_provider_attr, + DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id); + + ASSERT(dtrace_provider != NULL); + ASSERT((dtrace_provider_id_t)dtrace_provider == id); + + dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t) + dtrace_provider, NULL, NULL, "BEGIN", 0, NULL); + dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t) + dtrace_provider, NULL, NULL, "END", 0, NULL); + dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t) + dtrace_provider, NULL, NULL, "ERROR", 1, NULL); + + dtrace_anon_property(); + mutex_exit(&cpu_lock); + + /* + * If there are already providers, we must ask them to provide their + * probes, and then match any anonymous enabling against them. Note + * that there should be no other retained enablings at this time: + * the only retained enablings at this time should be the anonymous + * enabling. + */ + if (dtrace_anon.dta_enabling != NULL) { + ASSERT(dtrace_retained == dtrace_anon.dta_enabling); + + dtrace_enabling_provide(NULL); + state = dtrace_anon.dta_state; + + /* + * We couldn't hold cpu_lock across the above call to + * dtrace_enabling_provide(), but we must hold it to actually + * enable the probes. We have to drop all of our locks, pick + * up cpu_lock, and regain our locks before matching the + * retained anonymous enabling. + */ + mutex_exit(&dtrace_lock); + mutex_exit(&dtrace_provider_lock); + + mutex_enter(&cpu_lock); + mutex_enter(&dtrace_provider_lock); + mutex_enter(&dtrace_lock); + + if ((enab = dtrace_anon.dta_enabling) != NULL) + (void) dtrace_enabling_match(enab, NULL); + + mutex_exit(&cpu_lock); + } + + mutex_exit(&dtrace_lock); + mutex_exit(&dtrace_provider_lock); + + if (state != NULL) { + /* + * If we created any anonymous state, set it going now. + */ + (void) dtrace_state_go(state, &dtrace_anon.dta_beganon); + } + + return (DDI_SUCCESS); +} +#endif /* illumos */ + +#ifndef illumos +static void dtrace_dtr(void *); +#endif + +/*ARGSUSED*/ +static int +#ifdef illumos +dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +#else +dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +#endif +{ + dtrace_state_t *state; + uint32_t priv; + uid_t uid; + zoneid_t zoneid; + +#ifdef illumos + if (getminor(*devp) == DTRACEMNRN_HELPER) + return (0); + + /* + * If this wasn't an open with the "helper" minor, then it must be + * the "dtrace" minor. + */ + if (getminor(*devp) == DTRACEMNRN_DTRACE) + return (ENXIO); +#else + cred_t *cred_p = NULL; + cred_p = dev->si_cred; + + /* + * If no DTRACE_PRIV_* bits are set in the credential, then the + * caller lacks sufficient permission to do anything with DTrace. + */ + dtrace_cred2priv(cred_p, &priv, &uid, &zoneid); + if (priv == DTRACE_PRIV_NONE) { +#endif + + return (EACCES); + } + + /* + * Ask all providers to provide all their probes. + */ + mutex_enter(&dtrace_provider_lock); + dtrace_probe_provide(NULL, NULL); + mutex_exit(&dtrace_provider_lock); + + mutex_enter(&cpu_lock); + mutex_enter(&dtrace_lock); + dtrace_opens++; + dtrace_membar_producer(); + +#ifdef illumos + /* + * If the kernel debugger is active (that is, if the kernel debugger + * modified text in some way), we won't allow the open. + */ + if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) { + dtrace_opens--; + mutex_exit(&cpu_lock); + mutex_exit(&dtrace_lock); + return (EBUSY); + } + + if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) { + /* + * If DTrace helper tracing is enabled, we need to allocate the + * trace buffer and initialize the values. + */ + dtrace_helptrace_buffer = + kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP); + dtrace_helptrace_next = 0; + dtrace_helptrace_wrapped = 0; + dtrace_helptrace_enable = 0; + } + + state = dtrace_state_create(devp, cred_p); +#else + state = dtrace_state_create(dev, NULL); + devfs_set_cdevpriv(state, dtrace_dtr); +#endif + + mutex_exit(&cpu_lock); + + if (state == NULL) { +#ifdef illumos + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) + (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); +#else + --dtrace_opens; +#endif + mutex_exit(&dtrace_lock); + return (EAGAIN); + } + + mutex_exit(&dtrace_lock); + + return (0); +} + +/*ARGSUSED*/ +#ifdef illumos +static int +dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p) +#else +static void +dtrace_dtr(void *data) +#endif +{ +#ifdef illumos + minor_t minor = getminor(dev); + dtrace_state_t *state; +#endif + dtrace_helptrace_t *buf = NULL; + +#ifdef illumos + if (minor == DTRACEMNRN_HELPER) + return (0); + + state = ddi_get_soft_state(dtrace_softstate, minor); +#else + dtrace_state_t *state = data; +#endif + + mutex_enter(&cpu_lock); + mutex_enter(&dtrace_lock); + +#ifdef illumos + if (state->dts_anon) +#else + if (state != NULL && state->dts_anon) +#endif + { + /* + * There is anonymous state. Destroy that first. + */ + ASSERT(dtrace_anon.dta_state == NULL); + dtrace_state_destroy(state->dts_anon); + } + + if (dtrace_helptrace_disable) { + /* + * If we have been told to disable helper tracing, set the + * buffer to NULL before calling into dtrace_state_destroy(); + * we take advantage of its dtrace_sync() to know that no + * CPU is in probe context with enabled helper tracing + * after it returns. + */ + buf = dtrace_helptrace_buffer; + dtrace_helptrace_buffer = NULL; + } + +#ifdef illumos + dtrace_state_destroy(state); +#else + if (state != NULL) { + dtrace_state_destroy(state); + kmem_free(state, 0); + } +#endif + ASSERT(dtrace_opens > 0); + +#ifdef illumos + /* + * Only relinquish control of the kernel debugger interface when there + * are no consumers and no anonymous enablings. + */ + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) + (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); +#else + --dtrace_opens; +#endif + + if (buf != NULL) { + kmem_free(buf, dtrace_helptrace_bufsize); + dtrace_helptrace_disable = 0; + } + + mutex_exit(&dtrace_lock); + mutex_exit(&cpu_lock); + +#ifdef illumos + return (0); +#endif +} + +#ifdef illumos +/*ARGSUSED*/ +static int +dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv) +{ + int rval; + dof_helper_t help, *dhp = NULL; + + switch (cmd) { + case DTRACEHIOC_ADDDOF: + if (copyin((void *)arg, &help, sizeof (help)) != 0) { + dtrace_dof_error(NULL, "failed to copyin DOF helper"); + return (EFAULT); + } + + dhp = &help; + arg = (intptr_t)help.dofhp_dof; + /*FALLTHROUGH*/ + + case DTRACEHIOC_ADD: { + dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval); + + if (dof == NULL) + return (rval); + + mutex_enter(&dtrace_lock); + + /* + * dtrace_helper_slurp() takes responsibility for the dof -- + * it may free it now or it may save it and free it later. + */ + if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) { + *rv = rval; + rval = 0; + } else { + rval = EINVAL; + } + + mutex_exit(&dtrace_lock); + return (rval); + } + + case DTRACEHIOC_REMOVE: { + mutex_enter(&dtrace_lock); + rval = dtrace_helper_destroygen(NULL, arg); + mutex_exit(&dtrace_lock); + + return (rval); + } + + default: + break; + } + + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) +{ + minor_t minor = getminor(dev); + dtrace_state_t *state; + int rval; + + if (minor == DTRACEMNRN_HELPER) + return (dtrace_ioctl_helper(cmd, arg, rv)); + + state = ddi_get_soft_state(dtrace_softstate, minor); + + if (state->dts_anon) { + ASSERT(dtrace_anon.dta_state == NULL); + state = state->dts_anon; + } + + switch (cmd) { + case DTRACEIOC_PROVIDER: { + dtrace_providerdesc_t pvd; + dtrace_provider_t *pvp; + + if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0) + return (EFAULT); + + pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0'; + mutex_enter(&dtrace_provider_lock); + + for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) { + if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0) + break; + } + + mutex_exit(&dtrace_provider_lock); + + if (pvp == NULL) + return (ESRCH); + + bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t)); + bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t)); + + if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0) + return (EFAULT); + + return (0); + } + + case DTRACEIOC_EPROBE: { + dtrace_eprobedesc_t epdesc; + dtrace_ecb_t *ecb; + dtrace_action_t *act; + void *buf; + size_t size; + uintptr_t dest; + int nrecs; + + if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0) + return (EFAULT); + + mutex_enter(&dtrace_lock); + + if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) { + mutex_exit(&dtrace_lock); + return (EINVAL); + } + + if (ecb->dte_probe == NULL) { + mutex_exit(&dtrace_lock); + return (EINVAL); + } + + epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id; + epdesc.dtepd_uarg = ecb->dte_uarg; + epdesc.dtepd_size = ecb->dte_size; + + nrecs = epdesc.dtepd_nrecs; + epdesc.dtepd_nrecs = 0; + for (act = ecb->dte_action; act != NULL; act = act->dta_next) { + if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple) + continue; + + epdesc.dtepd_nrecs++; + } + + /* + * Now that we have the size, we need to allocate a temporary + * buffer in which to store the complete description. We need + * the temporary buffer to be able to drop dtrace_lock() + * across the copyout(), below. + */ + size = sizeof (dtrace_eprobedesc_t) + + (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t)); + + buf = kmem_alloc(size, KM_SLEEP); + dest = (uintptr_t)buf; + + bcopy(&epdesc, (void *)dest, sizeof (epdesc)); + dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]); + + for (act = ecb->dte_action; act != NULL; act = act->dta_next) { + if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple) + continue; + + if (nrecs-- == 0) + break; + + bcopy(&act->dta_rec, (void *)dest, + sizeof (dtrace_recdesc_t)); + dest += sizeof (dtrace_recdesc_t); + } + + mutex_exit(&dtrace_lock); + + if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) { + kmem_free(buf, size); + return (EFAULT); + } + + kmem_free(buf, size); + return (0); + } + + case DTRACEIOC_AGGDESC: { + dtrace_aggdesc_t aggdesc; + dtrace_action_t *act; + dtrace_aggregation_t *agg; + int nrecs; + uint32_t offs; + dtrace_recdesc_t *lrec; + void *buf; + size_t size; + uintptr_t dest; + + if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0) + return (EFAULT); + + mutex_enter(&dtrace_lock); + + if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) { + mutex_exit(&dtrace_lock); + return (EINVAL); + } + + aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid; + + nrecs = aggdesc.dtagd_nrecs; + aggdesc.dtagd_nrecs = 0; + + offs = agg->dtag_base; + lrec = &agg->dtag_action.dta_rec; + aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs; + + for (act = agg->dtag_first; ; act = act->dta_next) { + ASSERT(act->dta_intuple || + DTRACEACT_ISAGG(act->dta_kind)); + + /* + * If this action has a record size of zero, it + * denotes an argument to the aggregating action. + * Because the presence of this record doesn't (or + * shouldn't) affect the way the data is interpreted, + * we don't copy it out to save user-level the + * confusion of dealing with a zero-length record. + */ + if (act->dta_rec.dtrd_size == 0) { + ASSERT(agg->dtag_hasarg); + continue; + } + + aggdesc.dtagd_nrecs++; + + if (act == &agg->dtag_action) + break; + } + + /* + * Now that we have the size, we need to allocate a temporary + * buffer in which to store the complete description. We need + * the temporary buffer to be able to drop dtrace_lock() + * across the copyout(), below. + */ + size = sizeof (dtrace_aggdesc_t) + + (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t)); + + buf = kmem_alloc(size, KM_SLEEP); + dest = (uintptr_t)buf; + + bcopy(&aggdesc, (void *)dest, sizeof (aggdesc)); + dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]); + + for (act = agg->dtag_first; ; act = act->dta_next) { + dtrace_recdesc_t rec = act->dta_rec; + + /* + * See the comment in the above loop for why we pass + * over zero-length records. + */ + if (rec.dtrd_size == 0) { + ASSERT(agg->dtag_hasarg); + continue; + } + + if (nrecs-- == 0) + break; + + rec.dtrd_offset -= offs; + bcopy(&rec, (void *)dest, sizeof (rec)); + dest += sizeof (dtrace_recdesc_t); + + if (act == &agg->dtag_action) + break; + } + + mutex_exit(&dtrace_lock); + + if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) { + kmem_free(buf, size); + return (EFAULT); + } + + kmem_free(buf, size); + return (0); + } + + case DTRACEIOC_ENABLE: { + dof_hdr_t *dof; + dtrace_enabling_t *enab = NULL; + dtrace_vstate_t *vstate; + int err = 0; + + *rv = 0; + + /* + * If a NULL argument has been passed, we take this as our + * cue to reevaluate our enablings. + */ + if (arg == NULL) { + dtrace_enabling_matchall(); + + return (0); + } + + if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL) + return (rval); + + mutex_enter(&cpu_lock); + mutex_enter(&dtrace_lock); + vstate = &state->dts_vstate; + + if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) { + mutex_exit(&dtrace_lock); + mutex_exit(&cpu_lock); + dtrace_dof_destroy(dof); + return (EBUSY); + } + + if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) { + mutex_exit(&dtrace_lock); + mutex_exit(&cpu_lock); + dtrace_dof_destroy(dof); + return (EINVAL); + } + + if ((rval = dtrace_dof_options(dof, state)) != 0) { + dtrace_enabling_destroy(enab); + mutex_exit(&dtrace_lock); + mutex_exit(&cpu_lock); + dtrace_dof_destroy(dof); + return (rval); + } + + if ((err = dtrace_enabling_match(enab, rv)) == 0) { + err = dtrace_enabling_retain(enab); + } else { + dtrace_enabling_destroy(enab); + } + + mutex_exit(&cpu_lock); + mutex_exit(&dtrace_lock); + dtrace_dof_destroy(dof); + + return (err); + } + + case DTRACEIOC_REPLICATE: { + dtrace_repldesc_t desc; + dtrace_probedesc_t *match = &desc.dtrpd_match; + dtrace_probedesc_t *create = &desc.dtrpd_create; + int err; + + if (copyin((void *)arg, &desc, sizeof (desc)) != 0) + return (EFAULT); + + match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0'; + match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0'; + match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0'; + match->dtpd_name[DTRACE_NAMELEN - 1] = '\0'; + + create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0'; + create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0'; + create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0'; + create->dtpd_name[DTRACE_NAMELEN - 1] = '\0'; + + mutex_enter(&dtrace_lock); + err = dtrace_enabling_replicate(state, match, create); + mutex_exit(&dtrace_lock); + + return (err); + } + + case DTRACEIOC_PROBEMATCH: + case DTRACEIOC_PROBES: { + dtrace_probe_t *probe = NULL; + dtrace_probedesc_t desc; + dtrace_probekey_t pkey; + dtrace_id_t i; + int m = 0; + uint32_t priv; + uid_t uid; + zoneid_t zoneid; + + if (copyin((void *)arg, &desc, sizeof (desc)) != 0) + return (EFAULT); + + desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0'; + desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0'; + desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0'; + desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0'; + + /* + * Before we attempt to match this probe, we want to give + * all providers the opportunity to provide it. + */ + if (desc.dtpd_id == DTRACE_IDNONE) { + mutex_enter(&dtrace_provider_lock); + dtrace_probe_provide(&desc, NULL); + mutex_exit(&dtrace_provider_lock); + desc.dtpd_id++; + } + + if (cmd == DTRACEIOC_PROBEMATCH) { + dtrace_probekey(&desc, &pkey); + pkey.dtpk_id = DTRACE_IDNONE; + } + + dtrace_cred2priv(cr, &priv, &uid, &zoneid); + + mutex_enter(&dtrace_lock); + + if (cmd == DTRACEIOC_PROBEMATCH) { + for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) { + if ((probe = dtrace_probes[i - 1]) != NULL && + (m = dtrace_match_probe(probe, &pkey, + priv, uid, zoneid)) != 0) + break; + } + + if (m < 0) { + mutex_exit(&dtrace_lock); + return (EINVAL); + } + + } else { + for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) { + if ((probe = dtrace_probes[i - 1]) != NULL && + dtrace_match_priv(probe, priv, uid, zoneid)) + break; + } + } + + if (probe == NULL) { + mutex_exit(&dtrace_lock); + return (ESRCH); + } + + dtrace_probe_description(probe, &desc); + mutex_exit(&dtrace_lock); + + if (copyout(&desc, (void *)arg, sizeof (desc)) != 0) + return (EFAULT); + + return (0); + } + + case DTRACEIOC_PROBEARG: { + dtrace_argdesc_t desc; + dtrace_probe_t *probe; + dtrace_provider_t *prov; + + if (copyin((void *)arg, &desc, sizeof (desc)) != 0) + return (EFAULT); + + if (desc.dtargd_id == DTRACE_IDNONE) + return (EINVAL); + + if (desc.dtargd_ndx == DTRACE_ARGNONE) + return (EINVAL); + + mutex_enter(&dtrace_provider_lock); + mutex_enter(&mod_lock); + mutex_enter(&dtrace_lock); + + if (desc.dtargd_id > dtrace_nprobes) { + mutex_exit(&dtrace_lock); + mutex_exit(&mod_lock); + mutex_exit(&dtrace_provider_lock); + return (EINVAL); + } + + if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) { + mutex_exit(&dtrace_lock); + mutex_exit(&mod_lock); + mutex_exit(&dtrace_provider_lock); + return (EINVAL); + } + + mutex_exit(&dtrace_lock); + + prov = probe->dtpr_provider; + + if (prov->dtpv_pops.dtps_getargdesc == NULL) { + /* + * There isn't any typed information for this probe. + * Set the argument number to DTRACE_ARGNONE. + */ + desc.dtargd_ndx = DTRACE_ARGNONE; + } else { + desc.dtargd_native[0] = '\0'; + desc.dtargd_xlate[0] = '\0'; + desc.dtargd_mapping = desc.dtargd_ndx; + + prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg, + probe->dtpr_id, probe->dtpr_arg, &desc); + } + + mutex_exit(&mod_lock); + mutex_exit(&dtrace_provider_lock); + + if (copyout(&desc, (void *)arg, sizeof (desc)) != 0) + return (EFAULT); + + return (0); + } + + case DTRACEIOC_GO: { + processorid_t cpuid; + rval = dtrace_state_go(state, &cpuid); + + if (rval != 0) + return (rval); + + if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0) + return (EFAULT); + + return (0); + } + + case DTRACEIOC_STOP: { + processorid_t cpuid; + + mutex_enter(&dtrace_lock); + rval = dtrace_state_stop(state, &cpuid); + mutex_exit(&dtrace_lock); + + if (rval != 0) + return (rval); + + if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0) + return (EFAULT); + + return (0); + } + + case DTRACEIOC_DOFGET: { + dof_hdr_t hdr, *dof; + uint64_t len; + + if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0) + return (EFAULT); + + mutex_enter(&dtrace_lock); + dof = dtrace_dof_create(state); + mutex_exit(&dtrace_lock); + + len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz); + rval = copyout(dof, (void *)arg, len); + dtrace_dof_destroy(dof); + + return (rval == 0 ? 0 : EFAULT); + } + + case DTRACEIOC_AGGSNAP: + case DTRACEIOC_BUFSNAP: { + dtrace_bufdesc_t desc; + caddr_t cached; + dtrace_buffer_t *buf; + + if (copyin((void *)arg, &desc, sizeof (desc)) != 0) + return (EFAULT); + + if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU) + return (EINVAL); + + mutex_enter(&dtrace_lock); + + if (cmd == DTRACEIOC_BUFSNAP) { + buf = &state->dts_buffer[desc.dtbd_cpu]; + } else { + buf = &state->dts_aggbuffer[desc.dtbd_cpu]; + } + + if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) { + size_t sz = buf->dtb_offset; + + if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) { + mutex_exit(&dtrace_lock); + return (EBUSY); + } + + /* + * If this buffer has already been consumed, we're + * going to indicate that there's nothing left here + * to consume. + */ + if (buf->dtb_flags & DTRACEBUF_CONSUMED) { + mutex_exit(&dtrace_lock); + + desc.dtbd_size = 0; + desc.dtbd_drops = 0; + desc.dtbd_errors = 0; + desc.dtbd_oldest = 0; + sz = sizeof (desc); + + if (copyout(&desc, (void *)arg, sz) != 0) + return (EFAULT); + + return (0); + } + + /* + * If this is a ring buffer that has wrapped, we want + * to copy the whole thing out. + */ + if (buf->dtb_flags & DTRACEBUF_WRAPPED) { + dtrace_buffer_polish(buf); + sz = buf->dtb_size; + } + + if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) { + mutex_exit(&dtrace_lock); + return (EFAULT); + } + + desc.dtbd_size = sz; + desc.dtbd_drops = buf->dtb_drops; + desc.dtbd_errors = buf->dtb_errors; + desc.dtbd_oldest = buf->dtb_xamot_offset; + desc.dtbd_timestamp = dtrace_gethrtime(); + + mutex_exit(&dtrace_lock); + + if (copyout(&desc, (void *)arg, sizeof (desc)) != 0) + return (EFAULT); + + buf->dtb_flags |= DTRACEBUF_CONSUMED; + + return (0); + } + + if (buf->dtb_tomax == NULL) { + ASSERT(buf->dtb_xamot == NULL); + mutex_exit(&dtrace_lock); + return (ENOENT); + } + + cached = buf->dtb_tomax; + ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH)); + + dtrace_xcall(desc.dtbd_cpu, + (dtrace_xcall_t)dtrace_buffer_switch, buf); + + state->dts_errors += buf->dtb_xamot_errors; + + /* + * If the buffers did not actually switch, then the cross call + * did not take place -- presumably because the given CPU is + * not in the ready set. If this is the case, we'll return + * ENOENT. + */ + if (buf->dtb_tomax == cached) { + ASSERT(buf->dtb_xamot != cached); + mutex_exit(&dtrace_lock); + return (ENOENT); + } + + ASSERT(cached == buf->dtb_xamot); + + /* + * We have our snapshot; now copy it out. + */ + if (copyout(buf->dtb_xamot, desc.dtbd_data, + buf->dtb_xamot_offset) != 0) { + mutex_exit(&dtrace_lock); + return (EFAULT); + } + + desc.dtbd_size = buf->dtb_xamot_offset; + desc.dtbd_drops = buf->dtb_xamot_drops; + desc.dtbd_errors = buf->dtb_xamot_errors; + desc.dtbd_oldest = 0; + desc.dtbd_timestamp = buf->dtb_switched; + + mutex_exit(&dtrace_lock); + + /* + * Finally, copy out the buffer description. + */ + if (copyout(&desc, (void *)arg, sizeof (desc)) != 0) + return (EFAULT); + + return (0); + } + + case DTRACEIOC_CONF: { + dtrace_conf_t conf; + + bzero(&conf, sizeof (conf)); + conf.dtc_difversion = DIF_VERSION; + conf.dtc_difintregs = DIF_DIR_NREGS; + conf.dtc_diftupregs = DIF_DTR_NREGS; + conf.dtc_ctfmodel = CTF_MODEL_NATIVE; + + if (copyout(&conf, (void *)arg, sizeof (conf)) != 0) + return (EFAULT); + + return (0); + } + + case DTRACEIOC_STATUS: { + dtrace_status_t stat; + dtrace_dstate_t *dstate; + int i, j; + uint64_t nerrs; + + /* + * See the comment in dtrace_state_deadman() for the reason + * for setting dts_laststatus to INT64_MAX before setting + * it to the correct value. + */ + state->dts_laststatus = INT64_MAX; + dtrace_membar_producer(); + state->dts_laststatus = dtrace_gethrtime(); + + bzero(&stat, sizeof (stat)); + + mutex_enter(&dtrace_lock); + + if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) { + mutex_exit(&dtrace_lock); + return (ENOENT); + } + + if (state->dts_activity == DTRACE_ACTIVITY_DRAINING) + stat.dtst_exiting = 1; + + nerrs = state->dts_errors; + dstate = &state->dts_vstate.dtvs_dynvars; + + for (i = 0; i < NCPU; i++) { + dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i]; + + stat.dtst_dyndrops += dcpu->dtdsc_drops; + stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops; + stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops; + + if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL) + stat.dtst_filled++; + + nerrs += state->dts_buffer[i].dtb_errors; + + for (j = 0; j < state->dts_nspeculations; j++) { + dtrace_speculation_t *spec; + dtrace_buffer_t *buf; + + spec = &state->dts_speculations[j]; + buf = &spec->dtsp_buffer[i]; + stat.dtst_specdrops += buf->dtb_xamot_drops; + } + } + + stat.dtst_specdrops_busy = state->dts_speculations_busy; + stat.dtst_specdrops_unavail = state->dts_speculations_unavail; + stat.dtst_stkstroverflows = state->dts_stkstroverflows; + stat.dtst_dblerrors = state->dts_dblerrors; + stat.dtst_killed = + (state->dts_activity == DTRACE_ACTIVITY_KILLED); + stat.dtst_errors = nerrs; + + mutex_exit(&dtrace_lock); + + if (copyout(&stat, (void *)arg, sizeof (stat)) != 0) + return (EFAULT); + + return (0); + } + + case DTRACEIOC_FORMAT: { + dtrace_fmtdesc_t fmt; + char *str; + int len; + + if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0) + return (EFAULT); + + mutex_enter(&dtrace_lock); + + if (fmt.dtfd_format == 0 || + fmt.dtfd_format > state->dts_nformats) { + mutex_exit(&dtrace_lock); + return (EINVAL); + } + + /* + * Format strings are allocated contiguously and they are + * never freed; if a format index is less than the number + * of formats, we can assert that the format map is non-NULL + * and that the format for the specified index is non-NULL. + */ + ASSERT(state->dts_formats != NULL); + str = state->dts_formats[fmt.dtfd_format - 1]; + ASSERT(str != NULL); + + len = strlen(str) + 1; + + if (len > fmt.dtfd_length) { + fmt.dtfd_length = len; + + if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) { + mutex_exit(&dtrace_lock); + return (EINVAL); + } + } else { + if (copyout(str, fmt.dtfd_string, len) != 0) { + mutex_exit(&dtrace_lock); + return (EINVAL); + } + } + + mutex_exit(&dtrace_lock); + return (0); + } + + default: + break; + } + + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + dtrace_state_t *state; + + switch (cmd) { + case DDI_DETACH: + break; + + case DDI_SUSPEND: + return (DDI_SUCCESS); + + default: + return (DDI_FAILURE); + } + + mutex_enter(&cpu_lock); + mutex_enter(&dtrace_provider_lock); + mutex_enter(&dtrace_lock); + + ASSERT(dtrace_opens == 0); + + if (dtrace_helpers > 0) { + mutex_exit(&dtrace_provider_lock); + mutex_exit(&dtrace_lock); + mutex_exit(&cpu_lock); + return (DDI_FAILURE); + } + + if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) { + mutex_exit(&dtrace_provider_lock); + mutex_exit(&dtrace_lock); + mutex_exit(&cpu_lock); + return (DDI_FAILURE); + } + + dtrace_provider = NULL; + + if ((state = dtrace_anon_grab()) != NULL) { + /* + * If there were ECBs on this state, the provider should + * have not been allowed to detach; assert that there is + * none. + */ + ASSERT(state->dts_necbs == 0); + dtrace_state_destroy(state); + + /* + * If we're being detached with anonymous state, we need to + * indicate to the kernel debugger that DTrace is now inactive. + */ + (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); + } + + bzero(&dtrace_anon, sizeof (dtrace_anon_t)); + unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL); + dtrace_cpu_init = NULL; + dtrace_helpers_cleanup = NULL; + dtrace_helpers_fork = NULL; + dtrace_cpustart_init = NULL; + dtrace_cpustart_fini = NULL; + dtrace_debugger_init = NULL; + dtrace_debugger_fini = NULL; + dtrace_modload = NULL; + dtrace_modunload = NULL; + + ASSERT(dtrace_getf == 0); + ASSERT(dtrace_closef == NULL); + + mutex_exit(&cpu_lock); + + kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *)); + dtrace_probes = NULL; + dtrace_nprobes = 0; + + dtrace_hash_destroy(dtrace_bymod); + dtrace_hash_destroy(dtrace_byfunc); + dtrace_hash_destroy(dtrace_byname); + dtrace_bymod = NULL; + dtrace_byfunc = NULL; + dtrace_byname = NULL; + + kmem_cache_destroy(dtrace_state_cache); + vmem_destroy(dtrace_minor); + vmem_destroy(dtrace_arena); + + if (dtrace_toxrange != NULL) { + kmem_free(dtrace_toxrange, + dtrace_toxranges_max * sizeof (dtrace_toxrange_t)); + dtrace_toxrange = NULL; + dtrace_toxranges = 0; + dtrace_toxranges_max = 0; + } + + ddi_remove_minor_node(dtrace_devi, NULL); + dtrace_devi = NULL; + + ddi_soft_state_fini(&dtrace_softstate); + + ASSERT(dtrace_vtime_references == 0); + ASSERT(dtrace_opens == 0); + ASSERT(dtrace_retained == NULL); + + mutex_exit(&dtrace_lock); + mutex_exit(&dtrace_provider_lock); + + /* + * We don't destroy the task queue until after we have dropped our + * locks (taskq_destroy() may block on running tasks). To prevent + * attempting to do work after we have effectively detached but before + * the task queue has been destroyed, all tasks dispatched via the + * task queue must check that DTrace is still attached before + * performing any operation. + */ + taskq_destroy(dtrace_taskq); + dtrace_taskq = NULL; + + return (DDI_SUCCESS); +} +#endif + +#ifdef illumos +/*ARGSUSED*/ +static int +dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + int error; + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)dtrace_devi; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + } + return (error); +} +#endif + +#ifdef illumos +static struct cb_ops dtrace_cb_ops = { + dtrace_open, /* open */ + dtrace_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + dtrace_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops dtrace_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + dtrace_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + dtrace_attach, /* attach */ + dtrace_detach, /* detach */ + nodev, /* reset */ + &dtrace_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev /* dev power */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "Dynamic Tracing", /* name of module */ + &dtrace_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} +#else + +static d_ioctl_t dtrace_ioctl; +static d_ioctl_t dtrace_ioctl_helper; +static void dtrace_load(void *); +static int dtrace_unload(void); +static struct cdev *dtrace_dev; +static struct cdev *helper_dev; + +void dtrace_invop_init(void); +void dtrace_invop_uninit(void); + +static struct cdevsw dtrace_cdevsw = { + .d_version = D_VERSION, + .d_ioctl = dtrace_ioctl, + .d_open = dtrace_open, + .d_name = "dtrace", +}; + +static struct cdevsw helper_cdevsw = { + .d_version = D_VERSION, + .d_ioctl = dtrace_ioctl_helper, + .d_name = "helper", +}; + +#include <dtrace_anon.c> +#include <dtrace_ioctl.c> +#include <dtrace_load.c> +#include <dtrace_modevent.c> +#include <dtrace_sysctl.c> +#include <dtrace_unload.c> +#include <dtrace_vtime.c> +#include <dtrace_hacks.c> +#include <dtrace_isa.c> + +SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL); +SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL); +SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL); + +DEV_MODULE(dtrace, dtrace_modevent, NULL); +MODULE_VERSION(dtrace, 1); +MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1); +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.c new file mode 100644 index 000000000000..fbc656a90e5c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.c @@ -0,0 +1,89 @@ +/*- + * Copyright (c) 2016 (Graeme Jenkinson) + * All rights reserved. + * + * This software was developed by BAE Systems, the University of Cambridge + * Computer Laboratory, and Memorial University under DARPA/AFRL contract + * FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing + * (TC) research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/types.h> + +#include "dtrace_xoroshiro128_plus.h" + +static __inline uint64_t +rotl(const uint64_t x, int k) +{ + return (x << k) | (x >> (64 - k)); +} + +/* + * This is the jump function for the generator. It is equivalent to 2^64 calls + * to next(); it can be used to generate 2^64 non-overlapping subsequences for + * parallel computations. + */ +void +dtrace_xoroshiro128_plus_jump(uint64_t * const state, + uint64_t * const jump_state) +{ + static const uint64_t JUMP[] = { 0xbeac0467eba5facb, + 0xd86b048b86aa9922 }; + + uint64_t s0 = 0; + uint64_t s1 = 0; + int i = 0; + int b = 0; + for (i = 0; i < sizeof JUMP / sizeof *JUMP; i++) { + for (b = 0; b < 64; b++) { + if (JUMP[i] & 1ULL << b) { + s0 ^= state[0]; + s1 ^= state[1]; + } + dtrace_xoroshiro128_plus_next(state); + } + } + jump_state[0] = s0; + jump_state[1] = s1; +} + +/* + * xoroshiro128+ - XOR/rotate/shift/rotate + * xorshift.di.unimi.it + */ +uint64_t +dtrace_xoroshiro128_plus_next(uint64_t * const state) +{ + const uint64_t s0 = state[0]; + uint64_t s1 = state[1]; + uint64_t result; + result = s0 + s1; + + s1 ^= s0; + state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14); + state[1] = rotl(s1, 36); + + return result; +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.h b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.h new file mode 100644 index 000000000000..73b17226428f --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace_xoroshiro128_plus.h @@ -0,0 +1,41 @@ +/*- + * Copyright (c) 2016 (Graeme Jenkinson) + * All rights reserved. + * + * This software was developed by BAE Systems, the University of Cambridge + * Computer Laboratory, and Memorial University under DARPA/AFRL contract + * FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing + * (TC) research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#ifndef _DTRACE_XOROSHIRO128_PLUS_H +#define _DTRACE_XOROSHIRO128_PLUS_H + +#include <sys/types.h> + +void dtrace_xoroshiro128_plus_jump(uint64_t * const, uint64_t * const); +uint64_t dtrace_xoroshiro128_plus_next(uint64_t * const); + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c new file mode 100644 index 000000000000..3ac3becc28ba --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c @@ -0,0 +1,2678 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Portions Copyright 2010 The FreeBSD Foundation + * + * $FreeBSD$ + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + +#include <sys/atomic.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/modctl.h> +#include <sys/conf.h> +#include <sys/systm.h> +#ifdef illumos +#include <sys/ddi.h> +#endif +#include <sys/sunddi.h> +#include <sys/cpuvar.h> +#include <sys/kmem.h> +#ifdef illumos +#include <sys/strsubr.h> +#endif +#include <sys/fasttrap.h> +#include <sys/fasttrap_impl.h> +#include <sys/fasttrap_isa.h> +#include <sys/dtrace.h> +#include <sys/dtrace_impl.h> +#include <sys/sysmacros.h> +#include <sys/proc.h> +#include <sys/policy.h> +#ifdef illumos +#include <util/qsort.h> +#endif +#include <sys/mutex.h> +#include <sys/kernel.h> +#ifndef illumos +#include <sys/dtrace_bsd.h> +#include <sys/eventhandler.h> +#include <sys/rmlock.h> +#include <sys/sysent.h> +#include <sys/sysctl.h> +#include <sys/u8_textprep.h> +#include <sys/user.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_param.h> + +#include <cddl/dev/dtrace/dtrace_cddl.h> +#endif + +/* + * User-Land Trap-Based Tracing + * ---------------------------- + * + * The fasttrap provider allows DTrace consumers to instrument any user-level + * instruction to gather data; this includes probes with semantic + * signifigance like entry and return as well as simple offsets into the + * function. While the specific techniques used are very ISA specific, the + * methodology is generalizable to any architecture. + * + * + * The General Methodology + * ----------------------- + * + * With the primary goal of tracing every user-land instruction and the + * limitation that we can't trust user space so don't want to rely on much + * information there, we begin by replacing the instructions we want to trace + * with trap instructions. Each instruction we overwrite is saved into a hash + * table keyed by process ID and pc address. When we enter the kernel due to + * this trap instruction, we need the effects of the replaced instruction to + * appear to have occurred before we proceed with the user thread's + * execution. + * + * Each user level thread is represented by a ulwp_t structure which is + * always easily accessible through a register. The most basic way to produce + * the effects of the instruction we replaced is to copy that instruction out + * to a bit of scratch space reserved in the user thread's ulwp_t structure + * (a sort of kernel-private thread local storage), set the PC to that + * scratch space and single step. When we reenter the kernel after single + * stepping the instruction we must then adjust the PC to point to what would + * normally be the next instruction. Of course, special care must be taken + * for branches and jumps, but these represent such a small fraction of any + * instruction set that writing the code to emulate these in the kernel is + * not too difficult. + * + * Return probes may require several tracepoints to trace every return site, + * and, conversely, each tracepoint may activate several probes (the entry + * and offset 0 probes, for example). To solve this muliplexing problem, + * tracepoints contain lists of probes to activate and probes contain lists + * of tracepoints to enable. If a probe is activated, it adds its ID to + * existing tracepoints or creates new ones as necessary. + * + * Most probes are activated _before_ the instruction is executed, but return + * probes are activated _after_ the effects of the last instruction of the + * function are visible. Return probes must be fired _after_ we have + * single-stepped the instruction whereas all other probes are fired + * beforehand. + * + * + * Lock Ordering + * ------------- + * + * The lock ordering below -- both internally and with respect to the DTrace + * framework -- is a little tricky and bears some explanation. Each provider + * has a lock (ftp_mtx) that protects its members including reference counts + * for enabled probes (ftp_rcount), consumers actively creating probes + * (ftp_ccount) and USDT consumers (ftp_mcount); all three prevent a provider + * from being freed. A provider is looked up by taking the bucket lock for the + * provider hash table, and is returned with its lock held. The provider lock + * may be taken in functions invoked by the DTrace framework, but may not be + * held while calling functions in the DTrace framework. + * + * To ensure consistency over multiple calls to the DTrace framework, the + * creation lock (ftp_cmtx) should be held. Naturally, the creation lock may + * not be taken when holding the provider lock as that would create a cyclic + * lock ordering. In situations where one would naturally take the provider + * lock and then the creation lock, we instead up a reference count to prevent + * the provider from disappearing, drop the provider lock, and acquire the + * creation lock. + * + * Briefly: + * bucket lock before provider lock + * DTrace before provider lock + * creation lock before DTrace + * never hold the provider lock and creation lock simultaneously + */ + +static d_open_t fasttrap_open; +static d_ioctl_t fasttrap_ioctl; + +static struct cdevsw fasttrap_cdevsw = { + .d_version = D_VERSION, + .d_open = fasttrap_open, + .d_ioctl = fasttrap_ioctl, + .d_name = "fasttrap", +}; +static struct cdev *fasttrap_cdev; +static dtrace_meta_provider_id_t fasttrap_meta_id; + +static struct proc *fasttrap_cleanup_proc; +static struct mtx fasttrap_cleanup_mtx; +static uint_t fasttrap_cleanup_work, fasttrap_cleanup_drain, fasttrap_cleanup_cv; + +/* + * Generation count on modifications to the global tracepoint lookup table. + */ +static volatile uint64_t fasttrap_mod_gen; + +/* + * When the fasttrap provider is loaded, fasttrap_max is set to either + * FASTTRAP_MAX_DEFAULT, or the value for fasttrap-max-probes in the + * fasttrap.conf file (Illumos), or the value provied in the loader.conf (FreeBSD). + * Each time a probe is created, fasttrap_total is incremented by the number + * of tracepoints that may be associated with that probe; fasttrap_total is capped + * at fasttrap_max. + */ +#define FASTTRAP_MAX_DEFAULT 250000 +static uint32_t fasttrap_max = FASTTRAP_MAX_DEFAULT; +static uint32_t fasttrap_total; + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#define FASTTRAP_TPOINTS_DEFAULT_SIZE 0x4000 +#define FASTTRAP_PROVIDERS_DEFAULT_SIZE 0x100 +#define FASTTRAP_PROCS_DEFAULT_SIZE 0x100 + +#define FASTTRAP_PID_NAME "pid" + +fasttrap_hash_t fasttrap_tpoints; +static fasttrap_hash_t fasttrap_provs; +static fasttrap_hash_t fasttrap_procs; + +static uint64_t fasttrap_pid_count; /* pid ref count */ +static kmutex_t fasttrap_count_mtx; /* lock on ref count */ + +#define FASTTRAP_ENABLE_FAIL 1 +#define FASTTRAP_ENABLE_PARTIAL 2 + +static int fasttrap_tracepoint_enable(proc_t *, fasttrap_probe_t *, uint_t); +static void fasttrap_tracepoint_disable(proc_t *, fasttrap_probe_t *, uint_t); + +static fasttrap_provider_t *fasttrap_provider_lookup(pid_t, const char *, + const dtrace_pattr_t *); +static void fasttrap_provider_retire(pid_t, const char *, int); +static void fasttrap_provider_free(fasttrap_provider_t *); + +static fasttrap_proc_t *fasttrap_proc_lookup(pid_t); +static void fasttrap_proc_release(fasttrap_proc_t *); + +#ifndef illumos +static void fasttrap_thread_dtor(void *, struct thread *); +#endif + +#define FASTTRAP_PROVS_INDEX(pid, name) \ + ((fasttrap_hash_str(name) + (pid)) & fasttrap_provs.fth_mask) + +#define FASTTRAP_PROCS_INDEX(pid) ((pid) & fasttrap_procs.fth_mask) + +#ifndef illumos +struct rmlock fasttrap_tp_lock; +static eventhandler_tag fasttrap_thread_dtor_tag; +#endif + +static unsigned long tpoints_hash_size = FASTTRAP_TPOINTS_DEFAULT_SIZE; + +#ifdef __FreeBSD__ +SYSCTL_DECL(_kern_dtrace); +SYSCTL_NODE(_kern_dtrace, OID_AUTO, fasttrap, CTLFLAG_RD, 0, "DTrace fasttrap parameters"); +SYSCTL_UINT(_kern_dtrace_fasttrap, OID_AUTO, max_probes, CTLFLAG_RWTUN, &fasttrap_max, + FASTTRAP_MAX_DEFAULT, "Maximum number of fasttrap probes"); +SYSCTL_ULONG(_kern_dtrace_fasttrap, OID_AUTO, tpoints_hash_size, CTLFLAG_RDTUN, &tpoints_hash_size, + FASTTRAP_TPOINTS_DEFAULT_SIZE, "Size of the tracepoint hash table"); +#endif + +static int +fasttrap_highbit(ulong_t i) +{ + int h = 1; + + if (i == 0) + return (0); +#ifdef _LP64 + if (i & 0xffffffff00000000ul) { + h += 32; i >>= 32; + } +#endif + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} + +static uint_t +fasttrap_hash_str(const char *p) +{ + unsigned int g; + uint_t hval = 0; + + while (*p) { + hval = (hval << 4) + *p++; + if ((g = (hval & 0xf0000000)) != 0) + hval ^= g >> 24; + hval &= ~g; + } + return (hval); +} + +void +fasttrap_sigtrap(proc_t *p, kthread_t *t, uintptr_t pc) +{ + ksiginfo_t ksi; + + ksiginfo_init(&ksi); + ksi.ksi_signo = SIGTRAP; + ksi.ksi_code = TRAP_DTRACE; + ksi.ksi_addr = (caddr_t)pc; + PROC_LOCK(p); + (void)tdsendsignal(p, t, SIGTRAP, &ksi); + PROC_UNLOCK(p); +} + +#ifndef illumos +/* + * Obtain a chunk of scratch space in the address space of the target process. + */ +fasttrap_scrspace_t * +fasttrap_scraddr(struct thread *td, fasttrap_proc_t *fprc) +{ + fasttrap_scrblock_t *scrblk; + fasttrap_scrspace_t *scrspc; + struct proc *p; + vm_offset_t addr; + int error, i; + + scrspc = NULL; + if (td->t_dtrace_sscr != NULL) { + /* If the thread already has scratch space, we're done. */ + scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr; + return (scrspc); + } + + p = td->td_proc; + + mutex_enter(&fprc->ftpc_mtx); + if (LIST_EMPTY(&fprc->ftpc_fscr)) { + /* + * No scratch space is available, so we'll map a new scratch + * space block into the traced process' address space. + */ + addr = 0; + error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, + FASTTRAP_SCRBLOCK_SIZE, 0, VMFS_ANY_SPACE, VM_PROT_ALL, + VM_PROT_ALL, 0); + if (error != KERN_SUCCESS) + goto done; + + scrblk = malloc(sizeof(*scrblk), M_SOLARIS, M_WAITOK); + scrblk->ftsb_addr = addr; + LIST_INSERT_HEAD(&fprc->ftpc_scrblks, scrblk, ftsb_next); + + /* + * Carve the block up into chunks and put them on the free list. + */ + for (i = 0; + i < FASTTRAP_SCRBLOCK_SIZE / FASTTRAP_SCRSPACE_SIZE; i++) { + scrspc = malloc(sizeof(*scrspc), M_SOLARIS, M_WAITOK); + scrspc->ftss_addr = addr + + i * FASTTRAP_SCRSPACE_SIZE; + LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc, + ftss_next); + } + } + + /* + * Take the first scratch chunk off the free list, put it on the + * allocated list, and return its address. + */ + scrspc = LIST_FIRST(&fprc->ftpc_fscr); + LIST_REMOVE(scrspc, ftss_next); + LIST_INSERT_HEAD(&fprc->ftpc_ascr, scrspc, ftss_next); + + /* + * This scratch space is reserved for use by td until the thread exits. + */ + td->t_dtrace_sscr = scrspc; + +done: + mutex_exit(&fprc->ftpc_mtx); + + return (scrspc); +} + +/* + * Return any allocated per-thread scratch space chunks back to the process' + * free list. + */ +static void +fasttrap_thread_dtor(void *arg __unused, struct thread *td) +{ + fasttrap_bucket_t *bucket; + fasttrap_proc_t *fprc; + fasttrap_scrspace_t *scrspc; + pid_t pid; + + if (td->t_dtrace_sscr == NULL) + return; + + pid = td->td_proc->p_pid; + bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)]; + fprc = NULL; + + /* Look up the fasttrap process handle for this process. */ + mutex_enter(&bucket->ftb_mtx); + for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) { + if (fprc->ftpc_pid == pid) { + mutex_enter(&fprc->ftpc_mtx); + mutex_exit(&bucket->ftb_mtx); + break; + } + } + if (fprc == NULL) { + mutex_exit(&bucket->ftb_mtx); + return; + } + + scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr; + LIST_REMOVE(scrspc, ftss_next); + LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc, ftss_next); + + mutex_exit(&fprc->ftpc_mtx); +} +#endif + +/* + * This function ensures that no threads are actively using the memory + * associated with probes that were formerly live. + */ +static void +fasttrap_mod_barrier(uint64_t gen) +{ + int i; + + if (gen < fasttrap_mod_gen) + return; + + fasttrap_mod_gen++; + +#ifdef illumos + CPU_FOREACH(i) { + mutex_enter(&fasttrap_cpuc_pid_lock[i]); + mutex_exit(&fasttrap_cpuc_pid_lock[i]); + } +#else + rm_wlock(&fasttrap_tp_lock); + rm_wunlock(&fasttrap_tp_lock); +#endif +} + +/* + * This function performs asynchronous cleanup of fasttrap providers. The + * Solaris implementation of this mechanism use a timeout that's activated in + * fasttrap_pid_cleanup(), but this doesn't work in FreeBSD: one may sleep while + * holding the DTrace mutexes, but it is unsafe to sleep in a callout handler. + * Thus we use a dedicated process to perform the cleanup when requested. + */ +/*ARGSUSED*/ +static void +fasttrap_pid_cleanup_cb(void *data) +{ + fasttrap_provider_t **fpp, *fp; + fasttrap_bucket_t *bucket; + dtrace_provider_id_t provid; + int i, later = 0, rval; + + mtx_lock(&fasttrap_cleanup_mtx); + while (!fasttrap_cleanup_drain || later > 0) { + fasttrap_cleanup_work = 0; + mtx_unlock(&fasttrap_cleanup_mtx); + + later = 0; + + /* + * Iterate over all the providers trying to remove the marked + * ones. If a provider is marked but not retired, we just + * have to take a crack at removing it -- it's no big deal if + * we can't. + */ + for (i = 0; i < fasttrap_provs.fth_nent; i++) { + bucket = &fasttrap_provs.fth_table[i]; + mutex_enter(&bucket->ftb_mtx); + fpp = (fasttrap_provider_t **)&bucket->ftb_data; + + while ((fp = *fpp) != NULL) { + if (!fp->ftp_marked) { + fpp = &fp->ftp_next; + continue; + } + + mutex_enter(&fp->ftp_mtx); + + /* + * If this provider has consumers actively + * creating probes (ftp_ccount) or is a USDT + * provider (ftp_mcount), we can't unregister + * or even condense. + */ + if (fp->ftp_ccount != 0 || + fp->ftp_mcount != 0) { + mutex_exit(&fp->ftp_mtx); + fp->ftp_marked = 0; + continue; + } + + if (!fp->ftp_retired || fp->ftp_rcount != 0) + fp->ftp_marked = 0; + + mutex_exit(&fp->ftp_mtx); + + /* + * If we successfully unregister this + * provider we can remove it from the hash + * chain and free the memory. If our attempt + * to unregister fails and this is a retired + * provider, increment our flag to try again + * pretty soon. If we've consumed more than + * half of our total permitted number of + * probes call dtrace_condense() to try to + * clean out the unenabled probes. + */ + provid = fp->ftp_provid; + if ((rval = dtrace_unregister(provid)) != 0) { + if (fasttrap_total > fasttrap_max / 2) + (void) dtrace_condense(provid); + + if (rval == EAGAIN) + fp->ftp_marked = 1; + + later += fp->ftp_marked; + fpp = &fp->ftp_next; + } else { + *fpp = fp->ftp_next; + fasttrap_provider_free(fp); + } + } + mutex_exit(&bucket->ftb_mtx); + } + mtx_lock(&fasttrap_cleanup_mtx); + + /* + * If we were unable to retire a provider, try again after a + * second. This situation can occur in certain circumstances + * where providers cannot be unregistered even though they have + * no probes enabled because of an execution of dtrace -l or + * something similar. + */ + if (later > 0 || fasttrap_cleanup_work || + fasttrap_cleanup_drain) { + mtx_unlock(&fasttrap_cleanup_mtx); + pause("ftclean", hz); + mtx_lock(&fasttrap_cleanup_mtx); + } else + mtx_sleep(&fasttrap_cleanup_cv, &fasttrap_cleanup_mtx, + 0, "ftcl", 0); + } + + /* + * Wake up the thread in fasttrap_unload() now that we're done. + */ + wakeup(&fasttrap_cleanup_drain); + mtx_unlock(&fasttrap_cleanup_mtx); + + kthread_exit(); +} + +/* + * Activates the asynchronous cleanup mechanism. + */ +static void +fasttrap_pid_cleanup(void) +{ + + mtx_lock(&fasttrap_cleanup_mtx); + if (!fasttrap_cleanup_work) { + fasttrap_cleanup_work = 1; + wakeup(&fasttrap_cleanup_cv); + } + mtx_unlock(&fasttrap_cleanup_mtx); +} + +/* + * This is called from cfork() via dtrace_fasttrap_fork(). The child + * process's address space is (roughly) a copy of the parent process's so + * we have to remove all the instrumentation we had previously enabled in the + * parent. + */ +static void +fasttrap_fork(proc_t *p, proc_t *cp) +{ +#ifndef illumos + fasttrap_scrblock_t *scrblk; + fasttrap_proc_t *fprc = NULL; +#endif + pid_t ppid = p->p_pid; + int i; + + ASSERT(curproc == p); +#ifdef illumos + ASSERT(p->p_proc_flag & P_PR_LOCK); +#else + PROC_LOCK_ASSERT(p, MA_OWNED); +#endif +#ifdef illumos + ASSERT(p->p_dtrace_count > 0); +#else + /* + * This check is purposely here instead of in kern_fork.c because, + * for legal resons, we cannot include the dtrace_cddl.h header + * inside kern_fork.c and insert if-clause there. + */ + if (p->p_dtrace_count == 0 && p->p_dtrace_helpers == NULL) + return; +#endif + + ASSERT(cp->p_dtrace_count == 0); + + /* + * This would be simpler and faster if we maintained per-process + * hash tables of enabled tracepoints. It could, however, potentially + * slow down execution of a tracepoint since we'd need to go + * through two levels of indirection. In the future, we should + * consider either maintaining per-process ancillary lists of + * enabled tracepoints or hanging a pointer to a per-process hash + * table of enabled tracepoints off the proc structure. + */ + + /* + * We don't have to worry about the child process disappearing + * because we're in fork(). + */ +#ifdef illumos + mtx_lock_spin(&cp->p_slock); + sprlock_proc(cp); + mtx_unlock_spin(&cp->p_slock); +#else + /* + * fasttrap_tracepoint_remove() expects the child process to be + * unlocked and the VM then expects curproc to be unlocked. + */ + _PHOLD(cp); + PROC_UNLOCK(cp); + PROC_UNLOCK(p); + if (p->p_dtrace_count == 0) + goto dup_helpers; +#endif + + /* + * Iterate over every tracepoint looking for ones that belong to the + * parent process, and remove each from the child process. + */ + for (i = 0; i < fasttrap_tpoints.fth_nent; i++) { + fasttrap_tracepoint_t *tp; + fasttrap_bucket_t *bucket = &fasttrap_tpoints.fth_table[i]; + + mutex_enter(&bucket->ftb_mtx); + for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { + if (tp->ftt_pid == ppid && + tp->ftt_proc->ftpc_acount != 0) { + int ret = fasttrap_tracepoint_remove(cp, tp); + ASSERT(ret == 0); + + /* + * The count of active providers can only be + * decremented (i.e. to zero) during exec, + * exit, and removal of a meta provider so it + * should be impossible to drop the count + * mid-fork. + */ + ASSERT(tp->ftt_proc->ftpc_acount != 0); +#ifndef illumos + fprc = tp->ftt_proc; +#endif + } + } + mutex_exit(&bucket->ftb_mtx); + +#ifndef illumos + /* + * Unmap any scratch space inherited from the parent's address + * space. + */ + if (fprc != NULL) { + mutex_enter(&fprc->ftpc_mtx); + LIST_FOREACH(scrblk, &fprc->ftpc_scrblks, ftsb_next) { + vm_map_remove(&cp->p_vmspace->vm_map, + scrblk->ftsb_addr, + scrblk->ftsb_addr + FASTTRAP_SCRBLOCK_SIZE); + } + mutex_exit(&fprc->ftpc_mtx); + } +#endif + } + +#ifdef illumos + mutex_enter(&cp->p_lock); + sprunlock(cp); +#else +dup_helpers: + if (p->p_dtrace_helpers != NULL) + dtrace_helpers_duplicate(p, cp); + PROC_LOCK(p); + PROC_LOCK(cp); + _PRELE(cp); +#endif +} + +/* + * This is called from proc_exit() or from exec_common() if p_dtrace_probes + * is set on the proc structure to indicate that there is a pid provider + * associated with this process. + */ +static void +fasttrap_exec_exit(proc_t *p) +{ +#ifndef illumos + struct thread *td; +#endif + +#ifdef illumos + ASSERT(p == curproc); +#else + PROC_LOCK_ASSERT(p, MA_OWNED); + _PHOLD(p); + /* + * Since struct threads may be recycled, we cannot rely on t_dtrace_sscr + * fields to be zeroed by kdtrace_thread_ctor. Thus we must zero it + * ourselves when a process exits. + */ + FOREACH_THREAD_IN_PROC(p, td) + td->t_dtrace_sscr = NULL; + PROC_UNLOCK(p); +#endif + + /* + * We clean up the pid provider for this process here; user-land + * static probes are handled by the meta-provider remove entry point. + */ + fasttrap_provider_retire(p->p_pid, FASTTRAP_PID_NAME, 0); +#ifndef illumos + if (p->p_dtrace_helpers) + dtrace_helpers_destroy(p); + PROC_LOCK(p); + _PRELE(p); +#endif +} + + +/*ARGSUSED*/ +static void +fasttrap_pid_provide(void *arg, dtrace_probedesc_t *desc) +{ + /* + * There are no "default" pid probes. + */ +} + +static int +fasttrap_tracepoint_enable(proc_t *p, fasttrap_probe_t *probe, uint_t index) +{ + fasttrap_tracepoint_t *tp, *new_tp = NULL; + fasttrap_bucket_t *bucket; + fasttrap_id_t *id; + pid_t pid; + uintptr_t pc; + + ASSERT(index < probe->ftp_ntps); + + pid = probe->ftp_pid; + pc = probe->ftp_tps[index].fit_tp->ftt_pc; + id = &probe->ftp_tps[index].fit_id; + + ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid); + +#ifdef illumos + ASSERT(!(p->p_flag & SVFORK)); +#endif + + /* + * Before we make any modifications, make sure we've imposed a barrier + * on the generation in which this probe was last modified. + */ + fasttrap_mod_barrier(probe->ftp_gen); + + bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)]; + + /* + * If the tracepoint has already been enabled, just add our id to the + * list of interested probes. This may be our second time through + * this path in which case we'll have constructed the tracepoint we'd + * like to install. If we can't find a match, and have an allocated + * tracepoint ready to go, enable that one now. + * + * A tracepoint whose process is defunct is also considered defunct. + */ +again: + mutex_enter(&bucket->ftb_mtx); + for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { + /* + * Note that it's safe to access the active count on the + * associated proc structure because we know that at least one + * provider (this one) will still be around throughout this + * operation. + */ + if (tp->ftt_pid != pid || tp->ftt_pc != pc || + tp->ftt_proc->ftpc_acount == 0) + continue; + + /* + * Now that we've found a matching tracepoint, it would be + * a decent idea to confirm that the tracepoint is still + * enabled and the trap instruction hasn't been overwritten. + * Since this is a little hairy, we'll punt for now. + */ + + /* + * This can't be the first interested probe. We don't have + * to worry about another thread being in the midst of + * deleting this tracepoint (which would be the only valid + * reason for a tracepoint to have no interested probes) + * since we're holding P_PR_LOCK for this process. + */ + ASSERT(tp->ftt_ids != NULL || tp->ftt_retids != NULL); + + switch (id->fti_ptype) { + case DTFTP_ENTRY: + case DTFTP_OFFSETS: + case DTFTP_IS_ENABLED: + id->fti_next = tp->ftt_ids; + membar_producer(); + tp->ftt_ids = id; + membar_producer(); + break; + + case DTFTP_RETURN: + case DTFTP_POST_OFFSETS: + id->fti_next = tp->ftt_retids; + membar_producer(); + tp->ftt_retids = id; + membar_producer(); + break; + + default: + ASSERT(0); + } + + mutex_exit(&bucket->ftb_mtx); + + if (new_tp != NULL) { + new_tp->ftt_ids = NULL; + new_tp->ftt_retids = NULL; + } + + return (0); + } + + /* + * If we have a good tracepoint ready to go, install it now while + * we have the lock held and no one can screw with us. + */ + if (new_tp != NULL) { + int rc = 0; + + new_tp->ftt_next = bucket->ftb_data; + membar_producer(); + bucket->ftb_data = new_tp; + membar_producer(); + mutex_exit(&bucket->ftb_mtx); + + /* + * Activate the tracepoint in the ISA-specific manner. + * If this fails, we need to report the failure, but + * indicate that this tracepoint must still be disabled + * by calling fasttrap_tracepoint_disable(). + */ + if (fasttrap_tracepoint_install(p, new_tp) != 0) + rc = FASTTRAP_ENABLE_PARTIAL; + + /* + * Increment the count of the number of tracepoints active in + * the victim process. + */ +#ifdef illumos + ASSERT(p->p_proc_flag & P_PR_LOCK); +#endif + p->p_dtrace_count++; + + return (rc); + } + + mutex_exit(&bucket->ftb_mtx); + + /* + * Initialize the tracepoint that's been preallocated with the probe. + */ + new_tp = probe->ftp_tps[index].fit_tp; + + ASSERT(new_tp->ftt_pid == pid); + ASSERT(new_tp->ftt_pc == pc); + ASSERT(new_tp->ftt_proc == probe->ftp_prov->ftp_proc); + ASSERT(new_tp->ftt_ids == NULL); + ASSERT(new_tp->ftt_retids == NULL); + + switch (id->fti_ptype) { + case DTFTP_ENTRY: + case DTFTP_OFFSETS: + case DTFTP_IS_ENABLED: + id->fti_next = NULL; + new_tp->ftt_ids = id; + break; + + case DTFTP_RETURN: + case DTFTP_POST_OFFSETS: + id->fti_next = NULL; + new_tp->ftt_retids = id; + break; + + default: + ASSERT(0); + } + +#ifdef __FreeBSD__ + if (SV_PROC_FLAG(p, SV_LP64)) + p->p_model = DATAMODEL_LP64; + else + p->p_model = DATAMODEL_ILP32; +#endif + + /* + * If the ISA-dependent initialization goes to plan, go back to the + * beginning and try to install this freshly made tracepoint. + */ + if (fasttrap_tracepoint_init(p, new_tp, pc, id->fti_ptype) == 0) + goto again; + + new_tp->ftt_ids = NULL; + new_tp->ftt_retids = NULL; + + return (FASTTRAP_ENABLE_FAIL); +} + +static void +fasttrap_tracepoint_disable(proc_t *p, fasttrap_probe_t *probe, uint_t index) +{ + fasttrap_bucket_t *bucket; + fasttrap_provider_t *provider = probe->ftp_prov; + fasttrap_tracepoint_t **pp, *tp; + fasttrap_id_t *id, **idp = NULL; + pid_t pid; + uintptr_t pc; + + ASSERT(index < probe->ftp_ntps); + + pid = probe->ftp_pid; + pc = probe->ftp_tps[index].fit_tp->ftt_pc; + id = &probe->ftp_tps[index].fit_id; + + ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid); + + /* + * Find the tracepoint and make sure that our id is one of the + * ones registered with it. + */ + bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)]; + mutex_enter(&bucket->ftb_mtx); + for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { + if (tp->ftt_pid == pid && tp->ftt_pc == pc && + tp->ftt_proc == provider->ftp_proc) + break; + } + + /* + * If we somehow lost this tracepoint, we're in a world of hurt. + */ + ASSERT(tp != NULL); + + switch (id->fti_ptype) { + case DTFTP_ENTRY: + case DTFTP_OFFSETS: + case DTFTP_IS_ENABLED: + ASSERT(tp->ftt_ids != NULL); + idp = &tp->ftt_ids; + break; + + case DTFTP_RETURN: + case DTFTP_POST_OFFSETS: + ASSERT(tp->ftt_retids != NULL); + idp = &tp->ftt_retids; + break; + + default: + ASSERT(0); + } + + while ((*idp)->fti_probe != probe) { + idp = &(*idp)->fti_next; + ASSERT(*idp != NULL); + } + + id = *idp; + *idp = id->fti_next; + membar_producer(); + + ASSERT(id->fti_probe == probe); + + /* + * If there are other registered enablings of this tracepoint, we're + * all done, but if this was the last probe assocated with this + * this tracepoint, we need to remove and free it. + */ + if (tp->ftt_ids != NULL || tp->ftt_retids != NULL) { + + /* + * If the current probe's tracepoint is in use, swap it + * for an unused tracepoint. + */ + if (tp == probe->ftp_tps[index].fit_tp) { + fasttrap_probe_t *tmp_probe; + fasttrap_tracepoint_t **tmp_tp; + uint_t tmp_index; + + if (tp->ftt_ids != NULL) { + tmp_probe = tp->ftt_ids->fti_probe; + /* LINTED - alignment */ + tmp_index = FASTTRAP_ID_INDEX(tp->ftt_ids); + tmp_tp = &tmp_probe->ftp_tps[tmp_index].fit_tp; + } else { + tmp_probe = tp->ftt_retids->fti_probe; + /* LINTED - alignment */ + tmp_index = FASTTRAP_ID_INDEX(tp->ftt_retids); + tmp_tp = &tmp_probe->ftp_tps[tmp_index].fit_tp; + } + + ASSERT(*tmp_tp != NULL); + ASSERT(*tmp_tp != probe->ftp_tps[index].fit_tp); + ASSERT((*tmp_tp)->ftt_ids == NULL); + ASSERT((*tmp_tp)->ftt_retids == NULL); + + probe->ftp_tps[index].fit_tp = *tmp_tp; + *tmp_tp = tp; + } + + mutex_exit(&bucket->ftb_mtx); + + /* + * Tag the modified probe with the generation in which it was + * changed. + */ + probe->ftp_gen = fasttrap_mod_gen; + return; + } + + mutex_exit(&bucket->ftb_mtx); + + /* + * We can't safely remove the tracepoint from the set of active + * tracepoints until we've actually removed the fasttrap instruction + * from the process's text. We can, however, operate on this + * tracepoint secure in the knowledge that no other thread is going to + * be looking at it since we hold P_PR_LOCK on the process if it's + * live or we hold the provider lock on the process if it's dead and + * gone. + */ + + /* + * We only need to remove the actual instruction if we're looking + * at an existing process + */ + if (p != NULL) { + /* + * If we fail to restore the instruction we need to kill + * this process since it's in a completely unrecoverable + * state. + */ + if (fasttrap_tracepoint_remove(p, tp) != 0) + fasttrap_sigtrap(p, NULL, pc); + + /* + * Decrement the count of the number of tracepoints active + * in the victim process. + */ +#ifdef illumos + ASSERT(p->p_proc_flag & P_PR_LOCK); +#endif + p->p_dtrace_count--; + + atomic_add_rel_64(&p->p_fasttrap_tp_gen, 1); + } + + /* + * Remove the probe from the hash table of active tracepoints. + */ + mutex_enter(&bucket->ftb_mtx); + pp = (fasttrap_tracepoint_t **)&bucket->ftb_data; + ASSERT(*pp != NULL); + while (*pp != tp) { + pp = &(*pp)->ftt_next; + ASSERT(*pp != NULL); + } + + *pp = tp->ftt_next; + membar_producer(); + + mutex_exit(&bucket->ftb_mtx); + + /* + * Tag the modified probe with the generation in which it was changed. + */ + probe->ftp_gen = fasttrap_mod_gen; +} + +static void +fasttrap_enable_callbacks(void) +{ + /* + * We don't have to play the rw lock game here because we're + * providing something rather than taking something away -- + * we can be sure that no threads have tried to follow this + * function pointer yet. + */ + mutex_enter(&fasttrap_count_mtx); + if (fasttrap_pid_count == 0) { + ASSERT(dtrace_pid_probe_ptr == NULL); + ASSERT(dtrace_return_probe_ptr == NULL); + dtrace_pid_probe_ptr = &fasttrap_pid_probe; + dtrace_return_probe_ptr = &fasttrap_return_probe; + } + ASSERT(dtrace_pid_probe_ptr == &fasttrap_pid_probe); + ASSERT(dtrace_return_probe_ptr == &fasttrap_return_probe); + fasttrap_pid_count++; + mutex_exit(&fasttrap_count_mtx); +} + +static void +fasttrap_disable_callbacks(void) +{ +#ifdef illumos + ASSERT(MUTEX_HELD(&cpu_lock)); +#endif + + + mutex_enter(&fasttrap_count_mtx); + ASSERT(fasttrap_pid_count > 0); + fasttrap_pid_count--; + if (fasttrap_pid_count == 0) { +#ifdef illumos + cpu_t *cur, *cpu = CPU; + + for (cur = cpu->cpu_next_onln; cur != cpu; + cur = cur->cpu_next_onln) { + rw_enter(&cur->cpu_ft_lock, RW_WRITER); + } +#endif + dtrace_pid_probe_ptr = NULL; + dtrace_return_probe_ptr = NULL; +#ifdef illumos + for (cur = cpu->cpu_next_onln; cur != cpu; + cur = cur->cpu_next_onln) { + rw_exit(&cur->cpu_ft_lock); + } +#endif + } + mutex_exit(&fasttrap_count_mtx); +} + +/*ARGSUSED*/ +static void +fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) +{ + fasttrap_probe_t *probe = parg; + proc_t *p = NULL; + int i, rc; + + ASSERT(probe != NULL); + ASSERT(!probe->ftp_enabled); + ASSERT(id == probe->ftp_id); +#ifdef illumos + ASSERT(MUTEX_HELD(&cpu_lock)); +#endif + + /* + * Increment the count of enabled probes on this probe's provider; + * the provider can't go away while the probe still exists. We + * must increment this even if we aren't able to properly enable + * this probe. + */ + mutex_enter(&probe->ftp_prov->ftp_mtx); + probe->ftp_prov->ftp_rcount++; + mutex_exit(&probe->ftp_prov->ftp_mtx); + + /* + * If this probe's provider is retired (meaning it was valid in a + * previously exec'ed incarnation of this address space), bail out. The + * provider can't go away while we're in this code path. + */ + if (probe->ftp_prov->ftp_retired) + return; + + /* + * If we can't find the process, it may be that we're in the context of + * a fork in which the traced process is being born and we're copying + * USDT probes. Otherwise, the process is gone so bail. + */ +#ifdef illumos + if ((p = sprlock(probe->ftp_pid)) == NULL) { + if ((curproc->p_flag & SFORKING) == 0) + return; + + mutex_enter(&pidlock); + p = prfind(probe->ftp_pid); + + if (p == NULL) { + /* + * So it's not that the target process is being born, + * it's that it isn't there at all (and we simply + * happen to be forking). Anyway, we know that the + * target is definitely gone, so bail out. + */ + mutex_exit(&pidlock); + return (0); + } + + /* + * Confirm that curproc is indeed forking the process in which + * we're trying to enable probes. + */ + ASSERT(p->p_parent == curproc); + ASSERT(p->p_stat == SIDL); + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + sprlock_proc(p); + } + + ASSERT(!(p->p_flag & SVFORK)); + mutex_exit(&p->p_lock); +#else + if (pget(probe->ftp_pid, PGET_HOLD | PGET_NOTWEXIT, &p) != 0) + return; +#endif + + /* + * We have to enable the trap entry point before any user threads have + * the chance to execute the trap instruction we're about to place + * in their process's text. + */ + fasttrap_enable_callbacks(); + + /* + * Enable all the tracepoints and add this probe's id to each + * tracepoint's list of active probes. + */ + for (i = 0; i < probe->ftp_ntps; i++) { + if ((rc = fasttrap_tracepoint_enable(p, probe, i)) != 0) { + /* + * If enabling the tracepoint failed completely, + * we don't have to disable it; if the failure + * was only partial we must disable it. + */ + if (rc == FASTTRAP_ENABLE_FAIL) + i--; + else + ASSERT(rc == FASTTRAP_ENABLE_PARTIAL); + + /* + * Back up and pull out all the tracepoints we've + * created so far for this probe. + */ + while (i >= 0) { + fasttrap_tracepoint_disable(p, probe, i); + i--; + } + +#ifdef illumos + mutex_enter(&p->p_lock); + sprunlock(p); +#else + PRELE(p); +#endif + + /* + * Since we're not actually enabling this probe, + * drop our reference on the trap table entry. + */ + fasttrap_disable_callbacks(); + return; + } + } +#ifdef illumos + mutex_enter(&p->p_lock); + sprunlock(p); +#else + PRELE(p); +#endif + + probe->ftp_enabled = 1; +} + +/*ARGSUSED*/ +static void +fasttrap_pid_disable(void *arg, dtrace_id_t id, void *parg) +{ + fasttrap_probe_t *probe = parg; + fasttrap_provider_t *provider = probe->ftp_prov; + proc_t *p; + int i, whack = 0; + + ASSERT(id == probe->ftp_id); + + mutex_enter(&provider->ftp_mtx); + + /* + * We won't be able to acquire a /proc-esque lock on the process + * iff the process is dead and gone. In this case, we rely on the + * provider lock as a point of mutual exclusion to prevent other + * DTrace consumers from disabling this probe. + */ + if (pget(probe->ftp_pid, PGET_HOLD | PGET_NOTWEXIT, &p) != 0) + p = NULL; + + /* + * Disable all the associated tracepoints (for fully enabled probes). + */ + if (probe->ftp_enabled) { + for (i = 0; i < probe->ftp_ntps; i++) { + fasttrap_tracepoint_disable(p, probe, i); + } + } + + ASSERT(provider->ftp_rcount > 0); + provider->ftp_rcount--; + + if (p != NULL) { + /* + * Even though we may not be able to remove it entirely, we + * mark this retired provider to get a chance to remove some + * of the associated probes. + */ + if (provider->ftp_retired && !provider->ftp_marked) + whack = provider->ftp_marked = 1; + mutex_exit(&provider->ftp_mtx); + } else { + /* + * If the process is dead, we're just waiting for the + * last probe to be disabled to be able to free it. + */ + if (provider->ftp_rcount == 0 && !provider->ftp_marked) + whack = provider->ftp_marked = 1; + mutex_exit(&provider->ftp_mtx); + } + + if (whack) + fasttrap_pid_cleanup(); + +#ifdef __FreeBSD__ + if (p != NULL) + PRELE(p); +#endif + if (!probe->ftp_enabled) + return; + + probe->ftp_enabled = 0; + +#ifdef illumos + ASSERT(MUTEX_HELD(&cpu_lock)); +#endif + fasttrap_disable_callbacks(); +} + +/*ARGSUSED*/ +static void +fasttrap_pid_getargdesc(void *arg, dtrace_id_t id, void *parg, + dtrace_argdesc_t *desc) +{ + fasttrap_probe_t *probe = parg; + char *str; + int i, ndx; + + desc->dtargd_native[0] = '\0'; + desc->dtargd_xlate[0] = '\0'; + + if (probe->ftp_prov->ftp_retired != 0 || + desc->dtargd_ndx >= probe->ftp_nargs) { + desc->dtargd_ndx = DTRACE_ARGNONE; + return; + } + + ndx = (probe->ftp_argmap != NULL) ? + probe->ftp_argmap[desc->dtargd_ndx] : desc->dtargd_ndx; + + str = probe->ftp_ntypes; + for (i = 0; i < ndx; i++) { + str += strlen(str) + 1; + } + + ASSERT(strlen(str + 1) < sizeof (desc->dtargd_native)); + (void) strcpy(desc->dtargd_native, str); + + if (probe->ftp_xtypes == NULL) + return; + + str = probe->ftp_xtypes; + for (i = 0; i < desc->dtargd_ndx; i++) { + str += strlen(str) + 1; + } + + ASSERT(strlen(str + 1) < sizeof (desc->dtargd_xlate)); + (void) strcpy(desc->dtargd_xlate, str); +} + +/*ARGSUSED*/ +static void +fasttrap_pid_destroy(void *arg, dtrace_id_t id, void *parg) +{ + fasttrap_probe_t *probe = parg; + int i; + size_t size; + + ASSERT(probe != NULL); + ASSERT(!probe->ftp_enabled); + ASSERT(fasttrap_total >= probe->ftp_ntps); + + atomic_add_32(&fasttrap_total, -probe->ftp_ntps); + size = offsetof(fasttrap_probe_t, ftp_tps[probe->ftp_ntps]); + + if (probe->ftp_gen + 1 >= fasttrap_mod_gen) + fasttrap_mod_barrier(probe->ftp_gen); + + for (i = 0; i < probe->ftp_ntps; i++) { + kmem_free(probe->ftp_tps[i].fit_tp, + sizeof (fasttrap_tracepoint_t)); + } + + kmem_free(probe, size); +} + + +static const dtrace_pattr_t pid_attr = { +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +}; + +static dtrace_pops_t pid_pops = { + .dtps_provide = fasttrap_pid_provide, + .dtps_provide_module = NULL, + .dtps_enable = fasttrap_pid_enable, + .dtps_disable = fasttrap_pid_disable, + .dtps_suspend = NULL, + .dtps_resume = NULL, + .dtps_getargdesc = fasttrap_pid_getargdesc, + .dtps_getargval = fasttrap_pid_getarg, + .dtps_usermode = NULL, + .dtps_destroy = fasttrap_pid_destroy +}; + +static dtrace_pops_t usdt_pops = { + .dtps_provide = fasttrap_pid_provide, + .dtps_provide_module = NULL, + .dtps_enable = fasttrap_pid_enable, + .dtps_disable = fasttrap_pid_disable, + .dtps_suspend = NULL, + .dtps_resume = NULL, + .dtps_getargdesc = fasttrap_pid_getargdesc, + .dtps_getargval = fasttrap_usdt_getarg, + .dtps_usermode = NULL, + .dtps_destroy = fasttrap_pid_destroy +}; + +static fasttrap_proc_t * +fasttrap_proc_lookup(pid_t pid) +{ + fasttrap_bucket_t *bucket; + fasttrap_proc_t *fprc, *new_fprc; + + + bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)]; + mutex_enter(&bucket->ftb_mtx); + + for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) { + if (fprc->ftpc_pid == pid && fprc->ftpc_acount != 0) { + mutex_enter(&fprc->ftpc_mtx); + mutex_exit(&bucket->ftb_mtx); + fprc->ftpc_rcount++; + atomic_inc_64(&fprc->ftpc_acount); + ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount); + mutex_exit(&fprc->ftpc_mtx); + + return (fprc); + } + } + + /* + * Drop the bucket lock so we don't try to perform a sleeping + * allocation under it. + */ + mutex_exit(&bucket->ftb_mtx); + + new_fprc = kmem_zalloc(sizeof (fasttrap_proc_t), KM_SLEEP); + new_fprc->ftpc_pid = pid; + new_fprc->ftpc_rcount = 1; + new_fprc->ftpc_acount = 1; +#ifndef illumos + mutex_init(&new_fprc->ftpc_mtx, "fasttrap proc mtx", MUTEX_DEFAULT, + NULL); +#endif + + mutex_enter(&bucket->ftb_mtx); + + /* + * Take another lap through the list to make sure a proc hasn't + * been created for this pid while we weren't under the bucket lock. + */ + for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) { + if (fprc->ftpc_pid == pid && fprc->ftpc_acount != 0) { + mutex_enter(&fprc->ftpc_mtx); + mutex_exit(&bucket->ftb_mtx); + fprc->ftpc_rcount++; + atomic_inc_64(&fprc->ftpc_acount); + ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount); + mutex_exit(&fprc->ftpc_mtx); + + kmem_free(new_fprc, sizeof (fasttrap_proc_t)); + + return (fprc); + } + } + + new_fprc->ftpc_next = bucket->ftb_data; + bucket->ftb_data = new_fprc; + + mutex_exit(&bucket->ftb_mtx); + + return (new_fprc); +} + +static void +fasttrap_proc_release(fasttrap_proc_t *proc) +{ + fasttrap_bucket_t *bucket; + fasttrap_proc_t *fprc, **fprcp; + pid_t pid = proc->ftpc_pid; +#ifndef illumos + fasttrap_scrblock_t *scrblk, *scrblktmp; + fasttrap_scrspace_t *scrspc, *scrspctmp; + struct proc *p; + struct thread *td; +#endif + + mutex_enter(&proc->ftpc_mtx); + + ASSERT(proc->ftpc_rcount != 0); + ASSERT(proc->ftpc_acount <= proc->ftpc_rcount); + + if (--proc->ftpc_rcount != 0) { + mutex_exit(&proc->ftpc_mtx); + return; + } + +#ifndef illumos + /* + * Free all structures used to manage per-thread scratch space. + */ + LIST_FOREACH_SAFE(scrblk, &proc->ftpc_scrblks, ftsb_next, + scrblktmp) { + LIST_REMOVE(scrblk, ftsb_next); + free(scrblk, M_SOLARIS); + } + LIST_FOREACH_SAFE(scrspc, &proc->ftpc_fscr, ftss_next, scrspctmp) { + LIST_REMOVE(scrspc, ftss_next); + free(scrspc, M_SOLARIS); + } + LIST_FOREACH_SAFE(scrspc, &proc->ftpc_ascr, ftss_next, scrspctmp) { + LIST_REMOVE(scrspc, ftss_next); + free(scrspc, M_SOLARIS); + } + + if ((p = pfind(pid)) != NULL) { + FOREACH_THREAD_IN_PROC(p, td) + td->t_dtrace_sscr = NULL; + PROC_UNLOCK(p); + } +#endif + + mutex_exit(&proc->ftpc_mtx); + + /* + * There should definitely be no live providers associated with this + * process at this point. + */ + ASSERT(proc->ftpc_acount == 0); + + bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)]; + mutex_enter(&bucket->ftb_mtx); + + fprcp = (fasttrap_proc_t **)&bucket->ftb_data; + while ((fprc = *fprcp) != NULL) { + if (fprc == proc) + break; + + fprcp = &fprc->ftpc_next; + } + + /* + * Something strange has happened if we can't find the proc. + */ + ASSERT(fprc != NULL); + + *fprcp = fprc->ftpc_next; + + mutex_exit(&bucket->ftb_mtx); + + kmem_free(fprc, sizeof (fasttrap_proc_t)); +} + +/* + * Lookup a fasttrap-managed provider based on its name and associated pid. + * If the pattr argument is non-NULL, this function instantiates the provider + * if it doesn't exist otherwise it returns NULL. The provider is returned + * with its lock held. + */ +static fasttrap_provider_t * +fasttrap_provider_lookup(pid_t pid, const char *name, + const dtrace_pattr_t *pattr) +{ + fasttrap_provider_t *fp, *new_fp = NULL; + fasttrap_bucket_t *bucket; + char provname[DTRACE_PROVNAMELEN]; + proc_t *p; + cred_t *cred; + + ASSERT(strlen(name) < sizeof (fp->ftp_name)); + ASSERT(pattr != NULL); + + bucket = &fasttrap_provs.fth_table[FASTTRAP_PROVS_INDEX(pid, name)]; + mutex_enter(&bucket->ftb_mtx); + + /* + * Take a lap through the list and return the match if we find it. + */ + for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) { + if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 && + !fp->ftp_retired) { + mutex_enter(&fp->ftp_mtx); + mutex_exit(&bucket->ftb_mtx); + return (fp); + } + } + + /* + * Drop the bucket lock so we don't try to perform a sleeping + * allocation under it. + */ + mutex_exit(&bucket->ftb_mtx); + + /* + * Make sure the process exists, isn't a child created as the result + * of a vfork(2), and isn't a zombie (but may be in fork). + */ + if ((p = pfind(pid)) == NULL) + return (NULL); + + /* + * Increment p_dtrace_probes so that the process knows to inform us + * when it exits or execs. fasttrap_provider_free() decrements this + * when we're done with this provider. + */ + p->p_dtrace_probes++; + + /* + * Grab the credentials for this process so we have + * something to pass to dtrace_register(). + */ + PROC_LOCK_ASSERT(p, MA_OWNED); + crhold(p->p_ucred); + cred = p->p_ucred; + PROC_UNLOCK(p); + + new_fp = kmem_zalloc(sizeof (fasttrap_provider_t), KM_SLEEP); + new_fp->ftp_pid = pid; + new_fp->ftp_proc = fasttrap_proc_lookup(pid); +#ifndef illumos + mutex_init(&new_fp->ftp_mtx, "provider mtx", MUTEX_DEFAULT, NULL); + mutex_init(&new_fp->ftp_cmtx, "lock on creating", MUTEX_DEFAULT, NULL); +#endif + + ASSERT(new_fp->ftp_proc != NULL); + + mutex_enter(&bucket->ftb_mtx); + + /* + * Take another lap through the list to make sure a provider hasn't + * been created for this pid while we weren't under the bucket lock. + */ + for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) { + if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 && + !fp->ftp_retired) { + mutex_enter(&fp->ftp_mtx); + mutex_exit(&bucket->ftb_mtx); + fasttrap_provider_free(new_fp); + crfree(cred); + return (fp); + } + } + + (void) strcpy(new_fp->ftp_name, name); + + /* + * Fail and return NULL if either the provider name is too long + * or we fail to register this new provider with the DTrace + * framework. Note that this is the only place we ever construct + * the full provider name -- we keep it in pieces in the provider + * structure. + */ + if (snprintf(provname, sizeof (provname), "%s%u", name, (uint_t)pid) >= + sizeof (provname) || + dtrace_register(provname, pattr, + DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER | DTRACE_PRIV_ZONEOWNER, cred, + pattr == &pid_attr ? &pid_pops : &usdt_pops, new_fp, + &new_fp->ftp_provid) != 0) { + mutex_exit(&bucket->ftb_mtx); + fasttrap_provider_free(new_fp); + crfree(cred); + return (NULL); + } + + new_fp->ftp_next = bucket->ftb_data; + bucket->ftb_data = new_fp; + + mutex_enter(&new_fp->ftp_mtx); + mutex_exit(&bucket->ftb_mtx); + + crfree(cred); + return (new_fp); +} + +static void +fasttrap_provider_free(fasttrap_provider_t *provider) +{ + pid_t pid = provider->ftp_pid; + proc_t *p; + + /* + * There need to be no associated enabled probes, no consumers + * creating probes, and no meta providers referencing this provider. + */ + ASSERT(provider->ftp_rcount == 0); + ASSERT(provider->ftp_ccount == 0); + ASSERT(provider->ftp_mcount == 0); + + /* + * If this provider hasn't been retired, we need to explicitly drop the + * count of active providers on the associated process structure. + */ + if (!provider->ftp_retired) { + atomic_dec_64(&provider->ftp_proc->ftpc_acount); + ASSERT(provider->ftp_proc->ftpc_acount < + provider->ftp_proc->ftpc_rcount); + } + + fasttrap_proc_release(provider->ftp_proc); + +#ifndef illumos + mutex_destroy(&provider->ftp_mtx); + mutex_destroy(&provider->ftp_cmtx); +#endif + kmem_free(provider, sizeof (fasttrap_provider_t)); + + /* + * Decrement p_dtrace_probes on the process whose provider we're + * freeing. We don't have to worry about clobbering somone else's + * modifications to it because we have locked the bucket that + * corresponds to this process's hash chain in the provider hash + * table. Don't sweat it if we can't find the process. + */ + if ((p = pfind(pid)) == NULL) { + return; + } + + p->p_dtrace_probes--; +#ifndef illumos + PROC_UNLOCK(p); +#endif +} + +static void +fasttrap_provider_retire(pid_t pid, const char *name, int mprov) +{ + fasttrap_provider_t *fp; + fasttrap_bucket_t *bucket; + dtrace_provider_id_t provid; + + ASSERT(strlen(name) < sizeof (fp->ftp_name)); + + bucket = &fasttrap_provs.fth_table[FASTTRAP_PROVS_INDEX(pid, name)]; + mutex_enter(&bucket->ftb_mtx); + + for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) { + if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 && + !fp->ftp_retired) + break; + } + + if (fp == NULL) { + mutex_exit(&bucket->ftb_mtx); + return; + } + + mutex_enter(&fp->ftp_mtx); + ASSERT(!mprov || fp->ftp_mcount > 0); + if (mprov && --fp->ftp_mcount != 0) { + mutex_exit(&fp->ftp_mtx); + mutex_exit(&bucket->ftb_mtx); + return; + } + + /* + * Mark the provider to be removed in our post-processing step, mark it + * retired, and drop the active count on its proc. Marking it indicates + * that we should try to remove it; setting the retired flag indicates + * that we're done with this provider; dropping the active the proc + * releases our hold, and when this reaches zero (as it will during + * exit or exec) the proc and associated providers become defunct. + * + * We obviously need to take the bucket lock before the provider lock + * to perform the lookup, but we need to drop the provider lock + * before calling into the DTrace framework since we acquire the + * provider lock in callbacks invoked from the DTrace framework. The + * bucket lock therefore protects the integrity of the provider hash + * table. + */ + atomic_dec_64(&fp->ftp_proc->ftpc_acount); + ASSERT(fp->ftp_proc->ftpc_acount < fp->ftp_proc->ftpc_rcount); + + fp->ftp_retired = 1; + fp->ftp_marked = 1; + provid = fp->ftp_provid; + mutex_exit(&fp->ftp_mtx); + + /* + * We don't have to worry about invalidating the same provider twice + * since fasttrap_provider_lookup() will ignore provider that have + * been marked as retired. + */ + dtrace_invalidate(provid); + + mutex_exit(&bucket->ftb_mtx); + + fasttrap_pid_cleanup(); +} + +static int +fasttrap_uint32_cmp(const void *ap, const void *bp) +{ + return (*(const uint32_t *)ap - *(const uint32_t *)bp); +} + +static int +fasttrap_uint64_cmp(const void *ap, const void *bp) +{ + return (*(const uint64_t *)ap - *(const uint64_t *)bp); +} + +static int +fasttrap_add_probe(fasttrap_probe_spec_t *pdata) +{ + fasttrap_provider_t *provider; + fasttrap_probe_t *pp; + fasttrap_tracepoint_t *tp; + char *name; + int i, aframes = 0, whack; + + /* + * There needs to be at least one desired trace point. + */ + if (pdata->ftps_noffs == 0) + return (EINVAL); + + switch (pdata->ftps_type) { + case DTFTP_ENTRY: + name = "entry"; + aframes = FASTTRAP_ENTRY_AFRAMES; + break; + case DTFTP_RETURN: + name = "return"; + aframes = FASTTRAP_RETURN_AFRAMES; + break; + case DTFTP_OFFSETS: + name = NULL; + break; + default: + return (EINVAL); + } + + if ((provider = fasttrap_provider_lookup(pdata->ftps_pid, + FASTTRAP_PID_NAME, &pid_attr)) == NULL) + return (ESRCH); + + /* + * Increment this reference count to indicate that a consumer is + * actively adding a new probe associated with this provider. This + * prevents the provider from being deleted -- we'll need to check + * for pending deletions when we drop this reference count. + */ + provider->ftp_ccount++; + mutex_exit(&provider->ftp_mtx); + + /* + * Grab the creation lock to ensure consistency between calls to + * dtrace_probe_lookup() and dtrace_probe_create() in the face of + * other threads creating probes. We must drop the provider lock + * before taking this lock to avoid a three-way deadlock with the + * DTrace framework. + */ + mutex_enter(&provider->ftp_cmtx); + + if (name == NULL) { + for (i = 0; i < pdata->ftps_noffs; i++) { + char name_str[17]; + + (void) sprintf(name_str, "%llx", + (unsigned long long)pdata->ftps_offs[i]); + + if (dtrace_probe_lookup(provider->ftp_provid, + pdata->ftps_mod, pdata->ftps_func, name_str) != 0) + continue; + + atomic_inc_32(&fasttrap_total); + + if (fasttrap_total > fasttrap_max) { + atomic_dec_32(&fasttrap_total); + goto no_mem; + } + + pp = kmem_zalloc(sizeof (fasttrap_probe_t), KM_SLEEP); + + pp->ftp_prov = provider; + pp->ftp_faddr = pdata->ftps_pc; + pp->ftp_fsize = pdata->ftps_size; + pp->ftp_pid = pdata->ftps_pid; + pp->ftp_ntps = 1; + + tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), + KM_SLEEP); + + tp->ftt_proc = provider->ftp_proc; + tp->ftt_pc = pdata->ftps_offs[i] + pdata->ftps_pc; + tp->ftt_pid = pdata->ftps_pid; + + pp->ftp_tps[0].fit_tp = tp; + pp->ftp_tps[0].fit_id.fti_probe = pp; + pp->ftp_tps[0].fit_id.fti_ptype = pdata->ftps_type; + + pp->ftp_id = dtrace_probe_create(provider->ftp_provid, + pdata->ftps_mod, pdata->ftps_func, name_str, + FASTTRAP_OFFSET_AFRAMES, pp); + } + + } else if (dtrace_probe_lookup(provider->ftp_provid, pdata->ftps_mod, + pdata->ftps_func, name) == 0) { + atomic_add_32(&fasttrap_total, pdata->ftps_noffs); + + if (fasttrap_total > fasttrap_max) { + atomic_add_32(&fasttrap_total, -pdata->ftps_noffs); + goto no_mem; + } + + /* + * Make sure all tracepoint program counter values are unique. + * We later assume that each probe has exactly one tracepoint + * for a given pc. + */ + qsort(pdata->ftps_offs, pdata->ftps_noffs, + sizeof (uint64_t), fasttrap_uint64_cmp); + for (i = 1; i < pdata->ftps_noffs; i++) { + if (pdata->ftps_offs[i] > pdata->ftps_offs[i - 1]) + continue; + + atomic_add_32(&fasttrap_total, -pdata->ftps_noffs); + goto no_mem; + } + + ASSERT(pdata->ftps_noffs > 0); + pp = kmem_zalloc(offsetof(fasttrap_probe_t, + ftp_tps[pdata->ftps_noffs]), KM_SLEEP); + + pp->ftp_prov = provider; + pp->ftp_faddr = pdata->ftps_pc; + pp->ftp_fsize = pdata->ftps_size; + pp->ftp_pid = pdata->ftps_pid; + pp->ftp_ntps = pdata->ftps_noffs; + + for (i = 0; i < pdata->ftps_noffs; i++) { + tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), + KM_SLEEP); + + tp->ftt_proc = provider->ftp_proc; + tp->ftt_pc = pdata->ftps_offs[i] + pdata->ftps_pc; + tp->ftt_pid = pdata->ftps_pid; + + pp->ftp_tps[i].fit_tp = tp; + pp->ftp_tps[i].fit_id.fti_probe = pp; + pp->ftp_tps[i].fit_id.fti_ptype = pdata->ftps_type; + } + + pp->ftp_id = dtrace_probe_create(provider->ftp_provid, + pdata->ftps_mod, pdata->ftps_func, name, aframes, pp); + } + + mutex_exit(&provider->ftp_cmtx); + + /* + * We know that the provider is still valid since we incremented the + * creation reference count. If someone tried to clean up this provider + * while we were using it (e.g. because the process called exec(2) or + * exit(2)), take note of that and try to clean it up now. + */ + mutex_enter(&provider->ftp_mtx); + provider->ftp_ccount--; + whack = provider->ftp_retired; + mutex_exit(&provider->ftp_mtx); + + if (whack) + fasttrap_pid_cleanup(); + + return (0); + +no_mem: + /* + * If we've exhausted the allowable resources, we'll try to remove + * this provider to free some up. This is to cover the case where + * the user has accidentally created many more probes than was + * intended (e.g. pid123:::). + */ + mutex_exit(&provider->ftp_cmtx); + mutex_enter(&provider->ftp_mtx); + provider->ftp_ccount--; + provider->ftp_marked = 1; + mutex_exit(&provider->ftp_mtx); + + fasttrap_pid_cleanup(); + + return (ENOMEM); +} + +/*ARGSUSED*/ +static void * +fasttrap_meta_provide(void *arg, dtrace_helper_provdesc_t *dhpv, pid_t pid) +{ + fasttrap_provider_t *provider; + + /* + * A 32-bit unsigned integer (like a pid for example) can be + * expressed in 10 or fewer decimal digits. Make sure that we'll + * have enough space for the provider name. + */ + if (strlen(dhpv->dthpv_provname) + 10 >= + sizeof (provider->ftp_name)) { + printf("failed to instantiate provider %s: " + "name too long to accomodate pid", dhpv->dthpv_provname); + return (NULL); + } + + /* + * Don't let folks spoof the true pid provider. + */ + if (strcmp(dhpv->dthpv_provname, FASTTRAP_PID_NAME) == 0) { + printf("failed to instantiate provider %s: " + "%s is an invalid name", dhpv->dthpv_provname, + FASTTRAP_PID_NAME); + return (NULL); + } + + /* + * The highest stability class that fasttrap supports is ISA; cap + * the stability of the new provider accordingly. + */ + if (dhpv->dthpv_pattr.dtpa_provider.dtat_class > DTRACE_CLASS_ISA) + dhpv->dthpv_pattr.dtpa_provider.dtat_class = DTRACE_CLASS_ISA; + if (dhpv->dthpv_pattr.dtpa_mod.dtat_class > DTRACE_CLASS_ISA) + dhpv->dthpv_pattr.dtpa_mod.dtat_class = DTRACE_CLASS_ISA; + if (dhpv->dthpv_pattr.dtpa_func.dtat_class > DTRACE_CLASS_ISA) + dhpv->dthpv_pattr.dtpa_func.dtat_class = DTRACE_CLASS_ISA; + if (dhpv->dthpv_pattr.dtpa_name.dtat_class > DTRACE_CLASS_ISA) + dhpv->dthpv_pattr.dtpa_name.dtat_class = DTRACE_CLASS_ISA; + if (dhpv->dthpv_pattr.dtpa_args.dtat_class > DTRACE_CLASS_ISA) + dhpv->dthpv_pattr.dtpa_args.dtat_class = DTRACE_CLASS_ISA; + + if ((provider = fasttrap_provider_lookup(pid, dhpv->dthpv_provname, + &dhpv->dthpv_pattr)) == NULL) { + printf("failed to instantiate provider %s for " + "process %u", dhpv->dthpv_provname, (uint_t)pid); + return (NULL); + } + + /* + * Up the meta provider count so this provider isn't removed until + * the meta provider has been told to remove it. + */ + provider->ftp_mcount++; + + mutex_exit(&provider->ftp_mtx); + + return (provider); +} + +/* + * We know a few things about our context here: we know that the probe being + * created doesn't already exist (DTrace won't load DOF at the same address + * twice, even if explicitly told to do so) and we know that we are + * single-threaded with respect to the meta provider machinery. Knowing that + * this is a new probe and that there is no way for us to race with another + * operation on this provider allows us an important optimization: we need not + * lookup a probe before adding it. Saving this lookup is important because + * this code is in the fork path for processes with USDT probes, and lookups + * here are potentially very expensive because of long hash conflicts on + * module, function and name (DTrace doesn't hash on provider name). + */ +/*ARGSUSED*/ +static void +fasttrap_meta_create_probe(void *arg, void *parg, + dtrace_helper_probedesc_t *dhpb) +{ + fasttrap_provider_t *provider = parg; + fasttrap_probe_t *pp; + fasttrap_tracepoint_t *tp; + int i, j; + uint32_t ntps; + + /* + * Since the meta provider count is non-zero we don't have to worry + * about this provider disappearing. + */ + ASSERT(provider->ftp_mcount > 0); + + /* + * The offsets must be unique. + */ + qsort(dhpb->dthpb_offs, dhpb->dthpb_noffs, sizeof (uint32_t), + fasttrap_uint32_cmp); + for (i = 1; i < dhpb->dthpb_noffs; i++) { + if (dhpb->dthpb_base + dhpb->dthpb_offs[i] <= + dhpb->dthpb_base + dhpb->dthpb_offs[i - 1]) + return; + } + + qsort(dhpb->dthpb_enoffs, dhpb->dthpb_nenoffs, sizeof (uint32_t), + fasttrap_uint32_cmp); + for (i = 1; i < dhpb->dthpb_nenoffs; i++) { + if (dhpb->dthpb_base + dhpb->dthpb_enoffs[i] <= + dhpb->dthpb_base + dhpb->dthpb_enoffs[i - 1]) + return; + } + + ntps = dhpb->dthpb_noffs + dhpb->dthpb_nenoffs; + ASSERT(ntps > 0); + + atomic_add_32(&fasttrap_total, ntps); + + if (fasttrap_total > fasttrap_max) { + atomic_add_32(&fasttrap_total, -ntps); + return; + } + + pp = kmem_zalloc(offsetof(fasttrap_probe_t, ftp_tps[ntps]), KM_SLEEP); + + pp->ftp_prov = provider; + pp->ftp_pid = provider->ftp_pid; + pp->ftp_ntps = ntps; + pp->ftp_nargs = dhpb->dthpb_xargc; + pp->ftp_xtypes = dhpb->dthpb_xtypes; + pp->ftp_ntypes = dhpb->dthpb_ntypes; + + /* + * First create a tracepoint for each actual point of interest. + */ + for (i = 0; i < dhpb->dthpb_noffs; i++) { + tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), KM_SLEEP); + + tp->ftt_proc = provider->ftp_proc; + tp->ftt_pc = dhpb->dthpb_base + dhpb->dthpb_offs[i]; + tp->ftt_pid = provider->ftp_pid; + + pp->ftp_tps[i].fit_tp = tp; + pp->ftp_tps[i].fit_id.fti_probe = pp; +#ifdef __sparc + pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_POST_OFFSETS; +#else + pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_OFFSETS; +#endif + } + + /* + * Then create a tracepoint for each is-enabled point. + */ + for (j = 0; i < ntps; i++, j++) { + tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), KM_SLEEP); + + tp->ftt_proc = provider->ftp_proc; + tp->ftt_pc = dhpb->dthpb_base + dhpb->dthpb_enoffs[j]; + tp->ftt_pid = provider->ftp_pid; + + pp->ftp_tps[i].fit_tp = tp; + pp->ftp_tps[i].fit_id.fti_probe = pp; + pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_IS_ENABLED; + } + + /* + * If the arguments are shuffled around we set the argument remapping + * table. Later, when the probe fires, we only remap the arguments + * if the table is non-NULL. + */ + for (i = 0; i < dhpb->dthpb_xargc; i++) { + if (dhpb->dthpb_args[i] != i) { + pp->ftp_argmap = dhpb->dthpb_args; + break; + } + } + + /* + * The probe is fully constructed -- register it with DTrace. + */ + pp->ftp_id = dtrace_probe_create(provider->ftp_provid, dhpb->dthpb_mod, + dhpb->dthpb_func, dhpb->dthpb_name, FASTTRAP_OFFSET_AFRAMES, pp); +} + +/*ARGSUSED*/ +static void +fasttrap_meta_remove(void *arg, dtrace_helper_provdesc_t *dhpv, pid_t pid) +{ + /* + * Clean up the USDT provider. There may be active consumers of the + * provider busy adding probes, no damage will actually befall the + * provider until that count has dropped to zero. This just puts + * the provider on death row. + */ + fasttrap_provider_retire(pid, dhpv->dthpv_provname, 1); +} + +static dtrace_mops_t fasttrap_mops = { + .dtms_create_probe = fasttrap_meta_create_probe, + .dtms_provide_pid = fasttrap_meta_provide, + .dtms_remove_pid = fasttrap_meta_remove +}; + +/*ARGSUSED*/ +static int +fasttrap_open(struct cdev *dev __unused, int oflags __unused, + int devtype __unused, struct thread *td __unused) +{ + return (0); +} + +/*ARGSUSED*/ +static int +fasttrap_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int fflag, + struct thread *td) +{ + if (!dtrace_attached()) + return (EAGAIN); + + if (cmd == FASTTRAPIOC_MAKEPROBE) { + fasttrap_probe_spec_t *uprobe = *(fasttrap_probe_spec_t **)arg; + fasttrap_probe_spec_t *probe; + uint64_t noffs; + size_t size; + int ret, err; + + if (copyin(&uprobe->ftps_noffs, &noffs, + sizeof (uprobe->ftps_noffs))) + return (EFAULT); + + /* + * Probes must have at least one tracepoint. + */ + if (noffs == 0) + return (EINVAL); + + size = sizeof (fasttrap_probe_spec_t) + + sizeof (probe->ftps_offs[0]) * (noffs - 1); + + if (size > 1024 * 1024) + return (ENOMEM); + + probe = kmem_alloc(size, KM_SLEEP); + + if (copyin(uprobe, probe, size) != 0 || + probe->ftps_noffs != noffs) { + kmem_free(probe, size); + return (EFAULT); + } + + /* + * Verify that the function and module strings contain no + * funny characters. + */ + if (u8_validate(probe->ftps_func, strlen(probe->ftps_func), + NULL, U8_VALIDATE_ENTIRE, &err) < 0) { + ret = EINVAL; + goto err; + } + + if (u8_validate(probe->ftps_mod, strlen(probe->ftps_mod), + NULL, U8_VALIDATE_ENTIRE, &err) < 0) { + ret = EINVAL; + goto err; + } + +#ifdef notyet + if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) { + proc_t *p; + pid_t pid = probe->ftps_pid; + + mutex_enter(&pidlock); + /* + * Report an error if the process doesn't exist + * or is actively being birthed. + */ + if ((p = pfind(pid)) == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (ESRCH); + } + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if ((ret = priv_proc_cred_perm(cr, p, NULL, + VREAD | VWRITE)) != 0) { + mutex_exit(&p->p_lock); + return (ret); + } + mutex_exit(&p->p_lock); + } +#endif /* notyet */ + + ret = fasttrap_add_probe(probe); +err: + kmem_free(probe, size); + + return (ret); + + } else if (cmd == FASTTRAPIOC_GETINSTR) { + fasttrap_instr_query_t instr; + fasttrap_tracepoint_t *tp; + uint_t index; +#ifdef notyet + int ret; +#endif + +#ifdef illumos + if (copyin((void *)arg, &instr, sizeof (instr)) != 0) + return (EFAULT); +#endif + +#ifdef notyet + if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) { + proc_t *p; + pid_t pid = instr.ftiq_pid; + + mutex_enter(&pidlock); + /* + * Report an error if the process doesn't exist + * or is actively being birthed. + */ + if ((p == pfind(pid)) == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (ESRCH); + } + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if ((ret = priv_proc_cred_perm(cr, p, NULL, + VREAD)) != 0) { + mutex_exit(&p->p_lock); + return (ret); + } + + mutex_exit(&p->p_lock); + } +#endif /* notyet */ + + index = FASTTRAP_TPOINTS_INDEX(instr.ftiq_pid, instr.ftiq_pc); + + mutex_enter(&fasttrap_tpoints.fth_table[index].ftb_mtx); + tp = fasttrap_tpoints.fth_table[index].ftb_data; + while (tp != NULL) { + if (instr.ftiq_pid == tp->ftt_pid && + instr.ftiq_pc == tp->ftt_pc && + tp->ftt_proc->ftpc_acount != 0) + break; + + tp = tp->ftt_next; + } + + if (tp == NULL) { + mutex_exit(&fasttrap_tpoints.fth_table[index].ftb_mtx); + return (ENOENT); + } + + bcopy(&tp->ftt_instr, &instr.ftiq_instr, + sizeof (instr.ftiq_instr)); + mutex_exit(&fasttrap_tpoints.fth_table[index].ftb_mtx); + + if (copyout(&instr, (void *)arg, sizeof (instr)) != 0) + return (EFAULT); + + return (0); + } + + return (EINVAL); +} + +static int +fasttrap_load(void) +{ + ulong_t nent; + int i, ret; + + /* Create the /dev/dtrace/fasttrap entry. */ + fasttrap_cdev = make_dev(&fasttrap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, + "dtrace/fasttrap"); + + mtx_init(&fasttrap_cleanup_mtx, "fasttrap clean", "dtrace", MTX_DEF); + mutex_init(&fasttrap_count_mtx, "fasttrap count mtx", MUTEX_DEFAULT, + NULL); + +#ifdef illumos + fasttrap_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, + "fasttrap-max-probes", FASTTRAP_MAX_DEFAULT); +#endif + fasttrap_total = 0; + + /* + * Conjure up the tracepoints hashtable... + */ +#ifdef illumos + nent = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, + "fasttrap-hash-size", FASTTRAP_TPOINTS_DEFAULT_SIZE); +#else + nent = tpoints_hash_size; +#endif + + if (nent == 0 || nent > 0x1000000) + nent = FASTTRAP_TPOINTS_DEFAULT_SIZE; + + tpoints_hash_size = nent; + + if (ISP2(nent)) + fasttrap_tpoints.fth_nent = nent; + else + fasttrap_tpoints.fth_nent = 1 << fasttrap_highbit(nent); + ASSERT(fasttrap_tpoints.fth_nent > 0); + fasttrap_tpoints.fth_mask = fasttrap_tpoints.fth_nent - 1; + fasttrap_tpoints.fth_table = kmem_zalloc(fasttrap_tpoints.fth_nent * + sizeof (fasttrap_bucket_t), KM_SLEEP); +#ifndef illumos + for (i = 0; i < fasttrap_tpoints.fth_nent; i++) + mutex_init(&fasttrap_tpoints.fth_table[i].ftb_mtx, + "tracepoints bucket mtx", MUTEX_DEFAULT, NULL); +#endif + + /* + * ... and the providers hash table... + */ + nent = FASTTRAP_PROVIDERS_DEFAULT_SIZE; + if (ISP2(nent)) + fasttrap_provs.fth_nent = nent; + else + fasttrap_provs.fth_nent = 1 << fasttrap_highbit(nent); + ASSERT(fasttrap_provs.fth_nent > 0); + fasttrap_provs.fth_mask = fasttrap_provs.fth_nent - 1; + fasttrap_provs.fth_table = kmem_zalloc(fasttrap_provs.fth_nent * + sizeof (fasttrap_bucket_t), KM_SLEEP); +#ifndef illumos + for (i = 0; i < fasttrap_provs.fth_nent; i++) + mutex_init(&fasttrap_provs.fth_table[i].ftb_mtx, + "providers bucket mtx", MUTEX_DEFAULT, NULL); +#endif + + ret = kproc_create(fasttrap_pid_cleanup_cb, NULL, + &fasttrap_cleanup_proc, 0, 0, "ftcleanup"); + if (ret != 0) { + destroy_dev(fasttrap_cdev); +#ifndef illumos + for (i = 0; i < fasttrap_provs.fth_nent; i++) + mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx); + for (i = 0; i < fasttrap_tpoints.fth_nent; i++) + mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx); +#endif + kmem_free(fasttrap_provs.fth_table, fasttrap_provs.fth_nent * + sizeof (fasttrap_bucket_t)); + mtx_destroy(&fasttrap_cleanup_mtx); + mutex_destroy(&fasttrap_count_mtx); + return (ret); + } + + + /* + * ... and the procs hash table. + */ + nent = FASTTRAP_PROCS_DEFAULT_SIZE; + if (ISP2(nent)) + fasttrap_procs.fth_nent = nent; + else + fasttrap_procs.fth_nent = 1 << fasttrap_highbit(nent); + ASSERT(fasttrap_procs.fth_nent > 0); + fasttrap_procs.fth_mask = fasttrap_procs.fth_nent - 1; + fasttrap_procs.fth_table = kmem_zalloc(fasttrap_procs.fth_nent * + sizeof (fasttrap_bucket_t), KM_SLEEP); +#ifndef illumos + for (i = 0; i < fasttrap_procs.fth_nent; i++) + mutex_init(&fasttrap_procs.fth_table[i].ftb_mtx, + "processes bucket mtx", MUTEX_DEFAULT, NULL); + + rm_init(&fasttrap_tp_lock, "fasttrap tracepoint"); + + /* + * This event handler must run before kdtrace_thread_dtor() since it + * accesses the thread's struct kdtrace_thread. + */ + fasttrap_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor, + fasttrap_thread_dtor, NULL, EVENTHANDLER_PRI_FIRST); +#endif + + /* + * Install our hooks into fork(2), exec(2), and exit(2). + */ + dtrace_fasttrap_fork = &fasttrap_fork; + dtrace_fasttrap_exit = &fasttrap_exec_exit; + dtrace_fasttrap_exec = &fasttrap_exec_exit; + + (void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL, + &fasttrap_meta_id); + + return (0); +} + +static int +fasttrap_unload(void) +{ + int i, fail = 0; + + /* + * Unregister the meta-provider to make sure no new fasttrap- + * managed providers come along while we're trying to close up + * shop. If we fail to detach, we'll need to re-register as a + * meta-provider. We can fail to unregister as a meta-provider + * if providers we manage still exist. + */ + if (fasttrap_meta_id != DTRACE_METAPROVNONE && + dtrace_meta_unregister(fasttrap_meta_id) != 0) + return (-1); + + /* + * Iterate over all of our providers. If there's still a process + * that corresponds to that pid, fail to detach. + */ + for (i = 0; i < fasttrap_provs.fth_nent; i++) { + fasttrap_provider_t **fpp, *fp; + fasttrap_bucket_t *bucket = &fasttrap_provs.fth_table[i]; + + mutex_enter(&bucket->ftb_mtx); + fpp = (fasttrap_provider_t **)&bucket->ftb_data; + while ((fp = *fpp) != NULL) { + /* + * Acquire and release the lock as a simple way of + * waiting for any other consumer to finish with + * this provider. A thread must first acquire the + * bucket lock so there's no chance of another thread + * blocking on the provider's lock. + */ + mutex_enter(&fp->ftp_mtx); + mutex_exit(&fp->ftp_mtx); + + if (dtrace_unregister(fp->ftp_provid) != 0) { + fail = 1; + fpp = &fp->ftp_next; + } else { + *fpp = fp->ftp_next; + fasttrap_provider_free(fp); + } + } + + mutex_exit(&bucket->ftb_mtx); + } + + if (fail) { + (void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL, + &fasttrap_meta_id); + + return (-1); + } + + /* + * Stop new processes from entering these hooks now, before the + * fasttrap_cleanup thread runs. That way all processes will hopefully + * be out of these hooks before we free fasttrap_provs.fth_table + */ + ASSERT(dtrace_fasttrap_fork == &fasttrap_fork); + dtrace_fasttrap_fork = NULL; + + ASSERT(dtrace_fasttrap_exec == &fasttrap_exec_exit); + dtrace_fasttrap_exec = NULL; + + ASSERT(dtrace_fasttrap_exit == &fasttrap_exec_exit); + dtrace_fasttrap_exit = NULL; + + mtx_lock(&fasttrap_cleanup_mtx); + fasttrap_cleanup_drain = 1; + /* Wait for the cleanup thread to finish up and signal us. */ + wakeup(&fasttrap_cleanup_cv); + mtx_sleep(&fasttrap_cleanup_drain, &fasttrap_cleanup_mtx, 0, "ftcld", + 0); + fasttrap_cleanup_proc = NULL; + mtx_destroy(&fasttrap_cleanup_mtx); + +#ifdef DEBUG + mutex_enter(&fasttrap_count_mtx); + ASSERT(fasttrap_pid_count == 0); + mutex_exit(&fasttrap_count_mtx); +#endif + +#ifndef illumos + EVENTHANDLER_DEREGISTER(thread_dtor, fasttrap_thread_dtor_tag); + + for (i = 0; i < fasttrap_tpoints.fth_nent; i++) + mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx); + for (i = 0; i < fasttrap_provs.fth_nent; i++) + mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx); + for (i = 0; i < fasttrap_procs.fth_nent; i++) + mutex_destroy(&fasttrap_procs.fth_table[i].ftb_mtx); +#endif + kmem_free(fasttrap_tpoints.fth_table, + fasttrap_tpoints.fth_nent * sizeof (fasttrap_bucket_t)); + fasttrap_tpoints.fth_nent = 0; + + kmem_free(fasttrap_provs.fth_table, + fasttrap_provs.fth_nent * sizeof (fasttrap_bucket_t)); + fasttrap_provs.fth_nent = 0; + + kmem_free(fasttrap_procs.fth_table, + fasttrap_procs.fth_nent * sizeof (fasttrap_bucket_t)); + fasttrap_procs.fth_nent = 0; + +#ifndef illumos + destroy_dev(fasttrap_cdev); + mutex_destroy(&fasttrap_count_mtx); + rm_destroy(&fasttrap_tp_lock); +#endif + + return (0); +} + +/* ARGSUSED */ +static int +fasttrap_modevent(module_t mod __unused, int type, void *data __unused) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + break; + + case MOD_UNLOAD: + break; + + case MOD_SHUTDOWN: + break; + + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +SYSINIT(fasttrap_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, fasttrap_load, + NULL); +SYSUNINIT(fasttrap_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, + fasttrap_unload, NULL); + +DEV_MODULE(fasttrap, fasttrap_modevent, NULL); +MODULE_VERSION(fasttrap, 1); +MODULE_DEPEND(fasttrap, dtrace, 1, 1, 1); +MODULE_DEPEND(fasttrap, opensolaris, 1, 1, 1); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c new file mode 100644 index 000000000000..313e5a351948 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c @@ -0,0 +1,95 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/taskq.h> +#include <sys/vnode.h> + +/* Extensible attribute (xva) routines. */ + +/* + * Zero out the structure, set the size of the requested/returned bitmaps, + * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer + * to the returned attributes array. + */ +void +xva_init(xvattr_t *xvap) +{ + bzero(xvap, sizeof (xvattr_t)); + xvap->xva_mapsize = XVA_MAPSIZE; + xvap->xva_magic = XVA_MAGIC; + xvap->xva_vattr.va_mask = AT_XVATTR; + xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; +} + +/* + * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t + * structure. Otherwise, returns NULL. + */ +xoptattr_t * +xva_getxoptattr(xvattr_t *xvap) +{ + xoptattr_t *xoap = NULL; + if (xvap->xva_vattr.va_mask & AT_XVATTR) + xoap = &xvap->xva_xoptattrs; + return (xoap); +} + +/* + * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it + * asynchronously using a taskq. This can avoid deadlocks caused by re-entering + * the file system as a result of releasing the vnode. Note, file systems + * already have to handle the race where the vnode is incremented before the + * inactive routine is called and does its locking. + * + * Warning: Excessive use of this routine can lead to performance problems. + * This is because taskqs throttle back allocation if too many are created. + */ +void +vn_rele_async(vnode_t *vp, taskq_t *taskq) +{ + VERIFY(vp->v_count > 0); + if (refcount_release_if_not_last(&vp->v_usecount)) { + vdrop(vp); + return; + } + VERIFY(taskq_dispatch((taskq_t *)taskq, + (task_func_t *)vrele, vp, TQ_SLEEP) != 0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash new file mode 100644 index 000000000000..e558b2a50358 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash @@ -0,0 +1,19 @@ +Copyright (c) 2011 Google, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip new file mode 100644 index 000000000000..f98cb76dfc91 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip @@ -0,0 +1 @@ +CITYHASH CHECKSUM FUNCTIONALITY IN ZFS diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 new file mode 100644 index 000000000000..722cc75f01e9 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 @@ -0,0 +1,30 @@ +LZ4 - Fast LZ compression algorithm +Copyright (C) 2011-2013, Yann Collet. +BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +You can contact the author at : +- LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html +- LZ4 source repository : http://code.google.com/p/lz4/ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip new file mode 100644 index 000000000000..211f679b5749 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip @@ -0,0 +1 @@ +LZ4 COMPRESSION FUNCTIONALITY IN ZFS diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c new file mode 100644 index 000000000000..3730d53f5eb4 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c @@ -0,0 +1,960 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * ARC buffer data (ABD). + * + * ABDs are an abstract data structure for the ARC which can use two + * different ways of storing the underlying data: + * + * (a) Linear buffer. In this case, all the data in the ABD is stored in one + * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). + * + * +-------------------+ + * | ABD (linear) | + * | abd_flags = ... | + * | abd_size = ... | +--------------------------------+ + * | abd_buf ------------->| raw buffer of size abd_size | + * +-------------------+ +--------------------------------+ + * no abd_chunks + * + * (b) Scattered buffer. In this case, the data in the ABD is split into + * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers + * to the chunks recorded in an array at the end of the ABD structure. + * + * +-------------------+ + * | ABD (scattered) | + * | abd_flags = ... | + * | abd_size = ... | + * | abd_offset = 0 | +-----------+ + * | abd_chunks[0] ----------------------------->| chunk 0 | + * | abd_chunks[1] ---------------------+ +-----------+ + * | ... | | +-----------+ + * | abd_chunks[N-1] ---------+ +------->| chunk 1 | + * +-------------------+ | +-----------+ + * | ... + * | +-----------+ + * +----------------->| chunk N-1 | + * +-----------+ + * + * Using a large proportion of scattered ABDs decreases ARC fragmentation since + * when we are at the limit of allocatable space, using equal-size chunks will + * allow us to quickly reclaim enough space for a new large allocation (assuming + * it is also scattered). + * + * In addition to directly allocating a linear or scattered ABD, it is also + * possible to create an ABD by requesting the "sub-ABD" starting at an offset + * within an existing ABD. In linear buffers this is simple (set abd_buf of + * the new ABD to the starting point within the original raw buffer), but + * scattered ABDs are a little more complex. The new ABD makes a copy of the + * relevant abd_chunks pointers (but not the underlying data). However, to + * provide arbitrary rather than only chunk-aligned starting offsets, it also + * tracks an abd_offset field which represents the starting point of the data + * within the first chunk in abd_chunks. For both linear and scattered ABDs, + * creating an offset ABD marks the original ABD as the offset's parent, and the + * original ABD's abd_children refcount is incremented. This data allows us to + * ensure the root ABD isn't deleted before its children. + * + * Most consumers should never need to know what type of ABD they're using -- + * the ABD public API ensures that it's possible to transparently switch from + * using a linear ABD to a scattered one when doing so would be beneficial. + * + * If you need to use the data within an ABD directly, if you know it's linear + * (because you allocated it) you can use abd_to_buf() to access the underlying + * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions + * which will allocate a raw buffer if necessary. Use the abd_return_buf* + * functions to return any raw buffers that are no longer necessary when you're + * done using them. + * + * There are a variety of ABD APIs that implement basic buffer operations: + * compare, copy, read, write, and fill with zeroes. If you need a custom + * function which progressively accesses the whole ABD, use the abd_iterate_* + * functions. + */ + +#include <sys/abd.h> +#include <sys/param.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/zfs_znode.h> + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, +}; + +#define ABDSTAT(stat) (abd_stats.stat.value.ui64) +#define ABDSTAT_INCR(stat, val) \ + atomic_add_64(&abd_stats.stat.value.ui64, (val)) +#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) +#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) + +/* + * It is possible to make all future ABDs be linear by setting this to B_FALSE. + * Otherwise, ABDs are allocated scattered by default unless the caller uses + * abd_alloc_linear(). + */ +boolean_t zfs_abd_scatter_enabled = B_TRUE; + +/* + * The size of the chunks ABD allocates. Because the sizes allocated from the + * kmem_cache can't change, this tunable can only be modified at boot. Changing + * it at runtime would cause ABD iteration to work incorrectly for ABDs which + * were allocated with the old size, so a safeguard has been put in place which + * will cause the machine to panic if you change it and try to access the data + * within a scattered ABD. + */ +size_t zfs_abd_chunk_size = 4096; + +#if defined(__FreeBSD__) && defined(_KERNEL) +SYSCTL_DECL(_vfs_zfs); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, + &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); +SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN, + &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates"); +#endif + +#ifdef _KERNEL +extern vmem_t *zio_alloc_arena; +#endif + +kmem_cache_t *abd_chunk_cache; +static kstat_t *abd_ksp; + +extern inline boolean_t abd_is_linear(abd_t *abd); +extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size); +extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size); +extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size); +extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size); +extern inline void abd_zero(abd_t *abd, size_t size); + +static void * +abd_alloc_chunk() +{ + void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + ASSERT3P(c, !=, NULL); + return (c); +} + +static void +abd_free_chunk(void *c) +{ + kmem_cache_free(abd_chunk_cache, c); +} + +void +abd_init(void) +{ +#ifdef illumos + vmem_t *data_alloc_arena = NULL; + +#ifdef _KERNEL + data_alloc_arena = zio_alloc_arena; +#endif + + /* + * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH + * so that no allocator metadata is stored with the buffers. + */ + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH); +#else + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG); +#endif + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } +} + +void +abd_fini(void) +{ + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + kmem_cache_destroy(abd_chunk_cache); + abd_chunk_cache = NULL; +} + +static inline size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); +} + +static inline size_t +abd_scatter_chunkcnt(abd_t *abd) +{ + ASSERT(!abd_is_linear(abd)); + return (abd_chunkcnt_for_bytes( + abd->abd_u.abd_scatter.abd_offset + abd->abd_size)); +} + +static inline void +abd_verify(abd_t *abd) +{ + ASSERT3U(abd->abd_size, >, 0); + ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | + ABD_FLAG_OWNER | ABD_FLAG_META)); + IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); + IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) { + ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); + } else { + ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, + zfs_abd_chunk_size); + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + ASSERT3P( + abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL); + } + } +} + +static inline abd_t * +abd_alloc_struct(size_t chunkcnt) +{ + size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(size, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, size); + + return (abd); +} + +static inline void +abd_free_struct(abd_t *abd) +{ + size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); + int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + kmem_free(abd, size); + ABDSTAT_INCR(abdstat_struct_size, -size); +} + +/* + * Allocate an ABD, along with its own underlying data buffers. Use this if you + * don't care whether the ABD is linear or not. + */ +abd_t * +abd_alloc(size_t size, boolean_t is_metadata) +{ + if (!zfs_abd_scatter_enabled || size <= zfs_abd_chunk_size) + return (abd_alloc_linear(size, is_metadata)); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + size_t n = abd_chunkcnt_for_bytes(size); + abd_t *abd = abd_alloc_struct(n); + + abd->abd_flags = ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + abd->abd_u.abd_scatter.abd_offset = 0; + abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; + + for (int i = 0; i < n; i++) { + void *c = abd_alloc_chunk(); + ASSERT3P(c, !=, NULL); + abd->abd_u.abd_scatter.abd_chunks[i] = c; + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + n * zfs_abd_chunk_size - size); + + return (abd); +} + +static void +abd_free_scatter(abd_t *abd) +{ + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]); + } + + refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + abd->abd_size - n * zfs_abd_chunk_size); + + abd_free_struct(abd); +} + +/* + * Allocate an ABD that must be linear, along with its own underlying data + * buffer. Only use this when it would be very annoying to write your ABD + * consumer with a scattered ABD. + */ +abd_t * +abd_alloc_linear(size_t size, boolean_t is_metadata) +{ + abd_t *abd = abd_alloc_struct(0); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + if (is_metadata) { + abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); + } else { + abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, size); + + return (abd); +} + +static void +abd_free_linear(abd_t *abd) +{ + if (abd->abd_flags & ABD_FLAG_META) { + zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } else { + zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } + + refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + + abd_free_struct(abd); +} + +/* + * Free an ABD. Only use this on ABDs allocated with abd_alloc() or + * abd_alloc_linear(). + */ +void +abd_free(abd_t *abd) +{ + abd_verify(abd); + ASSERT3P(abd->abd_parent, ==, NULL); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) + abd_free_linear(abd); + else + abd_free_scatter(abd); +} + +/* + * Allocate an ABD of the same format (same metadata flag, same scatterize + * setting) as another ABD. + */ +abd_t * +abd_alloc_sametype(abd_t *sabd, size_t size) +{ + boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; + if (abd_is_linear(sabd)) { + return (abd_alloc_linear(size, is_metadata)); + } else { + return (abd_alloc(size, is_metadata)); + } +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +/* + * Allocate a new ABD to point to offset off of sabd. It shares the underlying + * buffer data with sabd. Use abd_put() to free. sabd must not be freed while + * any derived ABDs exist. + */ +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + abd_t *abd; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + if (abd_is_linear(sabd)) { + abd = abd_alloc_struct(0); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + + abd->abd_u.abd_linear.abd_buf = + (char *)sabd->abd_u.abd_linear.abd_buf + off; + } else { + size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; + size_t chunkcnt = abd_scatter_chunkcnt(sabd) - + (new_offset / zfs_abd_chunk_size); + + abd = abd_alloc_struct(chunkcnt); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + abd->abd_u.abd_scatter.abd_offset = + new_offset % zfs_abd_chunk_size; + abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; + + /* Copy the scatterlist starting at the correct offset */ + (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks, + &sabd->abd_u.abd_scatter.abd_chunks[new_offset / + zfs_abd_chunk_size], + chunkcnt * sizeof (void *)); + } + + abd->abd_size = sabd->abd_size - off; + abd->abd_parent = sabd; + refcount_create(&abd->abd_children); + (void) refcount_add_many(&sabd->abd_children, abd->abd_size, abd); + + return (abd); +} + +/* + * Allocate a linear ABD structure for buf. You must free this with abd_put() + * since the resulting ABD doesn't own its own buffer. + */ +abd_t * +abd_get_from_buf(void *buf, size_t size) +{ + abd_t *abd = abd_alloc_struct(0); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + /* + * Even if this buf is filesystem metadata, we only track that if we + * own the underlying data buffer, which is not true in this case. + * Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + abd->abd_u.abd_linear.abd_buf = buf; + + return (abd); +} + +/* + * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not + * free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + abd_verify(abd); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + + if (abd->abd_parent != NULL) { + (void) refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } + + refcount_destroy(&abd->abd_children); + abd_free_struct(abd); +} + +/* + * Get the raw buffer associated with a linear ABD. + */ +void * +abd_to_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + abd_verify(abd); + return (abd->abd_u.abd_linear.abd_buf); +} + +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will allocate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } + (void) refcount_add_many(&abd->abd_children, n, buf); + + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + if (!abd_is_linear(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will + * not change the contents of the ABD and will ASSERT that you didn't modify + * the buffer since it was borrowed. If you want any changes you made to buf to + * be copied back to abd, use abd_return_buf_copy() instead. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } + (void) refcount_remove_many(&abd->abd_children, n, buf); +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} + +/* + * Give this ABD ownership of the buffer that it's storing. Can only be used on + * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated + * with abd_alloc_linear() which subsequently released ownership of their buf + * with abd_release_ownership_of_buf(). + */ +void +abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + abd_verify(abd); + + abd->abd_flags |= ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); +} + +void +abd_release_ownership_of_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + abd_verify(abd); + + abd->abd_flags &= ~ABD_FLAG_OWNER; + /* Disable this flag since we no longer own the data buffer */ + abd->abd_flags &= ~ABD_FLAG_META; + + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); +} + +struct abd_iter { + abd_t *iter_abd; /* ABD being iterated through */ + size_t iter_pos; /* position (relative to abd_offset) */ + void *iter_mapaddr; /* addr corresponding to iter_pos */ + size_t iter_mapsize; /* length of data valid at mapaddr */ +}; + +static inline size_t +abd_iter_scatter_chunk_offset(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + + aiter->iter_pos) % zfs_abd_chunk_size); +} + +static inline size_t +abd_iter_scatter_chunk_index(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + + aiter->iter_pos) / zfs_abd_chunk_size); +} + +/* + * Initialize the abd_iter. + */ +static void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_pos = 0; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +static void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + aiter->iter_pos += amount; +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* Panic if someone has changed zfs_abd_chunk_size */ + IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == + aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size); + + /* There's nothing left to iterate over, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + if (abd_is_linear(aiter->iter_abd)) { + offset = aiter->iter_pos; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; + } else { + size_t index = abd_iter_scatter_chunk_index(aiter); + offset = abd_iter_scatter_chunk_offset(aiter); + aiter->iter_mapsize = zfs_abd_chunk_size - offset; + paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; + } + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +int +abd_iterate_func(abd_t *abd, size_t off, size_t size, + abd_iter_func_t *func, void *private) +{ + int ret = 0; + struct abd_iter aiter; + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_iter_init(&aiter, abd); + abd_iter_advance(&aiter, off); + + while (size > 0) { + abd_iter_map(&aiter); + + size_t len = MIN(aiter.iter_mapsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_mapaddr, len, private); + + abd_iter_unmap(&aiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&aiter, len); + } + + return (ret); +} + +struct buf_arg { + void *arg_buf; +}; + +static int +abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(ba_ptr->arg_buf, buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy abd to buf. (off is the offset in abd.) + */ +void +abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, + &ba_ptr); +} + +static int +abd_cmp_buf_off_cb(void *buf, size_t size, void *private) +{ + int ret; + struct buf_arg *ba_ptr = private; + + ret = memcmp(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (ret); +} + +/* + * Compare the contents of abd to buf. (off is the offset in abd.) + */ +int +abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); +} + +static int +abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy from buf to abd. (off is the offset in abd.) + */ +void +abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, + &ba_ptr); +} + +/*ARGSUSED*/ +static int +abd_zero_off_cb(void *buf, size_t size, void *private) +{ + (void) memset(buf, 0, size); + return (0); +} + +/* + * Zero out the abd from a particular offset to the end. + */ +void +abd_zero_off(abd_t *abd, size_t off, size_t size) +{ + (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); +} + +/* + * Iterate over two ABDs and call func incrementally on the two ABDs' data in + * equal-sized chunks (passed to func as raw buffers). func could be called many + * times during this iteration. + */ +int +abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, + size_t size, abd_iter_func2_t *func, void *private) +{ + int ret = 0; + struct abd_iter daiter, saiter; + + abd_verify(dabd); + abd_verify(sabd); + + ASSERT3U(doff + size, <=, dabd->abd_size); + ASSERT3U(soff + size, <=, sabd->abd_size); + + abd_iter_init(&daiter, dabd); + abd_iter_init(&saiter, sabd); + abd_iter_advance(&daiter, doff); + abd_iter_advance(&saiter, soff); + + while (size > 0) { + abd_iter_map(&daiter); + abd_iter_map(&saiter); + + size_t dlen = MIN(daiter.iter_mapsize, size); + size_t slen = MIN(saiter.iter_mapsize, size); + size_t len = MIN(dlen, slen); + ASSERT(dlen > 0 || slen > 0); + + ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, + private); + + abd_iter_unmap(&saiter); + abd_iter_unmap(&daiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&daiter, len); + abd_iter_advance(&saiter, len); + } + + return (ret); +} + +/*ARGSUSED*/ +static int +abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) +{ + (void) memcpy(dbuf, sbuf, size); + return (0); +} + +/* + * Copy from sabd to dabd starting from soff and doff. + */ +void +abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) +{ + (void) abd_iterate_func2(dabd, sabd, doff, soff, size, + abd_copy_off_cb, NULL); +} + +/*ARGSUSED*/ +static int +abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) +{ + return (memcmp(bufa, bufb, size)); +} + +/* + * Compares the first size bytes of two ABDs. + */ +int +abd_cmp(abd_t *dabd, abd_t *sabd, size_t size) +{ + return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c new file mode 100644 index 000000000000..814f6c276b0a --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c @@ -0,0 +1,232 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/aggsum.h> + +/* + * Aggregate-sum counters are a form of fanned-out counter, used when atomic + * instructions on a single field cause enough CPU cache line contention to + * slow system performance. Due to their increased overhead and the expense + * involved with precisely reading from them, they should only be used in cases + * where the write rate (increment/decrement) is much higher than the read rate + * (get value). + * + * Aggregate sum counters are comprised of two basic parts, the core and the + * buckets. The core counter contains a lock for the entire counter, as well + * as the current upper and lower bounds on the value of the counter. The + * aggsum_bucket structure contains a per-bucket lock to protect the contents of + * the bucket, the current amount that this bucket has changed from the global + * counter (called the delta), and the amount of increment and decrement we have + * "borrowed" from the core counter. + * + * The basic operation of an aggsum is simple. Threads that wish to modify the + * counter will modify one bucket's counter (determined by their current CPU, to + * help minimize lock and cache contention). If the bucket already has + * sufficient capacity borrowed from the core structure to handle their request, + * they simply modify the delta and return. If the bucket does not, we clear + * the bucket's current state (to prevent the borrowed amounts from getting too + * large), and borrow more from the core counter. Borrowing is done by adding to + * the upper bound (or subtracting from the lower bound) of the core counter, + * and setting the borrow value for the bucket to the amount added (or + * subtracted). Clearing the bucket is the opposite; we add the current delta + * to both the lower and upper bounds of the core counter, subtract the borrowed + * incremental from the upper bound, and add the borrowed decrement from the + * lower bound. Note that only borrowing and clearing require access to the + * core counter; since all other operations access CPU-local resources, + * performance can be much higher than a traditional counter. + * + * Threads that wish to read from the counter have a slightly more challenging + * task. It is fast to determine the upper and lower bounds of the aggum; this + * does not require grabbing any locks. This suffices for cases where an + * approximation of the aggsum's value is acceptable. However, if one needs to + * know whether some specific value is above or below the current value in the + * aggsum, they invoke aggsum_compare(). This function operates by repeatedly + * comparing the target value to the upper and lower bounds of the aggsum, and + * then clearing a bucket. This proceeds until the target is outside of the + * upper and lower bounds and we return a response, or the last bucket has been + * cleared and we know that the target is equal to the aggsum's value. Finally, + * the most expensive operation is determining the precise value of the aggsum. + * To do this, we clear every bucket and then return the upper bound (which must + * be equal to the lower bound). What makes aggsum_compare() and aggsum_value() + * expensive is clearing buckets. This involves grabbing the global lock + * (serializing against themselves and borrow operations), grabbing a bucket's + * lock (preventing threads on those CPUs from modifying their delta), and + * zeroing out the borrowed value (forcing that thread to borrow on its next + * request, which will also be expensive). This is what makes aggsums well + * suited for write-many read-rarely operations. + */ + +/* + * We will borrow aggsum_borrow_multiplier times the current request, so we will + * have to get the as_lock approximately every aggsum_borrow_multiplier calls to + * aggsum_delta(). + */ +static uint_t aggsum_borrow_multiplier = 10; + +void +aggsum_init(aggsum_t *as, uint64_t value) +{ + bzero(as, sizeof (*as)); + as->as_lower_bound = as->as_upper_bound = value; + mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL); + as->as_numbuckets = boot_ncpus; + as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t), + KM_SLEEP); + for (int i = 0; i < as->as_numbuckets; i++) { + mutex_init(&as->as_buckets[i].asc_lock, + NULL, MUTEX_DEFAULT, NULL); + } +} + +void +aggsum_fini(aggsum_t *as) +{ + for (int i = 0; i < as->as_numbuckets; i++) + mutex_destroy(&as->as_buckets[i].asc_lock); + mutex_destroy(&as->as_lock); +} + +int64_t +aggsum_lower_bound(aggsum_t *as) +{ + return (as->as_lower_bound); +} + +int64_t +aggsum_upper_bound(aggsum_t *as) +{ + return (as->as_upper_bound); +} + +static void +aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb) +{ + ASSERT(MUTEX_HELD(&as->as_lock)); + ASSERT(MUTEX_HELD(&asb->asc_lock)); + + /* + * We use atomic instructions for this because we read the upper and + * lower bounds without the lock, so we need stores to be atomic. + */ + atomic_add_64((volatile uint64_t *)&as->as_lower_bound, asb->asc_delta); + atomic_add_64((volatile uint64_t *)&as->as_upper_bound, asb->asc_delta); + asb->asc_delta = 0; + atomic_add_64((volatile uint64_t *)&as->as_upper_bound, + -asb->asc_borrowed); + atomic_add_64((volatile uint64_t *)&as->as_lower_bound, + asb->asc_borrowed); + asb->asc_borrowed = 0; +} + +uint64_t +aggsum_value(aggsum_t *as) +{ + int64_t rv; + + mutex_enter(&as->as_lock); + if (as->as_lower_bound == as->as_upper_bound) { + rv = as->as_lower_bound; + for (int i = 0; i < as->as_numbuckets; i++) { + ASSERT0(as->as_buckets[i].asc_delta); + ASSERT0(as->as_buckets[i].asc_borrowed); + } + mutex_exit(&as->as_lock); + return (rv); + } + for (int i = 0; i < as->as_numbuckets; i++) { + struct aggsum_bucket *asb = &as->as_buckets[i]; + mutex_enter(&asb->asc_lock); + aggsum_flush_bucket(as, asb); + mutex_exit(&asb->asc_lock); + } + VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); + rv = as->as_lower_bound; + mutex_exit(&as->as_lock); + + return (rv); +} + +static void +aggsum_borrow(aggsum_t *as, int64_t delta, struct aggsum_bucket *asb) +{ + int64_t abs_delta = (delta < 0 ? -delta : delta); + mutex_enter(&as->as_lock); + mutex_enter(&asb->asc_lock); + + aggsum_flush_bucket(as, asb); + + atomic_add_64((volatile uint64_t *)&as->as_upper_bound, abs_delta); + atomic_add_64((volatile uint64_t *)&as->as_lower_bound, -abs_delta); + asb->asc_borrowed = abs_delta; + + mutex_exit(&asb->asc_lock); + mutex_exit(&as->as_lock); +} + +void +aggsum_add(aggsum_t *as, int64_t delta) +{ + struct aggsum_bucket *asb = + &as->as_buckets[CPU_SEQID % as->as_numbuckets]; + + for (;;) { + mutex_enter(&asb->asc_lock); + if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed && + asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) { + asb->asc_delta += delta; + mutex_exit(&asb->asc_lock); + return; + } + mutex_exit(&asb->asc_lock); + aggsum_borrow(as, delta * aggsum_borrow_multiplier, asb); + } +} + +/* + * Compare the aggsum value to target efficiently. Returns -1 if the value + * represented by the aggsum is less than target, 1 if it's greater, and 0 if + * they are equal. + */ +int +aggsum_compare(aggsum_t *as, uint64_t target) +{ + if (as->as_upper_bound < target) + return (-1); + if (as->as_lower_bound > target) + return (1); + mutex_enter(&as->as_lock); + for (int i = 0; i < as->as_numbuckets; i++) { + struct aggsum_bucket *asb = &as->as_buckets[i]; + mutex_enter(&asb->asc_lock); + aggsum_flush_bucket(as, asb); + mutex_exit(&asb->asc_lock); + if (as->as_upper_bound < target) { + mutex_exit(&as->as_lock); + return (-1); + } + if (as->as_lower_bound > target) { + mutex_exit(&as->as_lock); + return (1); + } + } + VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); + ASSERT3U(as->as_lower_bound, ==, target); + mutex_exit(&as->as_lock); + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c new file mode 100644 index 000000000000..f8baa5adddd3 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -0,0 +1,8527 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +/* + * DVA-based Adjustable Replacement Cache + * + * While much of the theory of operation used here is + * based on the self-tuning, low overhead replacement cache + * presented by Megiddo and Modha at FAST 2003, there are some + * significant differences: + * + * 1. The Megiddo and Modha model assumes any page is evictable. + * Pages in its cache cannot be "locked" into memory. This makes + * the eviction algorithm simple: evict the last page in the list. + * This also make the performance characteristics easy to reason + * about. Our cache is not so simple. At any given moment, some + * subset of the blocks in the cache are un-evictable because we + * have handed out a reference to them. Blocks are only evictable + * when there are no external references active. This makes + * eviction far more problematic: we choose to evict the evictable + * blocks that are the "lowest" in the list. + * + * There are times when it is not possible to evict the requested + * space. In these circumstances we are unable to adjust the cache + * size. To prevent the cache growing unbounded at these times we + * implement a "cache throttle" that slows the flow of new data + * into the cache until we can make space available. + * + * 2. The Megiddo and Modha model assumes a fixed cache size. + * Pages are evicted when the cache is full and there is a cache + * miss. Our model has a variable sized cache. It grows with + * high use, but also tries to react to memory pressure from the + * operating system: decreasing its size when system memory is + * tight. + * + * 3. The Megiddo and Modha model assumes a fixed page size. All + * elements of the cache are therefore exactly the same size. So + * when adjusting the cache size following a cache miss, its simply + * a matter of choosing a single page to evict. In our model, we + * have variable sized cache blocks (rangeing from 512 bytes to + * 128K bytes). We therefore choose a set of blocks to evict to make + * space for a cache miss that approximates as closely as possible + * the space used by the new block. + * + * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" + * by N. Megiddo & D. Modha, FAST 2003 + */ + +/* + * The locking model: + * + * A new reference to a cache buffer can be obtained in two + * ways: 1) via a hash table lookup using the DVA as a key, + * or 2) via one of the ARC lists. The arc_read() interface + * uses method 1, while the internal ARC algorithms for + * adjusting the cache use method 2. We therefore provide two + * types of locks: 1) the hash table lock array, and 2) the + * ARC list locks. + * + * Buffers do not have their own mutexes, rather they rely on the + * hash table mutexes for the bulk of their protection (i.e. most + * fields in the arc_buf_hdr_t are protected by these mutexes). + * + * buf_hash_find() returns the appropriate mutex (held) when it + * locates the requested buffer in the hash table. It returns + * NULL for the mutex if the buffer was not in the table. + * + * buf_hash_remove() expects the appropriate hash mutex to be + * already held before it is invoked. + * + * Each ARC state also has a mutex which is used to protect the + * buffer list associated with the state. When attempting to + * obtain a hash table lock while holding an ARC list lock you + * must use: mutex_tryenter() to avoid deadlock. Also note that + * the active state mutex must be held before the ghost state mutex. + * + * It as also possible to register a callback which is run when the + * arc_meta_limit is reached and no buffers can be safely evicted. In + * this case the arc user should drop a reference on some arc buffers so + * they can be reclaimed and the arc_meta_limit honored. For example, + * when using the ZPL each dentry holds a references on a znode. These + * dentries must be pruned before the arc buffer holding the znode can + * be safely evicted. + * + * Note that the majority of the performance stats are manipulated + * with atomic operations. + * + * The L2ARC uses the l2ad_mtx on each vdev for the following: + * + * - L2ARC buflist creation + * - L2ARC buflist eviction + * - L2ARC write completion, which walks L2ARC buflists + * - ARC header destruction, as it removes from L2ARC buflists + * - ARC header release, as it removes from L2ARC buflists + */ + +/* + * ARC operation: + * + * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. + * This structure can point either to a block that is still in the cache or to + * one that is only accessible in an L2 ARC device, or it can provide + * information about a block that was recently evicted. If a block is + * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough + * information to retrieve it from the L2ARC device. This information is + * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block + * that is in this state cannot access the data directly. + * + * Blocks that are actively being referenced or have not been evicted + * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within + * the arc_buf_hdr_t that will point to the data block in memory. A block can + * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC + * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and + * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). + * + * The L1ARC's data pointer may or may not be uncompressed. The ARC has the + * ability to store the physical data (b_pabd) associated with the DVA of the + * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, + * it will match its on-disk compression characteristics. This behavior can be + * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the + * compressed ARC functionality is disabled, the b_pabd will point to an + * uncompressed version of the on-disk data. + * + * Data in the L1ARC is not accessed by consumers of the ARC directly. Each + * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. + * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC + * consumer. The ARC will provide references to this data and will keep it + * cached until it is no longer in use. The ARC caches only the L1ARC's physical + * data block and will evict any arc_buf_t that is no longer referenced. The + * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the + * "overhead_size" kstat. + * + * Depending on the consumer, an arc_buf_t can be requested in uncompressed or + * compressed form. The typical case is that consumers will want uncompressed + * data, and when that happens a new data buffer is allocated where the data is + * decompressed for them to use. Currently the only consumer who wants + * compressed arc_buf_t's is "zfs send", when it streams data exactly as it + * exists on disk. When this happens, the arc_buf_t's data buffer is shared + * with the arc_buf_hdr_t. + * + * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The + * first one is owned by a compressed send consumer (and therefore references + * the same compressed data buffer as the arc_buf_hdr_t) and the second could be + * used by any other consumer (and has its own uncompressed copy of the data + * buffer). + * + * arc_buf_hdr_t + * +-----------+ + * | fields | + * | common to | + * | L1- and | + * | L2ARC | + * +-----------+ + * | l2arc_buf_hdr_t + * | | + * +-----------+ + * | l1arc_buf_hdr_t + * | | arc_buf_t + * | b_buf +------------>+-----------+ arc_buf_t + * | b_pabd +-+ |b_next +---->+-----------+ + * +-----------+ | |-----------| |b_next +-->NULL + * | |b_comp = T | +-----------+ + * | |b_data +-+ |b_comp = F | + * | +-----------+ | |b_data +-+ + * +->+------+ | +-----------+ | + * compressed | | | | + * data | |<--------------+ | uncompressed + * +------+ compressed, | data + * shared +-->+------+ + * data | | + * | | + * +------+ + * + * When a consumer reads a block, the ARC must first look to see if the + * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new + * arc_buf_t and either copies uncompressed data into a new data buffer from an + * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a + * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the + * hdr is compressed and the desired compression characteristics of the + * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the + * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be + * the last buffer in the hdr's b_buf list, however a shared compressed buf can + * be anywhere in the hdr's list. + * + * The diagram below shows an example of an uncompressed ARC hdr that is + * sharing its data with an arc_buf_t (note that the shared uncompressed buf is + * the last element in the buf list): + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t (shared) + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pabd +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * | | | | + * uncompressed | | | | + * data +------+ | | + * ^ +->+------+ | + * | uncompressed | | | + * | data | | | + * | +------+ | + * +---------------------------------+ + * + * Writing to the ARC requires that the ARC first discard the hdr's b_pabd + * since the physical block is about to be rewritten. The new data contents + * will be contained in the arc_buf_t. As the I/O pipeline performs the write, + * it may compress the data before writing it to disk. The ARC will be called + * with the transformed data and will bcopy the transformed on-disk block into + * a newly allocated b_pabd. Writes are always done into buffers which have + * either been loaned (and hence are new and don't have other readers) or + * buffers which have been released (and hence have their own hdr, if there + * were originally other readers of the buf's original hdr). This ensures that + * the ARC only needs to update a single buf and its hdr after a write occurs. + * + * When the L2ARC is in use, it will also take advantage of the b_pabd. The + * L2ARC will always write the contents of b_pabd to the L2ARC. This means + * that when compressed ARC is enabled that the L2ARC blocks are identical + * to the on-disk block in the main data pool. This provides a significant + * advantage since the ARC can leverage the bp's checksum when reading from the + * L2ARC to determine if the contents are valid. However, if the compressed + * ARC is disabled, then the L2ARC's block must be transformed to look + * like the physical block in the main data pool before comparing the + * checksum and determining its validity. + */ + +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/spa_impl.h> +#include <sys/zio_compress.h> +#include <sys/zio_checksum.h> +#include <sys/zfs_context.h> +#include <sys/arc.h> +#include <sys/refcount.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/dsl_pool.h> +#include <sys/zio_checksum.h> +#include <sys/multilist.h> +#include <sys/abd.h> +#ifdef _KERNEL +#include <sys/dnlc.h> +#include <sys/racct.h> +#endif +#include <sys/callb.h> +#include <sys/kstat.h> +#include <sys/trim_map.h> +#include <sys/zthr.h> +#include <zfs_fletcher.h> +#include <sys/sdt.h> +#include <sys/aggsum.h> +#include <sys/cityhash.h> + +#include <machine/vmparam.h> + +#ifdef illumos +#ifndef _KERNEL +/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ +boolean_t arc_watch = B_FALSE; +int arc_procfd; +#endif +#endif /* illumos */ + +/* + * This thread's job is to keep enough free memory in the system, by + * calling arc_kmem_reap_now() plus arc_shrink(), which improves + * arc_available_memory(). + */ +static zthr_t *arc_reap_zthr; + +/* + * This thread's job is to keep arc_size under arc_c, by calling + * arc_adjust(), which improves arc_is_overflowing(). + */ +static zthr_t *arc_adjust_zthr; + +static kmutex_t arc_adjust_lock; +static kcondvar_t arc_adjust_waiters_cv; +static boolean_t arc_adjust_needed = B_FALSE; + +static kmutex_t arc_dnlc_evicts_lock; +static kcondvar_t arc_dnlc_evicts_cv; +static boolean_t arc_dnlc_evicts_thread_exit; + +uint_t arc_reduce_dnlc_percent = 3; + +/* + * The number of headers to evict in arc_evict_state_impl() before + * dropping the sublist lock and evicting from another sublist. A lower + * value means we're more likely to evict the "correct" header (i.e. the + * oldest header in the arc state), but comes with higher overhead + * (i.e. more invocations of arc_evict_state_impl()). + */ +int zfs_arc_evict_batch_limit = 10; + +/* number of seconds before growing cache again */ +int arc_grow_retry = 60; + +/* + * Minimum time between calls to arc_kmem_reap_soon(). Note that this will + * be converted to ticks, so with the default hz=100, a setting of 15 ms + * will actually wait 2 ticks, or 20ms. + */ +int arc_kmem_cache_reap_retry_ms = 1000; + +/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ +int zfs_arc_overflow_shift = 8; + +/* shift of arc_c for calculating both min and max arc_p */ +int arc_p_min_shift = 4; + +/* log2(fraction of arc to reclaim) */ +int arc_shrink_shift = 7; + +/* + * log2(fraction of ARC which must be free to allow growing). + * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, + * when reading a new block into the ARC, we will evict an equal-sized block + * from the ARC. + * + * This must be less than arc_shrink_shift, so that when we shrink the ARC, + * we will still not allow it to grow. + */ +int arc_no_grow_shift = 5; + + +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ +static int zfs_arc_min_prefetch_ms = 1; +static int zfs_arc_min_prescient_prefetch_ms = 6; + +/* + * If this percent of memory is free, don't throttle. + */ +int arc_lotsfree_percent = 10; + +static boolean_t arc_initialized; +extern boolean_t zfs_prefetch_disable; + +/* + * The arc has filled available memory and has now warmed up. + */ +static boolean_t arc_warm; + +/* + * log2 fraction of the zio arena to keep free. + */ +int arc_zio_arena_free_shift = 2; + +/* + * These tunables are for performance analysis. + */ +uint64_t zfs_arc_max; +uint64_t zfs_arc_min; +uint64_t zfs_arc_meta_limit = 0; +uint64_t zfs_arc_meta_min = 0; +uint64_t zfs_arc_dnode_limit = 0; +uint64_t zfs_arc_dnode_reduce_percent = 10; +int zfs_arc_grow_retry = 0; +int zfs_arc_shrink_shift = 0; +int zfs_arc_no_grow_shift = 0; +int zfs_arc_p_min_shift = 0; +uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ +u_int zfs_arc_free_target = 0; + +/* Absolute min for arc min / max is 16MB. */ +static uint64_t arc_abs_min = 16 << 20; + +/* + * ARC dirty data constraints for arc_tempreserve_space() throttle + */ +uint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */ +uint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */ +uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ + +boolean_t zfs_compressed_arc_enabled = B_TRUE; + +static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); +static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); +static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); +static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); +static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); + +#if defined(__FreeBSD__) && defined(_KERNEL) +static void +arc_free_target_init(void *unused __unused) +{ + + zfs_arc_free_target = vm_cnt.v_free_target; +} +SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, + arc_free_target_init, NULL); + +TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); +TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); +TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); +TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry); +TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift); +SYSCTL_DECL(_vfs_zfs); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, + 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, + 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN, + 0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", + "log2(fraction of ARC which must be free to allow growing)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, + &zfs_arc_average_blocksize, 0, + "ARC average blocksize"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, + &arc_shrink_shift, 0, + "log2(fraction of arc to reclaim)"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW, + &arc_grow_retry, 0, + "Wait in seconds before considering growing ARC"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, + &zfs_compressed_arc_enabled, 0, + "Enable compressed ARC"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_kmem_cache_reap_retry_ms, CTLFLAG_RWTUN, + &arc_kmem_cache_reap_retry_ms, 0, + "Interval between ARC kmem_cache reapings"); + +/* + * We don't have a tunable for arc_free_target due to the dependency on + * pagedaemon initialisation. + */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), + sysctl_vfs_zfs_arc_free_target, "IU", + "Desired number of free pages below which ARC triggers reclaim"); + +static int +sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) +{ + u_int val; + int err; + + val = zfs_arc_free_target; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < minfree) + return (EINVAL); + if (val > vm_cnt.v_page_count) + return (EINVAL); + + zfs_arc_free_target = val; + + return (0); +} + +/* + * Must be declared here, before the definition of corresponding kstat + * macro which uses the same names will confuse the compiler. + */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), + sysctl_vfs_zfs_arc_meta_limit, "QU", + "ARC metadata limit"); +#endif + +/* + * Note that buffers can be in one of 6 states: + * ARC_anon - anonymous (discussed below) + * ARC_mru - recently used, currently cached + * ARC_mru_ghost - recentely used, no longer in cache + * ARC_mfu - frequently used, currently cached + * ARC_mfu_ghost - frequently used, no longer in cache + * ARC_l2c_only - exists in L2ARC but not other states + * When there are no active references to the buffer, they are + * are linked onto a list in one of these arc states. These are + * the only buffers that can be evicted or deleted. Within each + * state there are multiple lists, one for meta-data and one for + * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, + * etc.) is tracked separately so that it can be managed more + * explicitly: favored over data, limited explicitly. + * + * Anonymous buffers are buffers that are not associated with + * a DVA. These are buffers that hold dirty block copies + * before they are written to stable storage. By definition, + * they are "ref'd" and are considered part of arc_mru + * that cannot be freed. Generally, they will aquire a DVA + * as they are written and migrate onto the arc_mru list. + * + * The ARC_l2c_only state is for buffers that are in the second + * level ARC but no longer in any of the ARC_m* lists. The second + * level ARC itself may also contain buffers that are in any of + * the ARC_m* states - meaning that a buffer can exist in two + * places. The reason for the ARC_l2c_only state is to keep the + * buffer header in the hash table, so that reads that hit the + * second level ARC benefit from these fast lookups. + */ + +typedef struct arc_state { + /* + * list of evictable buffers + */ + multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; + /* + * total amount of evictable data in this state + */ + refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; + /* + * total amount of data in this state; this includes: evictable, + * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + */ + refcount_t arcs_size; + /* + * supports the "dbufs" kstat + */ + arc_state_type_t arcs_state; +} arc_state_t; + +/* + * Percentage that can be consumed by dnodes of ARC meta buffers. + */ +int zfs_arc_meta_prune = 10000; +unsigned long zfs_arc_dnode_limit_percent = 10; +int zfs_arc_meta_strategy = ARC_STRATEGY_META_ONLY; +int zfs_arc_meta_adjust_restarts = 4096; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_meta_strategy, CTLFLAG_RWTUN, + &zfs_arc_meta_strategy, 0, + "ARC metadata reclamation strategy " + "(0 = metadata only, 1 = balance data and metadata)"); + +/* The 6 states: */ +static arc_state_t ARC_anon; +static arc_state_t ARC_mru; +static arc_state_t ARC_mru_ghost; +static arc_state_t ARC_mfu; +static arc_state_t ARC_mfu_ghost; +static arc_state_t ARC_l2c_only; + +typedef struct arc_stats { + kstat_named_t arcstat_hits; + kstat_named_t arcstat_misses; + kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_misses; + kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_misses; + kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_misses; + kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_misses; + kstat_named_t arcstat_mru_hits; + kstat_named_t arcstat_mru_ghost_hits; + kstat_named_t arcstat_mfu_hits; + kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_allocated; + kstat_named_t arcstat_deleted; + /* + * Number of buffers that could not be evicted because the hash lock + * was held by another thread. The lock may not necessarily be held + * by something using the same buffer, since hash locks are shared + * by multiple buffers. + */ + kstat_named_t arcstat_mutex_miss; + /* + * Number of buffers skipped when updating the access state due to the + * header having already been released after acquiring the hash lock. + */ + kstat_named_t arcstat_access_skip; + /* + * Number of buffers skipped because they have I/O in progress, are + * indirect prefetch buffers that have not lived long enough, or are + * not from the spa we're trying to evict from. + */ + kstat_named_t arcstat_evict_skip; + /* + * Number of times arc_evict_state() was unable to evict enough + * buffers to reach it's target amount. + */ + kstat_named_t arcstat_evict_not_enough; + kstat_named_t arcstat_evict_l2_cached; + kstat_named_t arcstat_evict_l2_eligible; + kstat_named_t arcstat_evict_l2_ineligible; + kstat_named_t arcstat_evict_l2_skip; + kstat_named_t arcstat_hash_elements; + kstat_named_t arcstat_hash_elements_max; + kstat_named_t arcstat_hash_collisions; + kstat_named_t arcstat_hash_chains; + kstat_named_t arcstat_hash_chain_max; + kstat_named_t arcstat_p; + kstat_named_t arcstat_c; + kstat_named_t arcstat_c_min; + kstat_named_t arcstat_c_max; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_size; + /* + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. + * Note that the compressed bytes may match the uncompressed bytes + * if the block is either not compressed or compressed arc is disabled. + */ + kstat_named_t arcstat_compressed_size; + /* + * Uncompressed size of the data stored in b_pabd. If compressed + * arc is disabled then this value will be identical to the stat + * above. + */ + kstat_named_t arcstat_uncompressed_size; + /* + * Number of bytes stored in all the arc_buf_t's. This is classified + * as "overhead" since this data is typically short-lived and will + * be evicted from the arc when it becomes unreferenced unless the + * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level + * values have been set (see comment in dbuf.c for more information). + */ + kstat_named_t arcstat_overhead_size; + /* + * Number of bytes consumed by internal ARC structures necessary + * for tracking purposes; these structures are not actually + * backed by ARC buffers. This includes arc_buf_hdr_t structures + * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only + * caches), and arc_buf_t structures (allocated via arc_buf_t + * cache). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_hdr_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_DATA. This is generally consumed by buffers backing + * on disk user data (e.g. plain file contents). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_data_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_METADATA. This is generally consumed by buffers + * backing on disk data that is used for internal ZFS + * structures (e.g. ZAP, dnode, indirect blocks, etc). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_metadata_size; + /* + * Number of bytes consumed by dmu_buf_impl_t objects. + */ + kstat_named_t arcstat_dbuf_size; + /* + * Number of bytes consumed by dnode_t objects. + */ + kstat_named_t arcstat_dnode_size; + /* + * Number of bytes consumed by bonus buffers. + */ + kstat_named_t arcstat_bonus_size; +#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11) + /* + * Sum of the previous three counters, provided for compatibility. + */ + kstat_named_t arcstat_other_size; +#endif + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_anon state. This includes *all* buffers in the arc_anon + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mru state. This includes *all* buffers in the arc_mru + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mru_ghost state. The key thing to note + * here, is the fact that this size doesn't actually indicate + * RAM consumption. The ghost lists only consist of headers and + * don't actually have ARC buffers linked off of these headers. + * Thus, *if* the headers had associated ARC buffers, these + * buffers *would have* consumed this number of bytes. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mfu state. This includes *all* buffers in the arc_mfu + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_size; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu + * state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_evictable_data; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_METADATA, and reside in the + * arc_mfu state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mfu_ghost state. See the comment above + * arcstat_mru_ghost_size for more details. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_evictable_metadata; + kstat_named_t arcstat_l2_hits; + kstat_named_t arcstat_l2_misses; + kstat_named_t arcstat_l2_feeds; + kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_read_bytes; + kstat_named_t arcstat_l2_write_bytes; + kstat_named_t arcstat_l2_writes_sent; + kstat_named_t arcstat_l2_writes_done; + kstat_named_t arcstat_l2_writes_error; + kstat_named_t arcstat_l2_writes_lock_retry; + kstat_named_t arcstat_l2_evict_lock_retry; + kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_evict_l1cached; + kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_abort_lowmem; + kstat_named_t arcstat_l2_cksum_bad; + kstat_named_t arcstat_l2_io_error; + kstat_named_t arcstat_l2_lsize; + kstat_named_t arcstat_l2_psize; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_l2_hdr_size; + kstat_named_t arcstat_l2_write_trylock_fail; + kstat_named_t arcstat_l2_write_passed_headroom; + kstat_named_t arcstat_l2_write_spa_mismatch; + kstat_named_t arcstat_l2_write_in_l2; + kstat_named_t arcstat_l2_write_hdr_io_in_progress; + kstat_named_t arcstat_l2_write_not_cacheable; + kstat_named_t arcstat_l2_write_full; + kstat_named_t arcstat_l2_write_buffer_iter; + kstat_named_t arcstat_l2_write_pios; + kstat_named_t arcstat_l2_write_buffer_bytes_scanned; + kstat_named_t arcstat_l2_write_buffer_list_iter; + kstat_named_t arcstat_l2_write_buffer_list_null_iter; + kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_memory_direct_count; + kstat_named_t arcstat_memory_indirect_count; + kstat_named_t arcstat_memory_all_bytes; + kstat_named_t arcstat_memory_free_bytes; + kstat_named_t arcstat_memory_available_bytes; + kstat_named_t arcstat_no_grow; + kstat_named_t arcstat_tempreserve; + kstat_named_t arcstat_loaned_bytes; + kstat_named_t arcstat_prune; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_meta_used; + kstat_named_t arcstat_meta_limit; + kstat_named_t arcstat_dnode_limit; + kstat_named_t arcstat_meta_max; + kstat_named_t arcstat_meta_min; + kstat_named_t arcstat_async_upgrade_sync; + kstat_named_t arcstat_demand_hit_predictive_prefetch; + kstat_named_t arcstat_demand_hit_prescient_prefetch; +} arc_stats_t; + +static arc_stats_t arc_stats = { + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "demand_data_hits", KSTAT_DATA_UINT64 }, + { "demand_data_misses", KSTAT_DATA_UINT64 }, + { "demand_metadata_hits", KSTAT_DATA_UINT64 }, + { "demand_metadata_misses", KSTAT_DATA_UINT64 }, + { "prefetch_data_hits", KSTAT_DATA_UINT64 }, + { "prefetch_data_misses", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, + { "mru_hits", KSTAT_DATA_UINT64 }, + { "mru_ghost_hits", KSTAT_DATA_UINT64 }, + { "mfu_hits", KSTAT_DATA_UINT64 }, + { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, + { "allocated", KSTAT_DATA_UINT64 }, + { "deleted", KSTAT_DATA_UINT64 }, + { "mutex_miss", KSTAT_DATA_UINT64 }, + { "access_skip", KSTAT_DATA_UINT64 }, + { "evict_skip", KSTAT_DATA_UINT64 }, + { "evict_not_enough", KSTAT_DATA_UINT64 }, + { "evict_l2_cached", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible", KSTAT_DATA_UINT64 }, + { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, + { "evict_l2_skip", KSTAT_DATA_UINT64 }, + { "hash_elements", KSTAT_DATA_UINT64 }, + { "hash_elements_max", KSTAT_DATA_UINT64 }, + { "hash_collisions", KSTAT_DATA_UINT64 }, + { "hash_chains", KSTAT_DATA_UINT64 }, + { "hash_chain_max", KSTAT_DATA_UINT64 }, + { "p", KSTAT_DATA_UINT64 }, + { "c", KSTAT_DATA_UINT64 }, + { "c_min", KSTAT_DATA_UINT64 }, + { "c_max", KSTAT_DATA_UINT64 }, + { "size", KSTAT_DATA_UINT64 }, + { "compressed_size", KSTAT_DATA_UINT64 }, + { "uncompressed_size", KSTAT_DATA_UINT64 }, + { "overhead_size", KSTAT_DATA_UINT64 }, + { "hdr_size", KSTAT_DATA_UINT64 }, + { "data_size", KSTAT_DATA_UINT64 }, + { "metadata_size", KSTAT_DATA_UINT64 }, + { "dbuf_size", KSTAT_DATA_UINT64 }, + { "dnode_size", KSTAT_DATA_UINT64 }, + { "bonus_size", KSTAT_DATA_UINT64 }, +#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11) + { "other_size", KSTAT_DATA_UINT64 }, +#endif + { "anon_size", KSTAT_DATA_UINT64 }, + { "anon_evictable_data", KSTAT_DATA_UINT64 }, + { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, + { "mru_size", KSTAT_DATA_UINT64 }, + { "mru_evictable_data", KSTAT_DATA_UINT64 }, + { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, + { "mru_ghost_size", KSTAT_DATA_UINT64 }, + { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, + { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, + { "mfu_size", KSTAT_DATA_UINT64 }, + { "mfu_evictable_data", KSTAT_DATA_UINT64 }, + { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, + { "mfu_ghost_size", KSTAT_DATA_UINT64 }, + { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, + { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, + { "l2_hits", KSTAT_DATA_UINT64 }, + { "l2_misses", KSTAT_DATA_UINT64 }, + { "l2_feeds", KSTAT_DATA_UINT64 }, + { "l2_rw_clash", KSTAT_DATA_UINT64 }, + { "l2_read_bytes", KSTAT_DATA_UINT64 }, + { "l2_write_bytes", KSTAT_DATA_UINT64 }, + { "l2_writes_sent", KSTAT_DATA_UINT64 }, + { "l2_writes_done", KSTAT_DATA_UINT64 }, + { "l2_writes_error", KSTAT_DATA_UINT64 }, + { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, + { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, + { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, + { "l2_free_on_write", KSTAT_DATA_UINT64 }, + { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, + { "l2_cksum_bad", KSTAT_DATA_UINT64 }, + { "l2_io_error", KSTAT_DATA_UINT64 }, + { "l2_size", KSTAT_DATA_UINT64 }, + { "l2_asize", KSTAT_DATA_UINT64 }, + { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, + { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, + { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, + { "l2_write_in_l2", KSTAT_DATA_UINT64 }, + { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, + { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, + { "l2_write_full", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, + { "l2_write_pios", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "memory_direct_count", KSTAT_DATA_UINT64 }, + { "memory_indirect_count", KSTAT_DATA_UINT64 }, + { "memory_all_bytes", KSTAT_DATA_UINT64 }, + { "memory_free_bytes", KSTAT_DATA_UINT64 }, + { "memory_available_bytes", KSTAT_DATA_UINT64 }, + { "arc_no_grow", KSTAT_DATA_UINT64 }, + { "arc_tempreserve", KSTAT_DATA_UINT64 }, + { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, + { "arc_prune", KSTAT_DATA_UINT64 }, + { "arc_meta_used", KSTAT_DATA_UINT64 }, + { "arc_meta_limit", KSTAT_DATA_UINT64 }, + { "arc_dnode_limit", KSTAT_DATA_UINT64 }, + { "arc_meta_max", KSTAT_DATA_UINT64 }, + { "arc_meta_min", KSTAT_DATA_UINT64 }, + { "async_upgrade_sync", KSTAT_DATA_UINT64 }, + { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, + { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, +}; + +#define ARCSTAT(stat) (arc_stats.stat.value.ui64) + +#define ARCSTAT_INCR(stat, val) \ + atomic_add_64(&arc_stats.stat.value.ui64, (val)) + +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) + +#define ARCSTAT_MAX(stat, val) { \ + uint64_t m; \ + while ((val) > (m = arc_stats.stat.value.ui64) && \ + (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ + continue; \ +} + +#define ARCSTAT_MAXSTAT(stat) \ + ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) + +/* + * We define a macro to allow ARC hits/misses to be easily broken down by + * two separate conditions, giving a total of four different subtypes for + * each of hits and misses (so eight statistics total). + */ +#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ + if (cond1) { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ + } \ + } else { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ + } \ + } + +kstat_t *arc_ksp; +static arc_state_t *arc_anon; +static arc_state_t *arc_mru; +static arc_state_t *arc_mru_ghost; +static arc_state_t *arc_mfu; +static arc_state_t *arc_mfu_ghost; +static arc_state_t *arc_l2c_only; + +/* + * There are several ARC variables that are critical to export as kstats -- + * but we don't want to have to grovel around in the kstat whenever we wish to + * manipulate them. For these variables, we therefore define them to be in + * terms of the statistic variable. This assures that we are not introducing + * the possibility of inconsistency by having shadow copies of the variables, + * while still allowing the code to be readable. + */ +#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ +#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ +#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ +#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ +#define arc_dbuf_size ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */ +#define arc_dnode_size ARCSTAT(arcstat_dnode_size) /* dnode metadata */ +#define arc_bonus_size ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */ + +/* compressed size of entire arc */ +#define arc_compressed_size ARCSTAT(arcstat_compressed_size) +/* uncompressed size of entire arc */ +#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) +/* number of bytes in the arc from arc_buf_t's */ +#define arc_overhead_size ARCSTAT(arcstat_overhead_size) + +/* + * There are also some ARC variables that we want to export, but that are + * updated so often that having the canonical representation be the statistic + * variable causes a performance bottleneck. We want to use aggsum_t's for these + * instead, but still be able to export the kstat in the same way as before. + * The solution is to always use the aggsum version, except in the kstat update + * callback. + */ +aggsum_t arc_size; +aggsum_t arc_meta_used; +aggsum_t astat_data_size; +aggsum_t astat_metadata_size; +aggsum_t astat_hdr_size; +aggsum_t astat_bonus_size; +aggsum_t astat_dnode_size; +aggsum_t astat_dbuf_size; +aggsum_t astat_l2_hdr_size; + +static list_t arc_prune_list; +static kmutex_t arc_prune_mtx; +static taskq_t *arc_prune_taskq; + +static int arc_no_grow; /* Don't try to grow cache size */ +static hrtime_t arc_growtime; +static uint64_t arc_tempreserve; +static uint64_t arc_loaned_bytes; + +typedef struct arc_callback arc_callback_t; + +struct arc_callback { + void *acb_private; + arc_read_done_func_t *acb_done; + arc_buf_t *acb_buf; + boolean_t acb_compressed; + zio_t *acb_zio_dummy; + zio_t *acb_zio_head; + arc_callback_t *acb_next; +}; + +typedef struct arc_write_callback arc_write_callback_t; + +struct arc_write_callback { + void *awcb_private; + arc_write_done_func_t *awcb_ready; + arc_write_done_func_t *awcb_children_ready; + arc_write_done_func_t *awcb_physdone; + arc_write_done_func_t *awcb_done; + arc_buf_t *awcb_buf; +}; + +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { + kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; +#ifdef ZFS_DEBUG + /* + * Used for debugging with kmem_flags - by allocating and freeing + * b_thawed when the buffer is thawed, we get a record of the stack + * trace that thawed it. + */ + void *b_thawed; +#endif + + arc_buf_t *b_buf; + uint32_t b_bufcnt; + /* for waiting on writes to complete */ + kcondvar_t b_cv; + uint8_t b_byteswap; + + /* protected by arc state mutex */ + arc_state_t *b_state; + multilist_node_t b_arc_node; + + /* updated atomically */ + clock_t b_arc_access; + uint32_t b_mru_hits; + uint32_t b_mru_ghost_hits; + uint32_t b_mfu_hits; + uint32_t b_mfu_ghost_hits; + uint32_t b_l2_hits; + + /* self protecting */ + refcount_t b_refcnt; + + arc_callback_t *b_acb; + abd_t *b_pabd; +} l1arc_buf_hdr_t; + +typedef struct l2arc_dev l2arc_dev_t; + +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + uint32_t b_hits; + + list_node_t b_l2node; +} l2arc_buf_hdr_t; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + + arc_buf_contents_t b_type; + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* + * This field stores the size of the data buffer after + * compression, and is set in the arc's zio completion handlers. + * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). + * + * While the block pointers can store up to 32MB in their psize + * field, we can only store up to 32MB minus 512B. This is due + * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. + * a field of zeros represents 512B in the bp). We can't use a + * bias of 1 since we need to reserve a psize of zero, here, to + * represent holes and embedded blocks. + * + * This isn't a problem in practice, since the maximum size of a + * buffer is limited to 16MB, so we never need to store 32MB in + * this field. Even in the upstream illumos code base, the + * maximum size of a buffer is limited to 16MB. + */ + uint16_t b_psize; + + /* + * This field stores the size of the data buffer before + * compression, and cannot change once set. It is in units + * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) + */ + uint16_t b_lsize; /* immutable */ + uint64_t b_spa; /* immutable */ + + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; +}; + +#if defined(__FreeBSD__) && defined(_KERNEL) +static int +sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = arc_meta_limit; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val <= 0 || val > arc_c_max) + return (EINVAL); + + arc_meta_limit = val; + return (0); +} + +static int +sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) +{ + uint32_t val; + int err; + + val = arc_no_grow_shift; + err = sysctl_handle_32(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val >= arc_shrink_shift) + return (EINVAL); + + arc_no_grow_shift = val; + return (0); +} + +static int +sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_arc_max; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (zfs_arc_max == 0) { + /* Loader tunable so blindly set */ + zfs_arc_max = val; + return (0); + } + + if (val < arc_abs_min || val > kmem_size()) + return (EINVAL); + if (val < arc_c_min) + return (EINVAL); + if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) + return (EINVAL); + + arc_c_max = val; + + arc_c = arc_c_max; + arc_p = (arc_c >> 1); + + if (zfs_arc_meta_limit == 0) { + /* limit meta-data to 1/4 of the arc capacity */ + arc_meta_limit = arc_c_max / 4; + } + + /* if kmem_flags are set, lets try to use less memory */ + if (kmem_debugging()) + arc_c = arc_c / 2; + + zfs_arc_max = arc_c; + + return (0); +} + +static int +sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_arc_min; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (zfs_arc_min == 0) { + /* Loader tunable so blindly set */ + zfs_arc_min = val; + return (0); + } + + if (val < arc_abs_min || val > arc_c_max) + return (EINVAL); + + arc_c_min = val; + + if (zfs_arc_meta_min == 0) + arc_meta_min = arc_c_min / 2; + + if (arc_c < arc_c_min) + arc_c = arc_c_min; + + zfs_arc_min = arc_c_min; + + return (0); +} +#endif + +#define GHOST_STATE(state) \ + ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ + (state) == arc_l2c_only) + +#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) +#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) +#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) +#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) +#define HDR_PRESCIENT_PREFETCH(hdr) \ + ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) +#define HDR_COMPRESSION_ENABLED(hdr) \ + ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) + +#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_L2_READING(hdr) \ + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) + +#define HDR_ISTYPE_METADATA(hdr) \ + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) +#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) + +#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) +#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) + +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); + +#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) +#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) +#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) + +/* + * Other sizes + */ + +#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) + +/* + * Hash table routines + */ + +#define HT_LOCK_PAD CACHE_LINE_SIZE + +struct ht_lock { + kmutex_t ht_lock; +#ifdef _KERNEL + unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; +#endif +}; + +#define BUF_LOCKS 256 +typedef struct buf_hash_table { + uint64_t ht_mask; + arc_buf_hdr_t **ht_table; + struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); +} buf_hash_table_t; + +static buf_hash_table_t buf_hash_table; + +#define BUF_HASH_INDEX(spa, dva, birth) \ + (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) +#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) +#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) +#define HDR_LOCK(hdr) \ + (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) + +uint64_t zfs_crc64_table[256]; + +/* + * Level 2 ARC + */ + +#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ +#define L2ARC_HEADROOM 2 /* num of writes */ +/* + * If we discover during ARC scan any buffers to be compressed, we boost + * our headroom for the next scanning cycle by this percentage multiple. + */ +#define L2ARC_HEADROOM_BOOST 200 +#define L2ARC_FEED_SECS 1 /* caching interval secs */ +#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ + +#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) +#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) + +/* L2ARC Performance Tunables */ +uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ +uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ +uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ +uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; +uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ +boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ +boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RWTUN, + &l2arc_write_max, 0, "max write size"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RWTUN, + &l2arc_write_boost, 0, "extra write during warmup"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RWTUN, + &l2arc_headroom, 0, "number of dev writes"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RWTUN, + &l2arc_feed_secs, 0, "interval seconds"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RWTUN, + &l2arc_feed_min_ms, 0, "min interval milliseconds"); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RWTUN, + &l2arc_noprefetch, 0, "don't cache prefetch bufs"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RWTUN, + &l2arc_feed_again, 0, "turbo warmup"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RWTUN, + &l2arc_norw, 0, "no reads during writes"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, + &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of anonymous state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, + &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, + &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru ghost state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, + &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu ghost state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, + &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); + +SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW, + &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW, + &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms"); + +/* + * L2ARC Internals + */ +struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ + refcount_t l2ad_alloc; /* allocated bytes */ +}; + +static list_t L2ARC_dev_list; /* device list */ +static list_t *l2arc_dev_list; /* device list pointer */ +static kmutex_t l2arc_dev_mtx; /* device list mutex */ +static l2arc_dev_t *l2arc_dev_last; /* last device used */ +static list_t L2ARC_free_on_write; /* free after write buf list */ +static list_t *l2arc_free_on_write; /* free after write list ptr */ +static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ +static uint64_t l2arc_ndev; /* number of devices */ + +typedef struct l2arc_read_callback { + arc_buf_hdr_t *l2rcb_hdr; /* read header */ + blkptr_t l2rcb_bp; /* original blkptr */ + zbookmark_phys_t l2rcb_zb; /* original bookmark */ + int l2rcb_flags; /* original flags */ + abd_t *l2rcb_abd; /* temporary buffer */ +} l2arc_read_callback_t; + +typedef struct l2arc_write_callback { + l2arc_dev_t *l2wcb_dev; /* device info */ + arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ +} l2arc_write_callback_t; + +typedef struct l2arc_data_free { + /* protected by l2arc_free_on_write_mtx */ + abd_t *l2df_abd; + size_t l2df_size; + arc_buf_contents_t l2df_type; + list_node_t l2df_list_node; +} l2arc_data_free_t; + +static kmutex_t l2arc_feed_thr_lock; +static kcondvar_t l2arc_feed_thr_cv; +static uint8_t l2arc_thread_exit; + +static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); +static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); +static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); +static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); +static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); +static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); +static void arc_hdr_free_pabd(arc_buf_hdr_t *); +static void arc_hdr_alloc_pabd(arc_buf_hdr_t *); +static void arc_access(arc_buf_hdr_t *, kmutex_t *); +static boolean_t arc_is_overflowing(); +static void arc_buf_watch(arc_buf_t *); +static void arc_prune_async(int64_t); + +static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); +static uint32_t arc_bufc_to_flags(arc_buf_contents_t); +static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); +static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); + +static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); +static void l2arc_read_done(zio_t *); + +static void +l2arc_trim(const arc_buf_hdr_t *hdr) +{ + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + + ASSERT(HDR_HAS_L2HDR(hdr)); + ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); + + if (HDR_GET_PSIZE(hdr) != 0) { + trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, + HDR_GET_PSIZE(hdr), 0); + } +} + +/* + * We use Cityhash for this. It's fast, and has good hash properties without + * requiring any large static buffers. + */ +static uint64_t +buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) +{ + return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); +} + +#define HDR_EMPTY(hdr) \ + ((hdr)->b_dva.dva_word[0] == 0 && \ + (hdr)->b_dva.dva_word[1] == 0) + +#define HDR_EQUAL(spa, dva, birth, hdr) \ + ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) + +static void +buf_discard_identity(arc_buf_hdr_t *hdr) +{ + hdr->b_dva.dva_word[0] = 0; + hdr->b_dva.dva_word[1] = 0; + hdr->b_birth = 0; +} + +static arc_buf_hdr_t * +buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) +{ + const dva_t *dva = BP_IDENTITY(bp); + uint64_t birth = BP_PHYSICAL_BIRTH(bp); + uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *hdr; + + mutex_enter(hash_lock); + for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; + hdr = hdr->b_hash_next) { + if (HDR_EQUAL(spa, dva, birth, hdr)) { + *lockp = hash_lock; + return (hdr); + } + } + mutex_exit(hash_lock); + *lockp = NULL; + return (NULL); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + * If lockp == NULL, the caller is assumed to already hold the hash lock. + */ +static arc_buf_hdr_t * +buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *fhdr; + uint32_t i; + + ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); + ASSERT(hdr->b_birth != 0); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + + if (lockp != NULL) { + *lockp = hash_lock; + mutex_enter(hash_lock); + } else { + ASSERT(MUTEX_HELD(hash_lock)); + } + + for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; + fhdr = fhdr->b_hash_next, i++) { + if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) + return (fhdr); + } + + hdr->b_hash_next = buf_hash_table.ht_table[idx]; + buf_hash_table.ht_table[idx] = hdr; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); + + /* collect some hash table performance data */ + if (i > 0) { + ARCSTAT_BUMP(arcstat_hash_collisions); + if (i == 1) + ARCSTAT_BUMP(arcstat_hash_chains); + + ARCSTAT_MAX(arcstat_hash_chain_max, i); + } + + ARCSTAT_BUMP(arcstat_hash_elements); + ARCSTAT_MAXSTAT(arcstat_hash_elements); + + return (NULL); +} + +static void +buf_hash_remove(arc_buf_hdr_t *hdr) +{ + arc_buf_hdr_t *fhdr, **hdrp; + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); + + ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + + hdrp = &buf_hash_table.ht_table[idx]; + while ((fhdr = *hdrp) != hdr) { + ASSERT3P(fhdr, !=, NULL); + hdrp = &fhdr->b_hash_next; + } + *hdrp = hdr->b_hash_next; + hdr->b_hash_next = NULL; + arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); + + /* collect some hash table performance data */ + ARCSTAT_BUMPDOWN(arcstat_hash_elements); + + if (buf_hash_table.ht_table[idx] && + buf_hash_table.ht_table[idx]->b_hash_next == NULL) + ARCSTAT_BUMPDOWN(arcstat_hash_chains); +} + +/* + * Global data structures and functions for the buf kmem cache. + */ +static kmem_cache_t *hdr_full_cache; +static kmem_cache_t *hdr_l2only_cache; +static kmem_cache_t *buf_cache; + +static void +buf_fini(void) +{ + int i; + + kmem_free(buf_hash_table.ht_table, + (buf_hash_table.ht_mask + 1) * sizeof (void *)); + for (i = 0; i < BUF_LOCKS; i++) + mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); + kmem_cache_destroy(hdr_full_cache); + kmem_cache_destroy(hdr_l2only_cache); + kmem_cache_destroy(buf_cache); +} + +/* + * Constructor callback - called when the cache is empty + * and a new buf is requested. + */ +/* ARGSUSED */ +static int +hdr_full_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + + bzero(hdr, HDR_FULL_SIZE); + cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&hdr->b_l1hdr.b_refcnt); + mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + multilist_link_init(&hdr->b_l1hdr.b_arc_node); + arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +hdr_l2only_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + + bzero(hdr, HDR_L2ONLY_SIZE); + arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +buf_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_t)); + mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); + + return (0); +} + +/* + * Destructor callback - called when a cached buf is + * no longer required. + */ +/* ARGSUSED */ +static void +hdr_full_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *hdr = vbuf; + + ASSERT(HDR_EMPTY(hdr)); + cv_destroy(&hdr->b_l1hdr.b_cv); + refcount_destroy(&hdr->b_l1hdr.b_refcnt); + mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void +hdr_l2only_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *hdr = vbuf; + + ASSERT(HDR_EMPTY(hdr)); + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); +} + +/* ARGSUSED */ +static void +buf_dest(void *vbuf, void *unused) +{ + arc_buf_t *buf = vbuf; + + mutex_destroy(&buf->b_evict_lock); + arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); +} + +/* + * Reclaim callback -- invoked when memory is low. + */ +/* ARGSUSED */ +static void +hdr_recl(void *unused) +{ + dprintf("hdr_recl called\n"); + /* + * umem calls the reclaim func when we destroy the buf cache, + * which is after we do arc_fini(). + */ + if (arc_initialized) + zthr_wakeup(arc_reap_zthr); +} + +static void +buf_init(void) +{ + uint64_t *ct; + uint64_t hsize = 1ULL << 12; + int i, j; + + /* + * The hash table is big enough to fill all of physical memory + * with an average block size of zfs_arc_average_blocksize (default 8K). + * By default, the table will take up + * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). + */ + while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) + hsize <<= 1; +retry: + buf_hash_table.ht_mask = hsize - 1; + buf_hash_table.ht_table = + kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); + if (buf_hash_table.ht_table == NULL) { + ASSERT(hsize > (1ULL << 8)); + hsize >>= 1; + goto retry; + } + + hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, + 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); + hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", + HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, + NULL, NULL, 0); + buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), + 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); + + for (i = 0; i < 256; i++) + for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) + *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); + + for (i = 0; i < BUF_LOCKS; i++) { + mutex_init(&buf_hash_table.ht_locks[i].ht_lock, + NULL, MUTEX_DEFAULT, NULL); + } +} + +/* + * This is the size that the buf occupies in memory. If the buf is compressed, + * it will correspond to the compressed size. You should use this method of + * getting the buf size unless you explicitly need the logical size. + */ +int32_t +arc_buf_size(arc_buf_t *buf) +{ + return (ARC_BUF_COMPRESSED(buf) ? + HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); +} + +int32_t +arc_buf_lsize(arc_buf_t *buf) +{ + return (HDR_GET_LSIZE(buf->b_hdr)); +} + +enum zio_compress +arc_get_compression(arc_buf_t *buf) +{ + return (ARC_BUF_COMPRESSED(buf) ? + HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); +} + +#define ARC_MINTIME (hz>>4) /* 62 ms */ + +static inline boolean_t +arc_buf_is_shared(arc_buf_t *buf) +{ + boolean_t shared = (buf->b_data != NULL && + buf->b_hdr->b_l1hdr.b_pabd != NULL && + abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && + buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); + IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); + IMPLY(shared, ARC_BUF_SHARED(buf)); + IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); + + /* + * It would be nice to assert arc_can_share() too, but the "hdr isn't + * already being shared" requirement prevents us from doing that. + */ + + return (shared); +} + +/* + * Free the checksum associated with this header. If there is no checksum, this + * is a no-op. + */ +static inline void +arc_cksum_free(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_l1hdr.b_freeze_cksum = NULL; + } + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); +} + +/* + * Return true iff at least one of the bufs on hdr is not compressed. + */ +static boolean_t +arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) +{ + for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { + if (!ARC_BUF_COMPRESSED(b)) { + return (B_TRUE); + } + } + return (B_FALSE); +} + +/* + * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data + * matches the checksum that is stored in the hdr. If there is no checksum, + * or if the buf is compressed, this is a no-op. + */ +static void +arc_cksum_verify(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + zio_cksum_t zc; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || + arc_hdr_has_uncompressed_buf(hdr)); + return; + } + + ASSERT(HDR_HAS_L1HDR(hdr)); + + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); + return; + } + + fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); + if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) + panic("buffer modified while frozen!"); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); +} + +static boolean_t +arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) +{ + enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); + boolean_t valid_cksum; + + ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); + VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); + + /* + * We rely on the blkptr's checksum to determine if the block + * is valid or not. When compressed arc is enabled, the l2arc + * writes the block to the l2arc just as it appears in the pool. + * This allows us to use the blkptr's checksum to validate the + * data that we just read off of the l2arc without having to store + * a separate checksum in the arc_buf_hdr_t. However, if compressed + * arc is disabled, then the data written to the l2arc is always + * uncompressed and won't match the block as it exists in the main + * pool. When this is the case, we must first compress it if it is + * compressed on the main pool before we can validate the checksum. + */ + if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + uint64_t lsize = HDR_GET_LSIZE(hdr); + uint64_t csize; + + abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE); + csize = zio_compress_data(compress, zio->io_abd, + abd_to_buf(cdata), lsize); + + ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); + if (csize < HDR_GET_PSIZE(hdr)) { + /* + * Compressed blocks are always a multiple of the + * smallest ashift in the pool. Ideally, we would + * like to round up the csize to the next + * spa_min_ashift but that value may have changed + * since the block was last written. Instead, + * we rely on the fact that the hdr's psize + * was set to the psize of the block when it was + * last written. We set the csize to that value + * and zero out any part that should not contain + * data. + */ + abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize); + csize = HDR_GET_PSIZE(hdr); + } + zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL); + } + + /* + * Block pointers always store the checksum for the logical data. + * If the block pointer has the gang bit set, then the checksum + * it represents is for the reconstituted data and not for an + * individual gang member. The zio pipeline, however, must be able to + * determine the checksum of each of the gang constituents so it + * treats the checksum comparison differently than what we need + * for l2arc blocks. This prevents us from using the + * zio_checksum_error() interface directly. Instead we must call the + * zio_checksum_error_impl() so that we can ensure the checksum is + * generated using the correct checksum algorithm and accounts for the + * logical I/O size and not just a gang fragment. + */ + valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, + BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, + zio->io_offset, NULL) == 0); + zio_pop_transforms(zio); + return (valid_cksum); +} + +/* + * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a + * checksum and attaches it to the buf's hdr so that we can ensure that the buf + * isn't modified later on. If buf is compressed or there is already a checksum + * on the hdr, this is a no-op (we only checksum uncompressed bufs). + */ +static void +arc_cksum_compute(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + ASSERT(HDR_HAS_L1HDR(hdr)); + + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + ASSERT(arc_hdr_has_uncompressed_buf(hdr)); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); + return; + } else if (ARC_BUF_COMPRESSED(buf)) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); + return; + } + + ASSERT(!ARC_BUF_COMPRESSED(buf)); + hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), + KM_SLEEP); + fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, + hdr->b_l1hdr.b_freeze_cksum); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); +#ifdef illumos + arc_buf_watch(buf); +#endif +} + +#ifdef illumos +#ifndef _KERNEL +typedef struct procctl { + long cmd; + prwatch_t prwatch; +} procctl_t; +#endif + +/* ARGSUSED */ +static void +arc_buf_unwatch(arc_buf_t *buf) +{ +#ifndef _KERNEL + if (arc_watch) { + int result; + procctl_t ctl; + ctl.cmd = PCWATCH; + ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; + ctl.prwatch.pr_size = 0; + ctl.prwatch.pr_wflags = 0; + result = write(arc_procfd, &ctl, sizeof (ctl)); + ASSERT3U(result, ==, sizeof (ctl)); + } +#endif +} + +/* ARGSUSED */ +static void +arc_buf_watch(arc_buf_t *buf) +{ +#ifndef _KERNEL + if (arc_watch) { + int result; + procctl_t ctl; + ctl.cmd = PCWATCH; + ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; + ctl.prwatch.pr_size = arc_buf_size(buf); + ctl.prwatch.pr_wflags = WA_WRITE; + result = write(arc_procfd, &ctl, sizeof (ctl)); + ASSERT3U(result, ==, sizeof (ctl)); + } +#endif +} +#endif /* illumos */ + +static arc_buf_contents_t +arc_buf_type(arc_buf_hdr_t *hdr) +{ + arc_buf_contents_t type; + if (HDR_ISTYPE_METADATA(hdr)) { + type = ARC_BUFC_METADATA; + } else { + type = ARC_BUFC_DATA; + } + VERIFY3U(hdr->b_type, ==, type); + return (type); +} + +boolean_t +arc_is_metadata(arc_buf_t *buf) +{ + return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); +} + +static uint32_t +arc_bufc_to_flags(arc_buf_contents_t type) +{ + switch (type) { + case ARC_BUFC_DATA: + /* metadata field is 0 if buffer contains normal data */ + return (0); + case ARC_BUFC_METADATA: + return (ARC_FLAG_BUFC_METADATA); + default: + break; + } + panic("undefined ARC buffer type!"); + return ((uint32_t)-1); +} + +void +arc_buf_thaw(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + + arc_cksum_verify(buf); + + /* + * Compressed buffers do not manipulate the b_freeze_cksum or + * allocate b_thawed. + */ + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || + arc_hdr_has_uncompressed_buf(hdr)); + return; + } + + ASSERT(HDR_HAS_L1HDR(hdr)); + arc_cksum_free(hdr); + + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); +#ifdef ZFS_DEBUG + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (hdr->b_l1hdr.b_thawed != NULL) + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); + } +#endif + + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); + +#ifdef illumos + arc_buf_unwatch(buf); +#endif +} + +void +arc_buf_freeze(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + kmutex_t *hash_lock; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || + arc_hdr_has_uncompressed_buf(hdr)); + return; + } + + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || + hdr->b_l1hdr.b_state == arc_anon); + arc_cksum_compute(buf); + mutex_exit(hash_lock); +} + +/* + * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, + * the following functions should be used to ensure that the flags are + * updated in a thread-safe way. When manipulating the flags either + * the hash_lock must be held or the hdr must be undiscoverable. This + * ensures that we're not racing with any other threads when updating + * the flags. + */ +static inline void +arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags |= flags; +} + +static inline void +arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags &= ~flags; +} + +/* + * Setting the compression bits in the arc_buf_hdr_t's b_flags is + * done in a special way since we have to clear and set bits + * at the same time. Consumers that wish to set the compression bits + * must use this function to ensure that the flags are updated in + * thread-safe manner. + */ +static void +arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * Holes and embedded blocks will always have a psize = 0 so + * we ignore the compression of the blkptr and set the + * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. + * Holes and embedded blocks remain anonymous so we don't + * want to uncompress them. Mark them as uncompressed. + */ + if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { + arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, cmp); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); + ASSERT(HDR_COMPRESSION_ENABLED(hdr)); + } +} + +/* + * Looks for another buf on the same hdr which has the data decompressed, copies + * from it, and returns true. If no such buf exists, returns false. + */ +static boolean_t +arc_buf_try_copy_decompressed_data(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + boolean_t copied = B_FALSE; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(buf->b_data, !=, NULL); + ASSERT(!ARC_BUF_COMPRESSED(buf)); + + for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; + from = from->b_next) { + /* can't use our own data buffer */ + if (from == buf) { + continue; + } + + if (!ARC_BUF_COMPRESSED(from)) { + bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); + copied = B_TRUE; + break; + } + } + + /* + * There were no decompressed bufs, so there should not be a + * checksum on the hdr either. + */ + EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); + + return (copied); +} + +/* + * Given a buf that has a data buffer attached to it, this function will + * efficiently fill the buf with data of the specified compression setting from + * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr + * are already sharing a data buf, no copy is performed. + * + * If the buf is marked as compressed but uncompressed data was requested, this + * will allocate a new data buffer for the buf, remove that flag, and fill the + * buf with uncompressed data. You can't request a compressed buf on a hdr with + * uncompressed data, and (since we haven't added support for it yet) if you + * want compressed data your buf must already be marked as compressed and have + * the correct-sized data buffer. + */ +static int +arc_buf_fill(arc_buf_t *buf, boolean_t compressed) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; + + ASSERT3P(buf->b_data, !=, NULL); + IMPLY(compressed, hdr_compressed); + IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); + + if (hdr_compressed == compressed) { + if (!arc_buf_is_shared(buf)) { + abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, + arc_buf_size(buf)); + } + } else { + ASSERT(hdr_compressed); + ASSERT(!compressed); + ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); + + /* + * If the buf is sharing its data with the hdr, unlink it and + * allocate a new data buffer for the buf. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_COMPRESSED(buf)); + + /* We need to give the buf it's own b_data */ + buf->b_flags &= ~ARC_BUF_FLAG_SHARED; + buf->b_data = + arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + + /* Previously overhead was 0; just add new overhead */ + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + } else if (ARC_BUF_COMPRESSED(buf)) { + /* We need to reallocate the buf's b_data */ + arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), + buf); + buf->b_data = + arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + + /* We increased the size of b_data; update overhead */ + ARCSTAT_INCR(arcstat_overhead_size, + HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); + } + + /* + * Regardless of the buf's previous compression settings, it + * should not be compressed at the end of this function. + */ + buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; + + /* + * Try copying the data from another buf which already has a + * decompressed version. If that's not possible, it's time to + * bite the bullet and decompress the data from the hdr. + */ + if (arc_buf_try_copy_decompressed_data(buf)) { + /* Skip byteswapping and checksumming (already done) */ + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); + return (0); + } else { + int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pabd, buf->b_data, + HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); + + /* + * Absent hardware errors or software bugs, this should + * be impossible, but log it anyway so we can debug it. + */ + if (error != 0) { + zfs_dbgmsg( + "hdr %p, compress %d, psize %d, lsize %d", + hdr, HDR_GET_COMPRESS(hdr), + HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); + return (SET_ERROR(EIO)); + } + } + } + + /* Byteswap the buf's data if necessary */ + if (bswap != DMU_BSWAP_NUMFUNCS) { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); + dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); + } + + /* Compute the hdr's checksum if necessary */ + arc_cksum_compute(buf); + + return (0); +} + +int +arc_decompress(arc_buf_t *buf) +{ + return (arc_buf_fill(buf, B_FALSE)); +} + +/* + * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. + */ +static uint64_t +arc_hdr_size(arc_buf_hdr_t *hdr) +{ + uint64_t size; + + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + HDR_GET_PSIZE(hdr) > 0) { + size = HDR_GET_PSIZE(hdr); + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); + size = HDR_GET_LSIZE(hdr); + } + return (size); +} + +/* + * Increment the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + (void) refcount_add_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pabd != NULL) { + (void) refcount_add_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) + continue; + (void) refcount_add_many(&state->arcs_esize[type], + arc_buf_size(buf), buf); + } +} + +/* + * Decrement the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + (void) refcount_remove_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pabd != NULL) { + (void) refcount_remove_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) + continue; + (void) refcount_remove_many(&state->arcs_esize[type], + arc_buf_size(buf), buf); + } +} + +/* + * Add a reference to this hdr indicating that someone is actively + * referencing that memory. When the refcount transitions from 0 to 1, + * we remove it from the respective arc_state_t list to indicate that + * it is not evictable. + */ +static void +add_reference(arc_buf_hdr_t *hdr, void *tag) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + if (!MUTEX_HELD(HDR_LOCK(hdr))) { + ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + } + + arc_state_t *state = hdr->b_l1hdr.b_state; + + if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && + (state != arc_anon)) { + /* We don't use the L2-only state list. */ + if (state != arc_l2c_only) { + multilist_remove(state->arcs_list[arc_buf_type(hdr)], + hdr); + arc_evictable_space_decrement(hdr, state); + } + /* remove the prefetch flag if we get a reference */ + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + } +} + +/* + * Remove a reference from this hdr. When the reference transitions from + * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's + * list making it eligible for eviction. + */ +static int +remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) +{ + int cnt; + arc_state_t *state = hdr->b_l1hdr.b_state; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); + ASSERT(!GHOST_STATE(state)); + + /* + * arc_l2c_only counts as a ghost state so we don't need to explicitly + * check to prevent usage of the arc_l2c_only list. + */ + if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && + (state != arc_anon)) { + multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + arc_evictable_space_increment(hdr, state); + } + return (cnt); +} + +/* + * Returns detailed information about a specific arc buffer. When the + * state_index argument is set the function will calculate the arc header + * list position for its arc state. Since this requires a linear traversal + * callers are strongly encourage not to do this. However, it can be helpful + * for targeted analysis so the functionality is provided. + */ +void +arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) +{ + arc_buf_hdr_t *hdr = ab->b_hdr; + l1arc_buf_hdr_t *l1hdr = NULL; + l2arc_buf_hdr_t *l2hdr = NULL; + arc_state_t *state = NULL; + + memset(abi, 0, sizeof (arc_buf_info_t)); + + if (hdr == NULL) + return; + + abi->abi_flags = hdr->b_flags; + + if (HDR_HAS_L1HDR(hdr)) { + l1hdr = &hdr->b_l1hdr; + state = l1hdr->b_state; + } + if (HDR_HAS_L2HDR(hdr)) + l2hdr = &hdr->b_l2hdr; + + if (l1hdr) { + abi->abi_bufcnt = l1hdr->b_bufcnt; + abi->abi_access = l1hdr->b_arc_access; + abi->abi_mru_hits = l1hdr->b_mru_hits; + abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; + abi->abi_mfu_hits = l1hdr->b_mfu_hits; + abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; + abi->abi_holds = refcount_count(&l1hdr->b_refcnt); + } + + if (l2hdr) { + abi->abi_l2arc_dattr = l2hdr->b_daddr; + abi->abi_l2arc_hits = l2hdr->b_hits; + } + + abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; + abi->abi_state_contents = arc_buf_type(hdr); + abi->abi_size = arc_hdr_size(hdr); +} + +/* + * Move the supplied buffer to the indicated state. The hash lock + * for the buffer must be held by the caller. + */ +static void +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, + kmutex_t *hash_lock) +{ + arc_state_t *old_state; + int64_t refcnt; + uint32_t bufcnt; + boolean_t update_old, update_new; + arc_buf_contents_t buftype = arc_buf_type(hdr); + + /* + * We almost always have an L1 hdr here, since we call arc_hdr_realloc() + * in arc_read() when bringing a buffer out of the L2ARC. However, the + * L1 hdr doesn't always exist when we change state to arc_anon before + * destroying a header, in which case reallocating to add the L1 hdr is + * pointless. + */ + if (HDR_HAS_L1HDR(hdr)) { + old_state = hdr->b_l1hdr.b_state; + refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); + bufcnt = hdr->b_l1hdr.b_bufcnt; + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); + } else { + old_state = arc_l2c_only; + refcnt = 0; + bufcnt = 0; + update_old = B_FALSE; + } + update_new = update_old; + + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT3P(new_state, !=, old_state); + ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); + ASSERT(old_state != arc_anon || bufcnt <= 1); + + /* + * If this buffer is evictable, transfer it from the + * old state list to the new state list. + */ + if (refcnt == 0) { + if (old_state != arc_anon && old_state != arc_l2c_only) { + ASSERT(HDR_HAS_L1HDR(hdr)); + multilist_remove(old_state->arcs_list[buftype], hdr); + + if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_old = B_TRUE; + } + arc_evictable_space_decrement(hdr, old_state); + } + if (new_state != arc_anon && new_state != arc_l2c_only) { + + /* + * An L1 header always exists here, since if we're + * moving to some L1-cached state (i.e. not l2c_only or + * anonymous), we realloc the header to add an L1hdr + * beforehand. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + multilist_insert(new_state->arcs_list[buftype], hdr); + + if (GHOST_STATE(new_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_new = B_TRUE; + } + arc_evictable_space_increment(hdr, new_state); + } + } + + ASSERT(!HDR_EMPTY(hdr)); + if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); + + /* adjust state sizes (ignore arc_l2c_only) */ + + if (update_new && new_state != arc_l2c_only) { + ASSERT(HDR_HAS_L1HDR(hdr)); + if (GHOST_STATE(new_state)) { + ASSERT0(bufcnt); + + /* + * When moving a header to a ghost state, we first + * remove all arc buffers. Thus, we'll have a + * bufcnt of zero, and no arc buffer to use for + * the reference. As a result, we use the arc + * header pointer for the reference. + */ + (void) refcount_add_many(&new_state->arcs_size, + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + } else { + uint32_t buffers = 0; + + /* + * Each individual buffer holds a unique reference, + * thus we must remove each of these references one + * at a time. + */ + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + ASSERT3U(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) + continue; + + (void) refcount_add_many(&new_state->arcs_size, + arc_buf_size(buf), buf); + } + ASSERT3U(bufcnt, ==, buffers); + + if (hdr->b_l1hdr.b_pabd != NULL) { + (void) refcount_add_many(&new_state->arcs_size, + arc_hdr_size(hdr), hdr); + } else { + ASSERT(GHOST_STATE(old_state)); + } + } + } + + if (update_old && old_state != arc_l2c_only) { + ASSERT(HDR_HAS_L1HDR(hdr)); + if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + + /* + * When moving a header off of a ghost state, + * the header will not contain any arc buffers. + * We use the arc header pointer for the reference + * which is exactly what we did when we put the + * header on the ghost state. + */ + + (void) refcount_remove_many(&old_state->arcs_size, + HDR_GET_LSIZE(hdr), hdr); + } else { + uint32_t buffers = 0; + + /* + * Each individual buffer holds a unique reference, + * thus we must remove each of these references one + * at a time. + */ + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + ASSERT3U(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) + continue; + + (void) refcount_remove_many( + &old_state->arcs_size, arc_buf_size(buf), + buf); + } + ASSERT3U(bufcnt, ==, buffers); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + (void) refcount_remove_many( + &old_state->arcs_size, arc_hdr_size(hdr), hdr); + } + } + + if (HDR_HAS_L1HDR(hdr)) + hdr->b_l1hdr.b_state = new_state; + + /* + * L2 headers should never be on the L2 state list since they don't + * have L1 headers allocated. + */ + ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && + multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); +} + +void +arc_space_consume(uint64_t space, arc_space_type_t type) +{ + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + case ARC_SPACE_DATA: + aggsum_add(&astat_data_size, space); + break; + case ARC_SPACE_META: + aggsum_add(&astat_metadata_size, space); + break; + case ARC_SPACE_BONUS: + aggsum_add(&astat_bonus_size, space); + break; + case ARC_SPACE_DNODE: + aggsum_add(&astat_dnode_size, space); + break; + case ARC_SPACE_DBUF: + aggsum_add(&astat_dbuf_size, space); + break; + case ARC_SPACE_HDRS: + aggsum_add(&astat_hdr_size, space); + break; + case ARC_SPACE_L2HDRS: + aggsum_add(&astat_l2_hdr_size, space); + break; + } + + if (type != ARC_SPACE_DATA) + aggsum_add(&arc_meta_used, space); + + aggsum_add(&arc_size, space); +} + +void +arc_space_return(uint64_t space, arc_space_type_t type) +{ + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + case ARC_SPACE_DATA: + aggsum_add(&astat_data_size, -space); + break; + case ARC_SPACE_META: + aggsum_add(&astat_metadata_size, -space); + break; + case ARC_SPACE_BONUS: + aggsum_add(&astat_bonus_size, -space); + break; + case ARC_SPACE_DNODE: + aggsum_add(&astat_dnode_size, -space); + break; + case ARC_SPACE_DBUF: + aggsum_add(&astat_dbuf_size, -space); + break; + case ARC_SPACE_HDRS: + aggsum_add(&astat_hdr_size, -space); + break; + case ARC_SPACE_L2HDRS: + aggsum_add(&astat_l2_hdr_size, -space); + break; + } + + if (type != ARC_SPACE_DATA) { + ASSERT(aggsum_compare(&arc_meta_used, space) >= 0); + /* + * We use the upper bound here rather than the precise value + * because the arc_meta_max value doesn't need to be + * precise. It's only consumed by humans via arcstats. + */ + if (arc_meta_max < aggsum_upper_bound(&arc_meta_used)) + arc_meta_max = aggsum_upper_bound(&arc_meta_used); + aggsum_add(&arc_meta_used, -space); + } + + ASSERT(aggsum_compare(&arc_size, space) >= 0); + aggsum_add(&arc_size, -space); +} + +/* + * Given a hdr and a buf, returns whether that buf can share its b_data buffer + * with the hdr's b_pabd. + */ +static boolean_t +arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + /* + * The criteria for sharing a hdr's data are: + * 1. the hdr's compression matches the buf's compression + * 2. the hdr doesn't need to be byteswapped + * 3. the hdr isn't already being shared + * 4. the buf is either compressed or it is the last buf in the hdr list + * + * Criterion #4 maintains the invariant that shared uncompressed + * bufs must be the final buf in the hdr's b_buf list. Reading this, you + * might ask, "if a compressed buf is allocated first, won't that be the + * last thing in the list?", but in that case it's impossible to create + * a shared uncompressed buf anyway (because the hdr must be compressed + * to have the compressed buf). You might also think that #3 is + * sufficient to make this guarantee, however it's possible + * (specifically in the rare L2ARC write race mentioned in + * arc_buf_alloc_impl()) there will be an existing uncompressed buf that + * is sharable, but wasn't at the time of its allocation. Rather than + * allow a new shared uncompressed buf to be created and then shuffle + * the list around to make it the last element, this simply disallows + * sharing if the new buf isn't the first to be added. + */ + ASSERT3P(buf->b_hdr, ==, hdr); + boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; + boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; + return (buf_compressed == hdr_compressed && + hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && + !HDR_SHARED_DATA(hdr) && + (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); +} + +/* + * Allocate a buf for this hdr. If you care about the data that's in the hdr, + * or if you want a compressed buffer, pass those flags in. Returns 0 if the + * copy was made successfully, or an error code otherwise. + */ +static int +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, + boolean_t fill, arc_buf_t **ret) +{ + arc_buf_t *buf; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + VERIFY(hdr->b_type == ARC_BUFC_DATA || + hdr->b_type == ARC_BUFC_METADATA); + ASSERT3P(ret, !=, NULL); + ASSERT3P(*ret, ==, NULL); + + buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_next = hdr->b_l1hdr.b_buf; + buf->b_flags = 0; + + add_reference(hdr, tag); + + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * Only honor requests for compressed bufs if the hdr is actually + * compressed. + */ + if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) + buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; + + /* + * If the hdr's data can be shared then we share the data buffer and + * set the appropriate bit in the hdr's b_flags to indicate the hdr is + * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new + * buffer to store the buf's data. + * + * There are two additional restrictions here because we're sharing + * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be + * actively involved in an L2ARC write, because if this buf is used by + * an arc_write() then the hdr's data buffer will be released when the + * write completes, even though the L2ARC write might still be using it. + * Second, the hdr's ABD must be linear so that the buf's user doesn't + * need to be ABD-aware. + */ + boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && + abd_is_linear(hdr->b_l1hdr.b_pabd); + + /* Set up b_data and sharing */ + if (can_share) { + buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); + buf->b_flags |= ARC_BUF_FLAG_SHARED; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + buf->b_data = + arc_get_data_buf(hdr, arc_buf_size(buf), buf); + ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); + } + VERIFY3P(buf->b_data, !=, NULL); + + hdr->b_l1hdr.b_buf = buf; + hdr->b_l1hdr.b_bufcnt += 1; + + /* + * If the user wants the data from the hdr, we need to either copy or + * decompress the data. + */ + if (fill) { + return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); + } + + return (0); +} + +static char *arc_onloan_tag = "onloan"; + +static inline void +arc_loaned_bytes_update(int64_t delta) +{ + atomic_add_64(&arc_loaned_bytes, delta); + + /* assert that it did not wrap around */ + ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); +} + +/* + * Loan out an anonymous arc buffer. Loaned buffers are not counted as in + * flight data by arc_tempreserve_space() until they are "returned". Loaned + * buffers must be returned to the arc before they can be used by the DMU or + * freed. + */ +arc_buf_t * +arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) +{ + arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, + is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); + + arc_loaned_bytes_update(arc_buf_size(buf)); + + return (buf); +} + +arc_buf_t * +arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type) +{ + arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, + psize, lsize, compression_type); + + arc_loaned_bytes_update(arc_buf_size(buf)); + + return (buf); +} + + +/* + * Return a loaned arc buffer to the arc. + */ +void +arc_return_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT3P(buf->b_data, !=, NULL); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); + + arc_loaned_bytes_update(-arc_buf_size(buf)); +} + +/* Detach an arc_buf from a dbuf (tag) */ +void +arc_loan_inuse_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT3P(buf->b_data, !=, NULL); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); + + arc_loaned_bytes_update(arc_buf_size(buf)); +} + +static void +l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) +{ + l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); + + df->l2df_abd = abd; + df->l2df_size = size; + df->l2df_type = type; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); +} + +static void +arc_hdr_free_on_write(arc_buf_hdr_t *hdr) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t size = arc_hdr_size(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, hdr); + } + (void) refcount_remove_many(&state->arcs_size, size, hdr); + if (type == ARC_BUFC_METADATA) { + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + arc_space_return(size, ARC_SPACE_DATA); + } + + l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); +} + +/* + * Share the arc_buf_t's data with the hdr. Whenever we are sharing the + * data buffer, we transfer the refcount ownership to the hdr and update + * the appropriate kstats. + */ +static void +arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + + ASSERT(arc_can_share(hdr, buf)); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * Start sharing the data buffer. We transfer the + * refcount ownership to the hdr since it always owns + * the refcount whenever an arc_buf_t is shared. + */ + refcount_transfer_ownership(&state->arcs_size, buf, hdr); + hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); + abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, + HDR_ISTYPE_METADATA(hdr)); + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + buf->b_flags |= ARC_BUF_FLAG_SHARED; + + /* + * Since we've transferred ownership to the hdr we need + * to increment its compressed and uncompressed kstats and + * decrement the overhead size. + */ + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); +} + +static void +arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + + ASSERT(arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * We are no longer sharing this buffer so we need + * to transfer its ownership to the rightful owner. + */ + refcount_transfer_ownership(&state->arcs_size, hdr, buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); + abd_put(hdr->b_l1hdr.b_pabd); + hdr->b_l1hdr.b_pabd = NULL; + buf->b_flags &= ~ARC_BUF_FLAG_SHARED; + + /* + * Since the buffer is no longer shared between + * the arc buf and the hdr, count it as overhead. + */ + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); +} + +/* + * Remove an arc_buf_t from the hdr's buf list and return the last + * arc_buf_t on the list. If no buffers remain on the list then return + * NULL. + */ +static arc_buf_t * +arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; + arc_buf_t *lastbuf = NULL; + + /* + * Remove the buf from the hdr list and locate the last + * remaining buffer on the list. + */ + while (*bufp != NULL) { + if (*bufp == buf) + *bufp = buf->b_next; + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } + buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); + IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); + IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); + + return (lastbuf); +} + +/* + * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's + * list and free it. + */ +static void +arc_buf_destroy_impl(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + /* + * Free up the data associated with the buf but only if we're not + * sharing this with the hdr. If we are sharing it with the hdr, the + * hdr is responsible for doing the free. + */ + if (buf->b_data != NULL) { + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + arc_cksum_verify(buf); +#ifdef illumos + arc_buf_unwatch(buf); +#endif + + if (arc_buf_is_shared(buf)) { + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + uint64_t size = arc_buf_size(buf); + arc_free_data_buf(hdr, buf->b_data, size, buf); + ARCSTAT_INCR(arcstat_overhead_size, -size); + } + buf->b_data = NULL; + + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + hdr->b_l1hdr.b_bufcnt -= 1; + } + + arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); + + if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { + /* + * If the current arc_buf_t is sharing its data buffer with the + * hdr, then reassign the hdr's b_pabd to share it with the new + * buffer at the end of the list. The shared buffer is always + * the last one on the hdr's buffer list. + * + * There is an equivalent case for compressed bufs, but since + * they aren't guaranteed to be the last buf in the list and + * that is an exceedingly rare case, we just allow that space be + * wasted temporarily. + */ + if (lastbuf != NULL) { + /* Only one buf can be shared at once */ + VERIFY(!arc_buf_is_shared(lastbuf)); + /* hdr is uncompressed so can't have compressed buf */ + VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); + + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + arc_hdr_free_pabd(hdr); + + /* + * We must setup a new shared block between the + * last buffer and the hdr. The data would have + * been allocated by the arc buf so we need to transfer + * ownership to the hdr since it's now being shared. + */ + arc_share_buf(hdr, lastbuf); + } + } else if (HDR_SHARED_DATA(hdr)) { + /* + * Uncompressed shared buffers are always at the end + * of the list. Compressed buffers don't have the + * same requirements. This makes it hard to + * simply assert that the lastbuf is shared so + * we rely on the hdr's compression flags to determine + * if we have a compressed, shared buffer. + */ + ASSERT3P(lastbuf, !=, NULL); + ASSERT(arc_buf_is_shared(lastbuf) || + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + } + + /* + * Free the checksum if we're removing the last uncompressed buf from + * this hdr. + */ + if (!arc_hdr_has_uncompressed_buf(hdr)) { + arc_cksum_free(hdr); + } + + /* clean up the buf */ + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); +} + +static void +arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) +{ + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); + + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); +} + +static void +arc_hdr_free_pabd(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + /* + * If the hdr is currently being written to the l2arc then + * we defer freeing the data by adding it to the l2arc_free_on_write + * list. The l2arc will free the data once it's finished + * writing it to the l2arc device. + */ + if (HDR_L2_WRITING(hdr)) { + arc_hdr_free_on_write(hdr); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, + arc_hdr_size(hdr), hdr); + } + hdr->b_l1hdr.b_pabd = NULL; + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); +} + +static arc_buf_hdr_t * +arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, + enum zio_compress compression_type, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr; + + VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); + + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); + ASSERT(HDR_EMPTY(hdr)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); + HDR_SET_PSIZE(hdr, psize); + HDR_SET_LSIZE(hdr, lsize); + hdr->b_spa = spa; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); + arc_hdr_set_compress(hdr, compression_type); + + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_bufcnt = 0; + hdr->b_l1hdr.b_buf = NULL; + + /* + * Allocate the hdr's buffer. This will contain either + * the compressed or uncompressed data depending on the block + * it references and compressed arc enablement. + */ + arc_hdr_alloc_pabd(hdr); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + + return (hdr); +} + +/* + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. + */ +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +{ + ASSERT(HDR_HAS_L2HDR(hdr)); + + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); + + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + + if (new == hdr_full_cache) { + arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + + /* Verify previous threads set to NULL before freeing */ + ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); + } else { + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + /* + * If we've reached here, We must have been called from + * arc_evict_hdr(), as such we should have already been + * removed from any ghost list we were previously on + * (which protects us from racing with arc_evict_state), + * thus no locking is needed during this check. + */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + + /* + * A buffer must not be moved into the arc_l2c_only + * state if it's not finished being written out to the + * l2arc device. Otherwise, the b_l1hdr.b_pabd field + * might try to be accessed, even though it was removed. + */ + VERIFY(!HDR_L2_WRITING(hdr)); + VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); + +#ifdef ZFS_DEBUG + if (hdr->b_l1hdr.b_thawed != NULL) { + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = NULL; + } +#endif + + arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); + } + /* + * The header has been reallocated so we need to re-insert it into any + * lists it was on. + */ + (void) buf_hash_insert(nhdr, NULL); + + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); + + /* + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. + */ + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); + + /* + * Since we're using the pointer address as the tag when + * incrementing and decrementing the l2ad_alloc refcount, we + * must remove the old pointer (that we're about to destroy) and + * add the new pointer to the refcount. Otherwise we'd remove + * the wrong pointer address when calling arc_hdr_destroy() later. + */ + + (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); + + buf_discard_identity(hdr); + kmem_cache_free(old, hdr); + + return (nhdr); +} + +/* + * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. + * The buf is returned thawed since we expect the consumer to modify it. + */ +arc_buf_t * +arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) +{ + arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, + ZIO_COMPRESS_OFF, type); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + + arc_buf_t *buf = NULL; + VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); + arc_buf_thaw(buf); + + return (buf); +} + +/* + * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this + * for bufs containing metadata. + */ +arc_buf_t * +arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type) +{ + ASSERT3U(lsize, >, 0); + ASSERT3U(lsize, >=, psize); + ASSERT(compression_type > ZIO_COMPRESS_OFF); + ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); + + arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + compression_type, ARC_BUFC_DATA); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + + arc_buf_t *buf = NULL; + VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); + arc_buf_thaw(buf); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + if (!arc_buf_is_shared(buf)) { + /* + * To ensure that the hdr has the correct data in it if we call + * arc_decompress() on this buf before it's been written to + * disk, it's easiest if we just set up sharing between the + * buf and the hdr. + */ + ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); + arc_hdr_free_pabd(hdr); + arc_share_buf(hdr, buf); + } + + return (buf); +} + +static void +arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) +{ + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + l2arc_dev_t *dev = l2hdr->b_dev; + uint64_t psize = arc_hdr_size(hdr); + + ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); + ASSERT(HDR_HAS_L2HDR(hdr)); + + list_remove(&dev->l2ad_buflist, hdr); + + ARCSTAT_INCR(arcstat_l2_psize, -psize); + ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); + + vdev_space_update(dev->l2ad_vdev, -psize, 0, 0); + + (void) refcount_remove_many(&dev->l2ad_alloc, psize, hdr); + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); +} + +static void +arc_hdr_destroy(arc_buf_hdr_t *hdr) +{ + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_buf == NULL || + hdr->b_l1hdr.b_bufcnt > 0); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + } + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + + if (HDR_HAS_L2HDR(hdr)) { + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); + + if (!buflist_held) + mutex_enter(&dev->l2ad_mtx); + + /* + * Even though we checked this conditional above, we + * need to check this again now that we have the + * l2ad_mtx. This is because we could be racing with + * another thread calling l2arc_evict() which might have + * destroyed this header's L2 portion as we were waiting + * to acquire the l2ad_mtx. If that happens, we don't + * want to re-destroy the header's L2 portion. + */ + if (HDR_HAS_L2HDR(hdr)) { + l2arc_trim(hdr); + arc_hdr_l2hdr_destroy(hdr); + } + + if (!buflist_held) + mutex_exit(&dev->l2ad_mtx); + } + + if (HDR_HAS_L1HDR(hdr)) { + arc_cksum_free(hdr); + + while (hdr->b_l1hdr.b_buf != NULL) + arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); + +#ifdef ZFS_DEBUG + if (hdr->b_l1hdr.b_thawed != NULL) { + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = NULL; + } +#endif + + if (hdr->b_l1hdr.b_pabd != NULL) { + arc_hdr_free_pabd(hdr); + } + } + + ASSERT3P(hdr->b_hash_next, ==, NULL); + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + kmem_cache_free(hdr_full_cache, hdr); + } else { + kmem_cache_free(hdr_l2only_cache, hdr); + } +} + +void +arc_buf_destroy(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + kmutex_t *hash_lock = HDR_LOCK(hdr); + + if (hdr->b_l1hdr.b_state == arc_anon) { + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + VERIFY0(remove_reference(hdr, NULL, tag)); + arc_hdr_destroy(hdr); + return; + } + + mutex_enter(hash_lock); + ASSERT3P(hdr, ==, buf->b_hdr); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); + ASSERT3P(buf->b_data, !=, NULL); + + (void) remove_reference(hdr, hash_lock, tag); + arc_buf_destroy_impl(buf); + mutex_exit(hash_lock); +} + +/* + * Evict the arc_buf_hdr that is provided as a parameter. The resultant + * state of the header is dependent on its state prior to entering this + * function. The following transitions are possible: + * + * - arc_mru -> arc_mru_ghost + * - arc_mfu -> arc_mfu_ghost + * - arc_mru_ghost -> arc_l2c_only + * - arc_mru_ghost -> deleted + * - arc_mfu_ghost -> arc_l2c_only + * - arc_mfu_ghost -> deleted + */ +static int64_t +arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) +{ + arc_state_t *evicted_state, *state; + int64_t bytes_evicted = 0; + int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? + zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms; + + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); + + state = hdr->b_l1hdr.b_state; + if (GHOST_STATE(state)) { + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + + /* + * l2arc_write_buffers() relies on a header's L1 portion + * (i.e. its b_pabd field) during it's write phase. + * Thus, we cannot push a header onto the arc_l2c_only + * state (removing it's L1 piece) until the header is + * done being written to the l2arc. + */ + if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { + ARCSTAT_BUMP(arcstat_evict_l2_skip); + return (bytes_evicted); + } + + ARCSTAT_BUMP(arcstat_deleted); + bytes_evicted += HDR_GET_LSIZE(hdr); + + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); + + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + if (HDR_HAS_L2HDR(hdr)) { + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, hdr, hash_lock); + /* + * dropping from L1+L2 cached to L2-only, + * realloc to remove the L1 header. + */ + hdr = arc_hdr_realloc(hdr, hdr_full_cache, + hdr_l2only_cache); + } else { + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); + } + return (bytes_evicted); + } + + ASSERT(state == arc_mru || state == arc_mfu); + evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + /* prefetch buffers have a minimum lifespan */ + if (HDR_IO_IN_PROGRESS(hdr) || + ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) { + ARCSTAT_BUMP(arcstat_evict_skip); + return (bytes_evicted); + } + + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; + if (!mutex_tryenter(&buf->b_evict_lock)) { + ARCSTAT_BUMP(arcstat_mutex_miss); + break; + } + if (buf->b_data != NULL) + bytes_evicted += HDR_GET_LSIZE(hdr); + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy_impl(buf); + } + + if (HDR_HAS_L2HDR(hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); + } else { + if (l2arc_write_eligible(hdr->b_spa, hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_eligible, + HDR_GET_LSIZE(hdr)); + } else { + ARCSTAT_INCR(arcstat_evict_l2_ineligible, + HDR_GET_LSIZE(hdr)); + } + } + + if (hdr->b_l1hdr.b_bufcnt == 0) { + arc_cksum_free(hdr); + + bytes_evicted += arc_hdr_size(hdr); + + /* + * If this hdr is being evicted and has a compressed + * buffer then we discard it here before we change states. + * This ensures that the accounting is updated correctly + * in arc_free_data_impl(). + */ + arc_hdr_free_pabd(hdr); + + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); + } + + return (bytes_evicted); +} + +static uint64_t +arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, + uint64_t spa, int64_t bytes) +{ + multilist_sublist_t *mls; + uint64_t bytes_evicted = 0; + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + int evict_count = 0; + + ASSERT3P(marker, !=, NULL); + IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); + + mls = multilist_sublist_lock(ml, idx); + + for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; + hdr = multilist_sublist_prev(mls, marker)) { + if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || + (evict_count >= zfs_arc_evict_batch_limit)) + break; + + /* + * To keep our iteration location, move the marker + * forward. Since we're not holding hdr's hash lock, we + * must be very careful and not remove 'hdr' from the + * sublist. Otherwise, other consumers might mistake the + * 'hdr' as not being on a sublist when they call the + * multilist_link_active() function (they all rely on + * the hash lock protecting concurrent insertions and + * removals). multilist_sublist_move_forward() was + * specifically implemented to ensure this is the case + * (only 'marker' will be removed and re-inserted). + */ + multilist_sublist_move_forward(mls, marker); + + /* + * The only case where the b_spa field should ever be + * zero, is the marker headers inserted by + * arc_evict_state(). It's possible for multiple threads + * to be calling arc_evict_state() concurrently (e.g. + * dsl_pool_close() and zio_inject_fault()), so we must + * skip any markers we see from these other threads. + */ + if (hdr->b_spa == 0) + continue; + + /* we're only interested in evicting buffers of a certain spa */ + if (spa != 0 && hdr->b_spa != spa) { + ARCSTAT_BUMP(arcstat_evict_skip); + continue; + } + + hash_lock = HDR_LOCK(hdr); + + /* + * We aren't calling this function from any code path + * that would already be holding a hash lock, so we're + * asserting on this assumption to be defensive in case + * this ever changes. Without this check, it would be + * possible to incorrectly increment arcstat_mutex_miss + * below (e.g. if the code changed such that we called + * this function with a hash lock held). + */ + ASSERT(!MUTEX_HELD(hash_lock)); + + if (mutex_tryenter(hash_lock)) { + uint64_t evicted = arc_evict_hdr(hdr, hash_lock); + mutex_exit(hash_lock); + + bytes_evicted += evicted; + + /* + * If evicted is zero, arc_evict_hdr() must have + * decided to skip this header, don't increment + * evict_count in this case. + */ + if (evicted != 0) + evict_count++; + + /* + * If arc_size isn't overflowing, signal any + * threads that might happen to be waiting. + * + * For each header evicted, we wake up a single + * thread. If we used cv_broadcast, we could + * wake up "too many" threads causing arc_size + * to significantly overflow arc_c; since + * arc_get_data_impl() doesn't check for overflow + * when it's woken up (it doesn't because it's + * possible for the ARC to be overflowing while + * full of un-evictable buffers, and the + * function should proceed in this case). + * + * If threads are left sleeping, due to not + * using cv_broadcast here, they will be woken + * up via cv_broadcast in arc_adjust_cb() just + * before arc_adjust_zthr sleeps. + */ + mutex_enter(&arc_adjust_lock); + if (!arc_is_overflowing()) + cv_signal(&arc_adjust_waiters_cv); + mutex_exit(&arc_adjust_lock); + } else { + ARCSTAT_BUMP(arcstat_mutex_miss); + } + } + + multilist_sublist_unlock(mls); + + return (bytes_evicted); +} + +/* + * Evict buffers from the given arc state, until we've removed the + * specified number of bytes. Move the removed buffers to the + * appropriate evict state. + * + * This function makes a "best effort". It skips over any buffers + * it can't get a hash_lock on, and so, may not catch all candidates. + * It may also return without evicting as much space as requested. + * + * If bytes is specified using the special value ARC_EVICT_ALL, this + * will evict all available (i.e. unlocked and evictable) buffers from + * the given arc state; which is used by arc_flush(). + */ +static uint64_t +arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) +{ + uint64_t total_evicted = 0; + multilist_t *ml = state->arcs_list[type]; + int num_sublists; + arc_buf_hdr_t **markers; + + IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); + + num_sublists = multilist_get_num_sublists(ml); + + /* + * If we've tried to evict from each sublist, made some + * progress, but still have not hit the target number of bytes + * to evict, we want to keep trying. The markers allow us to + * pick up where we left off for each individual sublist, rather + * than starting from the tail each time. + */ + markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); + for (int i = 0; i < num_sublists; i++) { + markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); + + /* + * A b_spa of 0 is used to indicate that this header is + * a marker. This fact is used in arc_adjust_type() and + * arc_evict_state_impl(). + */ + markers[i]->b_spa = 0; + + multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_insert_tail(mls, markers[i]); + multilist_sublist_unlock(mls); + } + + /* + * While we haven't hit our target number of bytes to evict, or + * we're evicting all available buffers. + */ + while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { + int sublist_idx = multilist_get_random_index(ml); + uint64_t scan_evicted = 0; + + /* + * Try to reduce pinned dnodes with a floor of arc_dnode_limit. + * Request that 10% of the LRUs be scanned by the superblock + * shrinker. + */ + if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size, + arc_dnode_limit) > 0) { + arc_prune_async((aggsum_upper_bound(&astat_dnode_size) - + arc_dnode_limit) / sizeof (dnode_t) / + zfs_arc_dnode_reduce_percent); + } + + /* + * Start eviction using a randomly selected sublist, + * this is to try and evenly balance eviction across all + * sublists. Always starting at the same sublist + * (e.g. index 0) would cause evictions to favor certain + * sublists over others. + */ + for (int i = 0; i < num_sublists; i++) { + uint64_t bytes_remaining; + uint64_t bytes_evicted; + + if (bytes == ARC_EVICT_ALL) + bytes_remaining = ARC_EVICT_ALL; + else if (total_evicted < bytes) + bytes_remaining = bytes - total_evicted; + else + break; + + bytes_evicted = arc_evict_state_impl(ml, sublist_idx, + markers[sublist_idx], spa, bytes_remaining); + + scan_evicted += bytes_evicted; + total_evicted += bytes_evicted; + + /* we've reached the end, wrap to the beginning */ + if (++sublist_idx >= num_sublists) + sublist_idx = 0; + } + + /* + * If we didn't evict anything during this scan, we have + * no reason to believe we'll evict more during another + * scan, so break the loop. + */ + if (scan_evicted == 0) { + /* This isn't possible, let's make that obvious */ + ASSERT3S(bytes, !=, 0); + + /* + * When bytes is ARC_EVICT_ALL, the only way to + * break the loop is when scan_evicted is zero. + * In that case, we actually have evicted enough, + * so we don't want to increment the kstat. + */ + if (bytes != ARC_EVICT_ALL) { + ASSERT3S(total_evicted, <, bytes); + ARCSTAT_BUMP(arcstat_evict_not_enough); + } + + break; + } + } + + for (int i = 0; i < num_sublists; i++) { + multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_remove(mls, markers[i]); + multilist_sublist_unlock(mls); + + kmem_cache_free(hdr_full_cache, markers[i]); + } + kmem_free(markers, sizeof (*markers) * num_sublists); + + return (total_evicted); +} + +/* + * Flush all "evictable" data of the given type from the arc state + * specified. This will not evict any "active" buffers (i.e. referenced). + * + * When 'retry' is set to B_FALSE, the function will make a single pass + * over the state and evict any buffers that it can. Since it doesn't + * continually retry the eviction, it might end up leaving some buffers + * in the ARC due to lock misses. + * + * When 'retry' is set to B_TRUE, the function will continually retry the + * eviction until *all* evictable buffers have been removed from the + * state. As a result, if concurrent insertions into the state are + * allowed (e.g. if the ARC isn't shutting down), this function might + * wind up in an infinite loop, continually trying to evict buffers. + */ +static uint64_t +arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, + boolean_t retry) +{ + uint64_t evicted = 0; + + while (refcount_count(&state->arcs_esize[type]) != 0) { + evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); + + if (!retry) + break; + } + + return (evicted); +} + +/* + * Helper function for arc_prune_async() it is responsible for safely + * handling the execution of a registered arc_prune_func_t. + */ +static void +arc_prune_task(void *ptr) +{ + arc_prune_t *ap = (arc_prune_t *)ptr; + arc_prune_func_t *func = ap->p_pfunc; + + if (func != NULL) + func(ap->p_adjust, ap->p_private); + + refcount_remove(&ap->p_refcnt, func); +} + +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffered they reference. This provides a mechanism to ensure the ARC can + * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * is analogous to dnlc_reduce_cache() but more generic. + * + * This operation is performed asynchronously so it may be safely called + * in the context of the arc_reclaim_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +static void +arc_prune_async(int64_t adjust) +{ + arc_prune_t *ap; + + mutex_enter(&arc_prune_mtx); + for (ap = list_head(&arc_prune_list); ap != NULL; + ap = list_next(&arc_prune_list, ap)) { + + if (refcount_count(&ap->p_refcnt) >= 2) + continue; + + refcount_add(&ap->p_refcnt, ap->p_pfunc); + ap->p_adjust = adjust; + if (taskq_dispatch(arc_prune_taskq, arc_prune_task, + ap, TQ_SLEEP) == TASKQID_INVALID) { + refcount_remove(&ap->p_refcnt, ap->p_pfunc); + continue; + } + ARCSTAT_BUMP(arcstat_prune); + } + mutex_exit(&arc_prune_mtx); +} + +/* + * Evict the specified number of bytes from the state specified, + * restricting eviction to the spa and type given. This function + * prevents us from trying to evict more from a state's list than + * is "evictable", and to skip evicting altogether when passed a + * negative value for "bytes". In contrast, arc_evict_state() will + * evict everything it can, when passed a negative value for "bytes". + */ +static uint64_t +arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) +{ + int64_t delta; + + if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); + return (arc_evict_state(state, spa, delta, type)); + } + + return (0); +} + +/* + * The goal of this function is to evict enough meta data buffers from the + * ARC in order to enforce the arc_meta_limit. Achieving this is slightly + * more complicated than it appears because it is common for data buffers + * to have holds on meta data buffers. In addition, dnode meta data buffers + * will be held by the dnodes in the block preventing them from being freed. + * This means we can't simply traverse the ARC and expect to always find + * enough unheld meta data buffer to release. + * + * Therefore, this function has been updated to make alternating passes + * over the ARC releasing data buffers and then newly unheld meta data + * buffers. This ensures forward progress is maintained and meta_used + * will decrease. Normally this is sufficient, but if required the ARC + * will call the registered prune callbacks causing dentry and inodes to + * be dropped from the VFS cache. This will make dnode meta data buffers + * available for reclaim. + */ +static uint64_t +arc_adjust_meta_balanced(uint64_t meta_used) +{ + int64_t delta, prune = 0, adjustmnt; + uint64_t total_evicted = 0; + arc_buf_contents_t type = ARC_BUFC_DATA; + int restarts = MAX(zfs_arc_meta_adjust_restarts, 0); + +restart: + /* + * This slightly differs than the way we evict from the mru in + * arc_adjust because we don't have a "target" value (i.e. no + * "meta" arc_p). As a result, I think we can completely + * cannibalize the metadata in the MRU before we evict the + * metadata from the MFU. I think we probably need to implement a + * "metadata arc_p" value to do this properly. + */ + adjustmnt = meta_used - arc_meta_limit; + + if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&arc_mru->arcs_esize[type]), + adjustmnt); + total_evicted += arc_adjust_impl(arc_mru, 0, delta, type); + adjustmnt -= delta; + } + + /* + * We can't afford to recalculate adjustmnt here. If we do, + * new metadata buffers can sneak into the MRU or ANON lists, + * thus penalize the MFU metadata. Although the fudge factor is + * small, it has been empirically shown to be significant for + * certain workloads (e.g. creating many empty directories). As + * such, we use the original calculation for adjustmnt, and + * simply decrement the amount of data evicted from the MRU. + */ + + if (adjustmnt > 0 && refcount_count(&arc_mfu->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&arc_mfu->arcs_esize[type]), + adjustmnt); + total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type); + } + + adjustmnt = meta_used - arc_meta_limit; + + if (adjustmnt > 0 && + refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { + delta = MIN(adjustmnt, + refcount_count(&arc_mru_ghost->arcs_esize[type])); + total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type); + adjustmnt -= delta; + } + + if (adjustmnt > 0 && + refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { + delta = MIN(adjustmnt, + refcount_count(&arc_mfu_ghost->arcs_esize[type])); + total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type); + } + + /* + * If after attempting to make the requested adjustment to the ARC + * the meta limit is still being exceeded then request that the + * higher layers drop some cached objects which have holds on ARC + * meta buffers. Requests to the upper layers will be made with + * increasingly large scan sizes until the ARC is below the limit. + */ + if (meta_used > arc_meta_limit) { + if (type == ARC_BUFC_DATA) { + type = ARC_BUFC_METADATA; + } else { + type = ARC_BUFC_DATA; + + if (zfs_arc_meta_prune) { + prune += zfs_arc_meta_prune; + arc_prune_async(prune); + } + } + + if (restarts > 0) { + restarts--; + goto restart; + } + } + return (total_evicted); +} + +/* + * Evict metadata buffers from the cache, such that arc_meta_used is + * capped by the arc_meta_limit tunable. + */ +static uint64_t +arc_adjust_meta_only(uint64_t meta_used) +{ + uint64_t total_evicted = 0; + int64_t target; + + /* + * If we're over the meta limit, we want to evict enough + * metadata to get back under the meta limit. We don't want to + * evict so much that we drop the MRU below arc_p, though. If + * we're over the meta limit more than we're over arc_p, we + * evict some from the MRU here, and some from the MFU below. + */ + target = MIN((int64_t)(meta_used - arc_meta_limit), + (int64_t)(refcount_count(&arc_anon->arcs_size) + + refcount_count(&arc_mru->arcs_size) - arc_p)); + + total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + + /* + * Similar to the above, we want to evict enough bytes to get us + * below the meta limit, but not so much as to drop us below the + * space allotted to the MFU (which is defined as arc_c - arc_p). + */ + target = MIN((int64_t)(meta_used - arc_meta_limit), + (int64_t)(refcount_count(&arc_mfu->arcs_size) - + (arc_c - arc_p))); + + total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + + return (total_evicted); +} + +static uint64_t +arc_adjust_meta(uint64_t meta_used) +{ + if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) + return (arc_adjust_meta_only(meta_used)); + else + return (arc_adjust_meta_balanced(meta_used)); +} + +/* + * Return the type of the oldest buffer in the given arc state + * + * This function will select a random sublist of type ARC_BUFC_DATA and + * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist + * is compared, and the type which contains the "older" buffer will be + * returned. + */ +static arc_buf_contents_t +arc_adjust_type(arc_state_t *state) +{ + multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; + multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; + int data_idx = multilist_get_random_index(data_ml); + int meta_idx = multilist_get_random_index(meta_ml); + multilist_sublist_t *data_mls; + multilist_sublist_t *meta_mls; + arc_buf_contents_t type; + arc_buf_hdr_t *data_hdr; + arc_buf_hdr_t *meta_hdr; + + /* + * We keep the sublist lock until we're finished, to prevent + * the headers from being destroyed via arc_evict_state(). + */ + data_mls = multilist_sublist_lock(data_ml, data_idx); + meta_mls = multilist_sublist_lock(meta_ml, meta_idx); + + /* + * These two loops are to ensure we skip any markers that + * might be at the tail of the lists due to arc_evict_state(). + */ + + for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; + data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { + if (data_hdr->b_spa != 0) + break; + } + + for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; + meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { + if (meta_hdr->b_spa != 0) + break; + } + + if (data_hdr == NULL && meta_hdr == NULL) { + type = ARC_BUFC_DATA; + } else if (data_hdr == NULL) { + ASSERT3P(meta_hdr, !=, NULL); + type = ARC_BUFC_METADATA; + } else if (meta_hdr == NULL) { + ASSERT3P(data_hdr, !=, NULL); + type = ARC_BUFC_DATA; + } else { + ASSERT3P(data_hdr, !=, NULL); + ASSERT3P(meta_hdr, !=, NULL); + + /* The headers can't be on the sublist without an L1 header */ + ASSERT(HDR_HAS_L1HDR(data_hdr)); + ASSERT(HDR_HAS_L1HDR(meta_hdr)); + + if (data_hdr->b_l1hdr.b_arc_access < + meta_hdr->b_l1hdr.b_arc_access) { + type = ARC_BUFC_DATA; + } else { + type = ARC_BUFC_METADATA; + } + } + + multilist_sublist_unlock(meta_mls); + multilist_sublist_unlock(data_mls); + + return (type); +} + +/* + * Evict buffers from the cache, such that arc_size is capped by arc_c. + */ +static uint64_t +arc_adjust(void) +{ + uint64_t total_evicted = 0; + uint64_t bytes; + int64_t target; + uint64_t asize = aggsum_value(&arc_size); + uint64_t ameta = aggsum_value(&arc_meta_used); + + /* + * If we're over arc_meta_limit, we want to correct that before + * potentially evicting data buffers below. + */ + total_evicted += arc_adjust_meta(ameta); + + /* + * Adjust MRU size + * + * If we're over the target cache size, we want to evict enough + * from the list to get back to our target size. We don't want + * to evict too much from the MRU, such that it drops below + * arc_p. So, if we're over our target cache size more than + * the MRU is over arc_p, we'll evict enough to get back to + * arc_p here, and then evict more from the MFU below. + */ + target = MIN((int64_t)(asize - arc_c), + (int64_t)(refcount_count(&arc_anon->arcs_size) + + refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); + + /* + * If we're below arc_meta_min, always prefer to evict data. + * Otherwise, try to satisfy the requested number of bytes to + * evict from the type which contains older buffers; in an + * effort to keep newer buffers in the cache regardless of their + * type. If we cannot satisfy the number of bytes from this + * type, spill over into the next type. + */ + if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && + ameta > arc_meta_min) { + bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from metadata. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + } + + /* + * Re-sum ARC stats after the first round of evictions. + */ + asize = aggsum_value(&arc_size); + ameta = aggsum_value(&arc_meta_used); + + /* + * Adjust MFU size + * + * Now that we've tried to evict enough from the MRU to get its + * size back to arc_p, if we're still above the target cache + * size, we evict the rest from the MFU. + */ + target = asize - arc_c; + + if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && + ameta > arc_meta_min) { + bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + } + + /* + * Adjust ghost lists + * + * In addition to the above, the ARC also defines target values + * for the ghost lists. The sum of the mru list and mru ghost + * list should never exceed the target size of the cache, and + * the sum of the mru list, mfu list, mru ghost list, and mfu + * ghost list should never exceed twice the target size of the + * cache. The following logic enforces these limits on the ghost + * caches, and evicts from them as needed. + */ + target = refcount_count(&arc_mru->arcs_size) + + refcount_count(&arc_mru_ghost->arcs_size) - arc_c; + + bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); + + /* + * We assume the sum of the mru list and mfu list is less than + * or equal to arc_c (we enforced this above), which means we + * can use the simpler of the two equations below: + * + * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c + * mru ghost + mfu ghost <= arc_c + */ + target = refcount_count(&arc_mru_ghost->arcs_size) + + refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; + + bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); + + return (total_evicted); +} + +void +arc_flush(spa_t *spa, boolean_t retry) +{ + uint64_t guid = 0; + + /* + * If retry is B_TRUE, a spa must not be specified since we have + * no good way to determine if all of a spa's buffers have been + * evicted from an arc state. + */ + ASSERT(!retry || spa == 0); + + if (spa != NULL) + guid = spa_load_guid(spa); + + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); +} + +static void +arc_reduce_target_size(int64_t to_free) +{ + uint64_t asize = aggsum_value(&arc_size); + if (arc_c > arc_c_min) { + DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, + arc_c_min, uint64_t, arc_p, uint64_t, to_free); + if (arc_c > arc_c_min + to_free) + atomic_add_64(&arc_c, -to_free); + else + arc_c = arc_c_min; + + atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); + if (asize < arc_c) + arc_c = MAX(asize, arc_c_min); + if (arc_p > arc_c) + arc_p = (arc_c >> 1); + + DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, + arc_p); + + ASSERT(arc_c >= arc_c_min); + ASSERT((int64_t)arc_p >= 0); + } + + if (asize > arc_c) { + DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize, + uint64_t, arc_c); + /* See comment in arc_adjust_cb_check() on why lock+flag */ + mutex_enter(&arc_adjust_lock); + arc_adjust_needed = B_TRUE; + mutex_exit(&arc_adjust_lock); + zthr_wakeup(arc_adjust_zthr); + } +} + +typedef enum free_memory_reason_t { + FMR_UNKNOWN, + FMR_NEEDFREE, + FMR_LOTSFREE, + FMR_SWAPFS_MINFREE, + FMR_PAGES_PP_MAXIMUM, + FMR_HEAP_ARENA, + FMR_ZIO_ARENA, +} free_memory_reason_t; + +int64_t last_free_memory; +free_memory_reason_t last_free_reason; + +/* + * Additional reserve of pages for pp_reserve. + */ +int64_t arc_pages_pp_reserve = 64; + +/* + * Additional reserve of pages for swapfs. + */ +int64_t arc_swapfs_reserve = 64; + +/* + * Return the amount of memory that can be consumed before reclaim will be + * needed. Positive if there is sufficient free memory, negative indicates + * the amount of memory that needs to be freed up. + */ +static int64_t +arc_available_memory(void) +{ + int64_t lowest = INT64_MAX; + int64_t n; + free_memory_reason_t r = FMR_UNKNOWN; + +#ifdef _KERNEL +#ifdef __FreeBSD__ + /* + * Cooperate with pagedaemon when it's time for it to scan + * and reclaim some pages. + */ + n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); + if (n < lowest) { + lowest = n; + r = FMR_LOTSFREE; + } + +#else + if (needfree > 0) { + n = PAGESIZE * (-needfree); + if (n < lowest) { + lowest = n; + r = FMR_NEEDFREE; + } + } + + /* + * check that we're out of range of the pageout scanner. It starts to + * schedule paging if freemem is less than lotsfree and needfree. + * lotsfree is the high-water mark for pageout, and needfree is the + * number of needed free pages. We add extra pages here to make sure + * the scanner doesn't start up while we're freeing memory. + */ + n = PAGESIZE * (freemem - lotsfree - needfree - desfree); + if (n < lowest) { + lowest = n; + r = FMR_LOTSFREE; + } + + /* + * check to make sure that swapfs has enough space so that anon + * reservations can still succeed. anon_resvmem() checks that the + * availrmem is greater than swapfs_minfree, and the number of reserved + * swap pages. We also add a bit of extra here just to prevent + * circumstances from getting really dire. + */ + n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - + desfree - arc_swapfs_reserve); + if (n < lowest) { + lowest = n; + r = FMR_SWAPFS_MINFREE; + } + + + /* + * Check that we have enough availrmem that memory locking (e.g., via + * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum + * stores the number of pages that cannot be locked; when availrmem + * drops below pages_pp_maximum, page locking mechanisms such as + * page_pp_lock() will fail.) + */ + n = PAGESIZE * (availrmem - pages_pp_maximum - + arc_pages_pp_reserve); + if (n < lowest) { + lowest = n; + r = FMR_PAGES_PP_MAXIMUM; + } + +#endif /* __FreeBSD__ */ +#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) + /* + * If we're on an i386 platform, it's possible that we'll exhaust the + * kernel heap space before we ever run out of available physical + * memory. Most checks of the size of the heap_area compare against + * tune.t_minarmem, which is the minimum available real memory that we + * can have in the system. However, this is generally fixed at 25 pages + * which is so low that it's useless. In this comparison, we seek to + * calculate the total heap-size, and reclaim if more than 3/4ths of the + * heap is allocated. (Or, in the calculation, if less than 1/4th is + * free) + */ + n = uma_avail() - (long)(uma_limit() / 4); + if (n < lowest) { + lowest = n; + r = FMR_HEAP_ARENA; + } +#endif + + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this arena remains + * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. + * + * Note that reducing the arc_zio_arena_free_shift keeps more virtual + * memory (in the zio_arena) free, which can avoid memory + * fragmentation issues. + */ + if (zio_arena != NULL) { + n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - + (vmem_size(zio_arena, VMEM_ALLOC) >> + arc_zio_arena_free_shift); + if (n < lowest) { + lowest = n; + r = FMR_ZIO_ARENA; + } + } + +#else /* _KERNEL */ + /* Every 100 calls, free a small amount */ + if (spa_get_random(100) == 0) + lowest = -1024; +#endif /* _KERNEL */ + + last_free_memory = lowest; + last_free_reason = r; + DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); + return (lowest); +} + + +/* + * Determine if the system is under memory pressure and is asking + * to reclaim memory. A return value of B_TRUE indicates that the system + * is under memory pressure and that the arc should adjust accordingly. + */ +static boolean_t +arc_reclaim_needed(void) +{ + return (arc_available_memory() < 0); +} + +extern kmem_cache_t *zio_buf_cache[]; +extern kmem_cache_t *zio_data_buf_cache[]; +extern kmem_cache_t *range_seg_cache; +extern kmem_cache_t *abd_chunk_cache; + +static __noinline void +arc_kmem_reap_soon(void) +{ + size_t i; + kmem_cache_t *prev_cache = NULL; + kmem_cache_t *prev_data_cache = NULL; + + DTRACE_PROBE(arc__kmem_reap_start); +#ifdef _KERNEL + if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) { + /* + * We are exceeding our meta-data cache limit. + * Purge some DNLC entries to release holds on meta-data. + */ + dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); + } +#if defined(__i386) + /* + * Reclaim unused memory from all kmem caches. + */ + kmem_reap(); +#endif +#endif + + for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { + if (zio_buf_cache[i] != prev_cache) { + prev_cache = zio_buf_cache[i]; + kmem_cache_reap_soon(zio_buf_cache[i]); + } + if (zio_data_buf_cache[i] != prev_data_cache) { + prev_data_cache = zio_data_buf_cache[i]; + kmem_cache_reap_soon(zio_data_buf_cache[i]); + } + } + kmem_cache_reap_soon(abd_chunk_cache); + kmem_cache_reap_soon(buf_cache); + kmem_cache_reap_soon(hdr_full_cache); + kmem_cache_reap_soon(hdr_l2only_cache); + kmem_cache_reap_soon(range_seg_cache); + +#ifdef illumos + if (zio_arena != NULL) { + /* + * Ask the vmem arena to reclaim unused memory from its + * quantum caches. + */ + vmem_qcache_reap(zio_arena); + } +#endif + DTRACE_PROBE(arc__kmem_reap_end); +} + +/* ARGSUSED */ +static boolean_t +arc_adjust_cb_check(void *arg, zthr_t *zthr) +{ + /* + * This is necessary in order for the mdb ::arc dcmd to + * show up to date information. Since the ::arc command + * does not call the kstat's update function, without + * this call, the command may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date(the arc_adjust_zthr has a maximum sleep + * time of 1 second); but that should suffice. The + * arc_state_t structures can be queried directly if more + * accurate information is needed. + */ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); + + /* + * We have to rely on arc_get_data_impl() to tell us when to adjust, + * rather than checking if we are overflowing here, so that we are + * sure to not leave arc_get_data_impl() waiting on + * arc_adjust_waiters_cv. If we have become "not overflowing" since + * arc_get_data_impl() checked, we need to wake it up. We could + * broadcast the CV here, but arc_get_data_impl() may have not yet + * gone to sleep. We would need to use a mutex to ensure that this + * function doesn't broadcast until arc_get_data_impl() has gone to + * sleep (e.g. the arc_adjust_lock). However, the lock ordering of + * such a lock would necessarily be incorrect with respect to the + * zthr_lock, which is held before this function is called, and is + * held by arc_get_data_impl() when it calls zthr_wakeup(). + */ + return (arc_adjust_needed); +} + +/* + * Keep arc_size under arc_c by running arc_adjust which evicts data + * from the ARC. */ +/* ARGSUSED */ +static int +arc_adjust_cb(void *arg, zthr_t *zthr) +{ + uint64_t evicted = 0; + + /* Evict from cache */ + evicted = arc_adjust(); + + /* + * If evicted is zero, we couldn't evict anything + * via arc_adjust(). This could be due to hash lock + * collisions, but more likely due to the majority of + * arc buffers being unevictable. Therefore, even if + * arc_size is above arc_c, another pass is unlikely to + * be helpful and could potentially cause us to enter an + * infinite loop. Additionally, zthr_iscancelled() is + * checked here so that if the arc is shutting down, the + * broadcast will wake any remaining arc adjust waiters. + */ + mutex_enter(&arc_adjust_lock); + arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) && + evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0; + if (!arc_adjust_needed) { + /* + * We're either no longer overflowing, or we + * can't evict anything more, so we should wake + * up any waiters. + */ + cv_broadcast(&arc_adjust_waiters_cv); + } + mutex_exit(&arc_adjust_lock); + + return (0); +} + +/* ARGSUSED */ +static boolean_t +arc_reap_cb_check(void *arg, zthr_t *zthr) +{ + int64_t free_memory = arc_available_memory(); + + /* + * If a kmem reap is already active, don't schedule more. We must + * check for this because kmem_cache_reap_soon() won't actually + * block on the cache being reaped (this is to prevent callers from + * becoming implicitly blocked by a system-wide kmem reap -- which, + * on a system with many, many full magazines, can take minutes). + */ + if (!kmem_cache_reap_active() && + free_memory < 0) { + arc_no_grow = B_TRUE; + arc_warm = B_TRUE; + /* + * Wait at least zfs_grow_retry (default 60) seconds + * before considering growing. + */ + arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + return (B_TRUE); + } else if (free_memory < arc_c >> arc_no_grow_shift) { + arc_no_grow = B_TRUE; + } else if (gethrtime() >= arc_growtime) { + arc_no_grow = B_FALSE; + } + + return (B_FALSE); +} + +/* + * Keep enough free memory in the system by reaping the ARC's kmem + * caches. To cause more slabs to be reapable, we may reduce the + * target size of the cache (arc_c), causing the arc_adjust_cb() + * to free more buffers. + */ +/* ARGSUSED */ +static int +arc_reap_cb(void *arg, zthr_t *zthr) +{ + int64_t free_memory; + + /* + * Kick off asynchronous kmem_reap()'s of all our caches. + */ + arc_kmem_reap_soon(); + + /* + * Wait at least arc_kmem_cache_reap_retry_ms between + * arc_kmem_reap_soon() calls. Without this check it is possible to + * end up in a situation where we spend lots of time reaping + * caches, while we're near arc_c_min. Waiting here also gives the + * subsequent free memory check a chance of finding that the + * asynchronous reap has already freed enough memory, and we don't + * need to call arc_reduce_target_size(). + */ + delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); + + /* + * Reduce the target size as needed to maintain the amount of free + * memory in the system at a fraction of the arc_size (1/128th by + * default). If oversubscribed (free_memory < 0) then reduce the + * target arc_size by the deficit amount plus the fractional + * amount. If free memory is positive but less then the fractional + * amount, reduce by what is needed to hit the fractional amount. + */ + free_memory = arc_available_memory(); + + int64_t to_free = + (arc_c >> arc_shrink_shift) - free_memory; + if (to_free > 0) { +#ifdef _KERNEL +#ifdef illumos + to_free = MAX(to_free, ptob(needfree)); +#endif +#endif + arc_reduce_target_size(to_free); + } + + return (0); +} + +static u_int arc_dnlc_evicts_arg; +extern struct vfsops zfs_vfsops; + +static void +arc_dnlc_evicts_thread(void *dummy __unused) +{ + callb_cpr_t cpr; + u_int percent; + + CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_dnlc_evicts_lock); + while (!arc_dnlc_evicts_thread_exit) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); + CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); + if (arc_dnlc_evicts_arg != 0) { + percent = arc_dnlc_evicts_arg; + mutex_exit(&arc_dnlc_evicts_lock); +#ifdef _KERNEL + vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); +#endif + mutex_enter(&arc_dnlc_evicts_lock); + /* + * Clear our token only after vnlru_free() + * pass is done, to avoid false queueing of + * the requests. + */ + arc_dnlc_evicts_arg = 0; + } + } + arc_dnlc_evicts_thread_exit = FALSE; + cv_broadcast(&arc_dnlc_evicts_cv); + CALLB_CPR_EXIT(&cpr); + thread_exit(); +} + +void +dnlc_reduce_cache(void *arg) +{ + u_int percent; + + percent = (u_int)(uintptr_t)arg; + mutex_enter(&arc_dnlc_evicts_lock); + if (arc_dnlc_evicts_arg == 0) { + arc_dnlc_evicts_arg = percent; + cv_broadcast(&arc_dnlc_evicts_cv); + } + mutex_exit(&arc_dnlc_evicts_lock); +} + +/* + * Adapt arc info given the number of bytes we are trying to add and + * the state that we are comming from. This function is only called + * when we are adding new content to the cache. + */ +static void +arc_adapt(int bytes, arc_state_t *state) +{ + int mult; + uint64_t arc_p_min = (arc_c >> arc_p_min_shift); + int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); + int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); + + if (state == arc_l2c_only) + return; + + ASSERT(bytes > 0); + /* + * Adapt the target size of the MRU list: + * - if we just hit in the MRU ghost list, then increase + * the target size of the MRU list. + * - if we just hit in the MFU ghost list, then increase + * the target size of the MFU list by decreasing the + * target size of the MRU list. + */ + if (state == arc_mru_ghost) { + mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); + mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ + + arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); + } else if (state == arc_mfu_ghost) { + uint64_t delta; + + mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); + mult = MIN(mult, 10); + + delta = MIN(bytes * mult, arc_p); + arc_p = MAX(arc_p_min, arc_p - delta); + } + ASSERT((int64_t)arc_p >= 0); + + /* + * Wake reap thread if we do not have any available memory + */ + if (arc_reclaim_needed()) { + zthr_wakeup(arc_reap_zthr); + return; + } + + if (arc_no_grow) + return; + + if (arc_c >= arc_c_max) + return; + + /* + * If we're within (2 * maxblocksize) bytes of the target + * cache size, increment the target cache size + */ + if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) > + 0) { + DTRACE_PROBE1(arc__inc_adapt, int, bytes); + atomic_add_64(&arc_c, (int64_t)bytes); + if (arc_c > arc_c_max) + arc_c = arc_c_max; + else if (state == arc_anon) + atomic_add_64(&arc_p, (int64_t)bytes); + if (arc_p > arc_c) + arc_p = arc_c; + } + ASSERT((int64_t)arc_p >= 0); +} + +/* + * Check if arc_size has grown past our upper threshold, determined by + * zfs_arc_overflow_shift. + */ +static boolean_t +arc_is_overflowing(void) +{ + /* Always allow at least one block of overflow */ + uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, + arc_c >> zfs_arc_overflow_shift); + + /* + * We just compare the lower bound here for performance reasons. Our + * primary goals are to make sure that the arc never grows without + * bound, and that it can reach its maximum size. This check + * accomplishes both goals. The maximum amount we could run over by is + * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block + * in the ARC. In practice, that's in the tens of MB, which is low + * enough to be safe. + */ + return (aggsum_lower_bound(&arc_size) >= arc_c + overflow); +} + +static abd_t * +arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_get_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + return (abd_alloc(size, B_TRUE)); + } else { + ASSERT(type == ARC_BUFC_DATA); + return (abd_alloc(size, B_FALSE)); + } +} + +static void * +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_get_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + return (zio_buf_alloc(size)); + } else { + ASSERT(type == ARC_BUFC_DATA); + return (zio_data_buf_alloc(size)); + } +} + +/* + * Allocate a block and return it to the caller. If we are hitting the + * hard limit for the cache size, we must sleep, waiting for the eviction + * thread to catch up. If we're past the target size but below the hard + * limit, we'll only signal the reclaim thread and continue on. + */ +static void +arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_adapt(size, state); + + /* + * If arc_size is currently overflowing, and has grown past our + * upper limit, we must be adding data faster than the evict + * thread can evict. Thus, to ensure we don't compound the + * problem by adding more data and forcing arc_size to grow even + * further past it's target size, we halt and wait for the + * eviction thread to catch up. + * + * It's also possible that the reclaim thread is unable to evict + * enough buffers to get arc_size below the overflow limit (e.g. + * due to buffers being un-evictable, or hash lock collisions). + * In this case, we want to proceed regardless if we're + * overflowing; thus we don't use a while loop here. + */ + if (arc_is_overflowing()) { + mutex_enter(&arc_adjust_lock); + + /* + * Now that we've acquired the lock, we may no longer be + * over the overflow limit, lets check. + * + * We're ignoring the case of spurious wake ups. If that + * were to happen, it'd let this thread consume an ARC + * buffer before it should have (i.e. before we're under + * the overflow limit and were signalled by the reclaim + * thread). As long as that is a rare occurrence, it + * shouldn't cause any harm. + */ + if (arc_is_overflowing()) { + arc_adjust_needed = B_TRUE; + zthr_wakeup(arc_adjust_zthr); + (void) cv_wait(&arc_adjust_waiters_cv, + &arc_adjust_lock); + } + mutex_exit(&arc_adjust_lock); + } + + VERIFY3U(hdr->b_type, ==, type); + if (type == ARC_BUFC_METADATA) { + arc_space_consume(size, ARC_SPACE_META); + } else { + arc_space_consume(size, ARC_SPACE_DATA); + } + + /* + * Update the state size. Note that ghost states have a + * "ghost size" and so don't need to be updated. + */ + if (!GHOST_STATE(state)) { + + (void) refcount_add_many(&state->arcs_size, size, tag); + + /* + * If this is reached via arc_read, the link is + * protected by the hash lock. If reached via + * arc_buf_alloc, the header should not be accessed by + * any other thread. And, if reached via arc_read_done, + * the hash lock will protect it if it's found in the + * hash table; otherwise no other thread should be + * trying to [add|remove]_reference it. + */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + (void) refcount_add_many(&state->arcs_esize[type], + size, tag); + } + + /* + * If we are growing the cache, and we are adding anonymous + * data, and we have outgrown arc_p, update arc_p + */ + if (aggsum_compare(&arc_size, arc_c) < 0 && + hdr->b_l1hdr.b_state == arc_anon && + (refcount_count(&arc_anon->arcs_size) + + refcount_count(&arc_mru->arcs_size) > arc_p)) + arc_p = MIN(arc_c, arc_p + size); + } + ARCSTAT_BUMP(arcstat_allocated); +} + +static void +arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) +{ + arc_free_data_impl(hdr, size, tag); + abd_free(abd); +} + +static void +arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_free_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(buf, size); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(buf, size); + } +} + +/* + * Free the arc data buffer. + */ +static void +arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, tag); + } + (void) refcount_remove_many(&state->arcs_size, size, tag); + + VERIFY3U(hdr->b_type, ==, type); + if (type == ARC_BUFC_METADATA) { + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + arc_space_return(size, ARC_SPACE_DATA); + } +} + +/* + * This routine is called whenever a buffer is accessed. + * NOTE: the hash lock is dropped in this function. + */ +static void +arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) +{ + clock_t now; + + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (hdr->b_l1hdr.b_state == arc_anon) { + /* + * This buffer is not in the cache, and does not + * appear in our "ghost" list. Add the new buffer + * to the MRU state. + */ + + ASSERT0(hdr->b_l1hdr.b_arc_access); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mru, hdr, hash_lock); + + } else if (hdr->b_l1hdr.b_state == arc_mru) { + now = ddi_get_lbolt(); + + /* + * If this buffer is here because of a prefetch, then either: + * - clear the flag if this is a "referencing" read + * (any subsequent access will bump this into the MFU state). + * or + * - move the buffer to the head of the list if this is + * another prefetch (to make it less likely to be evicted). + */ + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { + if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + /* link protected by hash lock */ + ASSERT(multilist_link_active( + &hdr->b_l1hdr.b_arc_node)); + } else { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH); + ARCSTAT_BUMP(arcstat_mru_hits); + } + hdr->b_l1hdr.b_arc_access = now; + return; + } + + /* + * This buffer has been "accessed" only once so far, + * but it is still in the cache. Move it to the MFU + * state. + */ + if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { + /* + * More than 125ms have passed since we + * instantiated this buffer. Move it to the + * most frequently used state. + */ + hdr->b_l1hdr.b_arc_access = now; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); + } + atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); + ARCSTAT_BUMP(arcstat_mru_hits); + } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { + arc_state_t *new_state; + /* + * This buffer has been "accessed" recently, but + * was evicted from the cache. Move it to the + * MFU state. + */ + + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { + new_state = arc_mru; + if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH); + } + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + } else { + new_state = arc_mfu; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + } + + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + arc_change_state(new_state, hdr, hash_lock); + + atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits); + ARCSTAT_BUMP(arcstat_mru_ghost_hits); + } else if (hdr->b_l1hdr.b_state == arc_mfu) { + /* + * This buffer has been accessed more than once and is + * still in the cache. Keep it in the MFU state. + * + * NOTE: an add_reference() that occurred when we did + * the arc_read() will have kicked this off the list. + * If it was a prefetch, we will explicitly move it to + * the head of the list now. + */ + + atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); + ARCSTAT_BUMP(arcstat_mfu_hits); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { + arc_state_t *new_state = arc_mfu; + /* + * This buffer has been accessed more than once but has + * been evicted from the cache. Move it back to the + * MFU state. + */ + + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { + /* + * This is a prefetch access... + * move this block back to the MRU state. + */ + new_state = arc_mru; + } + + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(new_state, hdr, hash_lock); + + atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits); + ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { + /* + * This buffer is on the 2nd Level ARC. + */ + + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); + } else { + ASSERT(!"invalid arc state"); + } +} + +/* + * This routine is called by dbuf_hold() to update the arc_access() state + * which otherwise would be skipped for entries in the dbuf cache. + */ +void +arc_buf_access(arc_buf_t *buf) +{ + mutex_enter(&buf->b_evict_lock); + arc_buf_hdr_t *hdr = buf->b_hdr; + + /* + * Avoid taking the hash_lock when possible as an optimization. + * The header must be checked again under the hash_lock in order + * to handle the case where it is concurrently being released. + */ + if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { + mutex_exit(&buf->b_evict_lock); + ARCSTAT_BUMP(arcstat_access_skip); + return; + } + + kmutex_t *hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { + mutex_exit(hash_lock); + mutex_exit(&buf->b_evict_lock); + ARCSTAT_BUMP(arcstat_access_skip); + return; + } + + mutex_exit(&buf->b_evict_lock); + + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); + + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); +} + +/* a generic arc_read_done_func_t which you can use */ +/* ARGSUSED */ +void +arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *arg) +{ + if (buf == NULL) + return; + + bcopy(buf->b_data, arg, arc_buf_size(buf)); + arc_buf_destroy(buf, arg); +} + +/* a generic arc_read_done_func_t */ +/* ARGSUSED */ +void +arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *arg) +{ + arc_buf_t **bufp = arg; + if (buf == NULL) { + ASSERT(zio == NULL || zio->io_error != 0); + *bufp = NULL; + } else { + ASSERT(zio == NULL || zio->io_error == 0); + *bufp = buf; + ASSERT(buf->b_data != NULL); + } +} + +static void +arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) +{ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { + ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + if (HDR_COMPRESSION_ENABLED(hdr)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, + BP_GET_COMPRESS(bp)); + } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); + ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); + } +} + +static void +arc_read_done(zio_t *zio) +{ + arc_buf_hdr_t *hdr = zio->io_private; + kmutex_t *hash_lock = NULL; + arc_callback_t *callback_list; + arc_callback_t *acb; + boolean_t freeable = B_FALSE; + boolean_t no_zio_error = (zio->io_error == 0); + + /* + * The hdr was inserted into hash-table and removed from lists + * prior to starting I/O. We should find this header, since + * it's in the hash table, and it should be legit since it's + * not possible to evict it during the I/O. The only possible + * reason for it not to be found is if we were freed during the + * read. + */ + if (HDR_IN_HASH_TABLE(hdr)) { + ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); + ASSERT3U(hdr->b_dva.dva_word[0], ==, + BP_IDENTITY(zio->io_bp)->dva_word[0]); + ASSERT3U(hdr->b_dva.dva_word[1], ==, + BP_IDENTITY(zio->io_bp)->dva_word[1]); + + arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, + &hash_lock); + + ASSERT((found == hdr && + DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || + (found == hdr && HDR_L2_READING(hdr))); + ASSERT3P(hash_lock, !=, NULL); + } + + if (no_zio_error) { + /* byteswap if necessary */ + if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + if (BP_GET_LEVEL(zio->io_bp) > 0) { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; + } else { + hdr->b_l1hdr.b_byteswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + } + } else { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + } + } + + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); + if (l2arc_noprefetch && HDR_PREFETCH(hdr)) + arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); + + callback_list = hdr->b_l1hdr.b_acb; + ASSERT3P(callback_list, !=, NULL); + + if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { + /* + * Only call arc_access on anonymous buffers. This is because + * if we've issued an I/O for an evicted buffer, we've already + * called arc_access (to prevent any simultaneous readers from + * getting confused). + */ + arc_access(hdr, hash_lock); + } + + /* + * If a read request has a callback (i.e. acb_done is not NULL), then we + * make a buf containing the data according to the parameters which were + * passed in. The implementation of arc_buf_alloc_impl() ensures that we + * aren't needlessly decompressing the data multiple times. + */ + int callback_cnt = 0; + for (acb = callback_list; acb != NULL; acb = acb->acb_next) { + if (!acb->acb_done) + continue; + + callback_cnt++; + + if (no_zio_error) { + int error = arc_buf_alloc_impl(hdr, acb->acb_private, + acb->acb_compressed, zio->io_error == 0, + &acb->acb_buf); + if (error != 0) { + /* + * Decompression failed. Set io_error + * so that when we call acb_done (below), + * we will indicate that the read failed. + * Note that in the unusual case where one + * callback is compressed and another + * uncompressed, we will mark all of them + * as failed, even though the uncompressed + * one can't actually fail. In this case, + * the hdr will not be anonymous, because + * if there are multiple callbacks, it's + * because multiple threads found the same + * arc buf in the hash table. + */ + zio->io_error = error; + } + } + } + /* + * If there are multiple callbacks, we must have the hash lock, + * because the only way for multiple threads to find this hdr is + * in the hash table. This ensures that if there are multiple + * callbacks, the hdr is not anonymous. If it were anonymous, + * we couldn't use arc_buf_destroy() in the error case below. + */ + ASSERT(callback_cnt < 2 || hash_lock != NULL); + + hdr->b_l1hdr.b_acb = NULL; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + if (callback_cnt == 0) { + ASSERT(HDR_PREFETCH(hdr)); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + } + + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || + callback_list != NULL); + + if (no_zio_error) { + arc_hdr_verify(hdr, zio->io_bp); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); + if (hdr->b_l1hdr.b_state != arc_anon) + arc_change_state(arc_anon, hdr, hash_lock); + if (HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); + freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); + } + + /* + * Broadcast before we drop the hash_lock to avoid the possibility + * that the hdr (and hence the cv) might be freed before we get to + * the cv_broadcast(). + */ + cv_broadcast(&hdr->b_l1hdr.b_cv); + + if (hash_lock != NULL) { + mutex_exit(hash_lock); + } else { + /* + * This block was freed while we waited for the read to + * complete. It has been removed from the hash table and + * moved to the anonymous state (so that it won't show up + * in the cache). + */ + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); + } + + /* execute each callback and free its structure */ + while ((acb = callback_list) != NULL) { + if (acb->acb_done != NULL) { + if (zio->io_error != 0 && acb->acb_buf != NULL) { + /* + * If arc_buf_alloc_impl() fails during + * decompression, the buf will still be + * allocated, and needs to be freed here. + */ + arc_buf_destroy(acb->acb_buf, acb->acb_private); + acb->acb_buf = NULL; + } + acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, + acb->acb_buf, acb->acb_private); + } + + if (acb->acb_zio_dummy != NULL) { + acb->acb_zio_dummy->io_error = zio->io_error; + zio_nowait(acb->acb_zio_dummy); + } + + callback_list = acb->acb_next; + kmem_free(acb, sizeof (arc_callback_t)); + } + + if (freeable) + arc_hdr_destroy(hdr); +} + +/* + * "Read" the block at the specified DVA (in bp) via the + * cache. If the block is found in the cache, invoke the provided + * callback immediately and return. Note that the `zio' parameter + * in the callback will be NULL in this case, since no IO was + * required. If the block is not in the cache pass the read request + * on to the spa with a substitute callback function, so that the + * requested block will be added to the cache. + * + * If a read request arrives for a block that has a read in-progress, + * either wait for the in-progress read to complete (and return the + * results); or, if this is a read with a "done" func, add a record + * to the read to invoke the "done" func when the read completes, + * and return; or just return. + * + * arc_read_done() will invoke all the requested "done" functions + * for readers of this block. + */ +int +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, + void *private, zio_priority_t priority, int zio_flags, + arc_flags_t *arc_flags, const zbookmark_phys_t *zb) +{ + arc_buf_hdr_t *hdr = NULL; + kmutex_t *hash_lock = NULL; + zio_t *rzio; + uint64_t guid = spa_load_guid(spa); + boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; + int rc = 0; + + ASSERT(!BP_IS_EMBEDDED(bp) || + BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); + +top: + if (!BP_IS_EMBEDDED(bp)) { + /* + * Embedded BP's have no DVA and require no I/O to "read". + * Create an anonymous arc buf to back it. + */ + hdr = buf_hash_find(guid, bp, &hash_lock); + } + + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { + arc_buf_t *buf = NULL; + *arc_flags |= ARC_FLAG_CACHED; + + if (HDR_IO_IN_PROGRESS(hdr)) { + zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; + + ASSERT3P(head_zio, !=, NULL); + if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && + priority == ZIO_PRIORITY_SYNC_READ) { + /* + * This is a sync read that needs to wait for + * an in-flight async read. Request that the + * zio have its priority upgraded. + */ + zio_change_priority(head_zio, priority); + DTRACE_PROBE1(arc__async__upgrade__sync, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_async_upgrade_sync); + } + if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); + } + + if (*arc_flags & ARC_FLAG_WAIT) { + cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); + mutex_exit(hash_lock); + goto top; + } + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); + + if (done) { + arc_callback_t *acb = NULL; + + acb = kmem_zalloc(sizeof (arc_callback_t), + KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + acb->acb_compressed = compressed_read; + if (pio != NULL) + acb->acb_zio_dummy = zio_null(pio, + spa, NULL, NULL, NULL, zio_flags); + + ASSERT3P(acb->acb_done, !=, NULL); + acb->acb_zio_head = head_zio; + acb->acb_next = hdr->b_l1hdr.b_acb; + hdr->b_l1hdr.b_acb = acb; + mutex_exit(hash_lock); + return (0); + } + mutex_exit(hash_lock); + return (0); + } + + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); + + if (done) { + if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { + /* + * This is a demand read which does not have to + * wait for i/o because we did a predictive + * prefetch i/o for it, which has completed. + */ + DTRACE_PROBE1( + arc__demand__hit__predictive__prefetch, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP( + arcstat_demand_hit_predictive_prefetch); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); + } + + if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { + ARCSTAT_BUMP( + arcstat_demand_hit_prescient_prefetch); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PRESCIENT_PREFETCH); + } + + ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); + /* Get a buf with the desired data in it. */ + rc = arc_buf_alloc_impl(hdr, private, + compressed_read, B_TRUE, &buf); + if (rc != 0) { + arc_buf_destroy(buf, private); + buf = NULL; + } + ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || + rc == 0 || rc != ENOENT); + } else if (*arc_flags & ARC_FLAG_PREFETCH && + refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + } + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, hash_lock); + if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); + if (*arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), + data, metadata, hits); + + if (done) + done(NULL, zb, bp, buf, private); + } else { + uint64_t lsize = BP_GET_LSIZE(bp); + uint64_t psize = BP_GET_PSIZE(bp); + arc_callback_t *acb; + vdev_t *vd = NULL; + uint64_t addr = 0; + boolean_t devw = B_FALSE; + uint64_t size; + + if (hdr == NULL) { + /* this block is not in the cache */ + arc_buf_hdr_t *exists = NULL; + arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + BP_GET_COMPRESS(bp), type); + + if (!BP_IS_EMBEDDED(bp)) { + hdr->b_dva = *BP_IDENTITY(bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(bp); + exists = buf_hash_insert(hdr, &hash_lock); + } + if (exists != NULL) { + /* somebody beat us to the hash insert */ + mutex_exit(hash_lock); + buf_discard_identity(hdr); + arc_hdr_destroy(hdr); + goto top; /* restart the IO request */ + } + } else { + /* + * This block is in the ghost cache. If it was L2-only + * (and thus didn't have an L1 hdr), we realloc the + * header to add an L1 hdr. + */ + if (!HDR_HAS_L1HDR(hdr)) { + hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, + hdr_full_cache); + } + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + /* + * This is a delicate dance that we play here. + * This hdr is in the ghost list so we access it + * to move it out of the ghost list before we + * initiate the read. If it's a prefetch then + * it won't have a callback so we'll remove the + * reference that arc_buf_alloc_impl() created. We + * do this after we've called arc_access() to + * avoid hitting an assert in remove_reference(). + */ + arc_access(hdr, hash_lock); + arc_hdr_alloc_pabd(hdr); + } + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + size = arc_hdr_size(hdr); + + /* + * If compression is enabled on the hdr, then will do + * RAW I/O and will store the compressed data in the hdr's + * data block. Otherwise, the hdr's data block will contain + * the uncompressed data. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + zio_flags |= ZIO_FLAG_RAW; + } + + if (*arc_flags & ARC_FLAG_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); + + if (*arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (BP_GET_LEVEL(bp) > 0) + arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); + if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); + ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); + + acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + acb->acb_compressed = compressed_read; + + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + hdr->b_l1hdr.b_acb = acb; + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + + if (HDR_HAS_L2HDR(hdr) && + (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { + devw = hdr->b_l2hdr.b_dev->l2ad_writing; + addr = hdr->b_l2hdr.b_daddr; + /* + * Lock out L2ARC device removal. + */ + if (vdev_is_dead(vd) || + !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) + vd = NULL; + } + + /* + * We count both async reads and scrub IOs as asynchronous so + * that both can be upgraded in the event of a cache hit while + * the read IO is still in-flight. + */ + if (priority == ZIO_PRIORITY_ASYNC_READ || + priority == ZIO_PRIORITY_SCRUB) + arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + else + arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + + /* + * At this point, we have a level 1 cache miss. Try again in + * L2ARC if possible. + */ + ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); + + DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, + uint64_t, lsize, zbookmark_phys_t *, zb); + ARCSTAT_BUMP(arcstat_misses); + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), + data, metadata, misses); +#ifdef _KERNEL +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_READBPS, size); + racct_add_force(curproc, RACCT_READIOPS, 1); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ + curthread->td_ru.ru_inblock++; +#endif + + if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { + /* + * Read from the L2ARC if the following are true: + * 1. The L2ARC vdev was previously cached. + * 2. This buffer still has L2ARC metadata. + * 3. This buffer isn't currently writing to the L2ARC. + * 4. The L2ARC entry wasn't evicted, which may + * also have invalidated the vdev. + * 5. This isn't prefetch and l2arc_noprefetch is set. + */ + if (HDR_HAS_L2HDR(hdr) && + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && + !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { + l2arc_read_callback_t *cb; + abd_t *abd; + uint64_t asize; + + DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_hits); + atomic_inc_32(&hdr->b_l2hdr.b_hits); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), + KM_SLEEP); + cb->l2rcb_hdr = hdr; + cb->l2rcb_bp = *bp; + cb->l2rcb_zb = *zb; + cb->l2rcb_flags = zio_flags; + + asize = vdev_psize_to_asize(vd, size); + if (asize != size) { + abd = abd_alloc_for_io(asize, + HDR_ISTYPE_METADATA(hdr)); + cb->l2rcb_abd = abd; + } else { + abd = hdr->b_l1hdr.b_pabd; + } + + ASSERT(addr >= VDEV_LABEL_START_SIZE && + addr + asize <= vd->vdev_psize - + VDEV_LABEL_END_SIZE); + + /* + * l2arc read. The SCL_L2ARC lock will be + * released by l2arc_read_done(). + * Issue a null zio if the underlying buffer + * was squashed to zero size by compression. + */ + ASSERT3U(HDR_GET_COMPRESS(hdr), !=, + ZIO_COMPRESS_EMPTY); + rzio = zio_read_phys(pio, vd, addr, + asize, abd, + ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, + zio_flags | ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); + acb->acb_zio_head = rzio; + + if (hash_lock != NULL) + mutex_exit(hash_lock); + + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, + zio_t *, rzio); + ARCSTAT_INCR(arcstat_l2_read_bytes, size); + + if (*arc_flags & ARC_FLAG_NOWAIT) { + zio_nowait(rzio); + return (0); + } + + ASSERT(*arc_flags & ARC_FLAG_WAIT); + if (zio_wait(rzio) == 0) + return (0); + + /* l2arc read error; goto zio_read() */ + if (hash_lock != NULL) + mutex_enter(hash_lock); + } else { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + if (HDR_L2_WRITING(hdr)) + ARCSTAT_BUMP(arcstat_l2_rw_clash); + spa_config_exit(spa, SCL_L2ARC, vd); + } + } else { + if (vd != NULL) + spa_config_exit(spa, SCL_L2ARC, vd); + if (l2arc_ndev != 0) { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + } + } + + rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, + arc_read_done, hdr, priority, zio_flags, zb); + acb->acb_zio_head = rzio; + + if (hash_lock != NULL) + mutex_exit(hash_lock); + + if (*arc_flags & ARC_FLAG_WAIT) + return (zio_wait(rzio)); + + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); + zio_nowait(rzio); + } + return (0); +} + +arc_prune_t * +arc_add_prune_callback(arc_prune_func_t *func, void *private) +{ + arc_prune_t *p; + + p = kmem_alloc(sizeof (*p), KM_SLEEP); + p->p_pfunc = func; + p->p_private = private; + list_link_init(&p->p_node); + refcount_create(&p->p_refcnt); + + mutex_enter(&arc_prune_mtx); + refcount_add(&p->p_refcnt, &arc_prune_list); + list_insert_head(&arc_prune_list, p); + mutex_exit(&arc_prune_mtx); + + return (p); +} + +void +arc_remove_prune_callback(arc_prune_t *p) +{ + boolean_t wait = B_FALSE; + mutex_enter(&arc_prune_mtx); + list_remove(&arc_prune_list, p); + if (refcount_remove(&p->p_refcnt, &arc_prune_list) > 0) + wait = B_TRUE; + mutex_exit(&arc_prune_mtx); + + /* wait for arc_prune_task to finish */ + if (wait) + taskq_wait(arc_prune_taskq); + ASSERT0(refcount_count(&p->p_refcnt)); + refcount_destroy(&p->p_refcnt); + kmem_free(p, sizeof (*p)); +} + +/* + * Notify the arc that a block was freed, and thus will never be used again. + */ +void +arc_freed(spa_t *spa, const blkptr_t *bp) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + uint64_t guid = spa_load_guid(spa); + + ASSERT(!BP_IS_EMBEDDED(bp)); + + hdr = buf_hash_find(guid, bp, &hash_lock); + if (hdr == NULL) + return; + + /* + * We might be trying to free a block that is still doing I/O + * (i.e. prefetch) or has a reference (i.e. a dedup-ed, + * dmu_sync-ed block). If this block is being prefetched, then it + * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr + * until the I/O completes. A block may also have a reference if it is + * part of a dedup-ed, dmu_synced write. The dmu_sync() function would + * have written the new block to its final resting place on disk but + * without the dedup flag set. This would have left the hdr in the MRU + * state and discoverable. When the txg finally syncs it detects that + * the block was overridden in open context and issues an override I/O. + * Since this is a dedup block, the override I/O will determine if the + * block is already in the DDT. If so, then it will replace the io_bp + * with the bp from the DDT and allow the I/O to finish. When the I/O + * reaches the done callback, dbuf_write_override_done, it will + * check to see if the io_bp and io_bp_override are identical. + * If they are not, then it indicates that the bp was replaced with + * the bp in the DDT and the override bp is freed. This allows + * us to arrive here with a reference on a block that is being + * freed. So if we have an I/O in progress, or a reference to + * this hdr, then we don't destroy the hdr. + */ + if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && + refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); + mutex_exit(hash_lock); + } else { + mutex_exit(hash_lock); + } + +} + +/* + * Release this buffer from the cache, making it an anonymous buffer. This + * must be done after a read and prior to modifying the buffer contents. + * If the buffer has more than one reference, we must make + * a new hdr for the buffer. + */ +void +arc_release(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + /* + * It would be nice to assert that if it's DMU metadata (level > + * 0 || it's the dnode file), then it must be syncing context. + * But we don't know that information at this level. + */ + + mutex_enter(&buf->b_evict_lock); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + /* + * We don't grab the hash lock prior to this check, because if + * the buffer's header is in the arc_anon state, it won't be + * linked into the hash table. + */ + if (hdr->b_l1hdr.b_state == arc_anon) { + mutex_exit(&buf->b_evict_lock); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + ASSERT(!HDR_HAS_L2HDR(hdr)); + ASSERT(HDR_EMPTY(hdr)); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + + hdr->b_l1hdr.b_arc_access = 0; + + /* + * If the buf is being overridden then it may already + * have a hdr that is not empty. + */ + buf_discard_identity(hdr); + arc_buf_thaw(buf); + + return; + } + + kmutex_t *hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + /* + * This assignment is only valid as long as the hash_lock is + * held, we must be careful not to reference state or the + * b_state field after dropping the lock. + */ + arc_state_t *state = hdr->b_l1hdr.b_state; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT3P(state, !=, arc_anon); + + /* this buffer is not on any list */ + ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); + + if (HDR_HAS_L2HDR(hdr)) { + mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); + + /* + * We have to recheck this conditional again now that + * we're holding the l2ad_mtx to prevent a race with + * another thread which might be concurrently calling + * l2arc_evict(). In that case, l2arc_evict() might have + * destroyed the header's L2 portion as we were waiting + * to acquire the l2ad_mtx. + */ + if (HDR_HAS_L2HDR(hdr)) { + l2arc_trim(hdr); + arc_hdr_l2hdr_destroy(hdr); + } + + mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); + } + + /* + * Do we have more than one buf? + */ + if (hdr->b_l1hdr.b_bufcnt > 1) { + arc_buf_hdr_t *nhdr; + uint64_t spa = hdr->b_spa; + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + enum zio_compress compress = HDR_GET_COMPRESS(hdr); + arc_buf_contents_t type = arc_buf_type(hdr); + VERIFY3U(hdr->b_type, ==, type); + + ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); + (void) remove_reference(hdr, hash_lock, tag); + + if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(buf)); + } + + /* + * Pull the data off of this hdr and attach it to + * a new anonymous hdr. Also find the last buffer + * in the hdr's buffer list. + */ + arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); + ASSERT3P(lastbuf, !=, NULL); + + /* + * If the current arc_buf_t and the hdr are sharing their data + * buffer, then we must stop sharing that block. + */ + if (arc_buf_is_shared(buf)) { + VERIFY(!arc_buf_is_shared(lastbuf)); + + /* + * First, sever the block sharing relationship between + * buf and the arc_buf_hdr_t. + */ + arc_unshare_buf(hdr, buf); + + /* + * Now we need to recreate the hdr's b_pabd. Since we + * have lastbuf handy, we try to share with it, but if + * we can't then we allocate a new b_pabd and copy the + * data from buf into it. + */ + if (arc_can_share(hdr, lastbuf)) { + arc_share_buf(hdr, lastbuf); + } else { + arc_hdr_alloc_pabd(hdr); + abd_copy_from_buf(hdr->b_l1hdr.b_pabd, + buf->b_data, psize); + } + VERIFY3P(lastbuf->b_data, !=, NULL); + } else if (HDR_SHARED_DATA(hdr)) { + /* + * Uncompressed shared buffers are always at the end + * of the list. Compressed buffers don't have the + * same requirements. This makes it hard to + * simply assert that the lastbuf is shared so + * we rely on the hdr's compression flags to determine + * if we have a compressed, shared buffer. + */ + ASSERT(arc_buf_is_shared(lastbuf) || + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + ASSERT(!ARC_BUF_SHARED(buf)); + } + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + ASSERT3P(state, !=, arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_size, + arc_buf_size(buf), buf); + + if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + ASSERT3P(state, !=, arc_l2c_only); + (void) refcount_remove_many(&state->arcs_esize[type], + arc_buf_size(buf), buf); + } + + hdr->b_l1hdr.b_bufcnt -= 1; + arc_cksum_verify(buf); +#ifdef illumos + arc_buf_unwatch(buf); +#endif + + mutex_exit(hash_lock); + + /* + * Allocate a new hdr. The new hdr will contain a b_pabd + * buffer which will be freed in arc_write(). + */ + nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); + ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(nhdr->b_l1hdr.b_bufcnt); + ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); + VERIFY3U(nhdr->b_type, ==, type); + ASSERT(!HDR_SHARED_DATA(nhdr)); + + nhdr->b_l1hdr.b_buf = buf; + nhdr->b_l1hdr.b_bufcnt = 1; + (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); + buf->b_hdr = nhdr; + + mutex_exit(&buf->b_evict_lock); + (void) refcount_add_many(&arc_anon->arcs_size, + arc_buf_size(buf), buf); + } else { + mutex_exit(&buf->b_evict_lock); + ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); + /* protected by hash lock, or hdr is on arc_anon */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + arc_change_state(arc_anon, hdr, hash_lock); + hdr->b_l1hdr.b_arc_access = 0; + mutex_exit(hash_lock); + + buf_discard_identity(hdr); + arc_buf_thaw(buf); + } +} + +int +arc_released(arc_buf_t *buf) +{ + int released; + + mutex_enter(&buf->b_evict_lock); + released = (buf->b_data != NULL && + buf->b_hdr->b_l1hdr.b_state == arc_anon); + mutex_exit(&buf->b_evict_lock); + return (released); +} + +#ifdef ZFS_DEBUG +int +arc_referenced(arc_buf_t *buf) +{ + int referenced; + + mutex_enter(&buf->b_evict_lock); + referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); + mutex_exit(&buf->b_evict_lock); + return (referenced); +} +#endif + +static void +arc_write_ready(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; + uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + + /* + * If we're reexecuting this zio because the pool suspended, then + * cleanup any state that was previously set the first time the + * callback was invoked. + */ + if (zio->io_flags & ZIO_FLAG_REEXECUTED) { + arc_cksum_free(hdr); +#ifdef illumos + arc_buf_unwatch(buf); +#endif + if (hdr->b_l1hdr.b_pabd != NULL) { + if (arc_buf_is_shared(buf)) { + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pabd(hdr); + } + } + } + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + + callback->awcb_ready(zio, buf, callback->awcb_private); + + if (HDR_IO_IN_PROGRESS(hdr)) + ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); + + arc_cksum_compute(buf); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + + enum zio_compress compress; + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { + compress = ZIO_COMPRESS_OFF; + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); + compress = BP_GET_COMPRESS(zio->io_bp); + } + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + + + /* + * Fill the hdr with data. If the hdr is compressed, the data we want + * is available from the zio, otherwise we can take it from the buf. + * + * We might be able to share the buf's data with the hdr here. However, + * doing so would cause the ARC to be full of linear ABDs if we write a + * lot of shareable data. As a compromise, we check whether scattered + * ABDs are allowed, and assume that if they are then the user wants + * the ARC to be primarily filled with them regardless of the data being + * written. Therefore, if they're allowed then we allocate one and copy + * the data into it; otherwise, we share the data directly if we can. + */ + if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { + arc_hdr_alloc_pabd(hdr); + + /* + * Ideally, we would always copy the io_abd into b_pabd, but the + * user may have disabled compressed ARC, thus we must check the + * hdr's compression setting rather than the io_bp's. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, + ZIO_COMPRESS_OFF); + ASSERT3U(psize, >, 0); + + abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); + } else { + ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); + + abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, + arc_buf_size(buf)); + } + } else { + ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); + ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + + arc_share_buf(hdr, buf); + } + + arc_hdr_verify(hdr, zio->io_bp); +} + +static void +arc_write_children_ready(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + + callback->awcb_children_ready(zio, buf, callback->awcb_private); +} + +/* + * The SPA calls this callback for each physical write that happens on behalf + * of a logical write. See the comment in dbuf_write_physdone() for details. + */ +static void +arc_write_physdone(zio_t *zio) +{ + arc_write_callback_t *cb = zio->io_private; + if (cb->awcb_physdone != NULL) + cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); +} + +static void +arc_write_done(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + + if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { + buf_discard_identity(hdr); + } else { + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); + } + } else { + ASSERT(HDR_EMPTY(hdr)); + } + + /* + * If the block to be written was all-zero or compressed enough to be + * embedded in the BP, no write was performed so there will be no + * dva/birth/checksum. The buffer must therefore remain anonymous + * (and uncached). + */ + if (!HDR_EMPTY(hdr)) { + arc_buf_hdr_t *exists; + kmutex_t *hash_lock; + + ASSERT3U(zio->io_error, ==, 0); + + arc_cksum_verify(buf); + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists != NULL) { + /* + * This can only happen if we overwrite for + * sync-to-convergence, because we remove + * buffers from the hash table when we arc_free(). + */ + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) + panic("bad overwrite, hdr=%p exists=%p", + (void *)hdr, (void *)exists); + ASSERT(refcount_is_zero( + &exists->b_l1hdr.b_refcnt)); + arc_change_state(arc_anon, exists, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(exists); + exists = buf_hash_insert(hdr, &hash_lock); + ASSERT3P(exists, ==, NULL); + } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { + /* nopwrite */ + ASSERT(zio->io_prop.zp_nopwrite); + if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) + panic("bad nopwrite, hdr=%p exists=%p", + (void *)hdr, (void *)exists); + } else { + /* Dedup */ + ASSERT(hdr->b_l1hdr.b_bufcnt == 1); + ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(BP_GET_DEDUP(zio->io_bp)); + ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); + } + } + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + /* if it's not anon, we are doing a scrub */ + if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + } else { + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + } + + ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + callback->awcb_done(zio, buf, callback->awcb_private); + + abd_put(zio->io_abd); + kmem_free(callback, sizeof (arc_write_callback_t)); +} + +zio_t * +arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, + arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, + arc_write_done_func_t *done, void *private, zio_priority_t priority, + int zio_flags, const zbookmark_phys_t *zb) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + arc_write_callback_t *callback; + zio_t *zio; + zio_prop_t localprop = *zp; + + ASSERT3P(ready, !=, NULL); + ASSERT3P(done, !=, NULL); + ASSERT(!HDR_IO_ERROR(hdr)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + if (l2arc) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (ARC_BUF_COMPRESSED(buf)) { + /* + * We're writing a pre-compressed buffer. Make the + * compression algorithm requested by the zio_prop_t match + * the pre-compressed buffer's compression algorithm. + */ + localprop.zp_compress = HDR_GET_COMPRESS(hdr); + + ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); + zio_flags |= ZIO_FLAG_RAW; + } + callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); + callback->awcb_ready = ready; + callback->awcb_children_ready = children_ready; + callback->awcb_physdone = physdone; + callback->awcb_done = done; + callback->awcb_private = private; + callback->awcb_buf = buf; + + /* + * The hdr's b_pabd is now stale, free it now. A new data block + * will be allocated when the zio pipeline calls arc_write_ready(). + */ + if (hdr->b_l1hdr.b_pabd != NULL) { + /* + * If the buf is currently sharing the data block with + * the hdr then we need to break that relationship here. + * The hdr will remain with a NULL data pointer and the + * buf will take sole ownership of the block. + */ + if (arc_buf_is_shared(buf)) { + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pabd(hdr); + } + VERIFY3P(buf->b_data, !=, NULL); + arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); + } + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + + zio = zio_write(pio, spa, txg, bp, + abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), + HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, + (children_ready != NULL) ? arc_write_children_ready : NULL, + arc_write_physdone, arc_write_done, callback, + priority, zio_flags, zb); + + return (zio); +} + +static int +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) +{ +#ifdef _KERNEL + uint64_t available_memory = ptob(freemem); + +#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) + available_memory = MIN(available_memory, uma_avail()); +#endif + + if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) + return (0); + + if (txg > spa->spa_lowmem_last_txg) { + spa->spa_lowmem_last_txg = txg; + spa->spa_lowmem_page_load = 0; + } + /* + * If we are in pageout, we know that memory is already tight, + * the arc is already going to be evicting, so we just want to + * continue to let page writes occur as quickly as possible. + */ + if (curproc == pageproc) { + if (spa->spa_lowmem_page_load > + MAX(ptob(minfree), available_memory) / 4) + return (SET_ERROR(ERESTART)); + /* Note: reserve is inflated, so we deflate */ + atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); + return (0); + } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { + /* memory is low, delay before restarting */ + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + return (SET_ERROR(EAGAIN)); + } + spa->spa_lowmem_page_load = 0; +#endif /* _KERNEL */ + return (0); +} + +void +arc_tempreserve_clear(uint64_t reserve) +{ + atomic_add_64(&arc_tempreserve, -reserve); + ASSERT((int64_t)arc_tempreserve >= 0); +} + +int +arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + int error; + uint64_t anon_size; + + if (reserve > arc_c/4 && !arc_no_grow) { + arc_c = MIN(arc_c_max, reserve * 4); + DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); + } + if (reserve > arc_c) + return (SET_ERROR(ENOMEM)); + + /* + * Don't count loaned bufs as in flight dirty data to prevent long + * network delays from blocking transactions that are ready to be + * assigned to a txg. + */ + + /* assert that it has not wrapped around */ + ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); + + anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - + arc_loaned_bytes), 0); + + /* + * Writes will, almost always, require additional memory allocations + * in order to compress/encrypt/etc the data. We therefore need to + * make sure that there is sufficient available memory for this. + */ + error = arc_memory_throttle(spa, reserve, txg); + if (error != 0) + return (error); + + /* + * Throttle writes when the amount of dirty data in the cache + * gets too large. We try to keep the cache less than half full + * of dirty blocks so that our sync times don't grow too large. + * + * In the case of one pool being built on another pool, we want + * to make sure we don't end up throttling the lower (backing) + * pool when the upper pool is the majority contributor to dirty + * data. To insure we make forward progress during throttling, we + * also check the current pool's net dirty data and only throttle + * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty + * data in the cache. + * + * Note: if two requests come in concurrently, we might let them + * both succeed, when one of them should fail. Not a huge deal. + */ + uint64_t total_dirty = reserve + arc_tempreserve + anon_size; + uint64_t spa_dirty_anon = spa_dirty_data(spa); + + if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 && + anon_size > arc_c * zfs_arc_anon_limit_percent / 100 && + spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { + uint64_t meta_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + uint64_t data_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " + "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", + arc_tempreserve >> 10, meta_esize >> 10, + data_esize >> 10, reserve >> 10, arc_c >> 10); + return (SET_ERROR(ERESTART)); + } + atomic_add_64(&arc_tempreserve, reserve); + return (0); +} + +static void +arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, + kstat_named_t *evict_data, kstat_named_t *evict_metadata) +{ + size->value.ui64 = refcount_count(&state->arcs_size); + evict_data->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); + evict_metadata->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); +} + +static int +arc_kstat_update(kstat_t *ksp, int rw) +{ + arc_stats_t *as = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + return (EACCES); + } else { + arc_kstat_update_state(arc_anon, + &as->arcstat_anon_size, + &as->arcstat_anon_evictable_data, + &as->arcstat_anon_evictable_metadata); + arc_kstat_update_state(arc_mru, + &as->arcstat_mru_size, + &as->arcstat_mru_evictable_data, + &as->arcstat_mru_evictable_metadata); + arc_kstat_update_state(arc_mru_ghost, + &as->arcstat_mru_ghost_size, + &as->arcstat_mru_ghost_evictable_data, + &as->arcstat_mru_ghost_evictable_metadata); + arc_kstat_update_state(arc_mfu, + &as->arcstat_mfu_size, + &as->arcstat_mfu_evictable_data, + &as->arcstat_mfu_evictable_metadata); + arc_kstat_update_state(arc_mfu_ghost, + &as->arcstat_mfu_ghost_size, + &as->arcstat_mfu_ghost_evictable_data, + &as->arcstat_mfu_ghost_evictable_metadata); + + ARCSTAT(arcstat_size) = aggsum_value(&arc_size); + ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used); + ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size); + ARCSTAT(arcstat_metadata_size) = + aggsum_value(&astat_metadata_size); + ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size); + ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size); + ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size); + ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size); +#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11) + ARCSTAT(arcstat_other_size) = aggsum_value(&astat_bonus_size) + + aggsum_value(&astat_dnode_size) + + aggsum_value(&astat_dbuf_size); +#endif + ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size); + } + + return (0); +} + +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the ARC eviction + * code is laid out; arc_evict_state() assumes ARC buffers are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +arc_state_multilist_index_func(multilist_t *ml, void *obj) +{ + arc_buf_hdr_t *hdr = obj; + + /* + * We rely on b_dva to generate evenly distributed index + * numbers using buf_hash below. So, as an added precaution, + * let's make sure we never add empty buffers to the arc lists. + */ + ASSERT(!HDR_EMPTY(hdr)); + + /* + * The assumption here, is the hash value for a given + * arc_buf_hdr_t will remain constant throughout it's lifetime + * (i.e. it's b_spa, b_dva, and b_birth fields don't change). + * Thus, we don't need to store the header's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % + multilist_get_num_sublists(ml)); +} + +#ifdef _KERNEL +static eventhandler_tag arc_event_lowmem = NULL; + +static void +arc_lowmem(void *arg __unused, int howto __unused) +{ + int64_t free_memory, to_free; + + arc_no_grow = B_TRUE; + arc_warm = B_TRUE; + arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + free_memory = arc_available_memory(); + to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0); + DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); + arc_reduce_target_size(to_free); + + mutex_enter(&arc_adjust_lock); + arc_adjust_needed = B_TRUE; + zthr_wakeup(arc_adjust_zthr); + + /* + * It is unsafe to block here in arbitrary threads, because we can come + * here from ARC itself and may hold ARC locks and thus risk a deadlock + * with ARC reclaim thread. + */ + if (curproc == pageproc) + (void) cv_wait(&arc_adjust_waiters_cv, &arc_adjust_lock); + mutex_exit(&arc_adjust_lock); +} +#endif + +static void +arc_state_init(void) +{ + arc_anon = &ARC_anon; + arc_mru = &ARC_mru; + arc_mru_ghost = &ARC_mru_ghost; + arc_mfu = &ARC_mfu; + arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; + + arc_mru->arcs_list[ARC_BUFC_METADATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mru->arcs_list[ARC_BUFC_DATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mfu->arcs_list[ARC_BUFC_METADATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mfu->arcs_list[ARC_BUFC_DATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_l2c_only->arcs_list[ARC_BUFC_DATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_create(&arc_anon->arcs_size); + refcount_create(&arc_mru->arcs_size); + refcount_create(&arc_mru_ghost->arcs_size); + refcount_create(&arc_mfu->arcs_size); + refcount_create(&arc_mfu_ghost->arcs_size); + refcount_create(&arc_l2c_only->arcs_size); + + aggsum_init(&arc_meta_used, 0); + aggsum_init(&arc_size, 0); + aggsum_init(&astat_data_size, 0); + aggsum_init(&astat_metadata_size, 0); + aggsum_init(&astat_hdr_size, 0); + aggsum_init(&astat_bonus_size, 0); + aggsum_init(&astat_dnode_size, 0); + aggsum_init(&astat_dbuf_size, 0); + aggsum_init(&astat_l2_hdr_size, 0); +} + +static void +arc_state_fini(void) +{ + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_destroy(&arc_anon->arcs_size); + refcount_destroy(&arc_mru->arcs_size); + refcount_destroy(&arc_mru_ghost->arcs_size); + refcount_destroy(&arc_mfu->arcs_size); + refcount_destroy(&arc_mfu_ghost->arcs_size); + refcount_destroy(&arc_l2c_only->arcs_size); + + multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); +} + +uint64_t +arc_max_bytes(void) +{ + return (arc_c_max); +} + +void +arc_init(void) +{ + int i, prefetch_tunable_set = 0; + + /* + * allmem is "all memory that we could possibly use". + */ +#ifdef illumos +#ifdef _KERNEL + uint64_t allmem = ptob(physmem - swapfs_minfree); +#else + uint64_t allmem = (physmem * PAGESIZE) / 2; +#endif +#else + uint64_t allmem = kmem_size(); +#endif + mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL); + + mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); + + /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ + arc_c_min = MAX(allmem / 32, arc_abs_min); + /* set max to 5/8 of all memory, or all but 1GB, whichever is more */ + if (allmem >= 1 << 30) + arc_c_max = allmem - (1 << 30); + else + arc_c_max = arc_c_min; + arc_c_max = MAX(allmem * 5 / 8, arc_c_max); + + /* + * In userland, there's only the memory pressure that we artificially + * create (see arc_available_memory()). Don't let arc_c get too + * small, because it can cause transactions to be larger than + * arc_c, causing arc_tempreserve_space() to fail. + */ +#ifndef _KERNEL + arc_c_min = arc_c_max / 2; +#endif + +#ifdef _KERNEL + /* + * Allow the tunables to override our calculations if they are + * reasonable. + */ + if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) { + arc_c_max = zfs_arc_max; + arc_c_min = MIN(arc_c_min, arc_c_max); + } + if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) + arc_c_min = zfs_arc_min; +#endif + + arc_c = arc_c_max; + arc_p = (arc_c >> 1); + + /* limit meta-data to 1/4 of the arc capacity */ + arc_meta_limit = arc_c_max / 4; + +#ifdef _KERNEL + /* + * Metadata is stored in the kernel's heap. Don't let us + * use more than half the heap for the ARC. + */ +#ifdef __FreeBSD__ + arc_meta_limit = MIN(arc_meta_limit, uma_limit() / 2); + arc_dnode_limit = arc_meta_limit / 10; +#else + arc_meta_limit = MIN(arc_meta_limit, + vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2); +#endif +#endif + + /* Allow the tunable to override if it is reasonable */ + if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) + arc_meta_limit = zfs_arc_meta_limit; + + if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) + arc_c_min = arc_meta_limit / 2; + + if (zfs_arc_meta_min > 0) { + arc_meta_min = zfs_arc_meta_min; + } else { + arc_meta_min = arc_c_min / 2; + } + + /* Valid range: <arc_meta_min> - <arc_c_max> */ + if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) && + (zfs_arc_dnode_limit >= zfs_arc_meta_min) && + (zfs_arc_dnode_limit <= arc_c_max)) + arc_dnode_limit = zfs_arc_dnode_limit; + + if (zfs_arc_grow_retry > 0) + arc_grow_retry = zfs_arc_grow_retry; + + if (zfs_arc_shrink_shift > 0) + arc_shrink_shift = zfs_arc_shrink_shift; + + if (zfs_arc_no_grow_shift > 0) + arc_no_grow_shift = zfs_arc_no_grow_shift; + /* + * Ensure that arc_no_grow_shift is less than arc_shrink_shift. + */ + if (arc_no_grow_shift >= arc_shrink_shift) + arc_no_grow_shift = arc_shrink_shift - 1; + + if (zfs_arc_p_min_shift > 0) + arc_p_min_shift = zfs_arc_p_min_shift; + + /* if kmem_flags are set, lets try to use less memory */ + if (kmem_debugging()) + arc_c = arc_c / 2; + if (arc_c < arc_c_min) + arc_c = arc_c_min; + + zfs_arc_min = arc_c_min; + zfs_arc_max = arc_c_max; + + arc_state_init(); + + /* + * The arc must be "uninitialized", so that hdr_recl() (which is + * registered by buf_init()) will not access arc_reap_zthr before + * it is created. + */ + ASSERT(!arc_initialized); + buf_init(); + + list_create(&arc_prune_list, sizeof (arc_prune_t), + offsetof(arc_prune_t, p_node)); + mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); + + arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri, + max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + + arc_dnlc_evicts_thread_exit = FALSE; + + arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, + sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + + if (arc_ksp != NULL) { + arc_ksp->ks_data = &arc_stats; + arc_ksp->ks_update = arc_kstat_update; + kstat_install(arc_ksp); + } + + arc_adjust_zthr = zthr_create_timer(arc_adjust_cb_check, + arc_adjust_cb, NULL, SEC2NSEC(1)); + arc_reap_zthr = zthr_create_timer(arc_reap_cb_check, + arc_reap_cb, NULL, SEC2NSEC(1)); + +#ifdef _KERNEL + arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, + EVENTHANDLER_PRI_FIRST); +#endif + + (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + + arc_initialized = B_TRUE; + arc_warm = B_FALSE; + + /* + * Calculate maximum amount of dirty data per pool. + * + * If it has been set by /etc/system, take that. + * Otherwise, use a percentage of physical memory defined by + * zfs_dirty_data_max_percent (default 10%) with a cap at + * zfs_dirty_data_max_max (default 4GB). + */ + if (zfs_dirty_data_max == 0) { + zfs_dirty_data_max = ptob(physmem) * + zfs_dirty_data_max_percent / 100; + zfs_dirty_data_max = MIN(zfs_dirty_data_max, + zfs_dirty_data_max_max); + } + +#ifdef _KERNEL + if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) + prefetch_tunable_set = 1; + +#ifdef __i386__ + if (prefetch_tunable_set == 0) { + printf("ZFS NOTICE: Prefetch is disabled by default on i386 " + "-- to enable,\n"); + printf(" add \"vfs.zfs.prefetch_disable=0\" " + "to /boot/loader.conf.\n"); + zfs_prefetch_disable = 1; + } +#else + if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && + prefetch_tunable_set == 0) { + printf("ZFS NOTICE: Prefetch is disabled by default if less " + "than 4GB of RAM is present;\n" + " to enable, add \"vfs.zfs.prefetch_disable=0\" " + "to /boot/loader.conf.\n"); + zfs_prefetch_disable = 1; + } +#endif + /* Warn about ZFS memory and address space requirements. */ + if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { + printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " + "expect unstable behavior.\n"); + } + if (allmem < 512 * (1 << 20)) { + printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " + "expect unstable behavior.\n"); + printf(" Consider tuning vm.kmem_size and " + "vm.kmem_size_max\n"); + printf(" in /boot/loader.conf.\n"); + } +#endif +} + +void +arc_fini(void) +{ + arc_prune_t *p; + +#ifdef _KERNEL + if (arc_event_lowmem != NULL) + EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); +#endif + + /* Use B_TRUE to ensure *all* buffers are evicted */ + arc_flush(NULL, B_TRUE); + + mutex_enter(&arc_dnlc_evicts_lock); + arc_dnlc_evicts_thread_exit = TRUE; + /* + * The user evicts thread will set arc_user_evicts_thread_exit + * to FALSE when it is finished exiting; we're waiting for that. + */ + while (arc_dnlc_evicts_thread_exit) { + cv_signal(&arc_dnlc_evicts_cv); + cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); + } + mutex_exit(&arc_dnlc_evicts_lock); + + arc_initialized = B_FALSE; + + if (arc_ksp != NULL) { + kstat_delete(arc_ksp); + arc_ksp = NULL; + } + + taskq_wait(arc_prune_taskq); + taskq_destroy(arc_prune_taskq); + + mutex_enter(&arc_prune_mtx); + while ((p = list_head(&arc_prune_list)) != NULL) { + list_remove(&arc_prune_list, p); + refcount_remove(&p->p_refcnt, &arc_prune_list); + refcount_destroy(&p->p_refcnt); + kmem_free(p, sizeof (*p)); + } + mutex_exit(&arc_prune_mtx); + + list_destroy(&arc_prune_list); + mutex_destroy(&arc_prune_mtx); + + (void) zthr_cancel(arc_adjust_zthr); + zthr_destroy(arc_adjust_zthr); + + mutex_destroy(&arc_dnlc_evicts_lock); + cv_destroy(&arc_dnlc_evicts_cv); + + (void) zthr_cancel(arc_reap_zthr); + zthr_destroy(arc_reap_zthr); + + mutex_destroy(&arc_adjust_lock); + cv_destroy(&arc_adjust_waiters_cv); + + arc_state_fini(); + buf_fini(); + + ASSERT0(arc_loaned_bytes); +} + +/* + * Level 2 ARC + * + * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. + * It uses dedicated storage devices to hold cached data, which are populated + * using large infrequent writes. The main role of this cache is to boost + * the performance of random read workloads. The intended L2ARC devices + * include short-stroked disks, solid state disks, and other media with + * substantially faster read latency than disk. + * + * +-----------------------+ + * | ARC | + * +-----------------------+ + * | ^ ^ + * | | | + * l2arc_feed_thread() arc_read() + * | | | + * | l2arc read | + * V | | + * +---------------+ | + * | L2ARC | | + * +---------------+ | + * | ^ | + * l2arc_write() | | + * | | | + * V | | + * +-------+ +-------+ + * | vdev | | vdev | + * | cache | | cache | + * +-------+ +-------+ + * +=========+ .-----. + * : L2ARC : |-_____-| + * : devices : | Disks | + * +=========+ `-_____-' + * + * Read requests are satisfied from the following sources, in order: + * + * 1) ARC + * 2) vdev cache of L2ARC devices + * 3) L2ARC devices + * 4) vdev cache of disks + * 5) disks + * + * Some L2ARC device types exhibit extremely slow write performance. + * To accommodate for this there are some significant differences between + * the L2ARC and traditional cache design: + * + * 1. There is no eviction path from the ARC to the L2ARC. Evictions from + * the ARC behave as usual, freeing buffers and placing headers on ghost + * lists. The ARC does not send buffers to the L2ARC during eviction as + * this would add inflated write latencies for all ARC memory pressure. + * + * 2. The L2ARC attempts to cache data from the ARC before it is evicted. + * It does this by periodically scanning buffers from the eviction-end of + * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are + * not already there. It scans until a headroom of buffers is satisfied, + * which itself is a buffer for ARC eviction. If a compressible buffer is + * found during scanning and selected for writing to an L2ARC device, we + * temporarily boost scanning headroom during the next scan cycle to make + * sure we adapt to compression effects (which might significantly reduce + * the data volume we write to L2ARC). The thread that does this is + * l2arc_feed_thread(), illustrated below; example sizes are included to + * provide a better sense of ratio than this diagram: + * + * head --> tail + * +---------------------+----------+ + * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC + * +---------------------+----------+ | o L2ARC eligible + * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer + * +---------------------+----------+ | + * 15.9 Gbytes ^ 32 Mbytes | + * headroom | + * l2arc_feed_thread() + * | + * l2arc write hand <--[oooo]--' + * | 8 Mbyte + * | write max + * V + * +==============================+ + * L2ARC dev |####|#|###|###| |####| ... | + * +==============================+ + * 32 Gbytes + * + * 3. If an ARC buffer is copied to the L2ARC but then hit instead of + * evicted, then the L2ARC has cached a buffer much sooner than it probably + * needed to, potentially wasting L2ARC device bandwidth and storage. It is + * safe to say that this is an uncommon case, since buffers at the end of + * the ARC lists have moved there due to inactivity. + * + * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, + * then the L2ARC simply misses copying some buffers. This serves as a + * pressure valve to prevent heavy read workloads from both stalling the ARC + * with waits and clogging the L2ARC with writes. This also helps prevent + * the potential for the L2ARC to churn if it attempts to cache content too + * quickly, such as during backups of the entire pool. + * + * 5. After system boot and before the ARC has filled main memory, there are + * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru + * lists can remain mostly static. Instead of searching from tail of these + * lists as pictured, the l2arc_feed_thread() will search from the list heads + * for eligible buffers, greatly increasing its chance of finding them. + * + * The L2ARC device write speed is also boosted during this time so that + * the L2ARC warms up faster. Since there have been no ARC evictions yet, + * there are no L2ARC reads, and no fear of degrading read performance + * through increased writes. + * + * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that + * the vdev queue can aggregate them into larger and fewer writes. Each + * device is written to in a rotor fashion, sweeping writes through + * available space then repeating. + * + * 7. The L2ARC does not store dirty content. It never needs to flush + * write buffers back to disk based storage. + * + * 8. If an ARC buffer is written (and dirtied) which also exists in the + * L2ARC, the now stale L2ARC buffer is immediately dropped. + * + * The performance of the L2ARC can be tweaked by a number of tunables, which + * may be necessary for different workloads: + * + * l2arc_write_max max write bytes per interval + * l2arc_write_boost extra write bytes during device warmup + * l2arc_noprefetch skip caching prefetched buffers + * l2arc_headroom number of max device writes to precache + * l2arc_headroom_boost when we find compressed buffers during ARC + * scanning, we multiply headroom by this + * percentage factor for the next scan cycle, + * since more compressed buffers are likely to + * be present + * l2arc_feed_secs seconds between L2ARC writing + * + * Tunables may be removed or added as future performance improvements are + * integrated, and also may become zpool properties. + * + * There are three key functions that control how the L2ARC warms up: + * + * l2arc_write_eligible() check if a buffer is eligible to cache + * l2arc_write_size() calculate how much to write + * l2arc_write_interval() calculate sleep delay between writes + * + * These three functions determine what to write, how much, and how quickly + * to send writes. + */ + +static boolean_t +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) +{ + /* + * A buffer is *not* eligible for the L2ARC if it: + * 1. belongs to a different spa. + * 2. is already cached on the L2ARC. + * 3. has an I/O in progress (it may be an incomplete read). + * 4. is flagged not eligible (zfs property). + */ + if (hdr->b_spa != spa_guid) { + ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); + return (B_FALSE); + } + if (HDR_HAS_L2HDR(hdr)) { + ARCSTAT_BUMP(arcstat_l2_write_in_l2); + return (B_FALSE); + } + if (HDR_IO_IN_PROGRESS(hdr)) { + ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); + return (B_FALSE); + } + if (!HDR_L2CACHE(hdr)) { + ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); + return (B_FALSE); + } + + return (B_TRUE); +} + +static uint64_t +l2arc_write_size(void) +{ + uint64_t size; + + /* + * Make sure our globals have meaningful values in case the user + * altered them. + */ + size = l2arc_write_max; + if (size == 0) { + cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " + "be greater than zero, resetting it to the default (%d)", + L2ARC_WRITE_SIZE); + size = l2arc_write_max = L2ARC_WRITE_SIZE; + } + + if (arc_warm == B_FALSE) + size += l2arc_write_boost; + + return (size); + +} + +static clock_t +l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) +{ + clock_t interval, next, now; + + /* + * If the ARC lists are busy, increase our write rate; if the + * lists are stale, idle back. This is achieved by checking + * how much we previously wrote - if it was more than half of + * what we wanted, schedule the next write much sooner. + */ + if (l2arc_feed_again && wrote > (wanted / 2)) + interval = (hz * l2arc_feed_min_ms) / 1000; + else + interval = hz * l2arc_feed_secs; + + now = ddi_get_lbolt(); + next = MAX(now, MIN(now + interval, began + interval)); + + return (next); +} + +/* + * Cycle through L2ARC devices. This is how L2ARC load balances. + * If a device is returned, this also returns holding the spa config lock. + */ +static l2arc_dev_t * +l2arc_dev_get_next(void) +{ + l2arc_dev_t *first, *next = NULL; + + /* + * Lock out the removal of spas (spa_namespace_lock), then removal + * of cache devices (l2arc_dev_mtx). Once a device has been selected, + * both locks will be dropped and a spa config lock held instead. + */ + mutex_enter(&spa_namespace_lock); + mutex_enter(&l2arc_dev_mtx); + + /* if there are no vdevs, there is nothing to do */ + if (l2arc_ndev == 0) + goto out; + + first = NULL; + next = l2arc_dev_last; + do { + /* loop around the list looking for a non-faulted vdev */ + if (next == NULL) { + next = list_head(l2arc_dev_list); + } else { + next = list_next(l2arc_dev_list, next); + if (next == NULL) + next = list_head(l2arc_dev_list); + } + + /* if we have come back to the start, bail out */ + if (first == NULL) + first = next; + else if (next == first) + break; + + } while (vdev_is_dead(next->l2ad_vdev)); + + /* if we were unable to find any usable vdevs, return NULL */ + if (vdev_is_dead(next->l2ad_vdev)) + next = NULL; + + l2arc_dev_last = next; + +out: + mutex_exit(&l2arc_dev_mtx); + + /* + * Grab the config lock to prevent the 'next' device from being + * removed while we are writing to it. + */ + if (next != NULL) + spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); + mutex_exit(&spa_namespace_lock); + + return (next); +} + +/* + * Free buffers that were tagged for destruction. + */ +static void +l2arc_do_free_on_write() +{ + list_t *buflist; + l2arc_data_free_t *df, *df_prev; + + mutex_enter(&l2arc_free_on_write_mtx); + buflist = l2arc_free_on_write; + + for (df = list_tail(buflist); df; df = df_prev) { + df_prev = list_prev(buflist, df); + ASSERT3P(df->l2df_abd, !=, NULL); + abd_free(df->l2df_abd); + list_remove(buflist, df); + kmem_free(df, sizeof (l2arc_data_free_t)); + } + + mutex_exit(&l2arc_free_on_write_mtx); +} + +/* + * A write to a cache device has completed. Update all headers to allow + * reads from these buffers to begin. + */ +static void +l2arc_write_done(zio_t *zio) +{ + l2arc_write_callback_t *cb; + l2arc_dev_t *dev; + list_t *buflist; + arc_buf_hdr_t *head, *hdr, *hdr_prev; + kmutex_t *hash_lock; + int64_t bytes_dropped = 0; + + cb = zio->io_private; + ASSERT3P(cb, !=, NULL); + dev = cb->l2wcb_dev; + ASSERT3P(dev, !=, NULL); + head = cb->l2wcb_head; + ASSERT3P(head, !=, NULL); + buflist = &dev->l2ad_buflist; + ASSERT3P(buflist, !=, NULL); + DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, + l2arc_write_callback_t *, cb); + + if (zio->io_error != 0) + ARCSTAT_BUMP(arcstat_l2_writes_error); + + /* + * All writes completed, or an error was hit. + */ +top: + mutex_enter(&dev->l2ad_mtx); + for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); + + hash_lock = HDR_LOCK(hdr); + + /* + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). + */ + if (!mutex_tryenter(hash_lock)) { + /* + * Missed the hash lock. We must retry so we + * don't leave the ARC_FLAG_L2_WRITING bit set. + */ + ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); + + /* + * We don't want to rescan the headers we've + * already marked as having been written out, so + * we reinsert the head node so we can pick up + * where we left off. + */ + list_remove(buflist, head); + list_insert_after(buflist, hdr, head); + + mutex_exit(&dev->l2ad_mtx); + + /* + * We wait for the hash lock to become available + * to try and prevent busy waiting, and increase + * the chance we'll be able to acquire the lock + * the next time around. + */ + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; + } + + /* + * We could not have been moved into the arc_l2c_only + * state while in-flight due to our ARC_FLAG_L2_WRITING + * bit being set. Let's just ensure that's being enforced. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (zio->io_error != 0) { + /* + * Error - drop L2ARC entry. + */ + list_remove(buflist, hdr); + l2arc_trim(hdr); + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); + + ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); + + bytes_dropped += arc_hdr_size(hdr); + (void) refcount_remove_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); + } + + /* + * Allow ARC to begin reads and ghost list evictions to + * this L2ARC entry. + */ + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); + + mutex_exit(hash_lock); + } + + atomic_inc_64(&l2arc_writes_done); + list_remove(buflist, head); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); + mutex_exit(&dev->l2ad_mtx); + + vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); + + l2arc_do_free_on_write(); + + kmem_free(cb, sizeof (l2arc_write_callback_t)); +} + +/* + * A read to a cache device completed. Validate buffer contents before + * handing over to the regular ARC routines. + */ +static void +l2arc_read_done(zio_t *zio) +{ + l2arc_read_callback_t *cb; + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + boolean_t valid_cksum; + + ASSERT3P(zio->io_vd, !=, NULL); + ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); + + spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); + + cb = zio->io_private; + ASSERT3P(cb, !=, NULL); + hdr = cb->l2rcb_hdr; + ASSERT3P(hdr, !=, NULL); + + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + + /* + * If the data was read into a temporary buffer, + * move it and free the buffer. + */ + if (cb->l2rcb_abd != NULL) { + ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); + if (zio->io_error == 0) { + abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, + arc_hdr_size(hdr)); + } + + /* + * The following must be done regardless of whether + * there was an error: + * - free the temporary buffer + * - point zio to the real ARC buffer + * - set zio size accordingly + * These are required because zio is either re-used for + * an I/O of the block in the case of the error + * or the zio is passed to arc_read_done() and it + * needs real data. + */ + abd_free(cb->l2rcb_abd); + zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); + zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; + } + + ASSERT3P(zio->io_abd, !=, NULL); + + /* + * Check this survived the L2ARC journey. + */ + ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + + valid_cksum = arc_cksum_is_equal(hdr, zio); + if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + mutex_exit(hash_lock); + zio->io_private = hdr; + arc_read_done(zio); + } else { + mutex_exit(hash_lock); + /* + * Buffer didn't survive caching. Increment stats and + * reissue to the original storage device. + */ + if (zio->io_error != 0) { + ARCSTAT_BUMP(arcstat_l2_io_error); + } else { + zio->io_error = SET_ERROR(EIO); + } + if (!valid_cksum) + ARCSTAT_BUMP(arcstat_l2_cksum_bad); + + /* + * If there's no waiter, issue an async i/o to the primary + * storage now. If there *is* a waiter, the caller must + * issue the i/o in a context where it's OK to block. + */ + if (zio->io_waiter == NULL) { + zio_t *pio = zio_unique_parent(zio); + + ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); + + zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, + hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, + hdr, zio->io_priority, cb->l2rcb_flags, + &cb->l2rcb_zb)); + } + } + + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + +/* + * This is the list priority from which the L2ARC will search for pages to + * cache. This is used within loops (0..3) to cycle through lists in the + * desired order. This order can have a significant effect on cache + * performance. + * + * Currently the metadata lists are hit first, MFU then MRU, followed by + * the data lists. This function returns a locked list, and also returns + * the lock pointer. + */ +static multilist_sublist_t * +l2arc_sublist_lock(int list_num) +{ + multilist_t *ml = NULL; + unsigned int idx; + + ASSERT(list_num >= 0 && list_num <= 3); + + switch (list_num) { + case 0: + ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; + break; + case 1: + ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; + break; + case 2: + ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; + break; + case 3: + ml = arc_mru->arcs_list[ARC_BUFC_DATA]; + break; + } + + /* + * Return a randomly-selected sublist. This is acceptable + * because the caller feeds only a little bit of data for each + * call (8MB). Subsequent calls will result in different + * sublists being selected. + */ + idx = multilist_get_random_index(ml); + return (multilist_sublist_lock(ml, idx)); +} + +/* + * Evict buffers from the device write hand to the distance specified in + * bytes. This distance may span populated buffers, it may span nothing. + * This is clearing a region on the L2ARC device ready for writing. + * If the 'all' boolean is set, every buffer is evicted. + */ +static void +l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) +{ + list_t *buflist; + arc_buf_hdr_t *hdr, *hdr_prev; + kmutex_t *hash_lock; + uint64_t taddr; + + buflist = &dev->l2ad_buflist; + + if (!all && dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. + */ + return; + } + + if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { + /* + * When nearing the end of the device, evict to the end + * before the device write hand jumps to the start. + */ + taddr = dev->l2ad_end; + } else { + taddr = dev->l2ad_hand + distance; + } + DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, + uint64_t, taddr, boolean_t, all); + +top: + mutex_enter(&dev->l2ad_mtx); + for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); + + hash_lock = HDR_LOCK(hdr); + + /* + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). + */ + if (!mutex_tryenter(hash_lock)) { + /* + * Missed the hash lock. Retry. + */ + ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); + mutex_exit(&dev->l2ad_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; + } + + /* + * A header can't be on this list if it doesn't have L2 header. + */ + ASSERT(HDR_HAS_L2HDR(hdr)); + + /* Ensure this header has finished being written. */ + ASSERT(!HDR_L2_WRITING(hdr)); + ASSERT(!HDR_L2_WRITE_HEAD(hdr)); + + if (!all && (hdr->b_l2hdr.b_daddr >= taddr || + hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { + /* + * We've evicted to the target address, + * or the end of the device. + */ + mutex_exit(hash_lock); + break; + } + + if (!HDR_HAS_L1HDR(hdr)) { + ASSERT(!HDR_L2_READING(hdr)); + /* + * This doesn't exist in the ARC. Destroy. + * arc_hdr_destroy() will call list_remove() + * and decrement arcstat_l2_lsize. + */ + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); + } else { + ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); + ARCSTAT_BUMP(arcstat_l2_evict_l1cached); + /* + * Invalidate issued or about to be issued + * reads, since we may be about to write + * over this location. + */ + if (HDR_L2_READING(hdr)) { + ARCSTAT_BUMP(arcstat_l2_evict_reading); + arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); + } + + arc_hdr_l2hdr_destroy(hdr); + } + mutex_exit(hash_lock); + } + mutex_exit(&dev->l2ad_mtx); +} + +/* + * Find and write ARC buffers to the L2ARC device. + * + * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid + * for reading until they have completed writing. + * The headroom_boost is an in-out parameter used to maintain headroom boost + * state between calls to this function. + * + * Returns the number of bytes actually written (which may be smaller than + * the delta by which the device hand has changed due to alignment). + */ +static uint64_t +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) +{ + arc_buf_hdr_t *hdr, *hdr_prev, *head; + uint64_t write_asize, write_psize, write_lsize, headroom; + boolean_t full; + l2arc_write_callback_t *cb; + zio_t *pio, *wzio; + uint64_t guid = spa_load_guid(spa); + int try; + + ASSERT3P(dev->l2ad_vdev, !=, NULL); + + pio = NULL; + write_lsize = write_asize = write_psize = 0; + full = B_FALSE; + head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); + arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); + + ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); + /* + * Copy buffers for L2ARC writing. + */ + for (try = 0; try <= 3; try++) { + multilist_sublist_t *mls = l2arc_sublist_lock(try); + uint64_t passed_sz = 0; + + ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); + + /* + * L2ARC fast warmup. + * + * Until the ARC is warm and starts to evict, read from the + * head of the ARC lists rather than the tail. + */ + if (arc_warm == B_FALSE) + hdr = multilist_sublist_head(mls); + else + hdr = multilist_sublist_tail(mls); + if (hdr == NULL) + ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); + + headroom = target_sz * l2arc_headroom; + if (zfs_compressed_arc_enabled) + headroom = (headroom * l2arc_headroom_boost) / 100; + + for (; hdr; hdr = hdr_prev) { + kmutex_t *hash_lock; + + if (arc_warm == B_FALSE) + hdr_prev = multilist_sublist_next(mls, hdr); + else + hdr_prev = multilist_sublist_prev(mls, hdr); + ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, + HDR_GET_LSIZE(hdr)); + + hash_lock = HDR_LOCK(hdr); + if (!mutex_tryenter(hash_lock)) { + ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); + /* + * Skip this buffer rather than waiting. + */ + continue; + } + + passed_sz += HDR_GET_LSIZE(hdr); + if (passed_sz > headroom) { + /* + * Searched too far. + */ + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); + break; + } + + if (!l2arc_write_eligible(guid, hdr)) { + mutex_exit(hash_lock); + continue; + } + + /* + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + + ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + ASSERT3U(arc_hdr_size(hdr), >, 0); + uint64_t psize = arc_hdr_size(hdr); + uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, + psize); + + if ((write_asize + asize) > target_sz) { + full = B_TRUE; + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_full); + break; + } + + if (pio == NULL) { + /* + * Insert a dummy header on the buflist so + * l2arc_write_done() can find where the + * write buffers begin without searching. + */ + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_buflist, head); + mutex_exit(&dev->l2ad_mtx); + + cb = kmem_alloc( + sizeof (l2arc_write_callback_t), KM_SLEEP); + cb->l2wcb_dev = dev; + cb->l2wcb_head = head; + pio = zio_root(spa, l2arc_write_done, cb, + ZIO_FLAG_CANFAIL); + ARCSTAT_BUMP(arcstat_l2_write_pios); + } + + hdr->b_l2hdr.b_dev = dev; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + arc_hdr_set_flags(hdr, + ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); + + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); + + (void) refcount_add_many(&dev->l2ad_alloc, psize, hdr); + + /* + * Normally the L2ARC can use the hdr's data, but if + * we're sharing data between the hdr and one of its + * bufs, L2ARC needs its own copy of the data so that + * the ZIO below can't race with the buf consumer. + * Another case where we need to create a copy of the + * data is when the buffer size is not device-aligned + * and we need to pad the block to make it such. + * That also keeps the clock hand suitably aligned. + * + * To ensure that the copy will be available for the + * lifetime of the ZIO and be cleaned up afterwards, we + * add it to the l2arc_free_on_write queue. + */ + abd_t *to_write; + if (!HDR_SHARED_DATA(hdr) && psize == asize) { + to_write = hdr->b_l1hdr.b_pabd; + } else { + to_write = abd_alloc_for_io(asize, + HDR_ISTYPE_METADATA(hdr)); + abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); + if (asize != psize) { + abd_zero_off(to_write, psize, + asize - psize); + } + l2arc_free_abd_on_write(to_write, asize, + arc_buf_type(hdr)); + } + wzio = zio_write_phys(pio, dev->l2ad_vdev, + hdr->b_l2hdr.b_daddr, asize, to_write, + ZIO_CHECKSUM_OFF, NULL, hdr, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, B_FALSE); + + write_lsize += HDR_GET_LSIZE(hdr); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + + write_psize += psize; + write_asize += asize; + dev->l2ad_hand += asize; + + mutex_exit(hash_lock); + + (void) zio_nowait(wzio); + } + + multilist_sublist_unlock(mls); + + if (full == B_TRUE) + break; + } + + /* No buffers selected for writing? */ + if (pio == NULL) { + ASSERT0(write_lsize); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); + return (0); + } + + ASSERT3U(write_psize, <=, target_sz); + ARCSTAT_BUMP(arcstat_l2_writes_sent); + ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); + ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); + ARCSTAT_INCR(arcstat_l2_psize, write_psize); + vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); + + /* + * Bump device hand to the device start if it is approaching the end. + * l2arc_evict() will already have evicted ahead for this case. + */ + if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { + dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_first = B_FALSE; + } + + dev->l2ad_writing = B_TRUE; + (void) zio_wait(pio); + dev->l2ad_writing = B_FALSE; + + return (write_asize); +} + +/* + * This thread feeds the L2ARC at regular intervals. This is the beating + * heart of the L2ARC. + */ +/* ARGSUSED */ +static void +l2arc_feed_thread(void *unused __unused) +{ + callb_cpr_t cpr; + l2arc_dev_t *dev; + spa_t *spa; + uint64_t size, wrote; + clock_t begin, next = ddi_get_lbolt(); + + CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&l2arc_feed_thr_lock); + + while (l2arc_thread_exit == 0) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, + next - ddi_get_lbolt()); + CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); + next = ddi_get_lbolt() + hz; + + /* + * Quick check for L2ARC devices. + */ + mutex_enter(&l2arc_dev_mtx); + if (l2arc_ndev == 0) { + mutex_exit(&l2arc_dev_mtx); + continue; + } + mutex_exit(&l2arc_dev_mtx); + begin = ddi_get_lbolt(); + + /* + * This selects the next l2arc device to write to, and in + * doing so the next spa to feed from: dev->l2ad_spa. This + * will return NULL if there are now no l2arc devices or if + * they are all faulted. + * + * If a device is returned, its spa's config lock is also + * held to prevent device removal. l2arc_dev_get_next() + * will grab and release l2arc_dev_mtx. + */ + if ((dev = l2arc_dev_get_next()) == NULL) + continue; + + spa = dev->l2ad_spa; + ASSERT3P(spa, !=, NULL); + + /* + * If the pool is read-only then force the feed thread to + * sleep a little longer. + */ + if (!spa_writeable(spa)) { + next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + /* + * Avoid contributing to memory pressure. + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_abort_lowmem); + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + ARCSTAT_BUMP(arcstat_l2_feeds); + + size = l2arc_write_size(); + + /* + * Evict L2ARC buffers that will be overwritten. + */ + l2arc_evict(dev, size, B_FALSE); + + /* + * Write ARC buffers. + */ + wrote = l2arc_write_buffers(spa, dev, size); + + /* + * Calculate interval between writes. + */ + next = l2arc_write_interval(begin, size, wrote); + spa_config_exit(spa, SCL_L2ARC, dev); + } + + l2arc_thread_exit = 0; + cv_broadcast(&l2arc_feed_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ + thread_exit(); +} + +boolean_t +l2arc_vdev_present(vdev_t *vd) +{ + l2arc_dev_t *dev; + + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev != NULL; + dev = list_next(l2arc_dev_list, dev)) { + if (dev->l2ad_vdev == vd) + break; + } + mutex_exit(&l2arc_dev_mtx); + + return (dev != NULL); +} + +/* + * Add a vdev for use by the L2ARC. By this point the spa has already + * validated the vdev and opened it. + */ +void +l2arc_add_vdev(spa_t *spa, vdev_t *vd) +{ + l2arc_dev_t *adddev; + + ASSERT(!l2arc_vdev_present(vd)); + + vdev_ashift_optimize(vd); + + /* + * Create a new l2arc device entry. + */ + adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); + adddev->l2ad_spa = spa; + adddev->l2ad_vdev = vd; + adddev->l2ad_start = VDEV_LABEL_START_SIZE; + adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); + adddev->l2ad_hand = adddev->l2ad_start; + adddev->l2ad_first = B_TRUE; + adddev->l2ad_writing = B_FALSE; + + mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); + /* + * This is a list of all ARC buffers that are still valid on the + * device. + */ + list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); + + vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); + refcount_create(&adddev->l2ad_alloc); + + /* + * Add device to global list + */ + mutex_enter(&l2arc_dev_mtx); + list_insert_head(l2arc_dev_list, adddev); + atomic_inc_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); +} + +/* + * Remove a vdev from the L2ARC. + */ +void +l2arc_remove_vdev(vdev_t *vd) +{ + l2arc_dev_t *dev, *nextdev, *remdev = NULL; + + /* + * Find the device by vdev + */ + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { + nextdev = list_next(l2arc_dev_list, dev); + if (vd == dev->l2ad_vdev) { + remdev = dev; + break; + } + } + ASSERT3P(remdev, !=, NULL); + + /* + * Remove device from global list + */ + list_remove(l2arc_dev_list, remdev); + l2arc_dev_last = NULL; /* may have been invalidated */ + atomic_dec_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); + + /* + * Clear all buflists and ARC references. L2ARC device flush. + */ + l2arc_evict(remdev, 0, B_TRUE); + list_destroy(&remdev->l2ad_buflist); + mutex_destroy(&remdev->l2ad_mtx); + refcount_destroy(&remdev->l2ad_alloc); + kmem_free(remdev, sizeof (l2arc_dev_t)); +} + +void +l2arc_init(void) +{ + l2arc_thread_exit = 0; + l2arc_ndev = 0; + l2arc_writes_sent = 0; + l2arc_writes_done = 0; + + mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); + + l2arc_dev_list = &L2ARC_dev_list; + l2arc_free_on_write = &L2ARC_free_on_write; + list_create(l2arc_dev_list, sizeof (l2arc_dev_t), + offsetof(l2arc_dev_t, l2ad_node)); + list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), + offsetof(l2arc_data_free_t, l2df_list_node)); +} + +void +l2arc_fini(void) +{ + /* + * This is called from dmu_fini(), which is called from spa_fini(); + * Because of this, we can assume that all l2arc devices have + * already been removed when the pools themselves were removed. + */ + + l2arc_do_free_on_write(); + + mutex_destroy(&l2arc_feed_thr_lock); + cv_destroy(&l2arc_feed_thr_cv); + mutex_destroy(&l2arc_dev_mtx); + mutex_destroy(&l2arc_free_on_write_mtx); + + list_destroy(l2arc_dev_list); + list_destroy(l2arc_free_on_write); +} + +void +l2arc_start(void) +{ + if (!(spa_mode_global & FWRITE)) + return; + + (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); +} + +void +l2arc_stop(void) +{ + if (!(spa_mode_global & FWRITE)) + return; + + mutex_enter(&l2arc_feed_thr_lock); + cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ + l2arc_thread_exit = 1; + while (l2arc_thread_exit != 0) + cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); + mutex_exit(&l2arc_feed_thr_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c new file mode 100644 index 000000000000..d7a7fdb0e1b1 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c @@ -0,0 +1,152 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/zio_compress.h> + +/* + * Embedded-data Block Pointers + * + * Normally, block pointers point (via their DVAs) to a block which holds data. + * If the data that we need to store is very small, this is an inefficient + * use of space, because a block must be at minimum 1 sector (typically 512 + * bytes or 4KB). Additionally, reading these small blocks tends to generate + * more random reads. + * + * Embedded-data Block Pointers allow small pieces of data (the "payload", + * up to 112 bytes) to be stored in the block pointer itself, instead of + * being pointed to. The "Pointer" part of this name is a bit of a + * misnomer, as nothing is pointed to. + * + * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to + * be embedded in the block pointer. The logic for this is handled in + * the SPA, by the zio pipeline. Therefore most code outside the zio + * pipeline doesn't need special-cases to handle these block pointers. + * + * See spa.h for details on the exact layout of embedded block pointers. + */ + +void +encode_embedded_bp_compressed(blkptr_t *bp, void *data, + enum zio_compress comp, int uncompressed_size, int compressed_size) +{ + uint64_t *bp64 = (uint64_t *)bp; + uint64_t w = 0; + uint8_t *data8 = data; + + ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE); + ASSERT(uncompressed_size == compressed_size || + comp != ZIO_COMPRESS_OFF); + ASSERT3U(comp, >=, ZIO_COMPRESS_OFF); + ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); + + bzero(bp, sizeof (*bp)); + BP_SET_EMBEDDED(bp, B_TRUE); + BP_SET_COMPRESS(bp, comp); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + BPE_SET_LSIZE(bp, uncompressed_size); + BPE_SET_PSIZE(bp, compressed_size); + + /* + * Encode the byte array into the words of the block pointer. + * First byte goes into low bits of first word (little endian). + */ + for (int i = 0; i < compressed_size; i++) { + BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]); + if (i % sizeof (w) == sizeof (w) - 1) { + /* we've reached the end of a word */ + ASSERT3P(bp64, <, bp + 1); + *bp64 = w; + bp64++; + if (!BPE_IS_PAYLOADWORD(bp, bp64)) + bp64++; + w = 0; + } + } + /* write last partial word */ + if (bp64 < (uint64_t *)(bp + 1)) + *bp64 = w; +} + +/* + * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be + * more than BPE_PAYLOAD_SIZE bytes). + */ +void +decode_embedded_bp_compressed(const blkptr_t *bp, void *buf) +{ + int psize; + uint8_t *buf8 = buf; + uint64_t w = 0; + const uint64_t *bp64 = (const uint64_t *)bp; + + ASSERT(BP_IS_EMBEDDED(bp)); + + psize = BPE_GET_PSIZE(bp); + + /* + * Decode the words of the block pointer into the byte array. + * Low bits of first word are the first byte (little endian). + */ + for (int i = 0; i < psize; i++) { + if (i % sizeof (w) == 0) { + /* beginning of a word */ + ASSERT3P(bp64, <, bp + 1); + w = *bp64; + bp64++; + if (!BPE_IS_PAYLOADWORD(bp, bp64)) + bp64++; + } + buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY); + } +} + +/* + * Fill in the buffer with the (decompressed) payload of the embedded + * blkptr_t. Takes into account compression and byteorder (the payload is + * treated as a stream of bytes). + * Return 0 on success, or ENOSPC if it won't fit in the buffer. + */ +int +decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen) +{ + int lsize, psize; + + ASSERT(BP_IS_EMBEDDED(bp)); + + lsize = BPE_GET_LSIZE(bp); + psize = BPE_GET_PSIZE(bp); + + if (lsize > buflen) + return (ENOSPC); + ASSERT3U(lsize, ==, buflen); + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { + uint8_t dstbuf[BPE_PAYLOAD_SIZE]; + decode_embedded_bp_compressed(bp, dstbuf); + VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp), + dstbuf, buf, psize, buflen)); + } else { + ASSERT3U(lsize, ==, psize); + decode_embedded_bp_compressed(bp, buf); + } + + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c new file mode 100644 index 000000000000..ee12db3a266d --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/bplist.h> +#include <sys/zfs_context.h> + + +void +bplist_create(bplist_t *bpl) +{ + mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&bpl->bpl_list, sizeof (bplist_entry_t), + offsetof(bplist_entry_t, bpe_node)); +} + +void +bplist_destroy(bplist_t *bpl) +{ + list_destroy(&bpl->bpl_list); + mutex_destroy(&bpl->bpl_lock); +} + +void +bplist_append(bplist_t *bpl, const blkptr_t *bp) +{ + bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP); + + mutex_enter(&bpl->bpl_lock); + bpe->bpe_blk = *bp; + list_insert_tail(&bpl->bpl_list, bpe); + mutex_exit(&bpl->bpl_lock); +} + +/* + * To aid debugging, we keep the most recently removed entry. This way if + * we are in the callback, we can easily locate the entry. + */ +static bplist_entry_t *bplist_iterate_last_removed; + +void +bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) +{ + bplist_entry_t *bpe; + + mutex_enter(&bpl->bpl_lock); + while (bpe = list_head(&bpl->bpl_list)) { + bplist_iterate_last_removed = bpe; + list_remove(&bpl->bpl_list, bpe); + mutex_exit(&bpl->bpl_lock); + func(arg, &bpe->bpe_blk, tx); + kmem_free(bpe, sizeof (*bpe)); + mutex_enter(&bpl->bpl_lock); + } + mutex_exit(&bpl->bpl_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c new file mode 100644 index 000000000000..bbdd765214fc --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c @@ -0,0 +1,606 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2017 Datto Inc. + */ + +#include <sys/bpobj.h> +#include <sys/zfs_context.h> +#include <sys/refcount.h> +#include <sys/dsl_pool.h> +#include <sys/zfeature.h> +#include <sys/zap.h> + +/* + * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). + */ +uint64_t +bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(os); + dsl_pool_t *dp = dmu_objset_pool(os); + + if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) { + if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) { + ASSERT0(dp->dp_empty_bpobj); + dp->dp_empty_bpobj = + bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY(zap_add(os, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, + &dp->dp_empty_bpobj, tx) == 0); + } + spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx); + ASSERT(dp->dp_empty_bpobj != 0); + return (dp->dp_empty_bpobj); + } else { + return (bpobj_alloc(os, blocksize, tx)); + } +} + +void +bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_objset_pool(os); + + spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx); + if (!spa_feature_is_active(dmu_objset_spa(os), + SPA_FEATURE_EMPTY_BPOBJ)) { + VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_EMPTY_BPOBJ, tx)); + VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); + dp->dp_empty_bpobj = 0; + } +} + +uint64_t +bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) +{ + int size; + + if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) + size = BPOBJ_SIZE_V0; + else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) + size = BPOBJ_SIZE_V1; + else + size = sizeof (bpobj_phys_t); + + return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, + DMU_OT_BPOBJ_HDR, size, tx)); +} + +void +bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + int64_t i; + bpobj_t bpo; + dmu_object_info_t doi; + int epb; + dmu_buf_t *dbuf = NULL; + + ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); + VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); + + mutex_enter(&bpo.bpo_lock); + + if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) + goto out; + + VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); + epb = doi.doi_data_block_size / sizeof (uint64_t); + + for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { + uint64_t *objarray; + uint64_t offset, blkoff; + + offset = i * sizeof (uint64_t); + blkoff = P2PHASE(i, epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + VERIFY3U(0, ==, dmu_buf_hold(os, + bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + objarray = dbuf->db_data; + bpobj_free(os, objarray[blkoff], tx); + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); + +out: + mutex_exit(&bpo.bpo_lock); + bpobj_close(&bpo); + + VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); +} + +int +bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) +{ + dmu_object_info_t doi; + int err; + + err = dmu_object_info(os, object, &doi); + if (err) + return (err); + + bzero(bpo, sizeof (*bpo)); + mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); + + ASSERT(bpo->bpo_dbuf == NULL); + ASSERT(bpo->bpo_phys == NULL); + ASSERT(object != 0); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); + + err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); + if (err) + return (err); + + bpo->bpo_os = os; + bpo->bpo_object = object; + bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; + bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); + bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); + bpo->bpo_phys = bpo->bpo_dbuf->db_data; + return (0); +} + +boolean_t +bpobj_is_open(const bpobj_t *bpo) +{ + return (bpo->bpo_object != 0); +} + +void +bpobj_close(bpobj_t *bpo) +{ + /* Lame workaround for closing a bpobj that was never opened. */ + if (bpo->bpo_object == 0) + return; + + dmu_buf_rele(bpo->bpo_dbuf, bpo); + if (bpo->bpo_cached_dbuf != NULL) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + bpo->bpo_dbuf = NULL; + bpo->bpo_phys = NULL; + bpo->bpo_cached_dbuf = NULL; + bpo->bpo_object = 0; + + mutex_destroy(&bpo->bpo_lock); +} + +boolean_t +bpobj_is_empty(bpobj_t *bpo) +{ + return (bpo->bpo_phys->bpo_num_blkptrs == 0 && + (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); +} + +static int +bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, + boolean_t free) +{ + dmu_object_info_t doi; + int epb; + int64_t i; + int err = 0; + dmu_buf_t *dbuf = NULL; + + ASSERT(bpobj_is_open(bpo)); + mutex_enter(&bpo->bpo_lock); + + if (free) + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + + for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { + blkptr_t *bparray; + blkptr_t *bp; + uint64_t offset, blkoff; + + offset = i * sizeof (blkptr_t); + blkoff = P2PHASE(i, bpo->bpo_epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, + FTAG, &dbuf, 0); + if (err) + break; + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + bparray = dbuf->db_data; + bp = &bparray[blkoff]; + err = func(arg, bp, tx); + if (err) + break; + if (free) { + bpo->bpo_phys->bpo_bytes -= + bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); + ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); + if (bpo->bpo_havecomp) { + bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); + } + bpo->bpo_phys->bpo_num_blkptrs--; + ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); + } + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + if (free) { + VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, + (i + 1) * sizeof (blkptr_t), -1ULL, tx)); + } + if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) + goto out; + + ASSERT(bpo->bpo_havecomp); + err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); + if (err) { + mutex_exit(&bpo->bpo_lock); + return (err); + } + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); + epb = doi.doi_data_block_size / sizeof (uint64_t); + + for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { + uint64_t *objarray; + uint64_t offset, blkoff; + bpobj_t sublist; + uint64_t used_before, comp_before, uncomp_before; + uint64_t used_after, comp_after, uncomp_after; + + offset = i * sizeof (uint64_t); + blkoff = P2PHASE(i, epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + err = dmu_buf_hold(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); + if (err) + break; + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + objarray = dbuf->db_data; + err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); + if (err) + break; + if (free) { + err = bpobj_space(&sublist, + &used_before, &comp_before, &uncomp_before); + if (err != 0) { + bpobj_close(&sublist); + break; + } + } + err = bpobj_iterate_impl(&sublist, func, arg, tx, free); + if (free) { + VERIFY3U(0, ==, bpobj_space(&sublist, + &used_after, &comp_after, &uncomp_after)); + bpo->bpo_phys->bpo_bytes -= used_before - used_after; + ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); + bpo->bpo_phys->bpo_comp -= comp_before - comp_after; + bpo->bpo_phys->bpo_uncomp -= + uncomp_before - uncomp_after; + } + + bpobj_close(&sublist); + if (err) + break; + if (free) { + err = dmu_object_free(bpo->bpo_os, + objarray[blkoff], tx); + if (err) + break; + bpo->bpo_phys->bpo_num_subobjs--; + ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); + } + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + if (free) { + VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + (i + 1) * sizeof (uint64_t), -1ULL, tx)); + } + +out: + /* If there are no entries, there should be no bytes. */ + if (bpobj_is_empty(bpo)) { + ASSERT0(bpo->bpo_phys->bpo_bytes); + ASSERT0(bpo->bpo_phys->bpo_comp); + ASSERT0(bpo->bpo_phys->bpo_uncomp); + } + + mutex_exit(&bpo->bpo_lock); + return (err); +} + +/* + * Iterate and remove the entries. If func returns nonzero, iteration + * will stop and that entry will not be removed. + */ +int +bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +{ + return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); +} + +/* + * Iterate the entries. If func returns nonzero, iteration will stop. + */ +int +bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +{ + return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); +} + +void +bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) +{ + bpobj_t subbpo; + uint64_t used, comp, uncomp, subsubobjs; + + ASSERT(bpobj_is_open(bpo)); + ASSERT(subobj != 0); + ASSERT(bpo->bpo_havesubobj); + ASSERT(bpo->bpo_havecomp); + ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); + + if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { + bpobj_decr_empty(bpo->bpo_os, tx); + return; + } + + VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); + VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); + + if (bpobj_is_empty(&subbpo)) { + /* No point in having an empty subobj. */ + bpobj_close(&subbpo); + bpobj_free(bpo->bpo_os, subobj, tx); + return; + } + + mutex_enter(&bpo->bpo_lock); + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + if (bpo->bpo_phys->bpo_subobjs == 0) { + bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, + DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, + DMU_OT_NONE, 0, tx); + } + + dmu_object_info_t doi; + ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); + + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + sizeof (subobj), &subobj, tx); + bpo->bpo_phys->bpo_num_subobjs++; + + /* + * If subobj has only one block of subobjs, then move subobj's + * subobjs to bpo's subobj list directly. This reduces + * recursion in bpobj_iterate due to nested subobjs. + */ + subsubobjs = subbpo.bpo_phys->bpo_subobjs; + if (subsubobjs != 0) { + dmu_object_info_t doi; + + VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); + if (doi.doi_max_offset == doi.doi_data_block_size) { + dmu_buf_t *subdb; + uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; + + VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, + 0, FTAG, &subdb, 0)); + /* + * Make sure that we are not asking dmu_write() + * to write more data than we have in our buffer. + */ + VERIFY3U(subdb->db_size, >=, + numsubsub * sizeof (subobj)); + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + numsubsub * sizeof (subobj), subdb->db_data, tx); + dmu_buf_rele(subdb, FTAG); + bpo->bpo_phys->bpo_num_subobjs += numsubsub; + + dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); + subbpo.bpo_phys->bpo_subobjs = 0; + VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, + subsubobjs, tx)); + } + } + bpo->bpo_phys->bpo_bytes += used; + bpo->bpo_phys->bpo_comp += comp; + bpo->bpo_phys->bpo_uncomp += uncomp; + mutex_exit(&bpo->bpo_lock); + + bpobj_close(&subbpo); +} + +void +bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) +{ + blkptr_t stored_bp = *bp; + uint64_t offset; + int blkoff; + blkptr_t *bparray; + + ASSERT(bpobj_is_open(bpo)); + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); + + if (BP_IS_EMBEDDED(bp)) { + /* + * The bpobj will compress better without the payload. + * + * Note that we store EMBEDDED bp's because they have an + * uncompressed size, which must be accounted for. An + * alternative would be to add their size to bpo_uncomp + * without storing the bp, but that would create additional + * complications: bpo_uncomp would be inconsistent with the + * set of BP's stored, and bpobj_iterate() wouldn't visit + * all the space accounted for in the bpobj. + */ + bzero(&stored_bp, sizeof (stored_bp)); + stored_bp.blk_prop = bp->blk_prop; + stored_bp.blk_birth = bp->blk_birth; + } else if (!BP_GET_DEDUP(bp)) { + /* The bpobj will compress better without the checksum */ + bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); + } + + /* We never need the fill count. */ + stored_bp.blk_fill = 0; + + mutex_enter(&bpo->bpo_lock); + + offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); + blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); + + if (bpo->bpo_cached_dbuf == NULL || + offset < bpo->bpo_cached_dbuf->db_offset || + offset >= bpo->bpo_cached_dbuf->db_offset + + bpo->bpo_cached_dbuf->db_size) { + if (bpo->bpo_cached_dbuf) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, + offset, bpo, &bpo->bpo_cached_dbuf, 0)); + } + + dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); + bparray = bpo->bpo_cached_dbuf->db_data; + bparray[blkoff] = stored_bp; + + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + bpo->bpo_phys->bpo_num_blkptrs++; + bpo->bpo_phys->bpo_bytes += + bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); + if (bpo->bpo_havecomp) { + bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); + } + mutex_exit(&bpo->bpo_lock); +} + +struct space_range_arg { + spa_t *spa; + uint64_t mintxg; + uint64_t maxtxg; + uint64_t used; + uint64_t comp; + uint64_t uncomp; +}; + +/* ARGSUSED */ +static int +space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct space_range_arg *sra = arg; + + if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { + if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) + sra->used += bp_get_dsize_sync(sra->spa, bp); + else + sra->used += bp_get_dsize(sra->spa, bp); + sra->comp += BP_GET_PSIZE(bp); + sra->uncomp += BP_GET_UCSIZE(bp); + } + return (0); +} + +int +bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + ASSERT(bpobj_is_open(bpo)); + mutex_enter(&bpo->bpo_lock); + + *usedp = bpo->bpo_phys->bpo_bytes; + if (bpo->bpo_havecomp) { + *compp = bpo->bpo_phys->bpo_comp; + *uncompp = bpo->bpo_phys->bpo_uncomp; + mutex_exit(&bpo->bpo_lock); + return (0); + } else { + mutex_exit(&bpo->bpo_lock); + return (bpobj_space_range(bpo, 0, UINT64_MAX, + usedp, compp, uncompp)); + } +} + +/* + * Return the amount of space in the bpobj which is: + * mintxg < blk_birth <= maxtxg + */ +int +bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + struct space_range_arg sra = { 0 }; + int err; + + ASSERT(bpobj_is_open(bpo)); + + /* + * As an optimization, if they want the whole txg range, just + * get bpo_bytes rather than iterating over the bps. + */ + if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) + return (bpobj_space(bpo, usedp, compp, uncompp)); + + sra.spa = dmu_objset_spa(bpo->bpo_os); + sra.mintxg = mintxg; + sra.maxtxg = maxtxg; + + err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); + *usedp = sra.used; + *compp = sra.comp; + *uncompp = sra.uncomp; + return (err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c new file mode 100644 index 000000000000..c74d07236c1b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c @@ -0,0 +1,301 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include <sys/arc.h> +#include <sys/bptree.h> +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dnode.h> +#include <sys/refcount.h> +#include <sys/spa.h> + +/* + * A bptree is a queue of root block pointers from destroyed datasets. When a + * dataset is destroyed its root block pointer is put on the end of the pool's + * bptree queue so the dataset's blocks can be freed asynchronously by + * dsl_scan_sync. This allows the delete operation to finish without traversing + * all the dataset's blocks. + * + * Note that while bt_begin and bt_end are only ever incremented in this code, + * they are effectively reset to 0 every time the entire bptree is freed because + * the bptree's object is destroyed and re-created. + */ + +struct bptree_args { + bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ + boolean_t ba_free; /* true if freeing during traversal */ + + bptree_itor_t *ba_func; /* function to call for each blockpointer */ + void *ba_arg; /* caller supplied argument to ba_func */ + dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ +} bptree_args_t; + +uint64_t +bptree_alloc(objset_t *os, dmu_tx_t *tx) +{ + uint64_t obj; + dmu_buf_t *db; + bptree_phys_t *bt; + + obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, + SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, + sizeof (bptree_phys_t), tx); + + /* + * Bonus buffer contents are already initialized to 0, but for + * readability we make it explicit. + */ + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + bt = db->db_data; + bt->bt_begin = 0; + bt->bt_end = 0; + bt->bt_bytes = 0; + bt->bt_comp = 0; + bt->bt_uncomp = 0; + dmu_buf_rele(db, FTAG); + + return (obj); +} + +int +bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + ASSERT3U(bt->bt_begin, ==, bt->bt_end); + ASSERT0(bt->bt_bytes); + ASSERT0(bt->bt_comp); + ASSERT0(bt->bt_uncomp); + dmu_buf_rele(db, FTAG); + + return (dmu_object_free(os, obj, tx)); +} + +boolean_t +bptree_is_empty(objset_t *os, uint64_t obj) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + boolean_t rv; + + VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + rv = (bt->bt_begin == bt->bt_end); + dmu_buf_rele(db, FTAG); + return (rv); +} + +void +bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, + uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + bptree_entry_phys_t bte = { 0 }; + + /* + * bptree objects are in the pool mos, therefore they can only be + * modified in syncing context. Furthermore, this is only modified + * by the sync thread, so no locking is necessary. + */ + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + + bte.be_birth_txg = birth_txg; + bte.be_bp = *bp; + dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); + + dmu_buf_will_dirty(db, tx); + bt->bt_end++; + bt->bt_bytes += bytes; + bt->bt_comp += comp; + bt->bt_uncomp += uncomp; + dmu_buf_rele(db, FTAG); +} + +/* ARGSUSED */ +static int +bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + int err; + struct bptree_args *ba = arg; + + if (bp == NULL || BP_IS_HOLE(bp)) + return (0); + + err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); + if (err == 0 && ba->ba_free) { + ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); + ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); + ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); + } + return (err); +} + +/* + * If "free" is set: + * - It is assumed that "func" will be freeing the block pointers. + * - If "func" returns nonzero, the bookmark will be remembered and + * iteration will be restarted from this point on next invocation. + * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), + * bptree_iterate will remember the bookmark, continue traversing + * any additional entries, and return 0. + * + * If "free" is not set, traversal will stop and return an error if + * an i/o error is encountered. + * + * In either case, if zfs_free_leak_on_eio is set, i/o errors will be + * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to + * traverse_dataset_destroyed()). + */ +int +bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, + void *arg, dmu_tx_t *tx) +{ + boolean_t ioerr = B_FALSE; + int err; + uint64_t i; + dmu_buf_t *db; + struct bptree_args ba; + + ASSERT(!free || dmu_tx_is_syncing(tx)); + + err = dmu_bonus_hold(os, obj, FTAG, &db); + if (err != 0) + return (err); + + if (free) + dmu_buf_will_dirty(db, tx); + + ba.ba_phys = db->db_data; + ba.ba_free = free; + ba.ba_func = func; + ba.ba_arg = arg; + ba.ba_tx = tx; + + err = 0; + for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { + bptree_entry_phys_t bte; + int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; + + err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), + &bte, DMU_READ_NO_PREFETCH); + if (err != 0) + break; + + if (zfs_free_leak_on_eio) + flags |= TRAVERSE_HARD; + zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld " + "bookmark %lld/%lld/%lld/%lld", + (longlong_t)i, + (longlong_t)bte.be_birth_txg, + (longlong_t)bte.be_zb.zb_objset, + (longlong_t)bte.be_zb.zb_object, + (longlong_t)bte.be_zb.zb_level, + (longlong_t)bte.be_zb.zb_blkid); + err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, + bte.be_birth_txg, &bte.be_zb, flags, + bptree_visit_cb, &ba); + if (free) { + /* + * The callback has freed the visited block pointers. + * Record our traversal progress on disk, either by + * updating this record's bookmark, or by logically + * removing this record by advancing bt_begin. + */ + if (err != 0) { + /* save bookmark for future resume */ + ASSERT3U(bte.be_zb.zb_objset, ==, + ZB_DESTROYED_OBJSET); + ASSERT0(bte.be_zb.zb_level); + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); + if (err == EIO || err == ECKSUM || + err == ENXIO) { + /* + * Skip the rest of this tree and + * continue on to the next entry. + */ + err = 0; + ioerr = B_TRUE; + } else { + break; + } + } else if (ioerr) { + /* + * This entry is finished, but there were + * i/o errors on previous entries, so we + * can't adjust bt_begin. Set this entry's + * be_birth_txg such that it will be + * treated as a no-op in future traversals. + */ + bte.be_birth_txg = UINT64_MAX; + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); + } + + if (!ioerr) { + ba.ba_phys->bt_begin++; + (void) dmu_free_range(os, obj, + i * sizeof (bte), sizeof (bte), tx); + } + } else if (err != 0) { + break; + } + } + + ASSERT(!free || err != 0 || ioerr || + ba.ba_phys->bt_begin == ba.ba_phys->bt_end); + + /* if all blocks are free there should be no used space */ + if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { + if (zfs_free_leak_on_eio) { + ba.ba_phys->bt_bytes = 0; + ba.ba_phys->bt_comp = 0; + ba.ba_phys->bt_uncomp = 0; + } + + ASSERT0(ba.ba_phys->bt_bytes); + ASSERT0(ba.ba_phys->bt_comp); + ASSERT0(ba.ba_phys->bt_uncomp); + } + + dmu_buf_rele(db, FTAG); + + return (err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c new file mode 100644 index 000000000000..1ddc697b5424 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c @@ -0,0 +1,111 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#include <sys/bqueue.h> +#include <sys/zfs_context.h> + +static inline bqueue_node_t * +obj2node(bqueue_t *q, void *data) +{ + return ((bqueue_node_t *)((char *)data + q->bq_node_offset)); +} + +/* + * Initialize a blocking queue The maximum capacity of the queue is set to + * size. Types that want to be stored in a bqueue must contain a bqueue_node_t, + * and offset should give its offset from the start of the struct. Return 0 on + * success, or -1 on failure. + */ +int +bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) +{ + list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t), + node_offset + offsetof(bqueue_node_t, bqn_node)); + cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL); + cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL); + q->bq_node_offset = node_offset; + q->bq_size = 0; + q->bq_maxsize = size; + return (0); +} + +/* + * Destroy a blocking queue. This function asserts that there are no + * elements in the queue, and no one is blocked on the condition + * variables. + */ +void +bqueue_destroy(bqueue_t *q) +{ + ASSERT0(q->bq_size); + cv_destroy(&q->bq_add_cv); + cv_destroy(&q->bq_pop_cv); + mutex_destroy(&q->bq_lock); + list_destroy(&q->bq_list); +} + +/* + * Add data to q, consuming size units of capacity. If there is insufficient + * capacity to consume size units, block until capacity exists. Asserts size is + * > 0. + */ +void +bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) +{ + ASSERT3U(item_size, >, 0); + ASSERT3U(item_size, <, q->bq_maxsize); + mutex_enter(&q->bq_lock); + obj2node(q, data)->bqn_size = item_size; + while (q->bq_size + item_size > q->bq_maxsize) { + cv_wait(&q->bq_add_cv, &q->bq_lock); + } + q->bq_size += item_size; + list_insert_tail(&q->bq_list, data); + cv_signal(&q->bq_pop_cv); + mutex_exit(&q->bq_lock); +} +/* + * Take the first element off of q. If there are no elements on the queue, wait + * until one is put there. Return the removed element. + */ +void * +bqueue_dequeue(bqueue_t *q) +{ + void *ret; + uint64_t item_size; + mutex_enter(&q->bq_lock); + while (q->bq_size == 0) { + cv_wait(&q->bq_pop_cv, &q->bq_lock); + } + ret = list_remove_head(&q->bq_list); + item_size = obj2node(q, ret)->bqn_size; + q->bq_size -= item_size; + mutex_exit(&q->bq_lock); + cv_signal(&q->bq_add_cv); + return (ret); +} + +/* + * Returns true if the space used is 0. + */ +boolean_t +bqueue_empty(bqueue_t *q) +{ + return (q->bq_size == 0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c new file mode 100644 index 000000000000..2b62edad0342 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c @@ -0,0 +1,63 @@ +// Copyright (c) 2011 Google, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#include <sys/cityhash.h> + +#define HASH_K1 0xb492b66fbe98f273ULL +#define HASH_K2 0x9ae16a3b2f90404fULL + +/* + * Bitwise right rotate. Normally this will compile to a single + * instruction. + */ +static inline uint64_t +rotate(uint64_t val, int shift) +{ + // Avoid shifting by 64: doing so yields an undefined result. + return (shift == 0 ? val : (val >> shift) | (val << (64 - shift))); +} + +static inline uint64_t +cityhash_helper(uint64_t u, uint64_t v, uint64_t mul) +{ + uint64_t a = (u ^ v) * mul; + a ^= (a >> 47); + uint64_t b = (v ^ a) * mul; + b ^= (b >> 47); + b *= mul; + return (b); +} + +uint64_t +cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4) +{ + uint64_t mul = HASH_K2 + 64; + uint64_t a = w1 * HASH_K1; + uint64_t b = w2; + uint64_t c = w4 * mul; + uint64_t d = w3 * HASH_K2; + return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d, + a + rotate(b + HASH_K2, 18) + c, mul)); + +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c new file mode 100644 index 000000000000..9012baa0a994 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -0,0 +1,4266 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include <sys/zfs_context.h> +#include <sys/dmu.h> +#include <sys/dmu_send.h> +#include <sys/dmu_impl.h> +#include <sys/dbuf.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_tx.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_zfetch.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#include <sys/zfeature.h> +#include <sys/blkptr.h> +#include <sys/range_tree.h> +#include <sys/callb.h> +#include <sys/abd.h> +#include <sys/vdev.h> +#include <sys/cityhash.h> +#include <sys/spa_impl.h> + +kstat_t *dbuf_ksp; + +typedef struct dbuf_stats { + /* + * Various statistics about the size of the dbuf cache. + */ + kstat_named_t cache_count; + kstat_named_t cache_size_bytes; + kstat_named_t cache_size_bytes_max; + /* + * Statistics regarding the bounds on the dbuf cache size. + */ + kstat_named_t cache_target_bytes; + kstat_named_t cache_lowater_bytes; + kstat_named_t cache_hiwater_bytes; + /* + * Total number of dbuf cache evictions that have occurred. + */ + kstat_named_t cache_total_evicts; + /* + * The distribution of dbuf levels in the dbuf cache and + * the total size of all dbufs at each level. + */ + kstat_named_t cache_levels[DN_MAX_LEVELS]; + kstat_named_t cache_levels_bytes[DN_MAX_LEVELS]; + /* + * Statistics about the dbuf hash table. + */ + kstat_named_t hash_hits; + kstat_named_t hash_misses; + kstat_named_t hash_collisions; + kstat_named_t hash_elements; + kstat_named_t hash_elements_max; + /* + * Number of sublists containing more than one dbuf in the dbuf + * hash table. Keep track of the longest hash chain. + */ + kstat_named_t hash_chains; + kstat_named_t hash_chain_max; + /* + * Number of times a dbuf_create() discovers that a dbuf was + * already created and in the dbuf hash table. + */ + kstat_named_t hash_insert_race; + /* + * Statistics about the size of the metadata dbuf cache. + */ + kstat_named_t metadata_cache_count; + kstat_named_t metadata_cache_size_bytes; + kstat_named_t metadata_cache_size_bytes_max; + /* + * For diagnostic purposes, this is incremented whenever we can't add + * something to the metadata cache because it's full, and instead put + * the data in the regular dbuf cache. + */ + kstat_named_t metadata_cache_overflow; +} dbuf_stats_t; + +dbuf_stats_t dbuf_stats = { + { "cache_count", KSTAT_DATA_UINT64 }, + { "cache_size_bytes", KSTAT_DATA_UINT64 }, + { "cache_size_bytes_max", KSTAT_DATA_UINT64 }, + { "cache_target_bytes", KSTAT_DATA_UINT64 }, + { "cache_lowater_bytes", KSTAT_DATA_UINT64 }, + { "cache_hiwater_bytes", KSTAT_DATA_UINT64 }, + { "cache_total_evicts", KSTAT_DATA_UINT64 }, + { { "cache_levels_N", KSTAT_DATA_UINT64 } }, + { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } }, + { "hash_hits", KSTAT_DATA_UINT64 }, + { "hash_misses", KSTAT_DATA_UINT64 }, + { "hash_collisions", KSTAT_DATA_UINT64 }, + { "hash_elements", KSTAT_DATA_UINT64 }, + { "hash_elements_max", KSTAT_DATA_UINT64 }, + { "hash_chains", KSTAT_DATA_UINT64 }, + { "hash_chain_max", KSTAT_DATA_UINT64 }, + { "hash_insert_race", KSTAT_DATA_UINT64 }, + { "metadata_cache_count", KSTAT_DATA_UINT64 }, + { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 }, + { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 }, + { "metadata_cache_overflow", KSTAT_DATA_UINT64 } +}; + +#define DBUF_STAT_INCR(stat, val) \ + atomic_add_64(&dbuf_stats.stat.value.ui64, (val)); +#define DBUF_STAT_DECR(stat, val) \ + DBUF_STAT_INCR(stat, -(val)); +#define DBUF_STAT_BUMP(stat) \ + DBUF_STAT_INCR(stat, 1); +#define DBUF_STAT_BUMPDOWN(stat) \ + DBUF_STAT_INCR(stat, -1); +#define DBUF_STAT_MAX(stat, v) { \ + uint64_t _m; \ + while ((v) > (_m = dbuf_stats.stat.value.ui64) && \ + (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\ + continue; \ +} + +struct dbuf_hold_impl_data { + /* Function arguments */ + dnode_t *dh_dn; + uint8_t dh_level; + uint64_t dh_blkid; + boolean_t dh_fail_sparse; + boolean_t dh_fail_uncached; + void *dh_tag; + dmu_buf_impl_t **dh_dbp; + /* Local variables */ + dmu_buf_impl_t *dh_db; + dmu_buf_impl_t *dh_parent; + blkptr_t *dh_bp; + int dh_err; + dbuf_dirty_record_t *dh_dr; + int dh_depth; +}; + +static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, + dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, + boolean_t fail_uncached, + void *tag, dmu_buf_impl_t **dbp, int depth); +static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); + +static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); + +#ifndef __lint +extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, + dmu_buf_evict_func_t *evict_func_sync, + dmu_buf_evict_func_t *evict_func_async, + dmu_buf_t **clear_on_evict_dbufp); +#endif /* ! __lint */ + +/* + * Global data structures and functions for the dbuf cache. + */ +static kmem_cache_t *dbuf_kmem_cache; +static taskq_t *dbu_evict_taskq; + +static kthread_t *dbuf_cache_evict_thread; +static kmutex_t dbuf_evict_lock; +static kcondvar_t dbuf_evict_cv; +static boolean_t dbuf_evict_thread_exit; + +/* + * There are two dbuf caches; each dbuf can only be in one of them at a time. + * + * 1. Cache of metadata dbufs, to help make read-heavy administrative commands + * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs + * that represent the metadata that describes filesystems/snapshots/ + * bookmarks/properties/etc. We only evict from this cache when we export a + * pool, to short-circuit as much I/O as possible for all administrative + * commands that need the metadata. There is no eviction policy for this + * cache, because we try to only include types in it which would occupy a + * very small amount of space per object but create a large impact on the + * performance of these commands. Instead, after it reaches a maximum size + * (which should only happen on very small memory systems with a very large + * number of filesystem objects), we stop taking new dbufs into the + * metadata cache, instead putting them in the normal dbuf cache. + * + * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that + * are not currently held but have been recently released. These dbufs + * are not eligible for arc eviction until they are aged out of the cache. + * Dbufs that are aged out of the cache will be immediately destroyed and + * become eligible for arc eviction. + * + * Dbufs are added to these caches once the last hold is released. If a dbuf is + * later accessed and still exists in the dbuf cache, then it will be removed + * from the cache and later re-added to the head of the cache. + * + * If a given dbuf meets the requirements for the metadata cache, it will go + * there, otherwise it will be considered for the generic LRU dbuf cache. The + * caches and the refcounts tracking their sizes are stored in an array indexed + * by those caches' matching enum values (from dbuf_cached_state_t). + */ +typedef struct dbuf_cache { + multilist_t *cache; + refcount_t size; +} dbuf_cache_t; +dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; + +/* Size limits for the caches */ +uint64_t dbuf_cache_max_bytes = 0; +uint64_t dbuf_metadata_cache_max_bytes = 0; +/* Set the default sizes of the caches to log2 fraction of arc size */ +int dbuf_cache_shift = 5; +int dbuf_metadata_cache_shift = 6; + +/* + * For diagnostic purposes, this is incremented whenever we can't add + * something to the metadata cache because it's full, and instead put + * the data in the regular dbuf cache. + */ +uint64_t dbuf_metadata_cache_overflow; + +/* + * The LRU dbuf cache uses a three-stage eviction policy: + * - A low water marker designates when the dbuf eviction thread + * should stop evicting from the dbuf cache. + * - When we reach the maximum size (aka mid water mark), we + * signal the eviction thread to run. + * - The high water mark indicates when the eviction thread + * is unable to keep up with the incoming load and eviction must + * happen in the context of the calling thread. + * + * The dbuf cache: + * (max size) + * low water mid water hi water + * +----------------------------------------+----------+----------+ + * | | | | + * | | | | + * | | | | + * | | | | + * +----------------------------------------+----------+----------+ + * stop signal evict + * evicting eviction directly + * thread + * + * The high and low water marks indicate the operating range for the eviction + * thread. The low water mark is, by default, 90% of the total size of the + * cache and the high water mark is at 110% (both of these percentages can be + * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, + * respectively). The eviction thread will try to ensure that the cache remains + * within this range by waking up every second and checking if the cache is + * above the low water mark. The thread can also be woken up by callers adding + * elements into the cache if the cache is larger than the mid water (i.e max + * cache size). Once the eviction thread is woken up and eviction is required, + * it will continue evicting buffers until it's able to reduce the cache size + * to the low water mark. If the cache size continues to grow and hits the high + * water mark, then callers adding elments to the cache will begin to evict + * directly from the cache until the cache is no longer above the high water + * mark. + */ + +/* + * The percentage above and below the maximum cache size. + */ +uint_t dbuf_cache_hiwater_pct = 10; +uint_t dbuf_cache_lowater_pct = 10; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN, + &dbuf_cache_max_bytes, 0, "dbuf cache size in bytes"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_max_bytes, CTLFLAG_RWTUN, + &dbuf_metadata_cache_max_bytes, 0, "dbuf metadata cache size in bytes"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_shift, CTLFLAG_RDTUN, + &dbuf_cache_shift, 0, "dbuf cache size as log2 fraction of ARC"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_shift, CTLFLAG_RDTUN, + &dbuf_metadata_cache_shift, 0, + "dbuf metadata cache size as log2 fraction of ARC"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_overflow, CTLFLAG_RD, + &dbuf_metadata_cache_overflow, 0, "dbuf metadata cache overflow"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN, + &dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN, + &dbuf_cache_lowater_pct, 0, "max percents below the dbuf cache size"); + +/* ARGSUSED */ +static int +dbuf_cons(void *vdb, void *unused, int kmflag) +{ + dmu_buf_impl_t *db = vdb; + bzero(db, sizeof (dmu_buf_impl_t)); + + mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + multilist_link_init(&db->db_cache_link); + refcount_create(&db->db_holds); + + return (0); +} + +/* ARGSUSED */ +static void +dbuf_dest(void *vdb, void *unused) +{ + dmu_buf_impl_t *db = vdb; + mutex_destroy(&db->db_mtx); + cv_destroy(&db->db_changed); + ASSERT(!multilist_link_active(&db->db_cache_link)); + refcount_destroy(&db->db_holds); +} + +/* + * dbuf hash table routines + */ +static dbuf_hash_table_t dbuf_hash_table; + +static uint64_t dbuf_hash_count; + +/* + * We use Cityhash for this. It's fast, and has good hash properties without + * requiring any large static buffers. + */ +static uint64_t +dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) +{ + return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid)); +} + +#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ + ((dbuf)->db.db_object == (obj) && \ + (dbuf)->db_objset == (os) && \ + (dbuf)->db_level == (level) && \ + (dbuf)->db_blkid == (blkid)) + +dmu_buf_impl_t * +dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + uint64_t hv = dbuf_hash(os, obj, level, blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *db; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { + if (DBUF_EQUAL(db, os, obj, level, blkid)) { + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (db); + } + mutex_exit(&db->db_mtx); + } + } + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (NULL); +} + +static dmu_buf_impl_t * +dbuf_find_bonus(objset_t *os, uint64_t object) +{ + dnode_t *dn; + dmu_buf_impl_t *db = NULL; + + if (dnode_hold(os, object, FTAG, &dn) == 0) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_bonus != NULL) { + db = dn->dn_bonus; + mutex_enter(&db->db_mtx); + } + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + } + return (db); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + */ +static dmu_buf_impl_t * +dbuf_hash_insert(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + objset_t *os = db->db_objset; + uint64_t obj = db->db.db_object; + int level = db->db_level; + uint64_t blkid, hv, idx; + dmu_buf_impl_t *dbf; + uint32_t i; + + blkid = db->db_blkid; + hv = dbuf_hash(os, obj, level, blkid); + idx = hv & h->hash_table_mask; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (dbf = h->hash_table[idx], i = 0; dbf != NULL; + dbf = dbf->db_hash_next, i++) { + if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { + mutex_enter(&dbf->db_mtx); + if (dbf->db_state != DB_EVICTING) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (dbf); + } + mutex_exit(&dbf->db_mtx); + } + } + + if (i > 0) { + DBUF_STAT_BUMP(hash_collisions); + if (i == 1) + DBUF_STAT_BUMP(hash_chains); + + DBUF_STAT_MAX(hash_chain_max, i); + } + + mutex_enter(&db->db_mtx); + db->db_hash_next = h->hash_table[idx]; + h->hash_table[idx] = db; + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_inc_64(&dbuf_hash_count); + DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count); + + return (NULL); +} + +/* + * Remove an entry from the hash table. It must be in the EVICTING state. + */ +static void +dbuf_hash_remove(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + uint64_t hv, idx; + dmu_buf_impl_t *dbf, **dbp; + + hv = dbuf_hash(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid); + idx = hv & h->hash_table_mask; + + /* + * We mustn't hold db_mtx to maintain lock ordering: + * DBUF_HASH_MUTEX > db_mtx. + */ + ASSERT(refcount_is_zero(&db->db_holds)); + ASSERT(db->db_state == DB_EVICTING); + ASSERT(!MUTEX_HELD(&db->db_mtx)); + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + dbp = &h->hash_table[idx]; + while ((dbf = *dbp) != db) { + dbp = &dbf->db_hash_next; + ASSERT(dbf != NULL); + } + *dbp = db->db_hash_next; + db->db_hash_next = NULL; + if (h->hash_table[idx] && + h->hash_table[idx]->db_hash_next == NULL) + DBUF_STAT_BUMPDOWN(hash_chains); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_dec_64(&dbuf_hash_count); +} + +typedef enum { + DBVU_EVICTING, + DBVU_NOT_EVICTING +} dbvu_verify_type_t; + +static void +dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) +{ +#ifdef ZFS_DEBUG + int64_t holds; + + if (db->db_user == NULL) + return; + + /* Only data blocks support the attachment of user data. */ + ASSERT(db->db_level == 0); + + /* Clients must resolve a dbuf before attaching user data. */ + ASSERT(db->db.db_data != NULL); + ASSERT3U(db->db_state, ==, DB_CACHED); + + holds = refcount_count(&db->db_holds); + if (verify_type == DBVU_EVICTING) { + /* + * Immediate eviction occurs when holds == dirtycnt. + * For normal eviction buffers, holds is zero on + * eviction, except when dbuf_fix_old_data() calls + * dbuf_clear_data(). However, the hold count can grow + * during eviction even though db_mtx is held (see + * dmu_bonus_hold() for an example), so we can only + * test the generic invariant that holds >= dirtycnt. + */ + ASSERT3U(holds, >=, db->db_dirtycnt); + } else { + if (db->db_user_immediate_evict == TRUE) + ASSERT3U(holds, >=, db->db_dirtycnt); + else + ASSERT3U(holds, >, 0); + } +#endif +} + +static void +dbuf_evict_user(dmu_buf_impl_t *db) +{ + dmu_buf_user_t *dbu = db->db_user; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (dbu == NULL) + return; + + dbuf_verify_user(db, DBVU_EVICTING); + db->db_user = NULL; + +#ifdef ZFS_DEBUG + if (dbu->dbu_clear_on_evict_dbufp != NULL) + *dbu->dbu_clear_on_evict_dbufp = NULL; +#endif + + /* + * There are two eviction callbacks - one that we call synchronously + * and one that we invoke via a taskq. The async one is useful for + * avoiding lock order reversals and limiting stack depth. + * + * Note that if we have a sync callback but no async callback, + * it's likely that the sync callback will free the structure + * containing the dbu. In that case we need to take care to not + * dereference dbu after calling the sync evict func. + */ + boolean_t has_async = (dbu->dbu_evict_func_async != NULL); + + if (dbu->dbu_evict_func_sync != NULL) + dbu->dbu_evict_func_sync(dbu); + + if (has_async) { + taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, + dbu, 0, &dbu->dbu_tqent); + } +} + +boolean_t +dbuf_is_metadata(dmu_buf_impl_t *db) +{ + if (db->db_level > 0) { + return (B_TRUE); + } else { + boolean_t is_metadata; + + DB_DNODE_ENTER(db); + is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); + DB_DNODE_EXIT(db); + + return (is_metadata); + } +} + +/* + * This returns whether this dbuf should be stored in the metadata cache, which + * is based on whether it's from one of the dnode types that store data related + * to traversing dataset hierarchies. + */ +static boolean_t +dbuf_include_in_metadata_cache(dmu_buf_impl_t *db) +{ + DB_DNODE_ENTER(db); + dmu_object_type_t type = DB_DNODE(db)->dn_type; + DB_DNODE_EXIT(db); + + /* Check if this dbuf is one of the types we care about */ + if (DMU_OT_IS_METADATA_CACHED(type)) { + /* If we hit this, then we set something up wrong in dmu_ot */ + ASSERT(DMU_OT_IS_METADATA(type)); + + /* + * Sanity check for small-memory systems: don't allocate too + * much memory for this purpose. + */ + if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) > + dbuf_metadata_cache_max_bytes) { + dbuf_metadata_cache_overflow++; + DTRACE_PROBE1(dbuf__metadata__cache__overflow, + dmu_buf_impl_t *, db); + return (B_FALSE); + } + + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the dbuf eviction + * code is laid out; dbuf_evict_thread() assumes dbufs are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) +{ + dmu_buf_impl_t *db = obj; + + /* + * The assumption here, is the hash value for a given + * dmu_buf_impl_t will remain constant throughout it's lifetime + * (i.e. it's objset, object, level and blkid fields don't change). + * Thus, we don't need to store the dbuf's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (dbuf_hash(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid) % + multilist_get_num_sublists(ml)); +} + +static inline unsigned long +dbuf_cache_target_bytes(void) +{ + return MIN(dbuf_cache_max_bytes, + arc_max_bytes() >> dbuf_cache_shift); +} + +static inline uint64_t +dbuf_cache_hiwater_bytes(void) +{ + uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); + return (dbuf_cache_target + + (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100); +} + +static inline uint64_t +dbuf_cache_lowater_bytes(void) +{ + uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); + return (dbuf_cache_target - + (dbuf_cache_target * dbuf_cache_lowater_pct) / 100); +} + +static inline boolean_t +dbuf_cache_above_hiwater(void) +{ + return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > + dbuf_cache_hiwater_bytes()); +} + +static inline boolean_t +dbuf_cache_above_lowater(void) +{ + return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > + dbuf_cache_lowater_bytes()); +} + +/* + * Evict the oldest eligible dbuf from the dbuf cache. + */ +static void +dbuf_evict_one(void) +{ + int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache); + multilist_sublist_t *mls = multilist_sublist_lock( + dbuf_caches[DB_DBUF_CACHE].cache, idx); + + ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); + + dmu_buf_impl_t *db = multilist_sublist_tail(mls); + while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { + db = multilist_sublist_prev(mls, db); + } + + DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, + multilist_sublist_t *, mls); + + if (db != NULL) { + multilist_sublist_remove(mls, db); + multilist_sublist_unlock(mls); + (void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size, + db->db.db_size, db); + DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); + DBUF_STAT_BUMPDOWN(cache_count); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], + db->db.db_size); + ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); + db->db_caching_status = DB_NO_CACHE; + dbuf_destroy(db); + DBUF_STAT_MAX(cache_size_bytes_max, + refcount_count(&dbuf_caches[DB_DBUF_CACHE].size)); + DBUF_STAT_BUMP(cache_total_evicts); + } else { + multilist_sublist_unlock(mls); + } +} + +/* + * The dbuf evict thread is responsible for aging out dbufs from the + * cache. Once the cache has reached it's maximum size, dbufs are removed + * and destroyed. The eviction thread will continue running until the size + * of the dbuf cache is at or below the maximum size. Once the dbuf is aged + * out of the cache it is destroyed and becomes eligible for arc eviction. + */ +/* ARGSUSED */ +static void +dbuf_evict_thread(void *unused __unused) +{ + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); + + mutex_enter(&dbuf_evict_lock); + while (!dbuf_evict_thread_exit) { + while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_hires(&dbuf_evict_cv, + &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); + CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + + /* + * Keep evicting as long as we're above the low water mark + * for the cache. We do this without holding the locks to + * minimize lock contention. + */ + while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + dbuf_evict_one(); + } + + mutex_enter(&dbuf_evict_lock); + } + + dbuf_evict_thread_exit = B_FALSE; + cv_broadcast(&dbuf_evict_cv); + CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ + thread_exit(); +} + +/* + * Wake up the dbuf eviction thread if the dbuf cache is at its max size. + * If the dbuf cache is at its high water mark, then evict a dbuf from the + * dbuf cache using the callers context. + */ +static void +dbuf_evict_notify(void) +{ + /* + * We check if we should evict without holding the dbuf_evict_lock, + * because it's OK to occasionally make the wrong decision here, + * and grabbing the lock results in massive lock contention. + */ + if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > + dbuf_cache_max_bytes) { + if (dbuf_cache_above_hiwater()) + dbuf_evict_one(); + cv_signal(&dbuf_evict_cv); + } +} + +static int +dbuf_kstat_update(kstat_t *ksp, int rw) +{ + dbuf_stats_t *ds = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + return (SET_ERROR(EACCES)); + } else { + ds->metadata_cache_size_bytes.value.ui64 = + refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size); + ds->cache_size_bytes.value.ui64 = + refcount_count(&dbuf_caches[DB_DBUF_CACHE].size); + ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes(); + ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes(); + ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes(); + ds->hash_elements.value.ui64 = dbuf_hash_count; + } + + return (0); +} + +void +dbuf_init(void) +{ + uint64_t hsize = 1ULL << 16; + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + /* + * The hash table is big enough to fill all of physical memory + * with an average 4K block size. The table will take up + * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). + */ + while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) + hsize <<= 1; + +retry: + h->hash_table_mask = hsize - 1; + h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); + if (h->hash_table == NULL) { + /* XXX - we should really return an error instead of assert */ + ASSERT(hsize > (1ULL << 10)); + hsize >>= 1; + goto retry; + } + + dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", + sizeof (dmu_buf_impl_t), + 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + + dbuf_stats_init(h); + /* + * Setup the parameters for the dbuf caches. We set the sizes of the + * dbuf cache and the metadata cache to 1/32nd and 1/16th (default) + * of the size of the ARC, respectively. If the values are set in + * /etc/system and they're not greater than the size of the ARC, then + * we honor that value. + */ + if (dbuf_cache_max_bytes == 0 || + dbuf_cache_max_bytes >= arc_max_bytes()) { + dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift; + } + if (dbuf_metadata_cache_max_bytes == 0 || + dbuf_metadata_cache_max_bytes >= arc_max_bytes()) { + dbuf_metadata_cache_max_bytes = + arc_max_bytes() >> dbuf_metadata_cache_shift; + } + + /* + * All entries are queued via taskq_dispatch_ent(), so min/maxalloc + * configuration is not required. + */ + dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); + + for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { + dbuf_caches[dcs].cache = + multilist_create(sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_cache_link), + dbuf_cache_multilist_index_func); + refcount_create(&dbuf_caches[dcs].size); + } + + dbuf_evict_thread_exit = B_FALSE; + mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); + dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, + NULL, 0, &p0, TS_RUN, minclsyspri); + +#ifdef __linux__ + /* + * XXX FreeBSD's SPL lacks KSTAT_TYPE_NAMED support - TODO + */ + dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc", + KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (dbuf_ksp != NULL) { + dbuf_ksp->ks_data = &dbuf_stats; + dbuf_ksp->ks_update = dbuf_kstat_update; + kstat_install(dbuf_ksp); + + for (i = 0; i < DN_MAX_LEVELS; i++) { + snprintf(dbuf_stats.cache_levels[i].name, + KSTAT_STRLEN, "cache_level_%d", i); + dbuf_stats.cache_levels[i].data_type = + KSTAT_DATA_UINT64; + snprintf(dbuf_stats.cache_levels_bytes[i].name, + KSTAT_STRLEN, "cache_level_%d_bytes", i); + dbuf_stats.cache_levels_bytes[i].data_type = + KSTAT_DATA_UINT64; + } + } +#endif +} + +void +dbuf_fini(void) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + dbuf_stats_destroy(); + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_destroy(&h->hash_mutexes[i]); + kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); + kmem_cache_destroy(dbuf_kmem_cache); + taskq_destroy(dbu_evict_taskq); + + mutex_enter(&dbuf_evict_lock); + dbuf_evict_thread_exit = B_TRUE; + while (dbuf_evict_thread_exit) { + cv_signal(&dbuf_evict_cv); + cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + + mutex_destroy(&dbuf_evict_lock); + cv_destroy(&dbuf_evict_cv); + + for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { + refcount_destroy(&dbuf_caches[dcs].size); + multilist_destroy(dbuf_caches[dcs].cache); + } + + if (dbuf_ksp != NULL) { + kstat_delete(dbuf_ksp); + dbuf_ksp = NULL; + } +} + +/* + * Other stuff. + */ + +#ifdef ZFS_DEBUG +static void +dbuf_verify(dmu_buf_impl_t *db) +{ + dnode_t *dn; + dbuf_dirty_record_t *dr; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) + return; + + ASSERT(db->db_objset != NULL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn == NULL) { + ASSERT(db->db_parent == NULL); + ASSERT(db->db_blkptr == NULL); + } else { + ASSERT3U(db->db.db_object, ==, dn->dn_object); + ASSERT3P(db->db_objset, ==, dn->dn_objset); + ASSERT3U(db->db_level, <, dn->dn_nlevels); + ASSERT(db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID || + !avl_is_empty(&dn->dn_dbufs)); + } + if (db->db_blkid == DMU_BONUS_BLKID) { + ASSERT(dn != NULL); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); + } else if (db->db_blkid == DMU_SPILL_BLKID) { + ASSERT(dn != NULL); + ASSERT0(db->db.db_offset); + } else { + ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); + } + + for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) + ASSERT(dr->dr_dbuf == db); + + for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) + ASSERT(dr->dr_dbuf == db); + + /* + * We can't assert that db_size matches dn_datablksz because it + * can be momentarily different when another thread is doing + * dnode_set_blksz(). + */ + if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { + dr = db->db_data_pending; + /* + * It should only be modified in syncing context, so + * make sure we only have one copy of the data. + */ + ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); + } + + /* verify db->db_blkptr */ + if (db->db_blkptr) { + if (db->db_parent == dn->dn_dbuf) { + /* db is pointed to by the dnode */ + /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ + if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) + ASSERT(db->db_parent == NULL); + else + ASSERT(db->db_parent != NULL); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + } else { + /* db is pointed to by an indirect block */ + int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; + ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); + ASSERT3U(db->db_parent->db.db_object, ==, + db->db.db_object); + /* + * dnode_grow_indblksz() can make this fail if we don't + * have the struct_rwlock. XXX indblksz no longer + * grows. safe to do this now? + */ + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + ASSERT3P(db->db_blkptr, ==, + ((blkptr_t *)db->db_parent->db.db_data + + db->db_blkid % epb)); + } + } + } + if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && + (db->db_buf == NULL || db->db_buf->b_data) && + db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && + db->db_state != DB_FILL && !dn->dn_free_txg) { + /* + * If the blkptr isn't set but they have nonzero data, + * it had better be dirty, otherwise we'll lose that + * data when we evict this buffer. + * + * There is an exception to this rule for indirect blocks; in + * this case, if the indirect block is a hole, we fill in a few + * fields on each of the child blocks (importantly, birth time) + * to prevent hole birth times from being lost when you + * partially fill in a hole. + */ + if (db->db_dirtycnt == 0) { + if (db->db_level == 0) { + uint64_t *buf = db->db.db_data; + int i; + + for (i = 0; i < db->db.db_size >> 3; i++) { + ASSERT(buf[i] == 0); + } + } else { + blkptr_t *bps = db->db.db_data; + ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, + db->db.db_size); + /* + * We want to verify that all the blkptrs in the + * indirect block are holes, but we may have + * automatically set up a few fields for them. + * We iterate through each blkptr and verify + * they only have those fields set. + */ + for (int i = 0; + i < db->db.db_size / sizeof (blkptr_t); + i++) { + blkptr_t *bp = &bps[i]; + ASSERT(ZIO_CHECKSUM_IS_ZERO( + &bp->blk_cksum)); + ASSERT( + DVA_IS_EMPTY(&bp->blk_dva[0]) && + DVA_IS_EMPTY(&bp->blk_dva[1]) && + DVA_IS_EMPTY(&bp->blk_dva[2])); + ASSERT0(bp->blk_fill); + ASSERT0(bp->blk_pad[0]); + ASSERT0(bp->blk_pad[1]); + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(BP_IS_HOLE(bp)); + ASSERT0(bp->blk_phys_birth); + } + } + } + } + DB_DNODE_EXIT(db); +} +#endif + +static void +dbuf_clear_data(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + dbuf_evict_user(db); + ASSERT3P(db->db_buf, ==, NULL); + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) + db->db_state = DB_UNCACHED; +} + +static void +dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(buf != NULL); + + db->db_buf = buf; + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; +} + +/* + * Loan out an arc_buf for read. Return the loaned arc_buf. + */ +arc_buf_t * +dbuf_loan_arcbuf(dmu_buf_impl_t *db) +{ + arc_buf_t *abuf; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + mutex_enter(&db->db_mtx); + if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { + int blksz = db->db.db_size; + spa_t *spa = db->db_objset->os_spa; + + mutex_exit(&db->db_mtx); + abuf = arc_loan_buf(spa, B_FALSE, blksz); + bcopy(db->db.db_data, abuf->b_data, blksz); + } else { + abuf = db->db_buf; + arc_loan_inuse_buf(abuf, db); + db->db_buf = NULL; + dbuf_clear_data(db); + mutex_exit(&db->db_mtx); + } + return (abuf); +} + +/* + * Calculate which level n block references the data at the level 0 offset + * provided. + */ +uint64_t +dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) +{ + if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { + /* + * The level n blkid is equal to the level 0 blkid divided by + * the number of level 0s in a level n block. + * + * The level 0 blkid is offset >> datablkshift = + * offset / 2^datablkshift. + * + * The number of level 0s in a level n is the number of block + * pointers in an indirect block, raised to the power of level. + * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = + * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). + * + * Thus, the level n blkid is: offset / + * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) + * = offset / 2^(datablkshift + level * + * (indblkshift - SPA_BLKPTRSHIFT)) + * = offset >> (datablkshift + level * + * (indblkshift - SPA_BLKPTRSHIFT)) + */ + return (offset >> (dn->dn_datablkshift + level * + (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); + } else { + ASSERT3U(offset, <, dn->dn_datablksz); + return (0); + } +} + +static void +dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + + mutex_enter(&db->db_mtx); + ASSERT3U(db->db_state, ==, DB_READ); + /* + * All reads are synchronous, so we must have a hold on the dbuf + */ + ASSERT(refcount_count(&db->db_holds) > 0); + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + if (buf == NULL) { + /* i/o error */ + ASSERT(zio == NULL || zio->io_error != 0); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT3P(db->db_buf, ==, NULL); + db->db_state = DB_UNCACHED; + } else if (db->db_level == 0 && db->db_freed_in_flight) { + /* freed in flight */ + ASSERT(zio == NULL || zio->io_error == 0); + if (buf == NULL) { + buf = arc_alloc_buf(db->db_objset->os_spa, + db, DBUF_GET_BUFC_TYPE(db), db->db.db_size); + } + arc_release(buf, db); + bzero(buf->b_data, db->db.db_size); + arc_buf_freeze(buf); + db->db_freed_in_flight = FALSE; + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + } else { + /* success */ + ASSERT(zio == NULL || zio->io_error == 0); + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + } + cv_broadcast(&db->db_changed); + dbuf_rele_and_unlock(db, NULL, B_FALSE); +} + +static void +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +{ + dnode_t *dn; + zbookmark_phys_t zb; + arc_flags_t aflags = ARC_FLAG_NOWAIT; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(!refcount_is_zero(&db->db_holds)); + /* We need the struct_rwlock to prevent db_blkptr from changing. */ + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_state == DB_UNCACHED); + ASSERT(db->db_buf == NULL); + + if (db->db_blkid == DMU_BONUS_BLKID) { + /* + * The bonus length stored in the dnode may be less than + * the maximum available space in the bonus buffer. + */ + int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); + int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + + ASSERT3U(bonuslen, <=, db->db.db_size); + db->db.db_data = zio_buf_alloc(max_bonuslen); + arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); + if (bonuslen < max_bonuslen) + bzero(db->db.db_data, max_bonuslen); + if (bonuslen) + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + DB_DNODE_EXIT(db); + db->db_state = DB_CACHED; + mutex_exit(&db->db_mtx); + return; + } + + /* + * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() + * processes the delete record and clears the bp while we are waiting + * for the dn_mtx (resulting in a "no" from block_freed). + */ + if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || + (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || + BP_IS_HOLE(db->db_blkptr)))) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + + dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type, + db->db.db_size)); + bzero(db->db.db_data, db->db.db_size); + + if (db->db_blkptr != NULL && db->db_level > 0 && + BP_IS_HOLE(db->db_blkptr) && + db->db_blkptr->blk_birth != 0) { + blkptr_t *bps = db->db.db_data; + for (int i = 0; i < ((1 << + DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t)); + i++) { + blkptr_t *bp = &bps[i]; + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, + 1 << dn->dn_indblkshift); + BP_SET_LSIZE(bp, + BP_GET_LEVEL(db->db_blkptr) == 1 ? + dn->dn_datablksz : + BP_GET_LSIZE(db->db_blkptr)); + BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); + BP_SET_LEVEL(bp, + BP_GET_LEVEL(db->db_blkptr) - 1); + BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); + } + } + DB_DNODE_EXIT(db); + db->db_state = DB_CACHED; + mutex_exit(&db->db_mtx); + return; + } + + DB_DNODE_EXIT(db); + + db->db_state = DB_READ; + mutex_exit(&db->db_mtx); + + if (DBUF_IS_L2CACHEABLE(db)) + aflags |= ARC_FLAG_L2CACHE; + + SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? + db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, + db->db.db_object, db->db_level, db->db_blkid); + + dbuf_add_ref(db, NULL); + + (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, + dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, + (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, + &aflags, &zb); +} + +/* + * This is our just-in-time copy function. It makes a copy of buffers that + * have been modified in a previous transaction group before we access them in + * the current active group. + * + * This function is used in three places: when we are dirtying a buffer for the + * first time in a txg, when we are freeing a range in a dnode that includes + * this buffer, and when we are accessing a buffer which was received compressed + * and later referenced in a WRITE_BYREF record. + * + * Note that when we are called from dbuf_free_range() we do not put a hold on + * the buffer, we just traverse the active dbuf list for the dnode. + */ +static void +dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr = db->db_last_dirty; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db.db_data != NULL); + ASSERT(db->db_level == 0); + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); + + if (dr == NULL || + (dr->dt.dl.dr_data != + ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + return; + + /* + * If the last dirty record for this dbuf has not yet synced + * and its referencing the dbuf data, either: + * reset the reference to point to a new copy, + * or (if there a no active holders) + * just null out the current db_data pointer. + */ + ASSERT(dr->dr_txg >= txg - 2); + if (db->db_blkid == DMU_BONUS_BLKID) { + /* Note that the data bufs here are zio_bufs */ + dnode_t *dn = DB_DNODE(db); + int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + dr->dt.dl.dr_data = zio_buf_alloc(bonuslen); + arc_space_consume(bonuslen, ARC_SPACE_BONUS); + bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); + } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + int size = arc_buf_size(db->db_buf); + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa = db->db_objset->os_spa; + enum zio_compress compress_type = + arc_get_compression(db->db_buf); + + if (compress_type == ZIO_COMPRESS_OFF) { + dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); + } else { + ASSERT3U(type, ==, ARC_BUFC_DATA); + dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, + size, arc_buf_lsize(db->db_buf), compress_type); + } + bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); + } else { + db->db_buf = NULL; + dbuf_clear_data(db); + } +} + +int +dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +{ + int err = 0; + boolean_t prefetch; + dnode_t *dn; + + /* + * We don't have to hold the mutex to check db_state because it + * can't be freed while we have a hold on the buffer. + */ + ASSERT(!refcount_is_zero(&db->db_holds)); + + if (db->db_state == DB_NOFILL) + return (SET_ERROR(EIO)); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && + DBUF_IS_CACHEABLE(db); + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_CACHED) { + /* + * If the arc buf is compressed, we need to decompress it to + * read the data. This could happen during the "zfs receive" of + * a stream which is compressed and deduplicated. + */ + if (db->db_buf != NULL && + arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) { + dbuf_fix_old_data(db, + spa_syncing_txg(dmu_objset_spa(db->db_objset))); + err = arc_decompress(db->db_buf); + dbuf_set_data(db, db->db_buf); + } + mutex_exit(&db->db_mtx); + if (prefetch) + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + DBUF_STAT_BUMP(hash_hits); + } else if (db->db_state == DB_UNCACHED) { + spa_t *spa = dn->dn_objset->os_spa; + boolean_t need_wait = B_FALSE; + + if (zio == NULL && + db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + need_wait = B_TRUE; + } + dbuf_read_impl(db, zio, flags); + + /* dbuf_read_impl has dropped db_mtx for us */ + + if (prefetch) + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + DBUF_STAT_BUMP(hash_misses); + + if (need_wait) + err = zio_wait(zio); + } else { + /* + * Another reader came in while the dbuf was in flight + * between UNCACHED and CACHED. Either a writer will finish + * writing the buffer (sending the dbuf to CACHED) or the + * first reader's request will reach the read_done callback + * and send the dbuf to CACHED. Otherwise, a failure + * occurred and the dbuf went to UNCACHED. + */ + mutex_exit(&db->db_mtx); + if (prefetch) + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + DBUF_STAT_BUMP(hash_misses); + + /* Skip the wait per the caller's request. */ + mutex_enter(&db->db_mtx); + if ((flags & DB_RF_NEVERWAIT) == 0) { + while (db->db_state == DB_READ || + db->db_state == DB_FILL) { + ASSERT(db->db_state == DB_READ || + (flags & DB_RF_HAVESTRUCT) == 0); + DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, + db, zio_t *, zio); + cv_wait(&db->db_changed, &db->db_mtx); + } + if (db->db_state == DB_UNCACHED) + err = SET_ERROR(EIO); + } + mutex_exit(&db->db_mtx); + } + + return (err); +} + +static void +dbuf_noread(dmu_buf_impl_t *db) +{ + ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa = db->db_objset->os_spa; + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size)); + db->db_state = DB_FILL; + } else if (db->db_state == DB_NOFILL) { + dbuf_clear_data(db); + } else { + ASSERT3U(db->db_state, ==, DB_CACHED); + } + mutex_exit(&db->db_mtx); +} + +void +dbuf_unoverride(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + blkptr_t *bp = &dr->dt.dl.dr_overridden_by; + uint64_t txg = dr->dr_txg; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + /* + * This assert is valid because dmu_sync() expects to be called by + * a zilog's get_data while holding a range lock. This call only + * comes from dbuf_dirty() callers who must also hold a range lock. + */ + ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); + ASSERT(db->db_level == 0); + + if (db->db_blkid == DMU_BONUS_BLKID || + dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) + return; + + ASSERT(db->db_data_pending != dr); + + /* free this block */ + if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) + zio_free(db->db_objset->os_spa, txg, bp); + + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + dr->dt.dl.dr_nopwrite = B_FALSE; + + /* + * Release the already-written buffer, so we leave it in + * a consistent dirty state. Note that all callers are + * modifying the buffer, so they will immediately do + * another (redundant) arc_release(). Therefore, leave + * the buf thawed to save the effort of freezing & + * immediately re-thawing it. + */ + arc_release(dr->dt.dl.dr_data, db); +} + +/* + * Evict (if its unreferenced) or clear (if its referenced) any level-0 + * data blocks in the free range, so that any future readers will find + * empty blocks. + */ +void +dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) +{ + dmu_buf_impl_t db_search; + dmu_buf_impl_t *db, *db_next; + uint64_t txg = tx->tx_txg; + avl_index_t where; + + if (end_blkid > dn->dn_maxblkid && + !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID)) + end_blkid = dn->dn_maxblkid; + dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); + + db_search.db_level = 0; + db_search.db_blkid = start_blkid; + db_search.db_state = DB_SEARCH; + + mutex_enter(&dn->dn_dbufs_mtx); + db = avl_find(&dn->dn_dbufs, &db_search, &where); + ASSERT3P(db, ==, NULL); + + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + for (; db != NULL; db = db_next) { + db_next = AVL_NEXT(&dn->dn_dbufs, db); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + if (db->db_level != 0 || db->db_blkid > end_blkid) { + break; + } + ASSERT3U(db->db_blkid, >=, start_blkid); + + /* found a level 0 buffer in the range */ + mutex_enter(&db->db_mtx); + if (dbuf_undirty(db, tx)) { + /* mutex has been dropped and dbuf destroyed */ + continue; + } + + if (db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL || + db->db_state == DB_EVICTING) { + ASSERT(db->db.db_data == NULL); + mutex_exit(&db->db_mtx); + continue; + } + if (db->db_state == DB_READ || db->db_state == DB_FILL) { + /* will be handled in dbuf_read_done or dbuf_rele */ + db->db_freed_in_flight = TRUE; + mutex_exit(&db->db_mtx); + continue; + } + if (refcount_count(&db->db_holds) == 0) { + ASSERT(db->db_buf); + dbuf_destroy(db); + continue; + } + /* The dbuf is referenced */ + + if (db->db_last_dirty != NULL) { + dbuf_dirty_record_t *dr = db->db_last_dirty; + + if (dr->dr_txg == txg) { + /* + * This buffer is "in-use", re-adjust the file + * size to reflect that this buffer may + * contain new data when we sync. + */ + if (db->db_blkid != DMU_SPILL_BLKID && + db->db_blkid > dn->dn_maxblkid) + dn->dn_maxblkid = db->db_blkid; + dbuf_unoverride(dr); + } else { + /* + * This dbuf is not dirty in the open context. + * Either uncache it (if its not referenced in + * the open context) or reset its contents to + * empty. + */ + dbuf_fix_old_data(db, txg); + } + } + /* clear the contents if its cached */ + if (db->db_state == DB_CACHED) { + ASSERT(db->db.db_data != NULL); + arc_release(db->db_buf, db); + bzero(db->db.db_data, db->db.db_size); + arc_buf_freeze(db->db_buf); + } + + mutex_exit(&db->db_mtx); + } + mutex_exit(&dn->dn_dbufs_mtx); +} + +void +dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) +{ + arc_buf_t *buf, *obuf; + int osize = db->db.db_size; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + dnode_t *dn; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + /* XXX does *this* func really need the lock? */ + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + /* + * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held + * is OK, because there can be no other references to the db + * when we are changing its size, so no concurrent DB_FILL can + * be happening. + */ + /* + * XXX we should be doing a dbuf_read, checking the return + * value and returning that up to our callers + */ + dmu_buf_will_dirty(&db->db, tx); + + /* create the data buffer for the new block */ + buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); + + /* copy old block data to the new block */ + obuf = db->db_buf; + bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); + /* zero the remainder */ + if (size > osize) + bzero((uint8_t *)buf->b_data + osize, size - osize); + + mutex_enter(&db->db_mtx); + dbuf_set_data(db, buf); + arc_buf_destroy(obuf, db); + db->db.db_size = size; + + if (db->db_level == 0) { + ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); + db->db_last_dirty->dt.dl.dr_data = buf; + } + mutex_exit(&db->db_mtx); + + dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); + DB_DNODE_EXIT(db); +} + +void +dbuf_release_bp(dmu_buf_impl_t *db) +{ + objset_t *os = db->db_objset; + + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + ASSERT(arc_released(os->os_phys_buf) || + list_link_active(&os->os_dsl_dataset->ds_synced_link)); + ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); + + (void) arc_release(db->db_buf, db); +} + +/* + * We already have a dirty record for this TXG, and we are being + * dirtied again. + */ +static void +dbuf_redirty(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { + /* + * If this buffer has already been written out, + * we now need to reset its state. + */ + dbuf_unoverride(dr); + if (db->db.db_object != DMU_META_DNODE_OBJECT && + db->db_state != DB_NOFILL) { + /* Already released on initial dirty, so just thaw. */ + ASSERT(arc_released(db->db_buf)); + arc_buf_thaw(db->db_buf); + } + } +} + +dbuf_dirty_record_t * +dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn; + objset_t *os; + dbuf_dirty_record_t **drp, *dr; + int drop_struct_lock = FALSE; + int txgoff = tx->tx_txg & TXG_MASK; + + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + DMU_TX_DIRTY_BUF(tx, db); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + /* + * Shouldn't dirty a regular buffer in syncing context. Private + * objects may be dirtied in syncing context, but only if they + * were already pre-dirtied in open context. + */ +#ifdef DEBUG + if (dn->dn_objset->os_dsl_dataset != NULL) { + rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, + RW_READER, FTAG); + } + ASSERT(!dmu_tx_is_syncing(tx) || + BP_IS_HOLE(dn->dn_objset->os_rootbp) || + DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + dn->dn_objset->os_dsl_dataset == NULL); + if (dn->dn_objset->os_dsl_dataset != NULL) + rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); +#endif + /* + * We make this assert for private objects as well, but after we + * check if we're already dirty. They are allowed to re-dirty + * in syncing context. + */ + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + mutex_enter(&db->db_mtx); + /* + * XXX make this true for indirects too? The problem is that + * transactions created with dmu_tx_create_assigned() from + * syncing context don't bother holding ahead. + */ + ASSERT(db->db_level != 0 || + db->db_state == DB_CACHED || db->db_state == DB_FILL || + db->db_state == DB_NOFILL); + + mutex_enter(&dn->dn_mtx); + /* + * Don't set dirtyctx to SYNC if we're just modifying this as we + * initialize the objset. + */ + if (dn->dn_dirtyctx == DN_UNDIRTIED) { + if (dn->dn_objset->os_dsl_dataset != NULL) { + rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, + RW_READER, FTAG); + } + if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { + dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? + DN_DIRTY_SYNC : DN_DIRTY_OPEN); + ASSERT(dn->dn_dirtyctx_firstset == NULL); + dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); + } + if (dn->dn_objset->os_dsl_dataset != NULL) { + rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, + FTAG); + } + } + mutex_exit(&dn->dn_mtx); + + if (db->db_blkid == DMU_SPILL_BLKID) + dn->dn_have_spill = B_TRUE; + + /* + * If this buffer is already dirty, we're done. + */ + drp = &db->db_last_dirty; + ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || + db->db.db_object == DMU_META_DNODE_OBJECT); + while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) + drp = &dr->dr_next; + if (dr && dr->dr_txg == tx->tx_txg) { + DB_DNODE_EXIT(db); + + dbuf_redirty(dr); + mutex_exit(&db->db_mtx); + return (dr); + } + + /* + * Only valid if not already dirty. + */ + ASSERT(dn->dn_object == 0 || + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + ASSERT3U(dn->dn_nlevels, >, db->db_level); + + /* + * We should only be dirtying in syncing context if it's the + * mos or we're initializing the os or it's a special object. + * However, we are allowed to dirty in syncing context provided + * we already dirtied it in open context. Hence we must make + * this assertion only if we're not already dirty. + */ + os = dn->dn_objset; + VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa)); +#ifdef DEBUG + if (dn->dn_objset->os_dsl_dataset != NULL) + rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); + ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); + if (dn->dn_objset->os_dsl_dataset != NULL) + rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); +#endif + ASSERT(db->db.db_size != 0); + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + if (db->db_blkid != DMU_BONUS_BLKID) { + dmu_objset_willuse_space(os, db->db.db_size, tx); + } + + /* + * If this buffer is dirty in an old transaction group we need + * to make a copy of it so that the changes we make in this + * transaction group won't leak out when we sync the older txg. + */ + dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); + list_link_init(&dr->dr_dirty_node); + if (db->db_level == 0) { + void *data_old = db->db_buf; + + if (db->db_state != DB_NOFILL) { + if (db->db_blkid == DMU_BONUS_BLKID) { + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db.db_data; + } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { + /* + * Release the data buffer from the cache so + * that we can modify it without impacting + * possible other users of this cached data + * block. Note that indirect blocks and + * private objects are not released until the + * syncing state (since they are only modified + * then). + */ + arc_release(db->db_buf, db); + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db_buf; + } + ASSERT(data_old != NULL); + } + dr->dt.dl.dr_data = data_old; + } else { + mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&dr->dt.di.dr_children, + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + } + if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) + dr->dr_accounted = db->db.db_size; + dr->dr_dbuf = db; + dr->dr_txg = tx->tx_txg; + dr->dr_next = *drp; + *drp = dr; + + /* + * We could have been freed_in_flight between the dbuf_noread + * and dbuf_dirty. We win, as though the dbuf_noread() had + * happened after the free. + */ + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + if (dn->dn_free_ranges[txgoff] != NULL) { + range_tree_clear(dn->dn_free_ranges[txgoff], + db->db_blkid, 1); + } + mutex_exit(&dn->dn_mtx); + db->db_freed_in_flight = FALSE; + } + + /* + * This buffer is now part of this txg + */ + dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); + db->db_dirtycnt += 1; + ASSERT3U(db->db_dirtycnt, <=, 3); + + mutex_exit(&db->db_mtx); + + if (db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); + return (dr); + } + + /* + * The dn_struct_rwlock prevents db_blkptr from changing + * due to a write from syncing context completing + * while we are running, so we want to acquire it before + * looking at db_blkptr. + */ + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + /* + * We need to hold the dn_struct_rwlock to make this assertion, + * because it protects dn_phys / dn_next_nlevels from changing. + */ + ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || + dn->dn_phys->dn_nlevels > db->db_level || + dn->dn_next_nlevels[txgoff] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); + + /* + * If we are overwriting a dedup BP, then unless it is snapshotted, + * when we get to syncing context we will need to decrement its + * refcount in the DDT. Prefetch the relevant DDT block so that + * syncing context won't have to wait for the i/o. + */ + ddt_prefetch(os->os_spa, db->db_blkptr); + + if (db->db_level == 0) { + dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); + ASSERT(dn->dn_maxblkid >= db->db_blkid); + } + + if (db->db_level+1 < dn->dn_nlevels) { + dmu_buf_impl_t *parent = db->db_parent; + dbuf_dirty_record_t *di; + int parent_held = FALSE; + + if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + parent = dbuf_hold_level(dn, db->db_level+1, + db->db_blkid >> epbs, FTAG); + ASSERT(parent != NULL); + parent_held = TRUE; + } + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + ASSERT3U(db->db_level+1, ==, parent->db_level); + di = dbuf_dirty(parent, tx); + if (parent_held) + dbuf_rele(parent, FTAG); + + mutex_enter(&db->db_mtx); + /* + * Since we've dropped the mutex, it's possible that + * dbuf_undirty() might have changed this out from under us. + */ + if (db->db_last_dirty == dr || + dn->dn_object == DMU_META_DNODE_OBJECT) { + mutex_enter(&di->dt.di.dr_mtx); + ASSERT3U(di->dr_txg, ==, tx->tx_txg); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&di->dt.di.dr_children, dr); + mutex_exit(&di->dt.di.dr_mtx); + dr->dr_parent = di; + } + mutex_exit(&db->db_mtx); + } else { + ASSERT(db->db_level+1 == dn->dn_nlevels); + ASSERT(db->db_blkid < dn->dn_nblkptr); + ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + } + + dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); + return (dr); +} + +/* + * Undirty a buffer in the transaction group referenced by the given + * transaction. Return whether this evicted the dbuf. + */ +static boolean_t +dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn; + uint64_t txg = tx->tx_txg; + dbuf_dirty_record_t *dr, **drp; + + ASSERT(txg != 0); + + /* + * Due to our use of dn_nlevels below, this can only be called + * in open context, unless we are operating on the MOS. + * From syncing context, dn_nlevels may be different from the + * dn_nlevels used when dbuf was dirtied. + */ + ASSERT(db->db_objset == + dmu_objset_pool(db->db_objset)->dp_meta_objset || + txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT0(db->db_level); + ASSERT(MUTEX_HELD(&db->db_mtx)); + + /* + * If this buffer is not dirty, we're done. + */ + for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) + if (dr->dr_txg <= txg) + break; + if (dr == NULL || dr->dr_txg < txg) + return (B_FALSE); + ASSERT(dr->dr_txg == txg); + ASSERT(dr->dr_dbuf == db); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + ASSERT(db->db.db_size != 0); + + dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), + dr->dr_accounted, txg); + + *drp = dr->dr_next; + + /* + * Note that there are three places in dbuf_dirty() + * where this dirty record may be put on a list. + * Make sure to do a list_remove corresponding to + * every one of those list_insert calls. + */ + if (dr->dr_parent) { + mutex_enter(&dr->dr_parent->dt.di.dr_mtx); + list_remove(&dr->dr_parent->dt.di.dr_children, dr); + mutex_exit(&dr->dr_parent->dt.di.dr_mtx); + } else if (db->db_blkid == DMU_SPILL_BLKID || + db->db_level + 1 == dn->dn_nlevels) { + ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); + mutex_enter(&dn->dn_mtx); + list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); + mutex_exit(&dn->dn_mtx); + } + DB_DNODE_EXIT(db); + + if (db->db_state != DB_NOFILL) { + dbuf_unoverride(dr); + + ASSERT(db->db_buf != NULL); + ASSERT(dr->dt.dl.dr_data != NULL); + if (dr->dt.dl.dr_data != db->db_buf) + arc_buf_destroy(dr->dt.dl.dr_data, db); + } + + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + + if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { + ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); + dbuf_destroy(db); + return (B_TRUE); + } + + return (B_FALSE); +} + +void +dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; + + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + + /* + * Quick check for dirtyness. For already dirty blocks, this + * reduces runtime of this function by >90%, and overall performance + * by 50% for some workloads (e.g. file deletion with indirect blocks + * cached). + */ + mutex_enter(&db->db_mtx); + dbuf_dirty_record_t *dr; + for (dr = db->db_last_dirty; + dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { + /* + * It's possible that it is already dirty but not cached, + * because there are some calls to dbuf_dirty() that don't + * go through dmu_buf_will_dirty(). + */ + if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { + /* This dbuf is already dirty and cached. */ + dbuf_redirty(dr); + mutex_exit(&db->db_mtx); + return; + } + } + mutex_exit(&db->db_mtx); + + DB_DNODE_ENTER(db); + if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + DB_DNODE_EXIT(db); + (void) dbuf_read(db, NULL, rf); + (void) dbuf_dirty(db, tx); +} + +void +dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_state = DB_NOFILL; + + dmu_buf_will_fill(db_fake, tx); +} + +void +dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(tx->tx_txg != 0); + ASSERT(db->db_level == 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || + dmu_tx_private_ok(tx)); + + dbuf_noread(db); + (void) dbuf_dirty(db, tx); +} + +#pragma weak dmu_buf_fill_done = dbuf_fill_done +/* ARGSUSED */ +void +dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); + + if (db->db_state == DB_FILL) { + if (db->db_level == 0 && db->db_freed_in_flight) { + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + /* we were freed while filling */ + /* XXX dbuf_undirty? */ + bzero(db->db.db_data, db->db.db_size); + db->db_freed_in_flight = FALSE; + } + db->db_state = DB_CACHED; + cv_broadcast(&db->db_changed); + } + mutex_exit(&db->db_mtx); +} + +void +dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, + bp_embedded_type_t etype, enum zio_compress comp, + int uncompressed_size, int compressed_size, int byteorder, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + struct dirty_leaf *dl; + dmu_object_type_t type; + + if (etype == BP_EMBEDDED_TYPE_DATA) { + ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), + SPA_FEATURE_EMBEDDED_DATA)); + } + + DB_DNODE_ENTER(db); + type = DB_DNODE(db)->dn_type; + DB_DNODE_EXIT(db); + + ASSERT0(db->db_level); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + dmu_buf_will_not_fill(dbuf, tx); + + ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); + dl = &db->db_last_dirty->dt.dl; + encode_embedded_bp_compressed(&dl->dr_overridden_by, + data, comp, uncompressed_size, compressed_size); + BPE_SET_ETYPE(&dl->dr_overridden_by, etype); + BP_SET_TYPE(&dl->dr_overridden_by, type); + BP_SET_LEVEL(&dl->dr_overridden_by, 0); + BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); + + dl->dr_override_state = DR_OVERRIDDEN; + dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; +} + +/* + * Directly assign a provided arc buf to a given dbuf if it's not referenced + * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. + */ +void +dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) +{ + ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_level == 0); + ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); + ASSERT(buf != NULL); + ASSERT(arc_buf_lsize(buf) == db->db.db_size); + ASSERT(tx->tx_txg != 0); + + arc_return_buf(buf, db); + ASSERT(arc_released(buf)); + + mutex_enter(&db->db_mtx); + + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); + + if (db->db_state == DB_CACHED && + refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + bcopy(buf->b_data, db->db.db_data, db->db.db_size); + arc_buf_destroy(buf, db); + xuio_stat_wbuf_copied(); + return; + } + + xuio_stat_wbuf_nocopy(); + if (db->db_state == DB_CACHED) { + dbuf_dirty_record_t *dr = db->db_last_dirty; + + ASSERT(db->db_buf != NULL); + if (dr != NULL && dr->dr_txg == tx->tx_txg) { + ASSERT(dr->dt.dl.dr_data == db->db_buf); + if (!arc_released(db->db_buf)) { + ASSERT(dr->dt.dl.dr_override_state == + DR_OVERRIDDEN); + arc_release(db->db_buf, db); + } + dr->dt.dl.dr_data = buf; + arc_buf_destroy(db->db_buf, db); + } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { + arc_release(db->db_buf, db); + arc_buf_destroy(db->db_buf, db); + } + db->db_buf = NULL; + } + ASSERT(db->db_buf == NULL); + dbuf_set_data(db, buf); + db->db_state = DB_FILL; + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + dmu_buf_fill_done(&db->db, tx); +} + +void +dbuf_destroy(dmu_buf_impl_t *db) +{ + dnode_t *dn; + dmu_buf_impl_t *parent = db->db_parent; + dmu_buf_impl_t *dndb; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(refcount_is_zero(&db->db_holds)); + + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + } + + if (db->db_blkid == DMU_BONUS_BLKID) { + int slots = DB_DNODE(db)->dn_num_slots; + int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); + if (db->db.db_data != NULL) { + zio_buf_free(db->db.db_data, bonuslen); + arc_space_return(bonuslen, ARC_SPACE_BONUS); + db->db_state = DB_UNCACHED; + } + } + + dbuf_clear_data(db); + + if (multilist_link_active(&db->db_cache_link)) { + ASSERT(db->db_caching_status == DB_DBUF_CACHE || + db->db_caching_status == DB_DBUF_METADATA_CACHE); + + multilist_remove(dbuf_caches[db->db_caching_status].cache, db); + (void) refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, + db->db.db_size, db); + + if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { + DBUF_STAT_BUMPDOWN(metadata_cache_count); + } else { + DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); + DBUF_STAT_BUMPDOWN(cache_count); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], + db->db.db_size); + } + db->db_caching_status = DB_NO_CACHE; + } + + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); + ASSERT(db->db_data_pending == NULL); + + db->db_state = DB_EVICTING; + db->db_blkptr = NULL; + + /* + * Now that db_state is DB_EVICTING, nobody else can find this via + * the hash table. We can now drop db_mtx, which allows us to + * acquire the dn_dbufs_mtx. + */ + mutex_exit(&db->db_mtx); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dndb = dn->dn_dbuf; + if (db->db_blkid != DMU_BONUS_BLKID) { + boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); + if (needlock) + mutex_enter(&dn->dn_dbufs_mtx); + avl_remove(&dn->dn_dbufs, db); + atomic_dec_32(&dn->dn_dbufs_count); + membar_producer(); + DB_DNODE_EXIT(db); + if (needlock) + mutex_exit(&dn->dn_dbufs_mtx); + /* + * Decrementing the dbuf count means that the hold corresponding + * to the removed dbuf is no longer discounted in dnode_move(), + * so the dnode cannot be moved until after we release the hold. + * The membar_producer() ensures visibility of the decremented + * value in dnode_move(), since DB_DNODE_EXIT doesn't actually + * release any lock. + */ + mutex_enter(&dn->dn_mtx); + dnode_rele_and_unlock(dn, db, B_TRUE); + db->db_dnode_handle = NULL; + + dbuf_hash_remove(db); + } else { + DB_DNODE_EXIT(db); + } + + ASSERT(refcount_is_zero(&db->db_holds)); + + db->db_parent = NULL; + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); + ASSERT(!multilist_link_active(&db->db_cache_link)); + + kmem_cache_free(dbuf_kmem_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); + + /* + * If this dbuf is referenced from an indirect dbuf, + * decrement the ref count on the indirect dbuf. + */ + if (parent && parent != dndb) { + mutex_enter(&parent->db_mtx); + dbuf_rele_and_unlock(parent, db, B_TRUE); + } +} + +/* + * Note: While bpp will always be updated if the function returns success, + * parentp will not be updated if the dnode does not have dn_dbuf filled in; + * this happens when the dnode is the meta-dnode, or a userused or groupused + * object. + */ +__attribute__((always_inline)) +static inline int +dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, + dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh) +{ + *parentp = NULL; + *bpp = NULL; + + ASSERT(blkid != DMU_BONUS_BLKID); + + if (blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + if (dn->dn_have_spill && + (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) + *bpp = DN_SPILL_BLKPTR(dn->dn_phys); + else + *bpp = NULL; + dbuf_add_ref(dn->dn_dbuf, NULL); + *parentp = dn->dn_dbuf; + mutex_exit(&dn->dn_mtx); + return (0); + } + + int nlevels = + (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT3U(level * epbs, <, 64); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + /* + * This assertion shouldn't trip as long as the max indirect block size + * is less than 1M. The reason for this is that up to that point, + * the number of levels required to address an entire object with blocks + * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In + * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55 + * (i.e. we can address the entire object), objects will all use at most + * N-1 levels and the assertion won't overflow. However, once epbs is + * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be + * enough to address an entire object, so objects will have 5 levels, + * but then this assertion will overflow. + * + * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we + * need to redo this logic to handle overflows. + */ + ASSERT(level >= nlevels || + ((nlevels - level - 1) * epbs) + + highbit64(dn->dn_phys->dn_nblkptr) <= 64); + if (level >= nlevels || + blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr << + ((nlevels - level - 1) * epbs)) || + (fail_sparse && + blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { + /* the buffer has no parent yet */ + return (SET_ERROR(ENOENT)); + } else if (level < nlevels-1) { + /* this block is referenced from an indirect block */ + int err; + if (dh == NULL) { + err = dbuf_hold_impl(dn, level+1, + blkid >> epbs, fail_sparse, FALSE, NULL, parentp); + } else { + __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, + blkid >> epbs, fail_sparse, FALSE, NULL, + parentp, dh->dh_depth + 1); + err = __dbuf_hold_impl(dh + 1); + } + if (err) + return (err); + err = dbuf_read(*parentp, NULL, + (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + if (err) { + dbuf_rele(*parentp, NULL); + *parentp = NULL; + return (err); + } + *bpp = ((blkptr_t *)(*parentp)->db.db_data) + + (blkid & ((1ULL << epbs) - 1)); + if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) + ASSERT(BP_IS_HOLE(*bpp)); + return (0); + } else { + /* the block is referenced from the dnode */ + ASSERT3U(level, ==, nlevels-1); + ASSERT(dn->dn_phys->dn_nblkptr == 0 || + blkid < dn->dn_phys->dn_nblkptr); + if (dn->dn_dbuf) { + dbuf_add_ref(dn->dn_dbuf, NULL); + *parentp = dn->dn_dbuf; + } + *bpp = &dn->dn_phys->dn_blkptr[blkid]; + return (0); + } +} + +static dmu_buf_impl_t * +dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, + dmu_buf_impl_t *parent, blkptr_t *blkptr) +{ + objset_t *os = dn->dn_objset; + dmu_buf_impl_t *db, *odb; + + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT(dn->dn_type != DMU_OT_NONE); + + db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); + + db->db_objset = os; + db->db.db_object = dn->dn_object; + db->db_level = level; + db->db_blkid = blkid; + db->db_last_dirty = NULL; + db->db_dirtycnt = 0; + db->db_dnode_handle = dn->dn_handle; + db->db_parent = parent; + db->db_blkptr = blkptr; + + db->db_user = NULL; + db->db_user_immediate_evict = FALSE; + db->db_freed_in_flight = FALSE; + db->db_pending_evict = FALSE; + + if (blkid == DMU_BONUS_BLKID) { + ASSERT3P(parent, ==, dn->dn_dbuf); + db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - + (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + db->db.db_offset = DMU_BONUS_BLKID; + db->db_state = DB_UNCACHED; + db->db_caching_status = DB_NO_CACHE; + /* the bonus dbuf is not placed in the hash table */ + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); + return (db); + } else if (blkid == DMU_SPILL_BLKID) { + db->db.db_size = (blkptr != NULL) ? + BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; + db->db.db_offset = 0; + } else { + int blocksize = + db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; + db->db.db_size = blocksize; + db->db.db_offset = db->db_blkid * blocksize; + } + + /* + * Hold the dn_dbufs_mtx while we get the new dbuf + * in the hash table *and* added to the dbufs list. + * This prevents a possible deadlock with someone + * trying to look up this dbuf before its added to the + * dn_dbufs list. + */ + mutex_enter(&dn->dn_dbufs_mtx); + db->db_state = DB_EVICTING; + if ((odb = dbuf_hash_insert(db)) != NULL) { + /* someone else inserted it first */ + kmem_cache_free(dbuf_kmem_cache, db); + mutex_exit(&dn->dn_dbufs_mtx); + DBUF_STAT_BUMP(hash_insert_race); + return (odb); + } + avl_add(&dn->dn_dbufs, db); + + db->db_state = DB_UNCACHED; + db->db_caching_status = DB_NO_CACHE; + mutex_exit(&dn->dn_dbufs_mtx); + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); + + if (parent && parent != dn->dn_dbuf) + dbuf_add_ref(parent, db); + + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || + refcount_count(&dn->dn_holds) > 0); + (void) refcount_add(&dn->dn_holds, db); + atomic_inc_32(&dn->dn_dbufs_count); + + dprintf_dbuf(db, "db=%p\n", db); + + return (db); +} + +typedef struct dbuf_prefetch_arg { + spa_t *dpa_spa; /* The spa to issue the prefetch in. */ + zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ + int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ + int dpa_curlevel; /* The current level that we're reading */ + dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ + zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ + zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ + arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ +} dbuf_prefetch_arg_t; + +/* + * Actually issue the prefetch read for the block given. + */ +static void +dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) +{ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + return; + + arc_flags_t aflags = + dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); + ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); + ASSERT(dpa->dpa_zio != NULL); + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, + dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &dpa->dpa_zb); +} + +/* + * Called when an indirect block above our prefetch target is read in. This + * will either read in the next indirect block down the tree or issue the actual + * prefetch if the next block down is our target. + */ +static void +dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *iobp, arc_buf_t *abuf, void *private) +{ + dbuf_prefetch_arg_t *dpa = private; + + ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); + ASSERT3S(dpa->dpa_curlevel, >, 0); + + if (abuf == NULL) { + ASSERT(zio == NULL || zio->io_error != 0); + kmem_free(dpa, sizeof (*dpa)); + return; + } + ASSERT(zio == NULL || zio->io_error == 0); + + /* + * The dpa_dnode is only valid if we are called with a NULL + * zio. This indicates that the arc_read() returned without + * first calling zio_read() to issue a physical read. Once + * a physical read is made the dpa_dnode must be invalidated + * as the locks guarding it may have been dropped. If the + * dpa_dnode is still valid, then we want to add it to the dbuf + * cache. To do so, we must hold the dbuf associated with the block + * we just prefetched, read its contents so that we associate it + * with an arc_buf_t, and then release it. + */ + if (zio != NULL) { + ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); + if (zio->io_flags & ZIO_FLAG_RAW) { + ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); + } else { + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + } + ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + + dpa->dpa_dnode = NULL; + } else if (dpa->dpa_dnode != NULL) { + uint64_t curblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - + dpa->dpa_zb.zb_level)); + dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, + dpa->dpa_curlevel, curblkid, FTAG); + (void) dbuf_read(db, NULL, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); + dbuf_rele(db, FTAG); + } + + if (abuf == NULL) { + kmem_free(dpa, sizeof(*dpa)); + return; + } + + dpa->dpa_curlevel--; + + uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); + blkptr_t *bp = ((blkptr_t *)abuf->b_data) + + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); + if (BP_IS_HOLE(bp)) { + kmem_free(dpa, sizeof (*dpa)); + } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { + ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); + dbuf_issue_final_prefetch(dpa, bp); + kmem_free(dpa, sizeof (*dpa)); + } else { + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; + zbookmark_phys_t zb; + + /* flag if L2ARC eligible, l2arc_noprefetch then decides */ + if (dpa->dpa_aflags & ARC_FLAG_L2CACHE) + iter_aflags |= ARC_FLAG_L2CACHE; + + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); + + SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, + dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); + + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, + bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &iter_aflags, &zb); + } + + arc_buf_destroy(abuf, private); +} + +/* + * Issue prefetch reads for the given block on the given level. If the indirect + * blocks above that block are not in memory, we will read them in + * asynchronously. As a result, this call never blocks waiting for a read to + * complete. + */ +void +dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, + arc_flags_t aflags) +{ + blkptr_t bp; + int epbs, nlevels, curlevel; + uint64_t curblkid; + + ASSERT(blkid != DMU_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + + if (blkid > dn->dn_maxblkid) + return; + + if (dnode_block_freed(dn, blkid)) + return; + + /* + * This dnode hasn't been written to disk yet, so there's nothing to + * prefetch. + */ + nlevels = dn->dn_phys->dn_nlevels; + if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) + return; + + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) + return; + + dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, + level, blkid); + if (db != NULL) { + mutex_exit(&db->db_mtx); + /* + * This dbuf already exists. It is either CACHED, or + * (we assume) about to be read or filled. + */ + return; + } + + /* + * Find the closest ancestor (indirect block) of the target block + * that is present in the cache. In this indirect block, we will + * find the bp that is at curlevel, curblkid. + */ + curlevel = level; + curblkid = blkid; + while (curlevel < nlevels - 1) { + int parent_level = curlevel + 1; + uint64_t parent_blkid = curblkid >> epbs; + dmu_buf_impl_t *db; + + if (dbuf_hold_impl(dn, parent_level, parent_blkid, + FALSE, TRUE, FTAG, &db) == 0) { + blkptr_t *bpp = db->db_buf->b_data; + bp = bpp[P2PHASE(curblkid, 1 << epbs)]; + dbuf_rele(db, FTAG); + break; + } + + curlevel = parent_level; + curblkid = parent_blkid; + } + + if (curlevel == nlevels - 1) { + /* No cached indirect blocks found. */ + ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); + bp = dn->dn_phys->dn_blkptr[curblkid]; + } + if (BP_IS_HOLE(&bp)) + return; + + ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); + + zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, + ZIO_FLAG_CANFAIL); + + dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, level, blkid); + dpa->dpa_curlevel = curlevel; + dpa->dpa_prio = prio; + dpa->dpa_aflags = aflags; + dpa->dpa_spa = dn->dn_objset->os_spa; + dpa->dpa_dnode = dn; + dpa->dpa_epbs = epbs; + dpa->dpa_zio = pio; + + /* flag if L2ARC eligible, l2arc_noprefetch then decides */ + if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) + dpa->dpa_aflags |= ARC_FLAG_L2CACHE; + + /* + * If we have the indirect just above us, no need to do the asynchronous + * prefetch chain; we'll just run the last step ourselves. If we're at + * a higher level, though, we want to issue the prefetches for all the + * indirect blocks asynchronously, so we can go on with whatever we were + * doing. + */ + if (curlevel == level) { + ASSERT3U(curblkid, ==, blkid); + dbuf_issue_final_prefetch(dpa, &bp); + kmem_free(dpa, sizeof (*dpa)); + } else { + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; + zbookmark_phys_t zb; + + /* flag if L2ARC eligible, l2arc_noprefetch then decides */ + if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) + iter_aflags |= ARC_FLAG_L2CACHE; + + SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, curlevel, curblkid); + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, + &bp, dbuf_prefetch_indirect_done, dpa, prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &iter_aflags, &zb); + } + /* + * We use pio here instead of dpa_zio since it's possible that + * dpa may have already been freed. + */ + zio_nowait(pio); +} + +#define DBUF_HOLD_IMPL_MAX_DEPTH 20 + +/* + * Helper function for __dbuf_hold_impl() to copy a buffer. Handles + * the case of encrypted, compressed and uncompressed buffers by + * allocating the new buffer, respectively, with arc_alloc_raw_buf(), + * arc_alloc_compressed_buf() or arc_alloc_buf().* + * + * NOTE: Declared noinline to avoid stack bloat in __dbuf_hold_impl(). + */ +noinline static void +dbuf_hold_copy(struct dbuf_hold_impl_data *dh) +{ + dnode_t *dn = dh->dh_dn; + dmu_buf_impl_t *db = dh->dh_db; + dbuf_dirty_record_t *dr = dh->dh_dr; + arc_buf_t *data = dr->dt.dl.dr_data; + + enum zio_compress compress_type = arc_get_compression(data); + + if (compress_type != ZIO_COMPRESS_OFF) { + dbuf_set_data(db, arc_alloc_compressed_buf( + dn->dn_objset->os_spa, db, arc_buf_size(data), + arc_buf_lsize(data), compress_type)); + } else { + dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db, + DBUF_GET_BUFC_TYPE(db), db->db.db_size)); + } + + bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); +} + +/* + * Returns with db_holds incremented, and db_mtx not held. + * Note: dn_struct_rwlock must be held. + */ +static int +__dbuf_hold_impl(struct dbuf_hold_impl_data *dh) +{ + ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH); + dh->dh_parent = NULL; + + ASSERT(dh->dh_blkid != DMU_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock)); + ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level); + + *(dh->dh_dbp) = NULL; + + /* dbuf_find() returns with db_mtx held */ + dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object, + dh->dh_level, dh->dh_blkid); + + if (dh->dh_db == NULL) { + dh->dh_bp = NULL; + + if (dh->dh_fail_uncached) + return (SET_ERROR(ENOENT)); + + ASSERT3P(dh->dh_parent, ==, NULL); + dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, + dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp, dh); + if (dh->dh_fail_sparse) { + if (dh->dh_err == 0 && + dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) + dh->dh_err = SET_ERROR(ENOENT); + if (dh->dh_err) { + if (dh->dh_parent) + dbuf_rele(dh->dh_parent, NULL); + return (dh->dh_err); + } + } + if (dh->dh_err && dh->dh_err != ENOENT) + return (dh->dh_err); + dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid, + dh->dh_parent, dh->dh_bp); + } + + if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) { + mutex_exit(&dh->dh_db->db_mtx); + return (SET_ERROR(ENOENT)); + } + + if (dh->dh_db->db_buf != NULL) { + arc_buf_access(dh->dh_db->db_buf); + ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data); + } + + ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf)); + + /* + * If this buffer is currently syncing out, and we are are + * still referencing it from db_data, we need to make a copy + * of it in case we decide we want to dirty it again in this txg. + */ + if (dh->dh_db->db_level == 0 && + dh->dh_db->db_blkid != DMU_BONUS_BLKID && + dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT && + dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) { + dh->dh_dr = dh->dh_db->db_data_pending; + if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) + dbuf_hold_copy(dh); + } + + if (multilist_link_active(&dh->dh_db->db_cache_link)) { + ASSERT(refcount_is_zero(&dh->dh_db->db_holds)); + ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE || + dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE); + + multilist_remove( + dbuf_caches[dh->dh_db->db_caching_status].cache, + dh->dh_db); + (void) refcount_remove_many( + &dbuf_caches[dh->dh_db->db_caching_status].size, + dh->dh_db->db.db_size, dh->dh_db); + + if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) { + DBUF_STAT_BUMPDOWN(metadata_cache_count); + } else { + DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]); + DBUF_STAT_BUMPDOWN(cache_count); + DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level], + dh->dh_db->db.db_size); + } + dh->dh_db->db_caching_status = DB_NO_CACHE; + } + (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag); + DBUF_VERIFY(dh->dh_db); + mutex_exit(&dh->dh_db->db_mtx); + + /* NOTE: we can't rele the parent until after we drop the db_mtx */ + if (dh->dh_parent) + dbuf_rele(dh->dh_parent, NULL); + + ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn); + ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid); + ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level); + *(dh->dh_dbp) = dh->dh_db; + + return (0); +} + +/* + * The following code preserves the recursive function dbuf_hold_impl() + * but moves the local variables AND function arguments to the heap to + * minimize the stack frame size. Enough space is initially allocated + * on the stack for 20 levels of recursion. + */ +int +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, + void *tag, dmu_buf_impl_t **dbp) +{ + struct dbuf_hold_impl_data *dh; + int error; + + dh = kmem_alloc(sizeof (struct dbuf_hold_impl_data) * + DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); + __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, + fail_uncached, tag, dbp, 0); + + error = __dbuf_hold_impl(dh); + + kmem_free(dh, sizeof (struct dbuf_hold_impl_data) * + DBUF_HOLD_IMPL_MAX_DEPTH); + + return (error); +} + +static void +__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, + dnode_t *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, + void *tag, dmu_buf_impl_t **dbp, int depth) +{ + dh->dh_dn = dn; + dh->dh_level = level; + dh->dh_blkid = blkid; + + dh->dh_fail_sparse = fail_sparse; + dh->dh_fail_uncached = fail_uncached; + + dh->dh_tag = tag; + dh->dh_dbp = dbp; + + dh->dh_db = NULL; + dh->dh_parent = NULL; + dh->dh_bp = NULL; + dh->dh_err = 0; + dh->dh_dr = NULL; + + dh->dh_depth = depth; +} + +dmu_buf_impl_t * +dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) +{ + return (dbuf_hold_level(dn, 0, blkid, tag)); +} + +dmu_buf_impl_t * +dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) +{ + dmu_buf_impl_t *db; + int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); + return (err ? NULL : db); +} + +void +dbuf_create_bonus(dnode_t *dn) +{ + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + ASSERT(dn->dn_bonus == NULL); + dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); +} + +int +dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + if (db->db_blkid != DMU_SPILL_BLKID) + return (SET_ERROR(ENOTSUP)); + if (blksz == 0) + blksz = SPA_MINBLOCKSIZE; + ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); + blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dbuf_new_size(db, blksz, tx); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + + return (0); +} + +void +dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) +{ + dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); +} + +#pragma weak dmu_buf_add_ref = dbuf_add_ref +void +dbuf_add_ref(dmu_buf_impl_t *db, void *tag) +{ + int64_t holds = refcount_add(&db->db_holds, tag); + ASSERT3S(holds, >, 1); +} + +#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref +boolean_t +dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, + void *tag) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dmu_buf_impl_t *found_db; + boolean_t result = B_FALSE; + + if (db->db_blkid == DMU_BONUS_BLKID) + found_db = dbuf_find_bonus(os, obj); + else + found_db = dbuf_find(os, obj, 0, blkid); + + if (found_db != NULL) { + if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { + (void) refcount_add(&db->db_holds, tag); + result = B_TRUE; + } + mutex_exit(&db->db_mtx); + } + return (result); +} + +/* + * If you call dbuf_rele() you had better not be referencing the dnode handle + * unless you have some other direct or indirect hold on the dnode. (An indirect + * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) + * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the + * dnode's parent dbuf evicting its dnode handles. + */ +void +dbuf_rele(dmu_buf_impl_t *db, void *tag) +{ + mutex_enter(&db->db_mtx); + dbuf_rele_and_unlock(db, tag, B_FALSE); +} + +void +dmu_buf_rele(dmu_buf_t *db, void *tag) +{ + dbuf_rele((dmu_buf_impl_t *)db, tag); +} + +/* + * dbuf_rele() for an already-locked dbuf. This is necessary to allow + * db_dirtycnt and db_holds to be updated atomically. The 'evicting' + * argument should be set if we are already in the dbuf-evicting code + * path, in which case we don't want to recursively evict. This allows us to + * avoid deeply nested stacks that would have a call flow similar to this: + * + * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() + * ^ | + * | | + * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * + */ +void +dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) +{ + int64_t holds; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + DBUF_VERIFY(db); + + /* + * Remove the reference to the dbuf before removing its hold on the + * dnode so we can guarantee in dnode_move() that a referenced bonus + * buffer has a corresponding dnode hold. + */ + holds = refcount_remove(&db->db_holds, tag); + ASSERT(holds >= 0); + + /* + * We can't freeze indirects if there is a possibility that they + * may be modified in the current syncing context. + */ + if (db->db_buf != NULL && + holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { + arc_buf_freeze(db->db_buf); + } + + if (holds == db->db_dirtycnt && + db->db_level == 0 && db->db_user_immediate_evict) + dbuf_evict_user(db); + + if (holds == 0) { + if (db->db_blkid == DMU_BONUS_BLKID) { + dnode_t *dn; + boolean_t evict_dbuf = db->db_pending_evict; + + /* + * If the dnode moves here, we cannot cross this + * barrier until the move completes. + */ + DB_DNODE_ENTER(db); + + dn = DB_DNODE(db); + atomic_dec_32(&dn->dn_dbufs_count); + + /* + * Decrementing the dbuf count means that the bonus + * buffer's dnode hold is no longer discounted in + * dnode_move(). The dnode cannot move until after + * the dnode_rele() below. + */ + DB_DNODE_EXIT(db); + + /* + * Do not reference db after its lock is dropped. + * Another thread may evict it. + */ + mutex_exit(&db->db_mtx); + + if (evict_dbuf) + dnode_evict_bonus(dn); + + dnode_rele(dn, db); + } else if (db->db_buf == NULL) { + /* + * This is a special case: we never associated this + * dbuf with any data allocated from the ARC. + */ + ASSERT(db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); + dbuf_destroy(db); + } else if (arc_released(db->db_buf)) { + /* + * This dbuf has anonymous data associated with it. + */ + dbuf_destroy(db); + } else { + boolean_t do_arc_evict = B_FALSE; + blkptr_t bp; + spa_t *spa = dmu_objset_spa(db->db_objset); + + if (!DBUF_IS_CACHEABLE(db) && + db->db_blkptr != NULL && + !BP_IS_HOLE(db->db_blkptr) && + !BP_IS_EMBEDDED(db->db_blkptr)) { + do_arc_evict = B_TRUE; + bp = *db->db_blkptr; + } + + if (!DBUF_IS_CACHEABLE(db) || + db->db_pending_evict) { + dbuf_destroy(db); + } else if (!multilist_link_active(&db->db_cache_link)) { + ASSERT3U(db->db_caching_status, ==, + DB_NO_CACHE); + + dbuf_cached_state_t dcs = + dbuf_include_in_metadata_cache(db) ? + DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; + db->db_caching_status = dcs; + + multilist_insert(dbuf_caches[dcs].cache, db); + (void) refcount_add_many(&dbuf_caches[dcs].size, + db->db.db_size, db); + + if (dcs == DB_DBUF_METADATA_CACHE) { + DBUF_STAT_BUMP(metadata_cache_count); + DBUF_STAT_MAX( + metadata_cache_size_bytes_max, + refcount_count( + &dbuf_caches[dcs].size)); + } else { + DBUF_STAT_BUMP( + cache_levels[db->db_level]); + DBUF_STAT_BUMP(cache_count); + DBUF_STAT_INCR( + cache_levels_bytes[db->db_level], + db->db.db_size); + DBUF_STAT_MAX(cache_size_bytes_max, + refcount_count( + &dbuf_caches[dcs].size)); + } + mutex_exit(&db->db_mtx); + + if (db->db_caching_status == DB_DBUF_CACHE && + !evicting) { + dbuf_evict_notify(); + } + } + + if (do_arc_evict) + arc_freed(spa, &bp); + } + } else { + mutex_exit(&db->db_mtx); + } + +} + +#pragma weak dmu_buf_refcount = dbuf_refcount +uint64_t +dbuf_refcount(dmu_buf_impl_t *db) +{ + return (refcount_count(&db->db_holds)); +} + +void * +dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, + dmu_buf_user_t *new_user) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + mutex_enter(&db->db_mtx); + dbuf_verify_user(db, DBVU_NOT_EVICTING); + if (db->db_user == old_user) + db->db_user = new_user; + else + old_user = db->db_user; + dbuf_verify_user(db, DBVU_NOT_EVICTING); + mutex_exit(&db->db_mtx); + + return (old_user); +} + +void * +dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + return (dmu_buf_replace_user(db_fake, NULL, user)); +} + +void * +dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_user_immediate_evict = TRUE; + return (dmu_buf_set_user(db_fake, user)); +} + +void * +dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + return (dmu_buf_replace_user(db_fake, user, NULL)); +} + +void * +dmu_buf_get_user(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + dbuf_verify_user(db, DBVU_NOT_EVICTING); + return (db->db_user); +} + +void +dmu_buf_user_evict_wait() +{ + taskq_wait(dbu_evict_taskq); +} + +blkptr_t * +dmu_buf_get_blkptr(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + return (dbi->db_blkptr); +} + +objset_t * +dmu_buf_get_objset(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + return (dbi->db_objset); +} + +dnode_t * +dmu_buf_dnode_enter(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + DB_DNODE_ENTER(dbi); + return (DB_DNODE(dbi)); +} + +void +dmu_buf_dnode_exit(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + DB_DNODE_EXIT(dbi); +} + +static void +dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) +{ + /* ASSERT(dmu_tx_is_syncing(tx) */ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_blkptr != NULL) + return; + + if (db->db_blkid == DMU_SPILL_BLKID) { + db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys); + BP_ZERO(db->db_blkptr); + return; + } + if (db->db_level == dn->dn_phys->dn_nlevels-1) { + /* + * This buffer was allocated at a time when there was + * no available blkptrs from the dnode, or it was + * inappropriate to hook it in (i.e., nlevels mis-match). + */ + ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); + ASSERT(db->db_parent == NULL); + db->db_parent = dn->dn_dbuf; + db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; + DBUF_VERIFY(db); + } else { + dmu_buf_impl_t *parent = db->db_parent; + int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT(dn->dn_phys->dn_nlevels > 1); + if (parent == NULL) { + mutex_exit(&db->db_mtx); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + parent = dbuf_hold_level(dn, db->db_level + 1, + db->db_blkid >> epbs, db); + rw_exit(&dn->dn_struct_rwlock); + mutex_enter(&db->db_mtx); + db->db_parent = parent; + } + db->db_blkptr = (blkptr_t *)parent->db.db_data + + (db->db_blkid & ((1ULL << epbs) - 1)); + DBUF_VERIFY(db); + } +} + +/* + * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it + * is critical the we not allow the compiler to inline this function in to + * dbuf_sync_list() thereby drastically bloating the stack usage. + */ +noinline static void +dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn; + zio_t *zio; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + + ASSERT(db->db_level > 0); + DBUF_VERIFY(db); + + /* Read the block if it hasn't been read yet. */ + if (db->db_buf == NULL) { + mutex_exit(&db->db_mtx); + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + mutex_enter(&db->db_mtx); + } + ASSERT3U(db->db_state, ==, DB_CACHED); + ASSERT(db->db_buf != NULL); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + /* Indirect block size must match what the dnode thinks it is. */ + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + dbuf_check_blkptr(dn, db); + DB_DNODE_EXIT(db); + + /* Provide the pending dirty record to child dbufs */ + db->db_data_pending = dr; + + mutex_exit(&db->db_mtx); + + dbuf_write(dr, db->db_buf, tx); + + zio = dr->dr_zio; + mutex_enter(&dr->dt.di.dr_mtx); + dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + mutex_exit(&dr->dt.di.dr_mtx); + zio_nowait(zio); +} + +/* + * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is + * critical the we not allow the compiler to inline this function in to + * dbuf_sync_list() thereby drastically bloating the stack usage. + */ +noinline static void +dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + arc_buf_t **datap = &dr->dt.dl.dr_data; + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn; + objset_t *os; + uint64_t txg = tx->tx_txg; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + /* + * To be synced, we must be dirtied. But we + * might have been freed after the dirty. + */ + if (db->db_state == DB_UNCACHED) { + /* This buffer has been freed since it was dirtied */ + ASSERT(db->db.db_data == NULL); + } else if (db->db_state == DB_FILL) { + /* This buffer was freed and is now being re-filled */ + ASSERT(db->db.db_data != dr->dt.dl.dr_data); + } else { + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); + } + DBUF_VERIFY(db); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (db->db_blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { + /* + * In the previous transaction group, the bonus buffer + * was entirely used to store the attributes for the + * dnode which overrode the dn_spill field. However, + * when adding more attributes to the file a spill + * block was required to hold the extra attributes. + * + * Make sure to clear the garbage left in the dn_spill + * field from the previous attributes in the bonus + * buffer. Otherwise, after writing out the spill + * block to the new allocated dva, it will free + * the old block pointed to by the invalid dn_spill. + */ + db->db_blkptr = NULL; + } + dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; + mutex_exit(&dn->dn_mtx); + } + + /* + * If this is a bonus buffer, simply copy the bonus data into the + * dnode. It will be written out when the dnode is synced (and it + * will be synced, since it must have been dirty for dbuf_sync to + * be called). + */ + if (db->db_blkid == DMU_BONUS_BLKID) { + dbuf_dirty_record_t **drp; + + ASSERT(*datap != NULL); + ASSERT0(db->db_level); + ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, + DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); + bcopy(*datap, DN_BONUS(dn->dn_phys), + DN_MAX_BONUS_LEN(dn->dn_phys)); + DB_DNODE_EXIT(db); + + if (*datap != db->db.db_data) { + int slots = DB_DNODE(db)->dn_num_slots; + int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); + zio_buf_free(*datap, bonuslen); + arc_space_return(bonuslen, ARC_SPACE_BONUS); + } + db->db_data_pending = NULL; + drp = &db->db_last_dirty; + while (*drp != dr) + drp = &(*drp)->dr_next; + ASSERT(dr->dr_next == NULL); + ASSERT(dr->dr_dbuf == db); + *drp = dr->dr_next; + if (dr->dr_dbuf->db_level != 0) { + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); + return; + } + + os = dn->dn_objset; + + /* + * This function may have dropped the db_mtx lock allowing a dmu_sync + * operation to sneak in. As a result, we need to ensure that we + * don't check the dr_override_state until we have returned from + * dbuf_check_blkptr. + */ + dbuf_check_blkptr(dn, db); + + /* + * If this buffer is in the middle of an immediate write, + * wait for the synchronous IO to complete. + */ + while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + cv_wait(&db->db_changed, &db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); + } + + if (db->db_state != DB_NOFILL && + dn->dn_object != DMU_META_DNODE_OBJECT && + refcount_count(&db->db_holds) > 1 && + dr->dt.dl.dr_override_state != DR_OVERRIDDEN && + *datap == db->db_buf) { + /* + * If this buffer is currently "in use" (i.e., there + * are active holds and db_data still references it), + * then make a copy before we start the write so that + * any modifications from the open txg will not leak + * into this write. + * + * NOTE: this copy does not need to be made for + * objects only modified in the syncing context (e.g. + * DNONE_DNODE blocks). + */ + int psize = arc_buf_size(*datap); + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + enum zio_compress compress_type = arc_get_compression(*datap); + + if (compress_type == ZIO_COMPRESS_OFF) { + *datap = arc_alloc_buf(os->os_spa, db, type, psize); + } else { + ASSERT3U(type, ==, ARC_BUFC_DATA); + int lsize = arc_buf_lsize(*datap); + *datap = arc_alloc_compressed_buf(os->os_spa, db, + psize, lsize, compress_type); + } + bcopy(db->db.db_data, (*datap)->b_data, psize); + } + db->db_data_pending = dr; + + mutex_exit(&db->db_mtx); + + dbuf_write(dr, *datap, tx); + + ASSERT(!list_link_active(&dr->dr_dirty_node)); + if (dn->dn_object == DMU_META_DNODE_OBJECT) { + list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); + DB_DNODE_EXIT(db); + } else { + /* + * Although zio_nowait() does not "wait for an IO", it does + * initiate the IO. If this is an empty write it seems plausible + * that the IO could actually be completed before the nowait + * returns. We need to DB_DNODE_EXIT() first in case + * zio_nowait() invalidates the dbuf. + */ + DB_DNODE_EXIT(db); + zio_nowait(dr->dr_zio); + } +} + +void +dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr; + + while (dr = list_head(list)) { + if (dr->dr_zio != NULL) { + /* + * If we find an already initialized zio then we + * are processing the meta-dnode, and we have finished. + * The dbufs for all dnodes are put back on the list + * during processing, so that we can zio_wait() + * these IOs after initiating all child IOs. + */ + ASSERT3U(dr->dr_dbuf->db.db_object, ==, + DMU_META_DNODE_OBJECT); + break; + } + if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { + VERIFY3U(dr->dr_dbuf->db_level, ==, level); + } + list_remove(list, dr); + if (dr->dr_dbuf->db_level > 0) + dbuf_sync_indirect(dr, tx); + else + dbuf_sync_leaf(dr, tx); + } +} + +/* ARGSUSED */ +static void +dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + dnode_t *dn; + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + spa_t *spa = zio->io_spa; + int64_t delta; + uint64_t fill = 0; + int i; + + ASSERT3P(db->db_blkptr, !=, NULL); + ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); + dnode_diduse_space(dn, delta - zio->io_prev_space_delta); + zio->io_prev_space_delta = delta; + + if (bp->blk_birth != 0) { + ASSERT((db->db_blkid != DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_type) || + (db->db_blkid == DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_bonustype) || + BP_IS_EMBEDDED(bp)); + ASSERT(BP_GET_LEVEL(bp) == db->db_level); + } + + mutex_enter(&db->db_mtx); + +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(bp)) && + db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); + } +#endif + + if (db->db_level == 0) { + mutex_enter(&dn->dn_mtx); + if (db->db_blkid > dn->dn_phys->dn_maxblkid && + db->db_blkid != DMU_SPILL_BLKID) + dn->dn_phys->dn_maxblkid = db->db_blkid; + mutex_exit(&dn->dn_mtx); + + if (dn->dn_type == DMU_OT_DNODE) { + i = 0; + while (i < db->db.db_size) { + dnode_phys_t *dnp = db->db.db_data + i; + + i += DNODE_MIN_SIZE; + if (dnp->dn_type != DMU_OT_NONE) { + fill++; + i += dnp->dn_extra_slots * + DNODE_MIN_SIZE; + } + } + } else { + if (BP_IS_HOLE(bp)) { + fill = 0; + } else { + fill = 1; + } + } + } else { + blkptr_t *ibp = db->db.db_data; + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { + if (BP_IS_HOLE(ibp)) + continue; + fill += BP_GET_FILL(ibp); + } + } + DB_DNODE_EXIT(db); + + if (!BP_IS_EMBEDDED(bp)) + bp->blk_fill = fill; + + mutex_exit(&db->db_mtx); + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + *db->db_blkptr = *bp; + rw_exit(&dn->dn_struct_rwlock); +} + +/* ARGSUSED */ +/* + * This function gets called just prior to running through the compression + * stage of the zio pipeline. If we're an indirect block comprised of only + * holes, then we want this indirect to be compressed away to a hole. In + * order to do that we must zero out any information about the holes that + * this indirect points to prior to before we try to compress it. + */ +static void +dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + dnode_t *dn; + blkptr_t *bp; + unsigned int epbs, i; + + ASSERT3U(db->db_level, >, 0); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(epbs, <, 31); + + /* Determine if all our children are holes */ + for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { + if (!BP_IS_HOLE(bp)) + break; + } + + /* + * If all the children are holes, then zero them all out so that + * we may get compressed away. + */ + if (i == 1 << epbs) { + /* + * We only found holes. Grab the rwlock to prevent + * anybody from reading the blocks we're about to + * zero out. + */ + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + bzero(db->db.db_data, db->db.db_size); + rw_exit(&dn->dn_struct_rwlock); + } + DB_DNODE_EXIT(db); +} + +/* + * The SPA will call this callback several times for each zio - once + * for every physical child i/o (zio->io_phys_children times). This + * allows the DMU to monitor the progress of each logical i/o. For example, + * there may be 2 copies of an indirect block, or many fragments of a RAID-Z + * block. There may be a long delay before all copies/fragments are completed, + * so this callback allows us to retire dirty space gradually, as the physical + * i/os complete. + */ +/* ARGSUSED */ +static void +dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) +{ + dmu_buf_impl_t *db = arg; + objset_t *os = db->db_objset; + dsl_pool_t *dp = dmu_objset_pool(os); + dbuf_dirty_record_t *dr; + int delta = 0; + + dr = db->db_data_pending; + ASSERT3U(dr->dr_txg, ==, zio->io_txg); + + /* + * The callback will be called io_phys_children times. Retire one + * portion of our dirty space each time we are called. Any rounding + * error will be cleaned up by dsl_pool_sync()'s call to + * dsl_pool_undirty_space(). + */ + delta = dr->dr_accounted / zio->io_phys_children; + dsl_pool_undirty_space(dp, delta, zio->io_txg); +} + +/* ARGSUSED */ +static void +dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + blkptr_t *bp_orig = &zio->io_bp_orig; + blkptr_t *bp = db->db_blkptr; + objset_t *os = db->db_objset; + dmu_tx_t *tx = os->os_synctx; + dbuf_dirty_record_t **drp, *dr; + + ASSERT0(zio->io_error); + ASSERT(db->db_blkptr == bp); + + /* + * For nopwrites and rewrites we ensure that the bp matches our + * original and bypass all the accounting. + */ + if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { + ASSERT(BP_EQUAL(bp, bp_orig)); + } else { + dsl_dataset_t *ds = os->os_dsl_dataset; + (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, bp, tx); + } + + mutex_enter(&db->db_mtx); + + DBUF_VERIFY(db); + + drp = &db->db_last_dirty; + while ((dr = *drp) != db->db_data_pending) + drp = &dr->dr_next; + ASSERT(!list_link_active(&dr->dr_dirty_node)); + ASSERT(dr->dr_dbuf == db); + ASSERT(dr->dr_next == NULL); + *drp = dr->dr_next; + +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); + DB_DNODE_EXIT(db); + } +#endif + + if (db->db_level == 0) { + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + if (db->db_state != DB_NOFILL) { + if (dr->dt.dl.dr_data != db->db_buf) + arc_buf_destroy(dr->dt.dl.dr_data, db); + } + } else { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); + if (!BP_IS_HOLE(db->db_blkptr)) { + int epbs = + dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(db->db_blkid, <=, + dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, + db->db.db_size); + } + DB_DNODE_EXIT(db); + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + + cv_broadcast(&db->db_changed); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + db->db_data_pending = NULL; + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); +} + +static void +dbuf_write_nofill_ready(zio_t *zio) +{ + dbuf_write_ready(zio, NULL, zio->io_private); +} + +static void +dbuf_write_nofill_done(zio_t *zio) +{ + dbuf_write_done(zio, NULL, zio->io_private); +} + +static void +dbuf_write_override_ready(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dmu_buf_impl_t *db = dr->dr_dbuf; + + dbuf_write_ready(zio, NULL, db); +} + +static void +dbuf_write_override_done(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dmu_buf_impl_t *db = dr->dr_dbuf; + blkptr_t *obp = &dr->dt.dl.dr_overridden_by; + + mutex_enter(&db->db_mtx); + if (!BP_EQUAL(zio->io_bp, obp)) { + if (!BP_IS_HOLE(obp)) + dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); + arc_release(dr->dt.dl.dr_data, db); + } + mutex_exit(&db->db_mtx); + dbuf_write_done(zio, NULL, db); + + if (zio->io_abd != NULL) + abd_put(zio->io_abd); +} + +typedef struct dbuf_remap_impl_callback_arg { + objset_t *drica_os; + uint64_t drica_blk_birth; + dmu_tx_t *drica_tx; +} dbuf_remap_impl_callback_arg_t; + +static void +dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, + void *arg) +{ + dbuf_remap_impl_callback_arg_t *drica = arg; + objset_t *os = drica->drica_os; + spa_t *spa = dmu_objset_spa(os); + dmu_tx_t *tx = drica->drica_tx; + + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); + + if (os == spa_meta_objset(spa)) { + spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); + } else { + dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset, + size, drica->drica_blk_birth, tx); + } +} + +static void +dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) +{ + blkptr_t bp_copy = *bp; + spa_t *spa = dmu_objset_spa(dn->dn_objset); + dbuf_remap_impl_callback_arg_t drica; + + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); + + drica.drica_os = dn->dn_objset; + drica.drica_blk_birth = bp->blk_birth; + drica.drica_tx = tx; + if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, + &drica)) { + /* + * The struct_rwlock prevents dbuf_read_impl() from + * dereferencing the BP while we are changing it. To + * avoid lock contention, only grab it when we are actually + * changing the BP. + */ + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + *bp = bp_copy; + rw_exit(&dn->dn_struct_rwlock); + } +} + +/* + * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting + * to remap a copy of every bp in the dbuf. + */ +boolean_t +dbuf_can_remap(const dmu_buf_impl_t *db) +{ + spa_t *spa = dmu_objset_spa(db->db_objset); + blkptr_t *bp = db->db.db_data; + boolean_t ret = B_FALSE; + + ASSERT3U(db->db_level, >, 0); + ASSERT3S(db->db_state, ==, DB_CACHED); + + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { + blkptr_t bp_copy = bp[i]; + if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { + ret = B_TRUE; + break; + } + } + spa_config_exit(spa, SCL_VDEV, FTAG); + + return (ret); +} + +boolean_t +dnode_needs_remap(const dnode_t *dn) +{ + spa_t *spa = dmu_objset_spa(dn->dn_objset); + boolean_t ret = B_FALSE; + + if (dn->dn_phys->dn_nlevels == 0) { + return (B_FALSE); + } + + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) { + blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j]; + if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { + ret = B_TRUE; + break; + } + } + spa_config_exit(spa, SCL_VDEV, FTAG); + + return (ret); +} + +/* + * Remap any existing BP's to concrete vdevs, if possible. + */ +static void +dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(db->db_objset); + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); + + if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) + return; + + if (db->db_level > 0) { + blkptr_t *bp = db->db.db_data; + for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { + dbuf_remap_impl(dn, &bp[i], tx); + } + } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { + dnode_phys_t *dnp = db->db.db_data; + ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, + DMU_OT_DNODE); + for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; + i += dnp[i].dn_extra_slots + 1) { + for (int j = 0; j < dnp[i].dn_nblkptr; j++) { + dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx); + } + } + } +} + + +/* Issue I/O to commit a dirty buffer to disk. */ +static void +dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn; + objset_t *os; + dmu_buf_impl_t *parent = db->db_parent; + uint64_t txg = tx->tx_txg; + zbookmark_phys_t zb; + zio_prop_t zp; + zio_t *zio; + int wp_flag = 0; + + ASSERT(dmu_tx_is_syncing(tx)); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + os = dn->dn_objset; + + if (db->db_state != DB_NOFILL) { + if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { + /* + * Private object buffers are released here rather + * than in dbuf_dirty() since they are only modified + * in the syncing context and we don't want the + * overhead of making multiple copies of the data. + */ + if (BP_IS_HOLE(db->db_blkptr)) { + arc_buf_thaw(data); + } else { + dbuf_release_bp(db); + } + dbuf_remap(dn, db, tx); + } + } + + if (parent != dn->dn_dbuf) { + /* Our parent is an indirect block. */ + /* We have a dirty parent that has been scheduled for write. */ + ASSERT(parent && parent->db_data_pending); + /* Our parent's buffer is one level closer to the dnode. */ + ASSERT(db->db_level == parent->db_level-1); + /* + * We're about to modify our parent's db_data by modifying + * our block pointer, so the parent must be released. + */ + ASSERT(arc_released(parent->db_buf)); + zio = parent->db_data_pending->dr_zio; + } else { + /* Our parent is the dnode itself. */ + ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && + db->db_blkid != DMU_SPILL_BLKID) || + (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + zio = dn->dn_zio; + } + + ASSERT(db->db_level == 0 || data == db->db_buf); + ASSERT3U(db->db_blkptr->blk_birth, <=, txg); + ASSERT(zio); + + SET_BOOKMARK(&zb, os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET, + db->db.db_object, db->db_level, db->db_blkid); + + if (db->db_blkid == DMU_SPILL_BLKID) + wp_flag = WP_SPILL; + wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; + + dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); + DB_DNODE_EXIT(db); + + /* + * We copy the blkptr now (rather than when we instantiate the dirty + * record), because its value can change between open context and + * syncing context. We do not need to hold dn_struct_rwlock to read + * db_blkptr because we are in syncing context. + */ + dr->dr_bp_copy = *db->db_blkptr; + + if (db->db_level == 0 && + dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * The BP for this block has been provided by open context + * (by dmu_sync() or dmu_buf_write_embedded()). + */ + abd_t *contents = (data != NULL) ? + abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; + + dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, + contents, db->db.db_size, db->db.db_size, &zp, + dbuf_write_override_ready, NULL, NULL, + dbuf_write_override_done, + dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + mutex_enter(&db->db_mtx); + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, + dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); + mutex_exit(&db->db_mtx); + } else if (db->db_state == DB_NOFILL) { + ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || + zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); + dr->dr_zio = zio_write(zio, os->os_spa, txg, + &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, + dbuf_write_nofill_ready, NULL, NULL, + dbuf_write_nofill_done, db, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); + } else { + ASSERT(arc_released(data)); + + /* + * For indirect blocks, we want to setup the children + * ready callback so that we can properly handle an indirect + * block that only contains holes. + */ + arc_write_done_func_t *children_ready_cb = NULL; + if (db->db_level != 0) + children_ready_cb = dbuf_write_children_ready; + + dr->dr_zio = arc_write(zio, os->os_spa, txg, + &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), + &zp, dbuf_write_ready, children_ready_cb, + dbuf_write_physdone, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c new file mode 100644 index 000000000000..1d1ba4edfa07 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c @@ -0,0 +1,242 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include <sys/zfs_context.h> +#include <sys/dbuf.h> +#include <sys/dmu_objset.h> + +/* + * Calculate the index of the arc header for the state, disabled by default. + */ +int zfs_dbuf_state_index = 0; + +/* + * ========================================================================== + * Dbuf Hash Read Routines + * ========================================================================== + */ +typedef struct dbuf_stats_t { + kmutex_t lock; + kstat_t *kstat; + dbuf_hash_table_t *hash; + int idx; +} dbuf_stats_t; + +static dbuf_stats_t dbuf_stats_hash_table; + +static int +dbuf_stats_hash_table_headers(char *buf, size_t size) +{ + size = snprintf(buf, size - 1, + "%-88s | %-124s | %s\n" + "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | " + "%-5s %-5s %-6s %-8s %-6s %-8s %-12s " + "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | " + "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n", + "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", + "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list", + "atype", "index", "flags", "count", "asize", "access", "mru", "gmru", + "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", + "dtype", "btype", "data_bs", "meta_bs", "bsize", + "lvls", "dholds", "blocks", "dsize"); + buf[size] = '\0'; + + return (0); +} + +int +__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) +{ + arc_buf_info_t abi = { 0 }; + dmu_object_info_t doi = { 0 }; + dnode_t *dn = DB_DNODE(db); + + if (db->db_buf) + arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index); + + if (dn) + __dmu_object_info_from_dnode(dn, &doi); + + size = snprintf(buf, size - 1, + "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | " + "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu " + "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | " + "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n", + /* dmu_buf_impl_t */ + spa_name(dn->dn_objset->os_spa), + (u_longlong_t)dmu_objset_id(db->db_objset), + (longlong_t)db->db.db_object, + (longlong_t)db->db_level, + (longlong_t)db->db_blkid, + (u_longlong_t)db->db.db_offset, + (u_longlong_t)db->db.db_size, + !!dbuf_is_metadata(db), + db->db_state, + (ulong_t)refcount_count(&db->db_holds), + /* arc_buf_info_t */ + abi.abi_state_type, + abi.abi_state_contents, + (longlong_t)abi.abi_state_index, + abi.abi_flags, + (ulong_t)abi.abi_bufcnt, + (u_longlong_t)abi.abi_size, + (u_longlong_t)abi.abi_access, + (ulong_t)abi.abi_mru_hits, + (ulong_t)abi.abi_mru_ghost_hits, + (ulong_t)abi.abi_mfu_hits, + (ulong_t)abi.abi_mfu_ghost_hits, + (ulong_t)abi.abi_l2arc_hits, + (u_longlong_t)abi.abi_l2arc_dattr, + (u_longlong_t)abi.abi_l2arc_asize, + abi.abi_l2arc_compress, + (ulong_t)abi.abi_holds, + /* dmu_object_info_t */ + doi.doi_type, + doi.doi_bonus_type, + (ulong_t)doi.doi_data_block_size, + (ulong_t)doi.doi_metadata_block_size, + (u_longlong_t)doi.doi_bonus_size, + (ulong_t)doi.doi_indirection, + (ulong_t)refcount_count(&dn->dn_holds), + (u_longlong_t)doi.doi_fill_count, + (u_longlong_t)doi.doi_max_offset); + buf[size] = '\0'; + + return (size); +} + +static int +dbuf_stats_hash_table_data(char *buf, size_t size, void *data) +{ + dbuf_stats_t *dsh = (dbuf_stats_t *)data; + dbuf_hash_table_t *h = dsh->hash; + dmu_buf_impl_t *db; + int length, error = 0; + + ASSERT3S(dsh->idx, >=, 0); + ASSERT3S(dsh->idx, <=, h->hash_table_mask); + memset(buf, 0, size); + + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); + for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { + /* + * Returning ENOMEM will cause the data and header functions + * to be called with a larger scratch buffers. + */ + if (size < 512) { + error = ENOMEM; + break; + } + + mutex_enter(&db->db_mtx); + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); + + length = __dbuf_stats_hash_table_data(buf, size, db); + buf += length; + size -= length; + + mutex_exit(&db->db_mtx); + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); + } + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); + + return (error); +} + +static void * +dbuf_stats_hash_table_addr(kstat_t *ksp, off_t n) +{ + dbuf_stats_t *dsh = ksp->ks_private; + + ASSERT(MUTEX_HELD(&dsh->lock)); + + if (n <= dsh->hash->hash_table_mask) { + dsh->idx = n; + return (dsh); + } + + return (NULL); +} + +#ifndef __FreeBSD__ +/* + * XXX The FreeBSD SPL is missing support for KSTAT_TYPE_RAW + * we can enable this as soon as that's implemented. See the + * lindebugfs module for similar callback semantics. + */ +static void +dbuf_stats_hash_table_init(dbuf_hash_table_t *hash) +{ + dbuf_stats_t *dsh = &dbuf_stats_hash_table; + kstat_t *ksp; + + mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL); + dsh->hash = hash; + + ksp = kstat_create("zfs", 0, "dbufs", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + dsh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &dsh->lock; + ksp->ks_ndata = UINT32_MAX; + ksp->ks_private = dsh; + kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers, + dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr); + kstat_install(ksp); + } +} + +static void +dbuf_stats_hash_table_destroy(void) +{ + dbuf_stats_t *dsh = &dbuf_stats_hash_table; + kstat_t *ksp; + + ksp = dsh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_destroy(&dsh->lock); +} +#else +static void +dbuf_stats_hash_table_init(dbuf_hash_table_t *hash) +{ +} + +static void +dbuf_stats_hash_table_destroy(void) +{ +} +#endif + +void +dbuf_stats_init(dbuf_hash_table_t *hash) +{ + dbuf_stats_hash_table_init(hash); +} + +void +dbuf_stats_destroy(void) +{ + dbuf_stats_hash_table_destroy(); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c new file mode 100644 index 000000000000..9cd612382402 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c @@ -0,0 +1,1177 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/ddt.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> +#include <sys/arc.h> +#include <sys/dsl_pool.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/dsl_scan.h> +#include <sys/abd.h> + +/* + * Enable/disable prefetching of dedup-ed blocks which are going to be freed. + */ +int zfs_dedup_prefetch = 1; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS DEDUP"); +SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RWTUN, &zfs_dedup_prefetch, + 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed"); + +static const ddt_ops_t *ddt_ops[DDT_TYPES] = { + &ddt_zap_ops, +}; + +static const char *ddt_class_name[DDT_CLASSES] = { + "ditto", + "duplicate", + "unique", +}; + +static void +ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp == 0); + VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); + ASSERT(*objectp != 0); + + VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, objectp, tx) == 0); + + VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); +} + +static void +ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + uint64_t count; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp != 0); + VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0); + ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); + VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); + VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); + VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); + bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); + + *objectp = 0; +} + +static int +ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + uint64_t count; + char name[DDT_NAMELEN]; + int error; + + ddt_object_name(ddt, type, class, name); + + error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); + + if (error != 0) + return (error); + + VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class])); + + /* + * Seed the cached statistics. + */ + VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); + + error = ddt_object_count(ddt, type, class, &count); + if (error) + return error; + + ddo->ddo_count = count; + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; + + return (0); +} + +static void +ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + uint64_t count; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); + + /* + * Cache DDT statistics; this is the only time they'll change. + */ + VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); + VERIFY(ddt_object_count(ddt, type, class, &count) == 0); + + ddo->ddo_count = count; + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; +} + +static int +ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) +{ + if (!ddt_object_exists(ddt, type, class)) + return (SET_ERROR(ENOENT)); + + return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, + ddt->ddt_object[type][class], dde)); +} + +static void +ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) +{ + if (!ddt_object_exists(ddt, type, class)) + return; + + ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, + ddt->ddt_object[type][class], dde); +} + +int +ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); +} + +static int +ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); +} + +int +ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + uint64_t *walk, ddt_entry_t *dde) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, + ddt->ddt_object[type][class], dde, walk)); +} + +int +ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, uint64_t *count) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, + ddt->ddt_object[type][class], count)); +} + +int +ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_object_info_t *doi) +{ + if (!ddt_object_exists(ddt, type, class)) + return (SET_ERROR(ENOENT)); + + return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], + doi)); +} + +boolean_t +ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + return (!!ddt->ddt_object[type][class]); +} + +void +ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + char *name) +{ + (void) sprintf(name, DMU_POOL_DDT, + zio_checksum_table[ddt->ddt_checksum].ci_name, + ddt_ops[type]->ddt_op_name, ddt_class_name[class]); +} + +void +ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) +{ + ASSERT(txg != 0); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + bp->blk_dva[d] = ddp->ddp_dva[d]; + BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); +} + +void +ddt_bp_create(enum zio_checksum checksum, + const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) +{ + BP_ZERO(bp); + + if (ddp != NULL) + ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); + + bp->blk_cksum = ddk->ddk_cksum; + bp->blk_fill = 1; + + BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); + BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); + BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); + BP_SET_CHECKSUM(bp, checksum); + BP_SET_TYPE(bp, DMU_OT_DEDUP); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); +} + +void +ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) +{ + ddk->ddk_cksum = bp->blk_cksum; + ddk->ddk_prop = 0; + + DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); +} + +void +ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) +{ + ASSERT(ddp->ddp_phys_birth == 0); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + ddp->ddp_dva[d] = bp->blk_dva[d]; + ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); +} + +void +ddt_phys_clear(ddt_phys_t *ddp) +{ + bzero(ddp, sizeof (*ddp)); +} + +void +ddt_phys_addref(ddt_phys_t *ddp) +{ + ddp->ddp_refcnt++; +} + +void +ddt_phys_decref(ddt_phys_t *ddp) +{ + ASSERT((int64_t)ddp->ddp_refcnt > 0); + ddp->ddp_refcnt--; +} + +void +ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) +{ + blkptr_t blk; + + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_phys_clear(ddp); + zio_free(ddt->ddt_spa, txg, &blk); +} + +ddt_phys_t * +ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) +{ + ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && + BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) + return (ddp); + } + return (NULL); +} + +uint64_t +ddt_phys_total_refcnt(const ddt_entry_t *dde) +{ + uint64_t refcnt = 0; + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) + refcnt += dde->dde_phys[p].ddp_refcnt; + + return (refcnt); +} + +static void +ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) +{ + spa_t *spa = ddt->ddt_spa; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + uint64_t lsize = DDK_GET_LSIZE(ddk); + uint64_t psize = DDK_GET_PSIZE(ddk); + + bzero(dds, sizeof (*dds)); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + uint64_t dsize = 0; + uint64_t refcnt = ddp->ddp_refcnt; + + if (ddp->ddp_phys_birth == 0) + continue; + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); + + dds->dds_blocks += 1; + dds->dds_lsize += lsize; + dds->dds_psize += psize; + dds->dds_dsize += dsize; + + dds->dds_ref_blocks += refcnt; + dds->dds_ref_lsize += lsize * refcnt; + dds->dds_ref_psize += psize * refcnt; + dds->dds_ref_dsize += dsize * refcnt; + } +} + +void +ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) +{ + const uint64_t *s = (const uint64_t *)src; + uint64_t *d = (uint64_t *)dst; + uint64_t *d_end = (uint64_t *)(dst + 1); + + ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ + + while (d < d_end) + *d++ += (*s++ ^ neg) - neg; +} + +static void +ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) +{ + ddt_stat_t dds; + ddt_histogram_t *ddh; + int bucket; + + ddt_stat_generate(ddt, dde, &dds); + + bucket = highbit64(dds.dds_ref_blocks) - 1; + ASSERT(bucket >= 0); + + ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; + + ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); +} + +void +ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) +{ + for (int h = 0; h < 64; h++) + ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); +} + +void +ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) +{ + bzero(dds, sizeof (*dds)); + + for (int h = 0; h < 64; h++) + ddt_stat_add(dds, &ddh->ddh_stat[h], 0); +} + +boolean_t +ddt_histogram_empty(const ddt_histogram_t *ddh) +{ + const uint64_t *s = (const uint64_t *)ddh; + const uint64_t *s_end = (const uint64_t *)(ddh + 1); + + while (s < s_end) + if (*s++ != 0) + return (B_FALSE); + + return (B_TRUE); +} + +void +ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) +{ + /* Sum the statistics we cached in ddt_object_sync(). */ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_object_t *ddo = + &ddt->ddt_object_stats[type][class]; + ddo_total->ddo_count += ddo->ddo_count; + ddo_total->ddo_dspace += ddo->ddo_dspace; + ddo_total->ddo_mspace += ddo->ddo_mspace; + } + } + } + + /* ... and compute the averages. */ + if (ddo_total->ddo_count != 0) { + ddo_total->ddo_dspace /= ddo_total->ddo_count; + ddo_total->ddo_mspace /= ddo_total->ddo_count; + } +} + +void +ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_histogram_add(ddh, + &ddt->ddt_histogram_cache[type][class]); + } + } + } +} + +void +ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) +{ + ddt_histogram_t *ddh_total; + + ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddt_get_dedup_histogram(spa, ddh_total); + ddt_histogram_stat(dds_total, ddh_total); + kmem_free(ddh_total, sizeof (ddt_histogram_t)); +} + +uint64_t +ddt_get_dedup_dspace(spa_t *spa) +{ + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + return (dds_total.dds_ref_dsize - dds_total.dds_dsize); +} + +uint64_t +ddt_get_pool_dedup_ratio(spa_t *spa) +{ + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + if (dds_total.dds_dsize == 0) + return (100); + + return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); +} + +int +ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) +{ + spa_t *spa = ddt->ddt_spa; + uint64_t total_refcnt = 0; + uint64_t ditto = spa->spa_dedup_ditto; + int total_copies = 0; + int desired_copies = 0; + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + ddt_phys_t *ddp = &dde->dde_phys[p]; + zio_t *zio = dde->dde_lead_zio[p]; + uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ + if (zio != NULL) + refcnt += zio->io_parent_count; /* pending refs */ + if (ddp == ddp_willref) + refcnt++; /* caller's ref */ + if (refcnt != 0) { + total_refcnt += refcnt; + total_copies += p; + } + } + + if (ditto == 0 || ditto > UINT32_MAX) + ditto = UINT32_MAX; + + if (total_refcnt >= 1) + desired_copies++; + if (total_refcnt >= ditto) + desired_copies++; + if (total_refcnt >= ditto * ditto) + desired_copies++; + + return (MAX(desired_copies, total_copies) - total_copies); +} + +int +ddt_ditto_copies_present(ddt_entry_t *dde) +{ + ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; + dva_t *dva = ddp->ddp_dva; + int copies = 0 - DVA_GET_GANG(dva); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) + if (DVA_IS_VALID(dva)) + copies++; + + ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); + + return (copies); +} + +size_t +ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) +{ + uchar_t *version = dst++; + int cpfunc = ZIO_COMPRESS_ZLE; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + size_t c_len; + + ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ + + c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); + + if (c_len == s_len) { + cpfunc = ZIO_COMPRESS_OFF; + bcopy(src, dst, s_len); + } + + *version = cpfunc; + /* CONSTCOND */ + if (ZFS_HOST_BYTEORDER) + *version |= DDT_COMPRESS_BYTEORDER_MASK; + + return (c_len + 1); +} + +void +ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) +{ + uchar_t version = *src++; + int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + + if (ci->ci_decompress != NULL) + (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); + else + bcopy(src, dst, d_len); + + if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) != + (ZFS_HOST_BYTEORDER != 0)) + byteswap_uint64_array(dst, d_len); +} + +ddt_t * +ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) +{ + return (spa->spa_ddt[c]); +} + +ddt_t * +ddt_select(spa_t *spa, const blkptr_t *bp) +{ + return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); +} + +void +ddt_enter(ddt_t *ddt) +{ + mutex_enter(&ddt->ddt_lock); +} + +void +ddt_exit(ddt_t *ddt) +{ + mutex_exit(&ddt->ddt_lock); +} + +static ddt_entry_t * +ddt_alloc(const ddt_key_t *ddk) +{ + ddt_entry_t *dde; + + dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); + cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); + + dde->dde_key = *ddk; + + return (dde); +} + +static void +ddt_free(ddt_entry_t *dde) +{ + ASSERT(!dde->dde_loading); + + for (int p = 0; p < DDT_PHYS_TYPES; p++) + ASSERT(dde->dde_lead_zio[p] == NULL); + + if (dde->dde_repair_abd != NULL) + abd_free(dde->dde_repair_abd); + + cv_destroy(&dde->dde_cv); + kmem_free(dde, sizeof (*dde)); +} + +void +ddt_remove(ddt_t *ddt, ddt_entry_t *dde) +{ + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + avl_remove(&ddt->ddt_tree, dde); + ddt_free(dde); +} + +ddt_entry_t * +ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) +{ + ddt_entry_t *dde, dde_search; + enum ddt_type type; + enum ddt_class class; + avl_index_t where; + int error; + + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + ddt_key_fill(&dde_search.dde_key, bp); + + dde = avl_find(&ddt->ddt_tree, &dde_search, &where); + if (dde == NULL) { + if (!add) + return (NULL); + dde = ddt_alloc(&dde_search.dde_key); + avl_insert(&ddt->ddt_tree, dde, where); + } + + while (dde->dde_loading) + cv_wait(&dde->dde_cv, &ddt->ddt_lock); + + if (dde->dde_loaded) + return (dde); + + dde->dde_loading = B_TRUE; + + ddt_exit(ddt); + + error = ENOENT; + + for (type = 0; type < DDT_TYPES; type++) { + for (class = 0; class < DDT_CLASSES; class++) { + error = ddt_object_lookup(ddt, type, class, dde); + if (error != ENOENT) { + ASSERT0(error); + break; + } + } + if (error != ENOENT) + break; + } + + ddt_enter(ddt); + + ASSERT(dde->dde_loaded == B_FALSE); + ASSERT(dde->dde_loading == B_TRUE); + + dde->dde_type = type; /* will be DDT_TYPES if no entry found */ + dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ + dde->dde_loaded = B_TRUE; + dde->dde_loading = B_FALSE; + + if (error == 0) + ddt_stat_update(ddt, dde, -1ULL); + + cv_broadcast(&dde->dde_cv); + + return (dde); +} + +void +ddt_prefetch(spa_t *spa, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t dde; + + if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) + return; + + /* + * We only remove the DDT once all tables are empty and only + * prefetch dedup blocks when there are entries in the DDT. + * Thus no locking is required as the DDT can't disappear on us. + */ + ddt = ddt_select(spa, bp); + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + ddt_object_prefetch(ddt, type, class, &dde); + } + } +} + +int +ddt_entry_compare(const void *x1, const void *x2) +{ + const ddt_entry_t *dde1 = x1; + const ddt_entry_t *dde2 = x2; + const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; + const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; + + for (int i = 0; i < DDT_KEY_WORDS; i++) { + if (u1[i] < u2[i]) + return (-1); + if (u1[i] > u2[i]) + return (1); + } + + return (0); +} + +static ddt_t * +ddt_table_alloc(spa_t *spa, enum zio_checksum c) +{ + ddt_t *ddt; + + ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); + + mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&ddt->ddt_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + ddt->ddt_checksum = c; + ddt->ddt_spa = spa; + ddt->ddt_os = spa->spa_meta_objset; + + return (ddt); +} + +static void +ddt_table_free(ddt_t *ddt) +{ + ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); + ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); + avl_destroy(&ddt->ddt_tree); + avl_destroy(&ddt->ddt_repair_tree); + mutex_destroy(&ddt->ddt_lock); + kmem_free(ddt, sizeof (*ddt)); +} + +void +ddt_create(spa_t *spa) +{ + spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) + spa->spa_ddt[c] = ddt_table_alloc(spa, c); +} + +int +ddt_load(spa_t *spa) +{ + int error; + + ddt_create(spa); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object); + + if (error) + return (error == ENOENT ? 0 : error); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + error = ddt_object_load(ddt, type, class); + if (error != 0 && error != ENOENT) + return (error); + } + } + + /* + * Seed the cached histograms. + */ + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); + } + + return (0); +} + +void +ddt_unload(spa_t *spa) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + if (spa->spa_ddt[c]) { + ddt_table_free(spa->spa_ddt[c]); + spa->spa_ddt[c] = NULL; + } + } +} + +boolean_t +ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t dde; + + if (!BP_GET_DEDUP(bp)) + return (B_FALSE); + + if (max_class == DDT_CLASS_UNIQUE) + return (B_TRUE); + + ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; + + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) + for (enum ddt_class class = 0; class <= max_class; class++) + if (ddt_object_lookup(ddt, type, class, &dde) == 0) + return (B_TRUE); + + return (B_FALSE); +} + +ddt_entry_t * +ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) +{ + ddt_key_t ddk; + ddt_entry_t *dde; + + ddt_key_fill(&ddk, bp); + + dde = ddt_alloc(&ddk); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + /* + * We can only do repair if there are multiple copies + * of the block. For anything in the UNIQUE class, + * there's definitely only one copy, so don't even try. + */ + if (class != DDT_CLASS_UNIQUE && + ddt_object_lookup(ddt, type, class, dde) == 0) + return (dde); + } + } + + bzero(dde->dde_phys, sizeof (dde->dde_phys)); + + return (dde); +} + +void +ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) +{ + avl_index_t where; + + ddt_enter(ddt); + + if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) && + avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) + avl_insert(&ddt->ddt_repair_tree, dde, where); + else + ddt_free(dde); + + ddt_exit(ddt); +} + +static void +ddt_repair_entry_done(zio_t *zio) +{ + ddt_entry_t *rdde = zio->io_private; + + ddt_free(rdde); +} + +static void +ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) +{ + ddt_phys_t *ddp = dde->dde_phys; + ddt_phys_t *rddp = rdde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + ddt_key_t *rddk = &rdde->dde_key; + zio_t *zio; + blkptr_t blk; + + zio = zio_null(rio, rio->io_spa, NULL, + ddt_repair_entry_done, rdde, rio->io_flags); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth != rddp->ddp_phys_birth || + bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) + continue; + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, + rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, + ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); + } + + zio_nowait(zio); +} + +static void +ddt_repair_table(ddt_t *ddt, zio_t *rio) +{ + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde, *rdde_next, *rdde; + avl_tree_t *t = &ddt->ddt_repair_tree; + blkptr_t blk; + + if (spa_sync_pass(spa) > 1) + return; + + ddt_enter(ddt); + for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { + rdde_next = AVL_NEXT(t, rdde); + avl_remove(&ddt->ddt_repair_tree, rdde); + ddt_exit(ddt); + ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); + dde = ddt_repair_start(ddt, &blk); + ddt_repair_entry(ddt, dde, rdde, rio); + ddt_repair_done(ddt, dde); + ddt_enter(ddt); + } + ddt_exit(ddt); +} + +static void +ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) +{ + dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + enum ddt_type otype = dde->dde_type; + enum ddt_type ntype = DDT_TYPE_CURRENT; + enum ddt_class oclass = dde->dde_class; + enum ddt_class nclass; + uint64_t total_refcnt = 0; + + ASSERT(dde->dde_loaded); + ASSERT(!dde->dde_loading); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + ASSERT(dde->dde_lead_zio[p] == NULL); + ASSERT((int64_t)ddp->ddp_refcnt >= 0); + if (ddp->ddp_phys_birth == 0) { + ASSERT(ddp->ddp_refcnt == 0); + continue; + } + if (p == DDT_PHYS_DITTO) { + if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) + ddt_phys_free(ddt, ddk, ddp, txg); + continue; + } + if (ddp->ddp_refcnt == 0) + ddt_phys_free(ddt, ddk, ddp, txg); + total_refcnt += ddp->ddp_refcnt; + } + + if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) + nclass = DDT_CLASS_DITTO; + else if (total_refcnt > 1) + nclass = DDT_CLASS_DUPLICATE; + else + nclass = DDT_CLASS_UNIQUE; + + if (otype != DDT_TYPES && + (otype != ntype || oclass != nclass || total_refcnt == 0)) { + VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); + ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); + } + + if (total_refcnt != 0) { + dde->dde_type = ntype; + dde->dde_class = nclass; + ddt_stat_update(ddt, dde, 0); + if (!ddt_object_exists(ddt, ntype, nclass)) + ddt_object_create(ddt, ntype, nclass, tx); + VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); + + /* + * If the class changes, the order that we scan this bp + * changes. If it decreases, we could miss it, so + * scan it right now. (This covers both class changing + * while we are doing ddt_walk(), and when we are + * traversing.) + */ + if (nclass < oclass) { + dsl_scan_ddt_entry(dp->dp_scan, + ddt->ddt_checksum, dde, tx); + } + } +} + +static void +ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) +{ + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde; + void *cookie = NULL; + + if (avl_numnodes(&ddt->ddt_tree) == 0) + return; + + ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); + + if (spa->spa_ddt_stat_object == 0) { + spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, tx); + } + + while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { + ddt_sync_entry(ddt, dde, tx, txg); + ddt_free(dde); + } + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + uint64_t add, count = 0; + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (ddt_object_exists(ddt, type, class)) { + ddt_object_sync(ddt, type, class, tx); + VERIFY(ddt_object_count(ddt, type, class, + &add) == 0); + count += add; + } + } + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (count == 0 && ddt_object_exists(ddt, type, class)) + ddt_object_destroy(ddt, type, class, tx); + } + } + + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); +} + +void +ddt_sync(spa_t *spa, uint64_t txg) +{ + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + dmu_tx_t *tx; + zio_t *rio; + + ASSERT(spa_syncing_txg(spa) == txg); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + rio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); + + /* + * This function may cause an immediate scan of ddt blocks (see + * the comment above dsl_scan_ddt() for details). We set the + * scan's root zio here so that we can wait for any scan IOs in + * addition to the regular ddt IOs. + */ + ASSERT3P(scn->scn_zio_root, ==, NULL); + scn->scn_zio_root = rio; + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL) + continue; + ddt_sync_table(ddt, tx, txg); + ddt_repair_table(ddt, rio); + } + + (void) zio_wait(rio); + scn->scn_zio_root = NULL; + + dmu_tx_commit(tx); +} + +int +ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) +{ + do { + do { + do { + ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; + int error = ENOENT; + if (ddt_object_exists(ddt, ddb->ddb_type, + ddb->ddb_class)) { + error = ddt_object_walk(ddt, + ddb->ddb_type, ddb->ddb_class, + &ddb->ddb_cursor, dde); + } + dde->dde_type = ddb->ddb_type; + dde->dde_class = ddb->ddb_class; + if (error == 0) + return (0); + if (error != ENOENT) + return (error); + ddb->ddb_cursor = 0; + } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); + ddb->ddb_checksum = 0; + } while (++ddb->ddb_type < DDT_TYPES); + ddb->ddb_type = 0; + } while (++ddb->ddb_class < DDT_CLASSES); + + return (SET_ERROR(ENOENT)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c new file mode 100644 index 000000000000..f42b4ddaac00 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c @@ -0,0 +1,153 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/ddt.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> + +int ddt_zap_leaf_blockshift = 12; +int ddt_zap_indirect_blockshift = 12; + +static int +ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) +{ + zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY; + + if (prehash) + flags |= ZAP_FLAG_PRE_HASHED_KEY; + + *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, + ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, + DMU_OT_NONE, 0, tx); + + return (*objectp == 0 ? ENOTSUP : 0); +} + +static int +ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + return (zap_destroy(os, object, tx)); +} + +static int +ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) +{ + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t one, csize; + int error; + + error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, &one, &csize); + if (error) + return (error); + + ASSERT(one == 1); + ASSERT(csize <= sizeof (cbuf)); + + error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, 1, csize, cbuf); + if (error) + return (error); + + ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); + + return (0); +} + +static void +ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde) +{ + (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS); +} + +static int +ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +{ + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t csize; + + csize = ddt_compress(dde->dde_phys, cbuf, + sizeof (dde->dde_phys), sizeof (cbuf)); + + return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, 1, csize, cbuf, tx)); +} + +static int +ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +{ + return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, tx)); +} + +static int +ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) +{ + zap_cursor_t zc; + zap_attribute_t za; + int error; + + zap_cursor_init_serialized(&zc, os, object, *walk); + if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t csize = za.za_num_integers; + ASSERT(za.za_integer_length == 1); + error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name, + DDT_KEY_WORDS, 1, csize, cbuf); + ASSERT(error == 0); + if (error == 0) { + ddt_decompress(cbuf, dde->dde_phys, csize, + sizeof (dde->dde_phys)); + dde->dde_key = *(ddt_key_t *)za.za_name; + } + zap_cursor_advance(&zc); + *walk = zap_cursor_serialize(&zc); + } + zap_cursor_fini(&zc); + return (error); +} + +static int +ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count) +{ + + return (zap_count(os, object, count)); +} + +const ddt_ops_t ddt_zap_ops = { + "zap", + ddt_zap_create, + ddt_zap_destroy, + ddt_zap_lookup, + ddt_zap_prefetch, + ddt_zap_update, + ddt_zap_remove, + ddt_zap_walk, + ddt_zap_count, +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c new file mode 100644 index 000000000000..a0d66a99a3f0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -0,0 +1,2687 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + */ +/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ +/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ +/* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_prop.h> +#include <sys/dmu_zfetch.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/sa.h> +#include <sys/zfeature.h> +#include <sys/abd.h> +#ifdef _KERNEL +#include <sys/racct.h> +#include <sys/vm.h> +#include <sys/zfs_znode.h> +#endif + +/* + * Enable/disable nopwrite feature. + */ +int zfs_nopwrite_enabled = 1; +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN, + &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); + +/* + * Tunable to control percentage of dirtied blocks from frees in one TXG. + * After this threshold is crossed, additional dirty blocks from frees + * wait until the next TXG. + * A value of zero will disable this throttle. + */ +uint32_t zfs_per_txg_dirty_frees_percent = 30; +SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN, + &zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg"); + +/* + * This can be used for testing, to ensure that certain actions happen + * while in the middle of a remap (which might otherwise complete too + * quickly). + */ +int zfs_object_remap_one_indirect_delay_ticks = 0; + +const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { + { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, "object array" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "ZIL intent log" }, + { DMU_BSWAP_DNODE, TRUE, FALSE, "DMU dnode" }, + { DMU_BSWAP_OBJSET, TRUE, TRUE, "DMU objset" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL directory" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL directory child map" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset snap map" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL props" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL dataset" }, + { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" }, + { DMU_BSWAP_OLDACL, TRUE, FALSE, "ZFS V0 ACL" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, "ZFS plain file" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS directory" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS delete queue" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, "zvol object" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, "other uint8[]" }, + { DMU_BSWAP_UINT64, FALSE, FALSE, "other uint64[]" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "Pool properties" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL permissions" }, + { DMU_BSWAP_ACL, TRUE, FALSE, "ZFS ACL" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "ZFS SYSACL" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "FUID table" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset next clones" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group used" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group quota" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "snapshot refcount tags" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "System attributes" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "SA master node" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr registration" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr layouts" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, "deduplicated block" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL deadlist map" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL deadlist map hdr" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dir clones" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" } +}; + +const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { + { byteswap_uint8_array, "uint8" }, + { byteswap_uint16_array, "uint16" }, + { byteswap_uint32_array, "uint32" }, + { byteswap_uint64_array, "uint64" }, + { zap_byteswap, "zap" }, + { dnode_buf_byteswap, "dnode" }, + { dmu_objset_byteswap, "objset" }, + { zfs_znode_byteswap, "znode" }, + { zfs_oldacl_byteswap, "oldacl" }, + { zfs_acl_byteswap, "acl" } +}; + +int +dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, + void *tag, dmu_buf_t **dbp) +{ + uint64_t blkid; + dmu_buf_impl_t *db; + + blkid = dbuf_whichblock(dn, 0, offset); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold(dn, blkid, tag); + rw_exit(&dn->dn_struct_rwlock); + + if (db == NULL) { + *dbp = NULL; + return (SET_ERROR(EIO)); + } + + *dbp = &db->db; + return (0); +} +int +dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp) +{ + dnode_t *dn; + uint64_t blkid; + dmu_buf_impl_t *db; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + blkid = dbuf_whichblock(dn, 0, offset); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold(dn, blkid, tag); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + + if (db == NULL) { + *dbp = NULL; + return (SET_ERROR(EIO)); + } + + *dbp = &db->db; + return (err); +} + +int +dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags) +{ + int err; + int db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + + err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); + if (err == 0) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); + err = dbuf_read(db, NULL, db_flags); + if (err != 0) { + dbuf_rele(db, tag); + *dbp = NULL; + } + } + + return (err); +} + +int +dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags) +{ + int err; + int db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + + err = dmu_buf_hold_noread(os, object, offset, tag, dbp); + if (err == 0) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); + err = dbuf_read(db, NULL, db_flags); + if (err != 0) { + dbuf_rele(db, tag); + *dbp = NULL; + } + } + + return (err); +} + +int +dmu_bonus_max(void) +{ + return (DN_OLD_MAX_BONUSLEN); +} + +int +dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (dn->dn_bonus != db) { + error = SET_ERROR(EINVAL); + } else if (newsize < 0 || newsize > db_fake->db_size) { + error = SET_ERROR(EINVAL); + } else { + dnode_setbonuslen(dn, newsize, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); +} + +int +dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (!DMU_OT_IS_VALID(type)) { + error = SET_ERROR(EINVAL); + } else if (dn->dn_bonus != db) { + error = SET_ERROR(EINVAL); + } else { + dnode_setbonus_type(dn, type, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); +} + +dmu_object_type_t +dmu_get_bonustype(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + dmu_object_type_t type; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + type = dn->dn_bonustype; + DB_DNODE_EXIT(db); + + return (type); +} + +int +dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int error; + + error = dnode_hold(os, object, FTAG, &dn); + dbuf_rm_spill(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dnode_rm_spill(dn, tx); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (error); +} + +/* + * returns ENOENT, EIO, or 0. + */ +int +dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) +{ + dnode_t *dn; + dmu_buf_impl_t *db; + int error; + + error = dnode_hold(os, object, FTAG, &dn); + if (error) + return (error); + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_bonus == NULL) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus == NULL) + dbuf_create_bonus(dn); + } + db = dn->dn_bonus; + + /* as long as the bonus buf is held, the dnode will be held */ + if (refcount_add(&db->db_holds, tag) == 1) { + VERIFY(dnode_add_ref(dn, db)); + atomic_inc_32(&dn->dn_dbufs_count); + } + + /* + * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's + * hold and incrementing the dbuf count to ensure that dnode_move() sees + * a dnode hold for every dbuf. + */ + rw_exit(&dn->dn_struct_rwlock); + + dnode_rele(dn, FTAG); + + VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); + + *dbp = &db->db; + return (0); +} + +/* + * returns ENOENT, EIO, or 0. + * + * This interface will allocate a blank spill dbuf when a spill blk + * doesn't already exist on the dnode. + * + * if you only want to find an already existing spill db, then + * dmu_spill_hold_existing() should be used. + */ +int +dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = NULL; + int err; + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + + ASSERT(db != NULL); + err = dbuf_read(db, NULL, flags); + if (err == 0) + *dbp = &db->db; + else + dbuf_rele(db, tag); + return (err); +} + +int +dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { + err = SET_ERROR(EINVAL); + } else { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (!dn->dn_have_spill) { + err = SET_ERROR(ENOENT); + } else { + err = dmu_spill_hold_by_dnode(dn, + DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); + } + + rw_exit(&dn->dn_struct_rwlock); + } + + DB_DNODE_EXIT(db); + return (err); +} + +int +dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); + DB_DNODE_EXIT(db); + + return (err); +} + +/* + * Note: longer-term, we should modify all of the dmu_buf_*() interfaces + * to take a held dnode rather than <os, object> -- the lookup is wasteful, + * and can induce severe lock contention when writing to several files + * whose dnodes are in the same block. + */ +int +dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, + boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) +{ + dmu_buf_t **dbp; + uint64_t blkid, nblks, i; + uint32_t dbuf_flags; + int err; + zio_t *zio; + + ASSERT(length <= DMU_MAX_ACCESS); + + /* + * Note: We directly notify the prefetch code of this read, so that + * we can tell it about the multi-block read. dbuf_read() only knows + * about the one block it is accessing. + */ + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | + DB_RF_NOPREFETCH; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_datablkshift) { + int blkshift = dn->dn_datablkshift; + nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - + P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; + } else { + if (offset + length > dn->dn_datablksz) { + zfs_panic_recover("zfs: accessing past end of object " + "%llx/%llx (size=%u access=%llu+%llu)", + (longlong_t)dn->dn_objset-> + os_dsl_dataset->ds_object, + (longlong_t)dn->dn_object, dn->dn_datablksz, + (longlong_t)offset, (longlong_t)length); + rw_exit(&dn->dn_struct_rwlock); + return (SET_ERROR(EIO)); + } + nblks = 1; + } + dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); + +#if defined(_KERNEL) && defined(RACCT) + if (racct_enable && !read) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_WRITEBPS, length); + racct_add_force(curproc, RACCT_WRITEIOPS, nblks); + PROC_UNLOCK(curproc); + } +#endif + + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); + blkid = dbuf_whichblock(dn, 0, offset); + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); + if (db == NULL) { + rw_exit(&dn->dn_struct_rwlock); + dmu_buf_rele_array(dbp, nblks, tag); + zio_nowait(zio); + return (SET_ERROR(EIO)); + } + + /* initiate async i/o */ + if (read) + (void) dbuf_read(db, zio, dbuf_flags); +#ifdef _KERNEL + else + curthread->td_ru.ru_oublock++; +#endif + dbp[i] = &db->db; + } + + if ((flags & DMU_READ_NO_PREFETCH) == 0 && + DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { + dmu_zfetch(&dn->dn_zfetch, blkid, nblks, + read && DNODE_IS_CACHEABLE(dn)); + } + rw_exit(&dn->dn_struct_rwlock); + + /* wait for async i/o */ + err = zio_wait(zio); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + + /* wait for other io to complete */ + if (read) { + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || + db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) + err = SET_ERROR(EIO); + mutex_exit(&db->db_mtx); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + } + } + + *numbufsp = nblks; + *dbpp = dbp; + return (0); +} + +static int +dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp, DMU_READ_PREFETCH); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, + uint64_t length, boolean_t read, void *tag, int *numbufsp, + dmu_buf_t ***dbpp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp, DMU_READ_PREFETCH); + DB_DNODE_EXIT(db); + + return (err); +} + +void +dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) +{ + int i; + dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; + + if (numbufs == 0) + return; + + for (i = 0; i < numbufs; i++) { + if (dbp[i]) + dbuf_rele(dbp[i], tag); + } + + kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); +} + +/* + * Issue prefetch i/os for the given blocks. If level is greater than 0, the + * indirect blocks prefeteched will be those that point to the blocks containing + * the data starting at offset, and continuing to offset + len. + * + * Note that if the indirect blocks above the blocks being prefetched are not in + * cache, they will be asychronously read in. + */ +void +dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, + uint64_t len, zio_priority_t pri) +{ + dnode_t *dn; + uint64_t blkid; + int nblks, err; + + if (len == 0) { /* they're interested in the bonus buffer */ + dn = DMU_META_DNODE(os); + + if (object == 0 || object >= DN_MAX_OBJECT) + return; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, level, + object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, level, blkid, pri, 0); + rw_exit(&dn->dn_struct_rwlock); + return; + } + + /* + * XXX - Note, if the dnode for the requested object is not + * already cached, we will do a *synchronous* read in the + * dnode_hold() call. The same is true for any indirects. + */ + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) + return; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + /* + * offset + len - 1 is the last byte we want to prefetch for, and offset + * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the + * last block we want to prefetch, and dbuf_whichblock(dn, level, + * offset) is the first. Then the number we need to prefetch is the + * last - first + 1. + */ + if (level > 0 || dn->dn_datablkshift != 0) { + nblks = dbuf_whichblock(dn, level, offset + len - 1) - + dbuf_whichblock(dn, level, offset) + 1; + } else { + nblks = (offset < dn->dn_datablksz); + } + + if (nblks != 0) { + blkid = dbuf_whichblock(dn, level, offset); + for (int i = 0; i < nblks; i++) + dbuf_prefetch(dn, level, blkid + i, pri, 0); + } + + rw_exit(&dn->dn_struct_rwlock); + + dnode_rele(dn, FTAG); +} + +/* + * Get the next "chunk" of file data to free. We traverse the file from + * the end so that the file gets shorter over time (if we crashes in the + * middle, this will leave us in a better state). We find allocated file + * data by simply searching the allocated level 1 indirects. + * + * On input, *start should be the first offset that does not need to be + * freed (e.g. "offset + length"). On return, *start will be the first + * offset that should be freed. + */ +static int +get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) +{ + uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); + /* bytes of data covered by a level-1 indirect block */ + uint64_t iblkrange = + dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); + + ASSERT3U(minimum, <=, *start); + + if (*start - minimum <= iblkrange * maxblks) { + *start = minimum; + return (0); + } + ASSERT(ISP2(iblkrange)); + + for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) { + int err; + + /* + * dnode_next_offset(BACKWARDS) will find an allocated L1 + * indirect block at or before the input offset. We must + * decrement *start so that it is at the end of the region + * to search. + */ + (*start)--; + err = dnode_next_offset(dn, + DNODE_FIND_BACKWARDS, start, 2, 1, 0); + + /* if there are no indirect blocks before start, we are done */ + if (err == ESRCH) { + *start = minimum; + break; + } else if (err != 0) { + return (err); + } + + /* set start to the beginning of this L1 indirect */ + *start = P2ALIGN(*start, iblkrange); + } + if (*start < minimum) + *start = minimum; + return (0); +} + +/* + * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set, + * otherwise return false. + * Used below in dmu_free_long_range_impl() to enable abort when unmounting + */ +/*ARGSUSED*/ +static boolean_t +dmu_objset_zfs_unmounting(objset_t *os) +{ +#ifdef _KERNEL + if (dmu_objset_type(os) == DMU_OST_ZFS) + return (zfs_get_vfs_flag_unmounted(os)); +#endif + return (B_FALSE); +} + +static int +dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, + uint64_t length) +{ + uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; + int err; + uint64_t dirty_frees_threshold; + dsl_pool_t *dp = dmu_objset_pool(os); + + if (offset >= object_size) + return (0); + + if (zfs_per_txg_dirty_frees_percent <= 100) + dirty_frees_threshold = + zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; + else + dirty_frees_threshold = zfs_dirty_data_max / 4; + + if (length == DMU_OBJECT_END || offset + length > object_size) + length = object_size - offset; + + while (length != 0) { + uint64_t chunk_end, chunk_begin, chunk_len; + uint64_t long_free_dirty_all_txgs = 0; + dmu_tx_t *tx; + + if (dmu_objset_zfs_unmounting(dn->dn_objset)) + return (SET_ERROR(EINTR)); + + chunk_end = chunk_begin = offset + length; + + /* move chunk_begin backwards to the beginning of this chunk */ + err = get_next_chunk(dn, &chunk_begin, offset); + if (err) + return (err); + ASSERT3U(chunk_begin, >=, offset); + ASSERT3U(chunk_begin, <=, chunk_end); + + chunk_len = chunk_end - chunk_begin; + + mutex_enter(&dp->dp_lock); + for (int t = 0; t < TXG_SIZE; t++) { + long_free_dirty_all_txgs += + dp->dp_long_free_dirty_pertxg[t]; + } + mutex_exit(&dp->dp_lock); + + /* + * To avoid filling up a TXG with just frees wait for + * the next TXG to open before freeing more chunks if + * we have reached the threshold of frees + */ + if (dirty_frees_threshold != 0 && + long_free_dirty_all_txgs >= dirty_frees_threshold) { + txg_wait_open(dp, 0); + continue; + } + + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); + + /* + * Mark this transaction as typically resulting in a net + * reduction in space used. + */ + dmu_tx_mark_netfree(tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + mutex_enter(&dp->dp_lock); + dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] += + chunk_len; + mutex_exit(&dp->dp_lock); + DTRACE_PROBE3(free__long__range, + uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len, + uint64_t, dmu_tx_get_txg(tx)); + dnode_free_range(dn, chunk_begin, chunk_len, tx); + dmu_tx_commit(tx); + + length -= chunk_len; + } + return (0); +} + +int +dmu_free_long_range(objset_t *os, uint64_t object, + uint64_t offset, uint64_t length) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) + return (err); + err = dmu_free_long_range_impl(os, dn, offset, length); + + /* + * It is important to zero out the maxblkid when freeing the entire + * file, so that (a) subsequent calls to dmu_free_long_range_impl() + * will take the fast path, and (b) dnode_reallocate() can verify + * that the entire file has been freed. + */ + if (err == 0 && offset == 0 && length == DMU_OBJECT_END) + dn->dn_maxblkid = 0; + + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_free_long_object(objset_t *os, uint64_t object) +{ + dmu_tx_t *tx; + int err; + + err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); + if (err != 0) + return (err); + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, object); + dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); + dmu_tx_mark_netfree(tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == 0) { + err = dmu_object_free(os, object, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + + return (err); +} + +int +dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, dmu_tx_t *tx) +{ + dnode_t *dn; + int err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + ASSERT(offset < UINT64_MAX); + ASSERT(size == -1ULL || size <= UINT64_MAX - offset); + dnode_free_range(dn, offset, size, tx); + dnode_rele(dn, FTAG); + return (0); +} + +static int +dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, + void *buf, uint32_t flags) +{ + dmu_buf_t **dbp; + int numbufs, err = 0; + + /* + * Deal with odd block sizes, where there can't be data past the first + * block. If we ever do the tail block optimization, we will need to + * handle that here as well. + */ + if (dn->dn_maxblkid == 0) { + int newsz = offset > dn->dn_datablksz ? 0 : + MIN(size, dn->dn_datablksz - offset); + bzero((char *)buf + newsz, size - newsz); + size = newsz; + } + + while (size > 0) { + uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); + int i; + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, + TRUE, FTAG, &numbufs, &dbp, flags); + if (err) + break; + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + bcopy((char *)db->db_data + bufoff, buf, tocpy); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + } + return (err); +} + +int +dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf, uint32_t flags) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) + return (err); + + err = dmu_read_impl(dn, offset, size, buf, flags); + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, + uint32_t flags) +{ + return (dmu_read_impl(dn, offset, size, buf, flags)); +} + +static void +dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + int i; + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + bcopy(buf, (char *)db->db_data + bufoff, tocpy); + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } +} + +void +dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + + if (size == 0) + return; + + VERIFY0(dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + dmu_write_impl(dbp, numbufs, offset, size, buf, tx); + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +void +dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + + if (size == 0) + return; + + VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); + dmu_write_impl(dbp, numbufs, offset, size, buf, tx); + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +static int +dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn, + uint64_t last_removal_txg, uint64_t offset) +{ + uint64_t l1blkid = dbuf_whichblock(dn, 1, offset); + int err = 0; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG); + ASSERT3P(dbuf, !=, NULL); + + /* + * If the block hasn't been written yet, this default will ensure + * we don't try to remap it. + */ + uint64_t birth = UINT64_MAX; + ASSERT3U(last_removal_txg, !=, UINT64_MAX); + if (dbuf->db_blkptr != NULL) + birth = dbuf->db_blkptr->blk_birth; + rw_exit(&dn->dn_struct_rwlock); + + /* + * If this L1 was already written after the last removal, then we've + * already tried to remap it. + */ + if (birth <= last_removal_txg && + dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 && + dbuf_can_remap(dbuf)) { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_remap_l1indirect(tx, dn->dn_object); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == 0) { + (void) dbuf_dirty(dbuf, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + } + + dbuf_rele(dbuf, FTAG); + + delay(zfs_object_remap_one_indirect_delay_ticks); + + return (err); +} + +/* + * Remap all blockpointers in the object, if possible, so that they reference + * only concrete vdevs. + * + * To do this, iterate over the L0 blockpointers and remap any that reference + * an indirect vdev. Note that we only examine L0 blockpointers; since we + * cannot guarantee that we can remap all blockpointer anyways (due to split + * blocks), we do not want to make the code unnecessarily complicated to + * catch the unlikely case that there is an L1 block on an indirect vdev that + * contains no indirect blockpointers. + */ +int +dmu_object_remap_indirects(objset_t *os, uint64_t object, + uint64_t last_removal_txg) +{ + uint64_t offset, l1span; + int err; + dnode_t *dn; + + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) { + return (err); + } + + if (dn->dn_nlevels <= 1) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = SET_ERROR(EINTR); + } + + /* + * If the dnode has no indirect blocks, we cannot dirty them. + * We still want to remap the blkptr(s) in the dnode if + * appropriate, so mark it as dirty. + */ + if (err == 0 && dnode_needs_remap(dn)) { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, dn->dn_object); + if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) { + dnode_setdirty(dn, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + } + + dnode_rele(dn, FTAG); + return (err); + } + + offset = 0; + l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT + + dn->dn_datablkshift); + /* + * Find the next L1 indirect that is not a hole. + */ + while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = SET_ERROR(EINTR); + break; + } + if ((err = dmu_object_remap_one_indirect(os, dn, + last_removal_txg, offset)) != 0) { + break; + } + offset += l1span; + } + + dnode_rele(dn, FTAG); + return (err); +} + +void +dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + + if (size == 0) + return; + + VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + + for (i = 0; i < numbufs; i++) { + dmu_buf_t *db = dbp[i]; + + dmu_buf_will_not_fill(db, tx); + } + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +void +dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, + void *data, uint8_t etype, uint8_t comp, int uncompressed_size, + int compressed_size, int byteorder, dmu_tx_t *tx) +{ + dmu_buf_t *db; + + ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); + ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); + VERIFY0(dmu_buf_hold_noread(os, object, offset, + FTAG, &db)); + + dmu_buf_write_embedded(db, + data, (bp_embedded_type_t)etype, (enum zio_compress)comp, + uncompressed_size, compressed_size, byteorder, tx); + + dmu_buf_rele(db, FTAG); +} + +/* + * DMU support for xuio + */ +kstat_t *xuio_ksp = NULL; + +int +dmu_xuio_init(xuio_t *xuio, int nblk) +{ + dmu_xuio_t *priv; + uio_t *uio = &xuio->xu_uio; + + uio->uio_iovcnt = nblk; + uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); + + priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); + priv->cnt = nblk; + priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); + priv->iovp = uio->uio_iov; + XUIO_XUZC_PRIV(xuio) = priv; + + if (XUIO_XUZC_RW(xuio) == UIO_READ) + XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); + else + XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); + + return (0); +} + +void +dmu_xuio_fini(xuio_t *xuio) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + int nblk = priv->cnt; + + kmem_free(priv->iovp, nblk * sizeof (iovec_t)); + kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); + kmem_free(priv, sizeof (dmu_xuio_t)); + + if (XUIO_XUZC_RW(xuio) == UIO_READ) + XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); + else + XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); +} + +/* + * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } + * and increase priv->next by 1. + */ +int +dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) +{ + struct iovec *iov; + uio_t *uio = &xuio->xu_uio; + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + int i = priv->next++; + + ASSERT(i < priv->cnt); + ASSERT(off + n <= arc_buf_lsize(abuf)); + iov = uio->uio_iov + i; + iov->iov_base = (char *)abuf->b_data + off; + iov->iov_len = n; + priv->bufs[i] = abuf; + return (0); +} + +int +dmu_xuio_cnt(xuio_t *xuio) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + return (priv->cnt); +} + +arc_buf_t * +dmu_xuio_arcbuf(xuio_t *xuio, int i) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + + ASSERT(i < priv->cnt); + return (priv->bufs[i]); +} + +void +dmu_xuio_clear(xuio_t *xuio, int i) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + + ASSERT(i < priv->cnt); + priv->bufs[i] = NULL; +} + +static void +xuio_stat_init(void) +{ + xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", + KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (xuio_ksp != NULL) { + xuio_ksp->ks_data = &xuio_stats; + kstat_install(xuio_ksp); + } +} + +static void +xuio_stat_fini(void) +{ + if (xuio_ksp != NULL) { + kstat_delete(xuio_ksp); + xuio_ksp = NULL; + } +} + +void +xuio_stat_wbuf_copied(void) +{ + XUIOSTAT_BUMP(xuiostat_wbuf_copied); +} + +void +xuio_stat_wbuf_nocopy(void) +{ + XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); +} + +#ifdef _KERNEL +int +dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) +{ + dmu_buf_t **dbp; + int numbufs, i, err; + xuio_t *xuio = NULL; + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, + TRUE, FTAG, &numbufs, &dbp, 0); + if (err) + return (err); + +#ifdef UIO_XUIO + if (uio->uio_extflg == UIO_XUIO) + xuio = (xuio_t *)uio; +#endif + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = uio->uio_loffset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + if (xuio) { + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + arc_buf_t *dbuf_abuf = dbi->db_buf; + arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); + err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); + if (!err) { + uio->uio_resid -= tocpy; + uio->uio_loffset += tocpy; + } + + if (abuf == dbuf_abuf) + XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); + else + XUIOSTAT_BUMP(xuiostat_rbuf_copied); + } else { +#ifdef illumos + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_READ, uio); +#else + err = vn_io_fault_uiomove((char *)db->db_data + bufoff, + tocpy, uio); +#endif + } + if (err) + break; + + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (err); +} + +/* + * Read 'size' bytes into the uio buffer. + * From object zdb->db_object. + * Starting at offset uio->uio_loffset. + * + * If the caller already has a dbuf in the target object + * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), + * because we don't have to find the dnode_t for the object. + */ +int +dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_read_uio_dnode(dn, uio, size); + DB_DNODE_EXIT(db); + + return (err); +} + +/* + * Read 'size' bytes into the uio buffer. + * From the specified object + * Starting at offset uio->uio_loffset. + */ +int +dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) +{ + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_read_uio_dnode(dn, uio, size); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + int err = 0; + int i; + + err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = uio->uio_loffset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + +#ifdef illumos + /* + * XXX uiomove could block forever (eg. nfs-backed + * pages). There needs to be a uiolockdown() function + * to lock the pages in memory, so that uiomove won't + * block. + */ + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_WRITE, uio); +#else + err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, + uio); +#endif + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + if (err) + break; + + size -= tocpy; + } + + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} + +/* + * Write 'size' bytes from the uio buffer. + * To object zdb->db_object. + * Starting at offset uio->uio_loffset. + * + * If the caller already has a dbuf in the target object + * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), + * because we don't have to find the dnode_t for the object. + */ +int +dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_write_uio_dnode(dn, uio, size, tx); + DB_DNODE_EXIT(db); + + return (err); +} + +/* + * Write 'size' bytes from the uio buffer. + * To the specified object. + * Starting at offset uio->uio_loffset. + */ +int +dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, + dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_write_uio_dnode(dn, uio, size, tx); + + dnode_rele(dn, FTAG); + + return (err); +} + +#ifdef illumos +int +dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + page_t *pp, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + int err; + + if (size == 0) + return (0); + + err = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy, copied, thiscpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + caddr_t va; + + ASSERT(size > 0); + ASSERT3U(db->db_size, >=, PAGESIZE); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + for (copied = 0; copied < tocpy; copied += PAGESIZE) { + ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); + thiscpy = MIN(PAGESIZE, tocpy - copied); + va = zfs_map_page(pp, S_READ); + bcopy(va, (char *)db->db_data + bufoff, thiscpy); + zfs_unmap_page(pp, va); + pp = pp->p_next; + bufoff += PAGESIZE; + } + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} + +#else /* !illumos */ + +int +dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + vm_page_t *ma, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + struct sf_buf *sf; + int numbufs, i; + int err; + + if (size == 0) + return (0); + + err = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy, copied, thiscpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + caddr_t va; + + ASSERT(size > 0); + ASSERT3U(db->db_size, >=, PAGESIZE); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + for (copied = 0; copied < tocpy; copied += PAGESIZE) { + ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff); + thiscpy = MIN(PAGESIZE, tocpy - copied); + va = zfs_map_page(*ma, &sf); + bcopy(va, (char *)db->db_data + bufoff, thiscpy); + zfs_unmap_page(sf); + ma += 1; + bufoff += PAGESIZE; + } + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} + +int +dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, + int *rbehind, int *rahead, int last_size) +{ + struct sf_buf *sf; + vm_object_t vmobj; + vm_page_t m; + dmu_buf_t **dbp; + dmu_buf_t *db; + caddr_t va; + int numbufs, i; + int bufoff, pgoff, tocpy; + int mi, di; + int err; + + ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex); + ASSERT(last_size <= PAGE_SIZE); + + err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex), + IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp); + if (err != 0) + return (err); + +#ifdef DEBUG + IMPLY(last_size < PAGE_SIZE, *rahead == 0); + if (dbp[0]->db_offset != 0 || numbufs > 1) { + for (i = 0; i < numbufs; i++) { + ASSERT(ISP2(dbp[i]->db_size)); + ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0); + ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size); + } + } +#endif + + vmobj = ma[0]->object; + zfs_vmobject_wlock(vmobj); + + db = dbp[0]; + for (i = 0; i < *rbehind; i++) { + m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i, + VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY); + if (m == NULL) + break; + if (m->valid != 0) { + ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); + break; + } + ASSERT(m->dirty == 0); + ASSERT(!pmap_page_is_mapped(m)); + + ASSERT(db->db_size > PAGE_SIZE); + bufoff = IDX_TO_OFF(m->pindex) % db->db_size; + va = zfs_map_page(m, &sf); + bcopy((char *)db->db_data + bufoff, va, PAGESIZE); + zfs_unmap_page(sf); + m->valid = VM_PAGE_BITS_ALL; + vm_page_lock(m); + if ((m->busy_lock & VPB_BIT_WAITERS) != 0) + vm_page_activate(m); + else + vm_page_deactivate(m); + vm_page_unlock(m); + } + *rbehind = i; + + bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size; + pgoff = 0; + for (mi = 0, di = 0; mi < count && di < numbufs; ) { + if (pgoff == 0) { + m = ma[mi]; + if (m != bogus_page) { + vm_page_assert_xbusied(m); + ASSERT(m->valid == 0); + ASSERT(m->dirty == 0); + ASSERT(!pmap_page_is_mapped(m)); + va = zfs_map_page(m, &sf); + } + } + if (bufoff == 0) + db = dbp[di]; + + if (m != bogus_page) { + ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==, + db->db_offset + bufoff); + } + + /* + * We do not need to clamp the copy size by the file + * size as the last block is zero-filled beyond the + * end of file anyway. + */ + tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff); + if (m != bogus_page) + bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy); + + pgoff += tocpy; + ASSERT(pgoff <= PAGESIZE); + if (pgoff == PAGESIZE) { + if (m != bogus_page) { + zfs_unmap_page(sf); + m->valid = VM_PAGE_BITS_ALL; + } + ASSERT(mi < count); + mi++; + pgoff = 0; + } + + bufoff += tocpy; + ASSERT(bufoff <= db->db_size); + if (bufoff == db->db_size) { + ASSERT(di < numbufs); + di++; + bufoff = 0; + } + } + +#ifdef DEBUG + /* + * Three possibilities: + * - last requested page ends at a buffer boundary and , thus, + * all pages and buffers have been iterated; + * - all requested pages are filled, but the last buffer + * has not been exhausted; + * the read-ahead is possible only in this case; + * - all buffers have been read, but the last page has not been + * fully filled; + * this is only possible if the file has only a single buffer + * with a size that is not a multiple of the page size. + */ + if (mi == count) { + ASSERT(di >= numbufs - 1); + IMPLY(*rahead != 0, di == numbufs - 1); + IMPLY(*rahead != 0, bufoff != 0); + ASSERT(pgoff == 0); + } + if (di == numbufs) { + ASSERT(mi >= count - 1); + ASSERT(*rahead == 0); + IMPLY(pgoff == 0, mi == count); + if (pgoff != 0) { + ASSERT(mi == count - 1); + ASSERT((dbp[0]->db_size & PAGE_MASK) != 0); + } + } +#endif + if (pgoff != 0) { + ASSERT(m != bogus_page); + bzero(va + pgoff, PAGESIZE - pgoff); + zfs_unmap_page(sf); + m->valid = VM_PAGE_BITS_ALL; + } + + for (i = 0; i < *rahead; i++) { + m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i, + VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY); + if (m == NULL) + break; + if (m->valid != 0) { + ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); + break; + } + ASSERT(m->dirty == 0); + ASSERT(!pmap_page_is_mapped(m)); + + ASSERT(db->db_size > PAGE_SIZE); + bufoff = IDX_TO_OFF(m->pindex) % db->db_size; + tocpy = MIN(db->db_size - bufoff, PAGESIZE); + va = zfs_map_page(m, &sf); + bcopy((char *)db->db_data + bufoff, va, tocpy); + if (tocpy < PAGESIZE) { + ASSERT(i == *rahead - 1); + ASSERT((db->db_size & PAGE_MASK) != 0); + bzero(va + tocpy, PAGESIZE - tocpy); + } + zfs_unmap_page(sf); + m->valid = VM_PAGE_BITS_ALL; + vm_page_lock(m); + if ((m->busy_lock & VPB_BIT_WAITERS) != 0) + vm_page_activate(m); + else + vm_page_deactivate(m); + vm_page_unlock(m); + } + *rahead = i; + zfs_vmobject_wunlock(vmobj); + + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (0); +} +#endif /* illumos */ +#endif /* _KERNEL */ + +/* + * Allocate a loaned anonymous arc buffer. + */ +arc_buf_t * +dmu_request_arcbuf(dmu_buf_t *handle, int size) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; + + return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); +} + +/* + * Free a loaned arc buffer. + */ +void +dmu_return_arcbuf(arc_buf_t *buf) +{ + arc_return_buf(buf, FTAG); + arc_buf_destroy(buf, FTAG); +} + +/* + * When possible directly assign passed loaned arc buffer to a dbuf. + * If this is not possible copy the contents of passed arc buf via + * dmu_write(). + */ +void +dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + uint32_t blksz = (uint32_t)arc_buf_lsize(buf); + uint64_t blkid; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, 0, offset); + VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); + rw_exit(&dn->dn_struct_rwlock); + + /* + * We can only assign if the offset is aligned, the arc buf is the + * same size as the dbuf, and the dbuf is not metadata. + */ + if (offset == db->db.db_offset && blksz == db->db.db_size) { +#ifdef _KERNEL + curthread->td_ru.ru_oublock++; +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_WRITEBPS, blksz); + racct_add_force(curproc, RACCT_WRITEIOPS, 1); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ +#endif /* _KERNEL */ + dbuf_assign_arcbuf(db, buf, tx); + dbuf_rele(db, FTAG); + } else { + objset_t *os; + uint64_t object; + + /* compressed bufs must always be assignable to their dbuf */ + ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); + ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); + + os = dn->dn_objset; + object = dn->dn_object; + + dbuf_rele(db, FTAG); + dmu_write(os, object, offset, blksz, buf->b_data, tx); + dmu_return_arcbuf(buf); + XUIOSTAT_BUMP(xuiostat_wbuf_copied); + } +} + +void +dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; + + DB_DNODE_ENTER(dbuf); + dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx); + DB_DNODE_EXIT(dbuf); +} + +typedef struct { + dbuf_dirty_record_t *dsa_dr; + dmu_sync_cb_t *dsa_done; + zgd_t *dsa_zgd; + dmu_tx_t *dsa_tx; +} dmu_sync_arg_t; + +/* ARGSUSED */ +static void +dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) +{ + dmu_sync_arg_t *dsa = varg; + dmu_buf_t *db = dsa->dsa_zgd->zgd_db; + blkptr_t *bp = zio->io_bp; + + if (zio->io_error == 0) { + if (BP_IS_HOLE(bp)) { + /* + * A block of zeros may compress to a hole, but the + * block size still needs to be known for replay. + */ + BP_SET_LSIZE(bp, db->db_size); + } else if (!BP_IS_EMBEDDED(bp)) { + ASSERT(BP_GET_LEVEL(bp) == 0); + bp->blk_fill = 1; + } + } +} + +static void +dmu_sync_late_arrival_ready(zio_t *zio) +{ + dmu_sync_ready(zio, NULL, zio->io_private); +} + +/* ARGSUSED */ +static void +dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) +{ + dmu_sync_arg_t *dsa = varg; + dbuf_dirty_record_t *dr = dsa->dsa_dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + + mutex_enter(&db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); + if (zio->io_error == 0) { + dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); + if (dr->dt.dl.dr_nopwrite) { + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + uint8_t chksum = BP_GET_CHECKSUM(bp_orig); + + ASSERT(BP_EQUAL(bp, bp_orig)); + VERIFY(BP_EQUAL(bp, db->db_blkptr)); + ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); + ASSERT(zio_checksum_table[chksum].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE); + } + dr->dt.dl.dr_overridden_by = *zio->io_bp; + dr->dt.dl.dr_override_state = DR_OVERRIDDEN; + dr->dt.dl.dr_copies = zio->io_prop.zp_copies; + + /* + * Old style holes are filled with all zeros, whereas + * new-style holes maintain their lsize, type, level, + * and birth time (see zio_write_compress). While we + * need to reset the BP_SET_LSIZE() call that happened + * in dmu_sync_ready for old style holes, we do *not* + * want to wipe out the information contained in new + * style holes. Thus, only zero out the block pointer if + * it's an old style hole. + */ + if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && + dr->dt.dl.dr_overridden_by.blk_birth == 0) + BP_ZERO(&dr->dt.dl.dr_overridden_by); + } else { + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + } + cv_broadcast(&db->db_changed); + mutex_exit(&db->db_mtx); + + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + + kmem_free(dsa, sizeof (*dsa)); +} + +static void +dmu_sync_late_arrival_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + dmu_sync_arg_t *dsa = zio->io_private; + blkptr_t *bp_orig = &zio->io_bp_orig; + + if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { + ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); + ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); + ASSERT(zio->io_bp->blk_birth == zio->io_txg); + ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); + zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + } + + dmu_tx_commit(dsa->dsa_tx); + + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + + abd_put(zio->io_abd); + kmem_free(dsa, sizeof (*dsa)); +} + +static int +dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, + zio_prop_t *zp, zbookmark_phys_t *zb) +{ + dmu_sync_arg_t *dsa; + dmu_tx_t *tx; + + tx = dmu_tx_create(os); + dmu_tx_hold_space(tx, zgd->zgd_db->db_size); + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + dmu_tx_abort(tx); + /* Make zl_get_data do txg_waited_synced() */ + return (SET_ERROR(EIO)); + } + + /* + * In order to prevent the zgd's lwb from being free'd prior to + * dmu_sync_late_arrival_done() being called, we have to ensure + * the lwb's "max txg" takes this tx's txg into account. + */ + zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx)); + + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = NULL; + dsa->dsa_done = done; + dsa->dsa_zgd = zgd; + dsa->dsa_tx = tx; + + /* + * Since we are currently syncing this txg, it's nontrivial to + * determine what BP to nopwrite against, so we disable nopwrite. + * + * When syncing, the db_blkptr is initially the BP of the previous + * txg. We can not nopwrite against it because it will be changed + * (this is similar to the non-late-arrival case where the dbuf is + * dirty in a future txg). + * + * Then dbuf_write_ready() sets bp_blkptr to the location we will write. + * We can not nopwrite against it because although the BP will not + * (typically) be changed, the data has not yet been persisted to this + * location. + * + * Finally, when dbuf_write_done() is called, it is theoretically + * possible to always nopwrite, because the data that was written in + * this txg is the same data that we are trying to write. However we + * would need to check that this dbuf is not dirty in any future + * txg's (as we do in the normal dmu_sync() path). For simplicity, we + * don't nopwrite in this case. + */ + zp->zp_nopwrite = B_FALSE; + + zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, + abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), + zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, + dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, + dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + + return (0); +} + +/* + * Intent log support: sync the block associated with db to disk. + * N.B. and XXX: the caller is responsible for making sure that the + * data isn't changing while dmu_sync() is writing it. + * + * Return values: + * + * EEXIST: this txg has already been synced, so there's nothing to do. + * The caller should not log the write. + * + * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. + * The caller should not log the write. + * + * EALREADY: this block is already in the process of being synced. + * The caller should track its progress (somehow). + * + * EIO: could not do the I/O. + * The caller should do a txg_wait_synced(). + * + * 0: the I/O has been initiated. + * The caller should log this blkptr in the done callback. + * It is possible that the I/O will fail, in which case + * the error will be reported to the done callback and + * propagated to pio from zio_done(). + */ +int +dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; + objset_t *os = db->db_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; + dbuf_dirty_record_t *dr; + dmu_sync_arg_t *dsa; + zbookmark_phys_t zb; + zio_prop_t zp; + dnode_t *dn; + + ASSERT(pio != NULL); + ASSERT(txg != 0); + + SET_BOOKMARK(&zb, ds->ds_object, + db->db.db_object, db->db_level, db->db_blkid); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + DB_DNODE_EXIT(db); + + /* + * If we're frozen (running ziltest), we always need to generate a bp. + */ + if (txg > spa_freeze_txg(os->os_spa)) + return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); + + /* + * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() + * and us. If we determine that this txg is not yet syncing, + * but it begins to sync a moment later, that's OK because the + * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. + */ + mutex_enter(&db->db_mtx); + + if (txg <= spa_last_synced_txg(os->os_spa)) { + /* + * This txg has already synced. There's nothing to do. + */ + mutex_exit(&db->db_mtx); + return (SET_ERROR(EEXIST)); + } + + if (txg <= spa_syncing_txg(os->os_spa)) { + /* + * This txg is currently syncing, so we can't mess with + * the dirty record anymore; just write a new log block. + */ + mutex_exit(&db->db_mtx); + return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); + } + + dr = db->db_last_dirty; + while (dr && dr->dr_txg != txg) + dr = dr->dr_next; + + if (dr == NULL) { + /* + * There's no dr for this dbuf, so it must have been freed. + * There's no need to log writes to freed blocks, so we're done. + */ + mutex_exit(&db->db_mtx); + return (SET_ERROR(ENOENT)); + } + + ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); + + if (db->db_blkptr != NULL) { + /* + * We need to fill in zgd_bp with the current blkptr so that + * the nopwrite code can check if we're writing the same + * data that's already on disk. We can only nopwrite if we + * are sure that after making the copy, db_blkptr will not + * change until our i/o completes. We ensure this by + * holding the db_mtx, and only allowing nopwrite if the + * block is not already dirty (see below). This is verified + * by dmu_sync_done(), which VERIFYs that the db_blkptr has + * not changed. + */ + *zgd->zgd_bp = *db->db_blkptr; + } + + /* + * Assume the on-disk data is X, the current syncing data (in + * txg - 1) is Y, and the current in-memory data is Z (currently + * in dmu_sync). + * + * We usually want to perform a nopwrite if X and Z are the + * same. However, if Y is different (i.e. the BP is going to + * change before this write takes effect), then a nopwrite will + * be incorrect - we would override with X, which could have + * been freed when Y was written. + * + * (Note that this is not a concern when we are nop-writing from + * syncing context, because X and Y must be identical, because + * all previous txgs have been synced.) + * + * Therefore, we disable nopwrite if the current BP could change + * before this TXG. There are two ways it could change: by + * being dirty (dr_next is non-NULL), or by being freed + * (dnode_block_freed()). This behavior is verified by + * zio_done(), which VERIFYs that the override BP is identical + * to the on-disk BP. + */ + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) + zp.zp_nopwrite = B_FALSE; + DB_DNODE_EXIT(db); + + ASSERT(dr->dr_txg == txg); + if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || + dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * We have already issued a sync write for this buffer, + * or this buffer has already been synced. It could not + * have been dirtied since, or we would have cleared the state. + */ + mutex_exit(&db->db_mtx); + return (SET_ERROR(EALREADY)); + } + + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; + mutex_exit(&db->db_mtx); + + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = dr; + dsa->dsa_done = done; + dsa->dsa_zgd = zgd; + dsa->dsa_tx = NULL; + + zio_nowait(arc_write(pio, os->os_spa, txg, + zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), + &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + + return (0); +} + +int +dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, + dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + err = dnode_set_blksz(dn, size, ibs, tx); + dnode_rele(dn, FTAG); + return (err); +} + +void +dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, + dmu_tx_t *tx) +{ + dnode_t *dn; + + /* + * Send streams include each object's checksum function. This + * check ensures that the receiving system can understand the + * checksum function transmitted. + */ + ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); + + VERIFY0(dnode_hold(os, object, FTAG, &dn)); + ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); + dn->dn_checksum = checksum; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); +} + +void +dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, + dmu_tx_t *tx) +{ + dnode_t *dn; + + /* + * Send streams include each object's compression function. This + * check ensures that the receiving system can understand the + * compression function transmitted. + */ + ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); + + VERIFY0(dnode_hold(os, object, FTAG, &dn)); + dn->dn_compress = compress; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); +} + +int zfs_mdcomp_disable = 0; +SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN, + &zfs_mdcomp_disable, 0, "Disable metadata compression"); + +/* + * When the "redundant_metadata" property is set to "most", only indirect + * blocks of this level and higher will have an additional ditto block. + */ +int zfs_redundant_metadata_most_ditto_level = 2; + +void +dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) +{ + dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; + boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || + (wp & WP_SPILL)); + enum zio_checksum checksum = os->os_checksum; + enum zio_compress compress = os->os_compress; + enum zio_checksum dedup_checksum = os->os_dedup_checksum; + boolean_t dedup = B_FALSE; + boolean_t nopwrite = B_FALSE; + boolean_t dedup_verify = os->os_dedup_verify; + int copies = os->os_copies; + + /* + * We maintain different write policies for each of the following + * types of data: + * 1. metadata + * 2. preallocated blocks (i.e. level-0 blocks of a dump device) + * 3. all other level 0 blocks + */ + if (ismd) { + if (zfs_mdcomp_disable) { + compress = ZIO_COMPRESS_EMPTY; + } else { + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + compress = zio_compress_select(os->os_spa, + ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); + } + + /* + * Metadata always gets checksummed. If the data + * checksum is multi-bit correctable, and it's not a + * ZBT-style checksum, then it's suitable for metadata + * as well. Otherwise, the metadata checksum defaults + * to fletcher4. + */ + if (!(zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_METADATA) || + (zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_EMBEDDED)) + checksum = ZIO_CHECKSUM_FLETCHER_4; + + if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || + (os->os_redundant_metadata == + ZFS_REDUNDANT_METADATA_MOST && + (level >= zfs_redundant_metadata_most_ditto_level || + DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) + copies++; + } else if (wp & WP_NOFILL) { + ASSERT(level == 0); + + /* + * If we're writing preallocated blocks, we aren't actually + * writing them so don't set any policy properties. These + * blocks are currently only used by an external subsystem + * outside of zfs (i.e. dump) and not written by the zio + * pipeline. + */ + compress = ZIO_COMPRESS_OFF; + checksum = ZIO_CHECKSUM_NOPARITY; + } else { + compress = zio_compress_select(os->os_spa, dn->dn_compress, + compress); + + checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? + zio_checksum_select(dn->dn_checksum, checksum) : + dedup_checksum; + + /* + * Determine dedup setting. If we are in dmu_sync(), + * we won't actually dedup now because that's all + * done in syncing context; but we do want to use the + * dedup checkum. If the checksum is not strong + * enough to ensure unique signatures, force + * dedup_verify. + */ + if (dedup_checksum != ZIO_CHECKSUM_OFF) { + dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; + if (!(zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP)) + dedup_verify = B_TRUE; + } + + /* + * Enable nopwrite if we have secure enough checksum + * algorithm (see comment in zio_nop_write) and + * compression is enabled. We don't enable nopwrite if + * dedup is enabled as the two features are mutually + * exclusive. + */ + nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE) && + compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); + } + + zp->zp_checksum = checksum; + zp->zp_compress = compress; + ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); + + zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; + zp->zp_level = level; + zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); + zp->zp_dedup = dedup; + zp->zp_dedup_verify = dedup && dedup_verify; + zp->zp_nopwrite = nopwrite; +} + +int +dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) +{ + dnode_t *dn; + int err; + + /* + * Sync any current changes before + * we go trundling through the block pointers. + */ + err = dmu_object_wait_synced(os, object); + if (err) { + return (err); + } + + err = dnode_hold(os, object, FTAG, &dn); + if (err) { + return (err); + } + + err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); + dnode_rele(dn, FTAG); + + return (err); +} + +/* + * Given the ZFS object, if it contains any dirty nodes + * this function flushes all dirty blocks to disk. This + * ensures the DMU object info is updated. A more efficient + * future version might just find the TXG with the maximum + * ID and wait for that to be synced. + */ +int +dmu_object_wait_synced(objset_t *os, uint64_t object) +{ + dnode_t *dn; + int error, i; + + error = dnode_hold(os, object, FTAG, &dn); + if (error) { + return (error); + } + + for (i = 0; i < TXG_SIZE; i++) { + if (list_link_active(&dn->dn_dirty_link[i])) { + break; + } + } + dnode_rele(dn, FTAG); + if (i != TXG_SIZE) { + txg_wait_synced(dmu_objset_pool(os), 0); + } + + return (0); +} + +void +__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +{ + dnode_phys_t *dnp = dn->dn_phys; + + doi->doi_data_block_size = dn->dn_datablksz; + doi->doi_metadata_block_size = dn->dn_indblkshift ? + 1ULL << dn->dn_indblkshift : 0; + doi->doi_type = dn->dn_type; + doi->doi_bonus_type = dn->dn_bonustype; + doi->doi_bonus_size = dn->dn_bonuslen; + doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT; + doi->doi_indirection = dn->dn_nlevels; + doi->doi_checksum = dn->dn_checksum; + doi->doi_compress = dn->dn_compress; + doi->doi_nblkptr = dn->dn_nblkptr; + doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; + doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; + doi->doi_fill_count = 0; + for (int i = 0; i < dnp->dn_nblkptr; i++) + doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); +} + +void +dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +{ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + mutex_enter(&dn->dn_mtx); + + __dmu_object_info_from_dnode(dn, doi); + + mutex_exit(&dn->dn_mtx); + rw_exit(&dn->dn_struct_rwlock); +} + +/* + * Get information on a DMU object. + * If doi is NULL, just indicates whether the object exists. + */ +int +dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) +{ + dnode_t *dn; + int err = dnode_hold(os, object, FTAG, &dn); + + if (err) + return (err); + + if (doi != NULL) + dmu_object_info_from_dnode(dn, doi); + + dnode_rele(dn, FTAG); + return (0); +} + +/* + * As above, but faster; can be used when you have a held dbuf in hand. + */ +void +dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + DB_DNODE_ENTER(db); + dmu_object_info_from_dnode(DB_DNODE(db), doi); + DB_DNODE_EXIT(db); +} + +/* + * Faster still when you only care about the size. + * This is specifically optimized for zfs_getattr(). + */ +void +dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, + u_longlong_t *nblk512) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + *blksize = dn->dn_datablksz; + /* add in number of slots used for the dnode itself */ + *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> + SPA_MINBLOCKSHIFT) + dn->dn_num_slots; + DB_DNODE_EXIT(db); +} + +void +dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + *dnsize = dn->dn_num_slots << DNODE_SHIFT; + DB_DNODE_EXIT(db); +} + +void +byteswap_uint64_array(void *vbuf, size_t size) +{ + uint64_t *buf = vbuf; + size_t count = size >> 3; + int i; + + ASSERT((size & 7) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_64(buf[i]); +} + +void +byteswap_uint32_array(void *vbuf, size_t size) +{ + uint32_t *buf = vbuf; + size_t count = size >> 2; + int i; + + ASSERT((size & 3) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_32(buf[i]); +} + +void +byteswap_uint16_array(void *vbuf, size_t size) +{ + uint16_t *buf = vbuf; + size_t count = size >> 1; + int i; + + ASSERT((size & 1) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_16(buf[i]); +} + +/* ARGSUSED */ +void +byteswap_uint8_array(void *vbuf, size_t size) +{ +} + +void +dmu_init(void) +{ + abd_init(); + zfs_dbgmsg_init(); + sa_cache_init(); + xuio_stat_init(); + dmu_objset_init(); + dnode_init(); + zfetch_init(); + zio_compress_init(); + l2arc_init(); + arc_init(); + dbuf_init(); +} + +void +dmu_fini(void) +{ + arc_fini(); /* arc depends on l2arc, so arc must go first */ + l2arc_fini(); + zfetch_fini(); + zio_compress_fini(); + dbuf_fini(); + dnode_fini(); + dmu_objset_fini(); + xuio_stat_fini(); + sa_cache_fini(); + zfs_dbgmsg_fini(); + abd_fini(); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c new file mode 100644 index 000000000000..e7bfdaa90e97 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c @@ -0,0 +1,251 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + */ + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> +#include <sys/zfs_znode.h> + +struct diffarg { + struct file *da_fp; /* file to which we are reporting */ + offset_t *da_offp; + int da_err; /* error that stopped diff search */ + dmu_diff_record_t da_ddr; + kthread_t *da_td; +}; + +static int +write_bytes(struct diffarg *da) +{ + struct uio auio; + struct iovec aiov; + + aiov.iov_base = (caddr_t)&da->da_ddr; + aiov.iov_len = sizeof (da->da_ddr); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = aiov.iov_len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_offset = (off_t)-1; + auio.uio_td = da->da_td; +#ifdef _KERNEL + if (da->da_fp->f_type == DTYPE_VNODE) + bwillwrite(); + return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td)); +#else + fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); + return (EOPNOTSUPP); +#endif +} + +static int +write_record(struct diffarg *da) +{ + + if (da->da_ddr.ddr_type == DDR_NONE) { + da->da_err = 0; + return (0); + } + + da->da_err = write_bytes(da); + *da->da_offp += sizeof (da->da_ddr); + return (da->da_err); +} + +static int +report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last) +{ + ASSERT(first <= last); + if (da->da_ddr.ddr_type != DDR_FREE || + first != da->da_ddr.ddr_last + 1) { + if (write_record(da) != 0) + return (da->da_err); + da->da_ddr.ddr_type = DDR_FREE; + da->da_ddr.ddr_first = first; + da->da_ddr.ddr_last = last; + return (0); + } + da->da_ddr.ddr_last = last; + return (0); +} + +static int +report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp) +{ + ASSERT(dnp != NULL); + if (dnp->dn_type == DMU_OT_NONE) + return (report_free_dnode_range(da, object, object)); + + if (da->da_ddr.ddr_type != DDR_INUSE || + object != da->da_ddr.ddr_last + 1) { + if (write_record(da) != 0) + return (da->da_err); + da->da_ddr.ddr_type = DDR_INUSE; + da->da_ddr.ddr_first = da->da_ddr.ddr_last = object; + return (0); + } + da->da_ddr.ddr_last = object; + return (0); +} + +#define DBP_SPAN(dnp, level) \ + (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) + +/* ARGSUSED */ +static int +diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + struct diffarg *da = arg; + int err = 0; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (SET_ERROR(EINTR)); + + if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT) + return (0); + + if (BP_IS_HOLE(bp)) { + uint64_t span = DBP_SPAN(dnp, zb->zb_level); + uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; + + err = report_free_dnode_range(da, dnobj, + dnobj + (span >> DNODE_SHIFT) - 1); + if (err) + return (err); + } else if (zb->zb_level == 0) { + dnode_phys_t *blk; + arc_buf_t *abuf; + arc_flags_t aflags = ARC_FLAG_WAIT; + int blksz = BP_GET_LSIZE(bp); + int i; + + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &aflags, zb) != 0) + return (SET_ERROR(EIO)); + + blk = abuf->b_data; + for (i = 0; i < blksz >> DNODE_SHIFT; i++) { + uint64_t dnobj = (zb->zb_blkid << + (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; + err = report_dnode(da, dnobj, blk+i); + if (err) + break; + } + arc_buf_destroy(abuf, &abuf); + if (err) + return (err); + /* Don't care about the data blocks */ + return (TRAVERSE_VISIT_NO_CHILDREN); + } + return (0); +} + +int +dmu_diff(const char *tosnap_name, const char *fromsnap_name, +#ifdef illumos + struct vnode *vp, offset_t *offp) +#else + struct file *fp, offset_t *offp) +#endif +{ + struct diffarg da; + dsl_dataset_t *fromsnap; + dsl_dataset_t *tosnap; + dsl_pool_t *dp; + int error; + uint64_t fromtxg; + + if (strchr(tosnap_name, '@') == NULL || + strchr(fromsnap_name, '@') == NULL) + return (SET_ERROR(EINVAL)); + + error = dsl_pool_hold(tosnap_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + + if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (SET_ERROR(EXDEV)); + } + + fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg; + dsl_dataset_rele(fromsnap, FTAG); + + dsl_dataset_long_hold(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + + da.da_fp = fp; + da.da_offp = offp; + da.da_ddr.ddr_type = DDR_NONE; + da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0; + da.da_err = 0; + da.da_td = curthread; + + error = traverse_dataset(tosnap, fromtxg, + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da); + + if (error != 0) { + da.da_err = error; + } else { + /* we set the da.da_err we return as side-effect */ + (void) write_record(&da); + } + + dsl_dataset_long_rele(tosnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + + return (da.da_err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c new file mode 100644 index 000000000000..f830076f767f --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c @@ -0,0 +1,349 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright 2014 HybridCluster. All rights reserved. + */ + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dnode.h> +#include <sys/zap.h> +#include <sys/zfeature.h> +#include <sys/dsl_dataset.h> + + +static uint64_t +dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx) +{ + uint64_t object; + uint64_t L1_dnode_count = DNODES_PER_BLOCK << + (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); + dnode_t *dn = NULL; + int dn_slots = dnodesize >> DNODE_SHIFT; + boolean_t restarted = B_FALSE; + + if (dn_slots == 0) { + dn_slots = DNODE_MIN_SLOTS; + } else { + ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); + ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); + } + + mutex_enter(&os->os_obj_lock); + for (;;) { + object = os->os_obj_next; + /* + * Each time we polish off a L1 bp worth of dnodes (2^12 + * objects), move to another L1 bp that's still + * reasonably sparse (at most 1/4 full). Look from the + * beginning at most once per txg. If we still can't + * allocate from that L1 block, search for an empty L0 + * block, which will quickly skip to the end of the + * metadnode if the no nearby L0 blocks are empty. This + * fallback avoids a pathology where full dnode blocks + * containing large dnodes appear sparse because they + * have a low blk_fill, leading to many failed + * allocation attempts. In the long term a better + * mechanism to search for sparse metadnode regions, + * such as spacemaps, could be implemented. + * + * os_scan_dnodes is set during txg sync if enough objects + * have been freed since the previous rescan to justify + * backfilling again. + * + * Note that dmu_traverse depends on the behavior that we use + * multiple blocks of the dnode object before going back to + * reuse objects. Any change to this algorithm should preserve + * that property or find another solution to the issues + * described in traverse_visitbp. + */ + if (P2PHASE(object, L1_dnode_count) == 0) { + uint64_t offset; + uint64_t blkfill; + int minlvl; + int error; + if (os->os_rescan_dnodes) { + offset = 0; + os->os_rescan_dnodes = B_FALSE; + } else { + offset = object << DNODE_SHIFT; + } + blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; + minlvl = restarted ? 1 : 2; + restarted = B_TRUE; + error = dnode_next_offset(DMU_META_DNODE(os), + DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0); + if (error == 0) + object = offset >> DNODE_SHIFT; + } + os->os_obj_next = object + dn_slots; + + /* + * XXX We should check for an i/o error here and return + * up to our caller. Actually we should pre-read it in + * dmu_tx_assign(), but there is currently no mechanism + * to do so. + */ + (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, + FTAG, &dn); + if (dn) + break; + + if (dmu_object_next(os, &object, B_TRUE, 0) == 0) + os->os_obj_next = object; + else + /* + * Skip to next known valid starting point for a dnode. + */ + os->os_obj_next = P2ROUNDUP(object + 1, + DNODES_PER_BLOCK); + } + + dnode_allocate(dn, ot, blocksize, indirect_blockshift, + bonustype, bonuslen, dn_slots, tx); + mutex_exit(&os->os_obj_lock); + + dmu_tx_add_new_object(tx, dn); + dnode_rele(dn, FTAG); + + return (object); +} + +uint64_t +dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, + bonuslen, 0, tx); +} + +uint64_t +dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, + dmu_tx_t *tx) +{ + return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, + bonustype, bonuslen, 0, tx); +} + +uint64_t +dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, + bonuslen, dnodesize, tx)); +} + +int +dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, + bonuslen, 0, tx)); +} + +int +dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx) +{ + dnode_t *dn; + int dn_slots = dnodesize >> DNODE_SHIFT; + int err; + + if (dn_slots == 0) + dn_slots = DNODE_MIN_SLOTS; + ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); + ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); + + if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) + return (SET_ERROR(EBADF)); + + err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, + FTAG, &dn); + if (err) + return (err); + dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); + dmu_tx_add_new_object(tx, dn); + + dnode_rele(dn, FTAG); + + return (0); +} + +int +dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, + bonuslen, 0, tx)); +} + +int +dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, + dmu_tx_t *tx) +{ + dnode_t *dn; + int dn_slots = dnodesize >> DNODE_SHIFT; + int err; + + if (object == DMU_META_DNODE_OBJECT) + return (SET_ERROR(EBADF)); + + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, + FTAG, &dn); + if (err) + return (err); + + dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx); + + dnode_rele(dn, FTAG); + return (err); +} + + +int +dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, + FTAG, &dn); + if (err) + return (err); + + ASSERT(dn->dn_type != DMU_OT_NONE); + /* + * If we don't create this free range, we'll leak indirect blocks when + * we get to freeing the dnode in syncing context. + */ + dnode_free_range(dn, 0, DMU_OBJECT_END, tx); + dnode_free(dn, tx); + dnode_rele(dn, FTAG); + + return (0); +} + +/* + * Return (in *objectp) the next object which is allocated (or a hole) + * after *object, taking into account only objects that may have been modified + * after the specified txg. + */ +int +dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) +{ + uint64_t offset; + dmu_object_info_t doi; + struct dsl_dataset *ds = os->os_dsl_dataset; + int dnodesize; + int error; + + /* + * Avoid expensive dnode hold if this dataset doesn't use large dnodes. + */ + if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { + error = dmu_object_info(os, *objectp, &doi); + if (error && !(error == EINVAL && *objectp == 0)) + return (SET_ERROR(error)); + else + dnodesize = doi.doi_dnodesize; + } else { + dnodesize = DNODE_MIN_SIZE; + } + + if (*objectp == 0) + offset = 1 << DNODE_SHIFT; + else + offset = (*objectp << DNODE_SHIFT) + dnodesize; + + error = dnode_next_offset(DMU_META_DNODE(os), + (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); + + *objectp = offset >> DNODE_SHIFT; + + return (error); +} + +/* + * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the + * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. + * + * Only for use from syncing context, on MOS objects. + */ +void +dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, + dmu_tx_t *tx) +{ + dnode_t *dn; + + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY0(dnode_hold(mos, object, FTAG, &dn)); + if (dn->dn_type == DMU_OTN_ZAP_METADATA) { + dnode_rele(dn, FTAG); + return; + } + ASSERT3U(dn->dn_type, ==, old_type); + ASSERT0(dn->dn_maxblkid); + + /* + * We must initialize the ZAP data before changing the type, + * so that concurrent calls to *_is_zapified() can determine if + * the object has been completely zapified by checking the type. + */ + mzap_create_impl(mos, object, 0, 0, tx); + + dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = + DMU_OTN_ZAP_METADATA; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); + + spa_feature_incr(dmu_objset_spa(mos), + SPA_FEATURE_EXTENSIBLE_DATASET, tx); +} + +void +dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + dmu_object_type_t t; + + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY0(dnode_hold(mos, object, FTAG, &dn)); + t = dn->dn_type; + dnode_rele(dn, FTAG); + + if (t == DMU_OTN_ZAP_METADATA) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_EXTENSIBLE_DATASET, tx); + } + VERIFY0(dmu_object_free(mos, object, tx)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c new file mode 100644 index 000000000000..2e36e0e5eec1 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -0,0 +1,2401 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2015, STRATO AG, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Nexenta Systems, Inc. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include <sys/cred.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_deleg.h> +#include <sys/dnode.h> +#include <sys/dbuf.h> +#include <sys/zvol.h> +#include <sys/dmu_tx.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/dmu_impl.h> +#include <sys/zfs_ioctl.h> +#include <sys/sa.h> +#include <sys/zfs_onexit.h> +#include <sys/dsl_destroy.h> +#include <sys/vdev.h> +#include <sys/zfeature.h> +#include "zfs_namecheck.h" + +/* + * Needed to close a window in dnode_move() that allows the objset to be freed + * before it can be safely accessed. + */ +krwlock_t os_lock; + +/* + * Tunable to overwrite the maximum number of threads for the parallization + * of dmu_objset_find_dp, needed to speed up the import of pools with many + * datasets. + * Default is 4 times the number of leaf vdevs. + */ +int dmu_find_threads = 0; + +/* + * Backfill lower metadnode objects after this many have been freed. + * Backfilling negatively impacts object creation rates, so only do it + * if there are enough holes to fill. + */ +int dmu_rescan_dnode_threshold = 131072; + +static void dmu_objset_find_dp_cb(void *arg); + +void +dmu_objset_init(void) +{ + rw_init(&os_lock, NULL, RW_DEFAULT, NULL); +} + +void +dmu_objset_fini(void) +{ + rw_destroy(&os_lock); +} + +spa_t * +dmu_objset_spa(objset_t *os) +{ + return (os->os_spa); +} + +zilog_t * +dmu_objset_zil(objset_t *os) +{ + return (os->os_zil); +} + +dsl_pool_t * +dmu_objset_pool(objset_t *os) +{ + dsl_dataset_t *ds; + + if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) + return (ds->ds_dir->dd_pool); + else + return (spa_get_dsl(os->os_spa)); +} + +dsl_dataset_t * +dmu_objset_ds(objset_t *os) +{ + return (os->os_dsl_dataset); +} + +dmu_objset_type_t +dmu_objset_type(objset_t *os) +{ + return (os->os_phys->os_type); +} + +void +dmu_objset_name(objset_t *os, char *buf) +{ + dsl_dataset_name(os->os_dsl_dataset, buf); +} + +uint64_t +dmu_objset_id(objset_t *os) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + + return (ds ? ds->ds_object : 0); +} + +uint64_t +dmu_objset_dnodesize(objset_t *os) +{ + return (os->os_dnodesize); +} + +zfs_sync_type_t +dmu_objset_syncprop(objset_t *os) +{ + return (os->os_sync); +} + +zfs_logbias_op_t +dmu_objset_logbias(objset_t *os) +{ + return (os->os_logbias); +} + +static void +checksum_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval != ZIO_CHECKSUM_INHERIT); + + os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); +} + +static void +compression_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval != ZIO_COMPRESS_INHERIT); + + os->os_compress = zio_compress_select(os->os_spa, newval, + ZIO_COMPRESS_ON); +} + +static void +copies_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval > 0); + ASSERT(newval <= spa_max_replication(os->os_spa)); + + os->os_copies = newval; +} + +static void +dedup_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + spa_t *spa = os->os_spa; + enum zio_checksum checksum; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval != ZIO_CHECKSUM_INHERIT); + + checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); + + os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; + os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); +} + +static void +primary_cache_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || + newval == ZFS_CACHE_METADATA); + + os->os_primary_cache = newval; +} + +static void +secondary_cache_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || + newval == ZFS_CACHE_METADATA); + + os->os_secondary_cache = newval; +} + +static void +sync_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || + newval == ZFS_SYNC_DISABLED); + + os->os_sync = newval; + if (os->os_zil) + zil_set_sync(os->os_zil, newval); +} + +static void +redundant_metadata_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL || + newval == ZFS_REDUNDANT_METADATA_MOST); + + os->os_redundant_metadata = newval; +} + +static void +dnodesize_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + switch (newval) { + case ZFS_DNSIZE_LEGACY: + os->os_dnodesize = DNODE_MIN_SIZE; + break; + case ZFS_DNSIZE_AUTO: + /* + * Choose a dnode size that will work well for most + * workloads if the user specified "auto". Future code + * improvements could dynamically select a dnode size + * based on observed workload patterns. + */ + os->os_dnodesize = DNODE_MIN_SIZE * 2; + break; + case ZFS_DNSIZE_1K: + case ZFS_DNSIZE_2K: + case ZFS_DNSIZE_4K: + case ZFS_DNSIZE_8K: + case ZFS_DNSIZE_16K: + os->os_dnodesize = newval; + break; + } +} + +static void +logbias_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + ASSERT(newval == ZFS_LOGBIAS_LATENCY || + newval == ZFS_LOGBIAS_THROUGHPUT); + os->os_logbias = newval; + if (os->os_zil) + zil_set_logbias(os->os_zil, newval); +} + +static void +recordsize_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + os->os_recordsize = newval; +} + +void +dmu_objset_byteswap(void *buf, size_t size) +{ + objset_phys_t *osp = buf; + + ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); + dnode_byteswap(&osp->os_meta_dnode); + byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); + osp->os_type = BSWAP_64(osp->os_type); + osp->os_flags = BSWAP_64(osp->os_flags); + if (size == sizeof (objset_phys_t)) { + dnode_byteswap(&osp->os_userused_dnode); + dnode_byteswap(&osp->os_groupused_dnode); + } +} + +/* + * The hash is a CRC-based hash of the objset_t pointer and the object number. + */ +static uint64_t +dnode_hash(const objset_t *os, uint64_t obj) +{ + uintptr_t osv = (uintptr_t)os; + uint64_t crc = -1ULL; + + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + /* + * The low 6 bits of the pointer don't have much entropy, because + * the objset_t is larger than 2^6 bytes long. + */ + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF]; + + crc ^= (osv>>14) ^ (obj>>24); + + return (crc); +} + +unsigned int +dnode_multilist_index_func(multilist_t *ml, void *obj) +{ + dnode_t *dn = obj; + return (dnode_hash(dn->dn_objset, dn->dn_object) % + multilist_get_num_sublists(ml)); +} + +/* + * Instantiates the objset_t in-memory structure corresponding to the + * objset_phys_t that's pointed to by the specified blkptr_t. + */ +int +dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + objset_t **osp) +{ + objset_t *os; + int i, err; + + ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); + +#if 0 + /* + * The $ORIGIN dataset (if it exists) doesn't have an associated + * objset, so there's no reason to open it. The $ORIGIN dataset + * will not exist on pools older than SPA_VERSION_ORIGIN. + */ + if (ds != NULL && spa_get_dsl(spa) != NULL && + spa_get_dsl(spa)->dp_origin_snap != NULL) { + ASSERT3P(ds->ds_dir, !=, + spa_get_dsl(spa)->dp_origin_snap->ds_dir); + } +#endif + + os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); + os->os_dsl_dataset = ds; + os->os_spa = spa; + os->os_rootbp = bp; + if (!BP_IS_HOLE(os->os_rootbp)) { + arc_flags_t aflags = ARC_FLAG_WAIT; + zbookmark_phys_t zb; + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + + if (DMU_OS_IS_L2CACHEABLE(os)) + aflags |= ARC_FLAG_L2CACHE; + + dprintf_bp(os->os_rootbp, "reading %s", ""); + err = arc_read(NULL, spa, os->os_rootbp, + arc_getbuf_func, &os->os_phys_buf, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); + if (err != 0) { + kmem_free(os, sizeof (objset_t)); + /* convert checksum errors into IO errors */ + if (err == ECKSUM) + err = SET_ERROR(EIO); + return (err); + } + + /* Increase the blocksize if we are permitted. */ + if (spa_version(spa) >= SPA_VERSION_USERSPACE && + arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { + arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf, + ARC_BUFC_METADATA, sizeof (objset_phys_t)); + bzero(buf->b_data, sizeof (objset_phys_t)); + bcopy(os->os_phys_buf->b_data, buf->b_data, + arc_buf_size(os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); + os->os_phys_buf = buf; + } + + os->os_phys = os->os_phys_buf->b_data; + os->os_flags = os->os_phys->os_flags; + } else { + int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? + sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; + os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf, + ARC_BUFC_METADATA, size); + os->os_phys = os->os_phys_buf->b_data; + bzero(os->os_phys, size); + } + + /* + * Note: the changed_cb will be called once before the register + * func returns, thus changing the checksum/compression from the + * default (fletcher2/off). Snapshots don't need to know about + * checksum/compression/copies. + */ + if (ds != NULL) { + boolean_t needlock = B_FALSE; + + /* + * Note: it's valid to open the objset if the dataset is + * long-held, in which case the pool_config lock will not + * be held. + */ + if (!dsl_pool_config_held(dmu_objset_pool(os))) { + needlock = B_TRUE; + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + } + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), + primary_cache_changed_cb, os); + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), + secondary_cache_changed_cb, os); + } + if (!ds->ds_is_snapshot) { + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), + checksum_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), + compression_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_COPIES), + copies_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DEDUP), + dedup_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_LOGBIAS), + logbias_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SYNC), + sync_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name( + ZFS_PROP_REDUNDANT_METADATA), + redundant_metadata_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), + recordsize_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DNODESIZE), + dnodesize_changed_cb, os); + } + } + if (needlock) + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (err != 0) { + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); + kmem_free(os, sizeof (objset_t)); + return (err); + } + } else { + /* It's the meta-objset. */ + os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; + os->os_compress = ZIO_COMPRESS_ON; + os->os_copies = spa_max_replication(spa); + os->os_dedup_checksum = ZIO_CHECKSUM_OFF; + os->os_dedup_verify = B_FALSE; + os->os_logbias = ZFS_LOGBIAS_LATENCY; + os->os_sync = ZFS_SYNC_STANDARD; + os->os_primary_cache = ZFS_CACHE_ALL; + os->os_secondary_cache = ZFS_CACHE_ALL; + os->os_dnodesize = DNODE_MIN_SIZE; + } + /* + * These properties will be filled in by the logic in zfs_get_zplprop() + * when they are queried for the first time. + */ + os->os_version = OBJSET_PROP_UNINITIALIZED; + os->os_normalization = OBJSET_PROP_UNINITIALIZED; + os->os_utf8only = OBJSET_PROP_UNINITIALIZED; + os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED; + + if (ds == NULL || !ds->ds_is_snapshot) + os->os_zil_header = os->os_phys->os_zil_header; + os->os_zil = zil_alloc(os, &os->os_zil_header); + + for (i = 0; i < TXG_SIZE; i++) { + os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[i]), + dnode_multilist_index_func); + } + list_create(&os->os_dnodes, sizeof (dnode_t), + offsetof(dnode_t, dn_link)); + list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + + mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); + + dnode_special_open(os, &os->os_phys->os_meta_dnode, + DMU_META_DNODE_OBJECT, &os->os_meta_dnode); + if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { + dnode_special_open(os, &os->os_phys->os_userused_dnode, + DMU_USERUSED_OBJECT, &os->os_userused_dnode); + dnode_special_open(os, &os->os_phys->os_groupused_dnode, + DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); + } + + *osp = os; + return (0); +} + +int +dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) +{ + int err = 0; + + /* + * We shouldn't be doing anything with dsl_dataset_t's unless the + * pool_config lock is held, or the dataset is long-held. + */ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) || + dsl_dataset_long_held(ds)); + + mutex_enter(&ds->ds_opening_lock); + if (ds->ds_objset == NULL) { + objset_t *os; + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), + ds, dsl_dataset_get_blkptr(ds), &os); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + if (err == 0) { + mutex_enter(&ds->ds_lock); + ASSERT(ds->ds_objset == NULL); + ds->ds_objset = os; + mutex_exit(&ds->ds_lock); + } + } + *osp = ds->ds_objset; + mutex_exit(&ds->ds_opening_lock); + return (err); +} + +/* + * Holds the pool while the objset is held. Therefore only one objset + * can be held at a time. + */ +int +dmu_objset_hold(const char *name, void *tag, objset_t **osp) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + + err = dsl_pool_hold(name, tag, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, name, tag, &ds); + if (err != 0) { + dsl_pool_rele(dp, tag); + return (err); + } + + err = dmu_objset_from_ds(ds, osp); + if (err != 0) { + dsl_dataset_rele(ds, tag); + dsl_pool_rele(dp, tag); + } + + return (err); +} + +static int +dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp) +{ + int err; + + err = dmu_objset_from_ds(ds, osp); + if (err != 0) { + dsl_dataset_disown(ds, tag); + } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { + dsl_dataset_disown(ds, tag); + return (SET_ERROR(EINVAL)); + } else if (!readonly && dsl_dataset_is_snapshot(ds)) { + dsl_dataset_disown(ds, tag); + return (SET_ERROR(EROFS)); + } + return (err); +} + +/* + * dsl_pool must not be held when this is called. + * Upon successful return, there will be a longhold on the dataset, + * and the dsl_pool will not be held. + */ +int +dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + + err = dsl_pool_hold(name, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_own(dp, name, tag, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + err = dmu_objset_own_impl(ds, type, readonly, tag, osp); + dsl_pool_rele(dp, FTAG); + + return (err); +} + +int +dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_own_obj(dp, obj, tag, &ds); + if (err != 0) + return (err); + + return (dmu_objset_own_impl(ds, type, readonly, tag, osp)); +} + +void +dmu_objset_rele(objset_t *os, void *tag) +{ + dsl_pool_t *dp = dmu_objset_pool(os); + dsl_dataset_rele(os->os_dsl_dataset, tag); + dsl_pool_rele(dp, tag); +} + +/* + * When we are called, os MUST refer to an objset associated with a dataset + * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner + * == tag. We will then release and reacquire ownership of the dataset while + * holding the pool config_rwlock to avoid intervening namespace or ownership + * changes may occur. + * + * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to + * release the hold on its dataset and acquire a new one on the dataset of the + * same name so that it can be partially torn down and reconstructed. + */ +void +dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, + void *tag) +{ + dsl_pool_t *dp; + char name[ZFS_MAX_DATASET_NAME_LEN]; + + VERIFY3P(ds, !=, NULL); + VERIFY3P(ds->ds_owner, ==, tag); + VERIFY(dsl_dataset_long_held(ds)); + + dsl_dataset_name(ds, name); + dp = ds->ds_dir->dd_pool; + dsl_pool_config_enter(dp, FTAG); + dsl_dataset_disown(ds, tag); + VERIFY0(dsl_dataset_own(dp, name, tag, newds)); + dsl_pool_config_exit(dp, FTAG); +} + +void +dmu_objset_disown(objset_t *os, void *tag) +{ + dsl_dataset_disown(os->os_dsl_dataset, tag); +} + +void +dmu_objset_evict_dbufs(objset_t *os) +{ + dnode_t dn_marker; + dnode_t *dn; + + mutex_enter(&os->os_lock); + dn = list_head(&os->os_dnodes); + while (dn != NULL) { + /* + * Skip dnodes without holds. We have to do this dance + * because dnode_add_ref() only works if there is already a + * hold. If the dnode has no holds, then it has no dbufs. + */ + if (dnode_add_ref(dn, FTAG)) { + list_insert_after(&os->os_dnodes, dn, &dn_marker); + mutex_exit(&os->os_lock); + + dnode_evict_dbufs(dn); + dnode_rele(dn, FTAG); + + mutex_enter(&os->os_lock); + dn = list_next(&os->os_dnodes, &dn_marker); + list_remove(&os->os_dnodes, &dn_marker); + } else { + dn = list_next(&os->os_dnodes, dn); + } + } + mutex_exit(&os->os_lock); + + if (DMU_USERUSED_DNODE(os) != NULL) { + dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); + dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); + } + dnode_evict_dbufs(DMU_META_DNODE(os)); +} + +/* + * Objset eviction processing is split into into two pieces. + * The first marks the objset as evicting, evicts any dbufs that + * have a refcount of zero, and then queues up the objset for the + * second phase of eviction. Once os->os_dnodes has been cleared by + * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. + * The second phase closes the special dnodes, dequeues the objset from + * the list of those undergoing eviction, and finally frees the objset. + * + * NOTE: Due to asynchronous eviction processing (invocation of + * dnode_buf_pageout()), it is possible for the meta dnode for the + * objset to have no holds even though os->os_dnodes is not empty. + */ +void +dmu_objset_evict(objset_t *os) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + + for (int t = 0; t < TXG_SIZE; t++) + ASSERT(!dmu_objset_is_dirty(os, t)); + + if (ds) + dsl_prop_unregister_all(ds, os); + + if (os->os_sa) + sa_tear_down(os); + + dmu_objset_evict_dbufs(os); + + mutex_enter(&os->os_lock); + spa_evicting_os_register(os->os_spa, os); + if (list_is_empty(&os->os_dnodes)) { + mutex_exit(&os->os_lock); + dmu_objset_evict_done(os); + } else { + mutex_exit(&os->os_lock); + } +} + +void +dmu_objset_evict_done(objset_t *os) +{ + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); + + dnode_special_close(&os->os_meta_dnode); + if (DMU_USERUSED_DNODE(os)) { + dnode_special_close(&os->os_userused_dnode); + dnode_special_close(&os->os_groupused_dnode); + } + zil_free(os->os_zil); + + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); + + /* + * This is a barrier to prevent the objset from going away in + * dnode_move() until we can safely ensure that the objset is still in + * use. We consider the objset valid before the barrier and invalid + * after the barrier. + */ + rw_enter(&os_lock, RW_READER); + rw_exit(&os_lock); + + mutex_destroy(&os->os_lock); + mutex_destroy(&os->os_userused_lock); + mutex_destroy(&os->os_obj_lock); + mutex_destroy(&os->os_user_ptr_lock); + for (int i = 0; i < TXG_SIZE; i++) { + multilist_destroy(os->os_dirty_dnodes[i]); + } + spa_evicting_os_deregister(os->os_spa, os); + kmem_free(os, sizeof (objset_t)); +} + +timestruc_t +dmu_objset_snap_cmtime(objset_t *os) +{ + return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); +} + +/* called from dsl for meta-objset */ +objset_t * +dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + dmu_objset_type_t type, dmu_tx_t *tx) +{ + objset_t *os; + dnode_t *mdn; + + ASSERT(dmu_tx_is_syncing(tx)); + + if (ds != NULL) + VERIFY0(dmu_objset_from_ds(ds, &os)); + else + VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); + + mdn = DMU_META_DNODE(os); + + dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT, + DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx); + + /* + * We don't want to have to increase the meta-dnode's nlevels + * later, because then we could do it in quescing context while + * we are also accessing it in open context. + * + * This precaution is not necessary for the MOS (ds == NULL), + * because the MOS is only updated in syncing context. + * This is most fortunate: the MOS is the only objset that + * needs to be synced multiple times as spa_sync() iterates + * to convergence, so minimizing its dn_nlevels matters. + */ + if (ds != NULL) { + int levels = 1; + + /* + * Determine the number of levels necessary for the meta-dnode + * to contain DN_MAX_OBJECT dnodes. Note that in order to + * ensure that we do not overflow 64 bits, there has to be + * a nlevels that gives us a number of blocks > DN_MAX_OBJECT + * but < 2^64. Therefore, + * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be + * less than (64 - log2(DN_MAX_OBJECT)) (16). + */ + while ((uint64_t)mdn->dn_nblkptr << + (mdn->dn_datablkshift - DNODE_SHIFT + + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < + DN_MAX_OBJECT) + levels++; + + mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = + mdn->dn_nlevels = levels; + } + + ASSERT(type != DMU_OST_NONE); + ASSERT(type != DMU_OST_ANY); + ASSERT(type < DMU_OST_NUMTYPES); + os->os_phys->os_type = type; + if (dmu_objset_userused_enabled(os)) { + os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + os->os_flags = os->os_phys->os_flags; + } + + dsl_dataset_dirty(ds, tx); + + return (os); +} + +typedef struct dmu_objset_create_arg { + const char *doca_name; + cred_t *doca_cred; + void (*doca_userfunc)(objset_t *os, void *arg, + cred_t *cr, dmu_tx_t *tx); + void *doca_userarg; + dmu_objset_type_t doca_type; + uint64_t doca_flags; +} dmu_objset_create_arg_t; + +/*ARGSUSED*/ +static int +dmu_objset_create_check(void *arg, dmu_tx_t *tx) +{ + dmu_objset_create_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + const char *tail; + int error; + + if (strchr(doca->doca_name, '@') != NULL) + return (SET_ERROR(EINVAL)); + + if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + + if (dataset_nestcheck(doca->doca_name) != 0) + return (SET_ERROR(ENAMETOOLONG)); + + error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); + if (error != 0) + return (error); + if (tail == NULL) { + dsl_dir_rele(pdd, FTAG); + return (SET_ERROR(EEXIST)); + } + error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, + doca->doca_cred); + dsl_dir_rele(pdd, FTAG); + + return (error); +} + +static void +dmu_objset_create_sync(void *arg, dmu_tx_t *tx) +{ + dmu_objset_create_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + const char *tail; + dsl_dataset_t *ds; + uint64_t obj; + blkptr_t *bp; + objset_t *os; + + VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); + + obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, + doca->doca_cred, tx); + + VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + bp = dsl_dataset_get_blkptr(ds); + os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, + ds, bp, doca->doca_type, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + if (doca->doca_userfunc != NULL) { + doca->doca_userfunc(os, doca->doca_userarg, + doca->doca_cred, tx); + } + + spa_history_log_internal_ds(ds, "create", tx, ""); + dsl_dataset_rele(ds, FTAG); + dsl_dir_rele(pdd, FTAG); +} + +int +dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, + void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) +{ + dmu_objset_create_arg_t doca; + + doca.doca_name = name; + doca.doca_cred = CRED(); + doca.doca_flags = flags; + doca.doca_userfunc = func; + doca.doca_userarg = arg; + doca.doca_type = type; + + return (dsl_sync_task(name, + dmu_objset_create_check, dmu_objset_create_sync, &doca, + 5, ZFS_SPACE_CHECK_NORMAL)); +} + +typedef struct dmu_objset_clone_arg { + const char *doca_clone; + const char *doca_origin; + cred_t *doca_cred; +} dmu_objset_clone_arg_t; + +/*ARGSUSED*/ +static int +dmu_objset_clone_check(void *arg, dmu_tx_t *tx) +{ + dmu_objset_clone_arg_t *doca = arg; + dsl_dir_t *pdd; + const char *tail; + int error; + dsl_dataset_t *origin; + dsl_pool_t *dp = dmu_tx_pool(tx); + + if (strchr(doca->doca_clone, '@') != NULL) + return (SET_ERROR(EINVAL)); + + if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + + error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); + if (error != 0) + return (error); + if (tail == NULL) { + dsl_dir_rele(pdd, FTAG); + return (SET_ERROR(EEXIST)); + } + + error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, + doca->doca_cred); + if (error != 0) { + dsl_dir_rele(pdd, FTAG); + return (SET_ERROR(EDQUOT)); + } + dsl_dir_rele(pdd, FTAG); + + error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); + if (error != 0) + return (error); + + /* You can only clone snapshots, not the head datasets. */ + if (!origin->ds_is_snapshot) { + dsl_dataset_rele(origin, FTAG); + return (SET_ERROR(EINVAL)); + } + dsl_dataset_rele(origin, FTAG); + + return (0); +} + +static void +dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) +{ + dmu_objset_clone_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + const char *tail; + dsl_dataset_t *origin, *ds; + uint64_t obj; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + + VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); + VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); + + obj = dsl_dataset_create_sync(pdd, tail, origin, 0, + doca->doca_cred, tx); + + VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); + dsl_dataset_name(origin, namebuf); + spa_history_log_internal_ds(ds, "clone", tx, + "origin=%s (%llu)", namebuf, origin->ds_object); + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(pdd, FTAG); +} + +int +dmu_objset_clone(const char *clone, const char *origin) +{ + dmu_objset_clone_arg_t doca; + + doca.doca_clone = clone; + doca.doca_origin = origin; + doca.doca_cred = CRED(); + + return (dsl_sync_task(clone, + dmu_objset_clone_check, dmu_objset_clone_sync, &doca, + 5, ZFS_SPACE_CHECK_NORMAL)); +} + +static int +dmu_objset_remap_indirects_impl(objset_t *os, uint64_t last_removed_txg) +{ + int error = 0; + uint64_t object = 0; + while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { + error = dmu_object_remap_indirects(os, object, + last_removed_txg); + /* + * If the ZPL removed the object before we managed to dnode_hold + * it, we would get an ENOENT. If the ZPL declares its intent + * to remove the object (dnode_free) before we manage to + * dnode_hold it, we would get an EEXIST. In either case, we + * want to continue remapping the other objects in the objset; + * in all other cases, we want to break early. + */ + if (error != 0 && error != ENOENT && error != EEXIST) { + break; + } + } + if (error == ESRCH) { + error = 0; + } + return (error); +} + +int +dmu_objset_remap_indirects(const char *fsname) +{ + int error = 0; + objset_t *os = NULL; + uint64_t last_removed_txg; + uint64_t remap_start_txg; + dsl_dir_t *dd; + + error = dmu_objset_hold(fsname, FTAG, &os); + if (error != 0) { + return (error); + } + dd = dmu_objset_ds(os)->ds_dir; + + if (!spa_feature_is_enabled(dmu_objset_spa(os), + SPA_FEATURE_OBSOLETE_COUNTS)) { + dmu_objset_rele(os, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + if (dsl_dataset_is_snapshot(dmu_objset_ds(os))) { + dmu_objset_rele(os, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * If there has not been a removal, we're done. + */ + last_removed_txg = spa_get_last_removal_txg(dmu_objset_spa(os)); + if (last_removed_txg == -1ULL) { + dmu_objset_rele(os, FTAG); + return (0); + } + + /* + * If we have remapped since the last removal, we're done. + */ + if (dsl_dir_is_zapified(dd)) { + uint64_t last_remap_txg; + if (zap_lookup(spa_meta_objset(dmu_objset_spa(os)), + dd->dd_object, DD_FIELD_LAST_REMAP_TXG, + sizeof (last_remap_txg), 1, &last_remap_txg) == 0 && + last_remap_txg > last_removed_txg) { + dmu_objset_rele(os, FTAG); + return (0); + } + } + + dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); + dsl_pool_rele(dmu_objset_pool(os), FTAG); + + remap_start_txg = spa_last_synced_txg(dmu_objset_spa(os)); + error = dmu_objset_remap_indirects_impl(os, last_removed_txg); + if (error == 0) { + /* + * We update the last_remap_txg to be the start txg so that + * we can guarantee that every block older than last_remap_txg + * that can be remapped has been remapped. + */ + error = dsl_dir_update_last_remap_txg(dd, remap_start_txg); + } + + dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); + dsl_dataset_rele(dmu_objset_ds(os), FTAG); + + return (error); +} + +int +dmu_objset_snapshot_one(const char *fsname, const char *snapname) +{ + int err; + char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); + nvlist_t *snaps = fnvlist_alloc(); + + fnvlist_add_boolean(snaps, longsnap); + strfree(longsnap); + err = dsl_dataset_snapshot(snaps, NULL, NULL); + fnvlist_free(snaps); + return (err); +} + +static void +dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) +{ + dnode_t *dn; + + while ((dn = multilist_sublist_head(list)) != NULL) { + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + ASSERT(dn->dn_dbuf->db_data_pending); + /* + * Initialize dn_zio outside dnode_sync() because the + * meta-dnode needs to set it ouside dnode_sync(). + */ + dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; + ASSERT(dn->dn_zio); + + ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); + multilist_sublist_remove(list, dn); + + multilist_t *newlist = dn->dn_objset->os_synced_dnodes; + if (newlist != NULL) { + (void) dnode_add_ref(dn, newlist); + multilist_insert(newlist, dn); + } + + dnode_sync(dn, tx); + } +} + +/* ARGSUSED */ +static void +dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + blkptr_t *bp = zio->io_bp; + objset_t *os = arg; + dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; + + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); + ASSERT0(BP_GET_LEVEL(bp)); + + /* + * Update rootbp fill count: it should be the number of objects + * allocated in the object set (not counting the "special" + * objects that are stored in the objset_phys_t -- the meta + * dnode and user/group accounting objects). + */ + bp->blk_fill = 0; + for (int i = 0; i < dnp->dn_nblkptr; i++) + bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]); + if (os->os_dsl_dataset != NULL) + rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG); + *os->os_rootbp = *bp; + if (os->os_dsl_dataset != NULL) + rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); +} + +/* ARGSUSED */ +static void +dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + objset_t *os = arg; + + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(BP_EQUAL(bp, bp_orig)); + } else { + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; + + (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, bp, tx); + } + kmem_free(bp, sizeof (*bp)); +} + +typedef struct sync_dnodes_arg { + multilist_t *sda_list; + int sda_sublist_idx; + multilist_t *sda_newlist; + dmu_tx_t *sda_tx; +} sync_dnodes_arg_t; + +static void +sync_dnodes_task(void *arg) +{ + sync_dnodes_arg_t *sda = arg; + + multilist_sublist_t *ms = + multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx); + + dmu_objset_sync_dnodes(ms, sda->sda_tx); + + multilist_sublist_unlock(ms); + + kmem_free(sda, sizeof (*sda)); +} + + +/* called from dsl */ +void +dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) +{ + int txgoff; + zbookmark_phys_t zb; + zio_prop_t zp; + zio_t *zio; + list_t *list; + dbuf_dirty_record_t *dr; + blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); + *blkptr_copy = *os->os_rootbp; + + dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); + + ASSERT(dmu_tx_is_syncing(tx)); + /* XXX the write_done callback should really give us the tx... */ + os->os_synctx = tx; + + if (os->os_dsl_dataset == NULL) { + /* + * This is the MOS. If we have upgraded, + * spa_max_replication() could change, so reset + * os_copies here. + */ + os->os_copies = spa_max_replication(os->os_spa); + } + + /* + * Create the root block IO + */ + SET_BOOKMARK(&zb, os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + arc_release(os->os_phys_buf, &os->os_phys_buf); + + dmu_write_policy(os, NULL, 0, 0, &zp); + + zio = arc_write(pio, os->os_spa, tx->tx_txg, + blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), + &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, + os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + + /* + * Sync special dnodes - the parent IO for the sync is the root block + */ + DMU_META_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_META_DNODE(os), tx); + + os->os_phys->os_flags = os->os_flags; + + if (DMU_USERUSED_DNODE(os) && + DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { + DMU_USERUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_USERUSED_DNODE(os), tx); + DMU_GROUPUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_GROUPUSED_DNODE(os), tx); + } + + txgoff = tx->tx_txg & TXG_MASK; + + if (dmu_objset_userused_enabled(os)) { + /* + * We must create the list here because it uses the + * dn_dirty_link[] of this txg. But it may already + * exist because we call dsl_dataset_sync() twice per txg. + */ + if (os->os_synced_dnodes == NULL) { + os->os_synced_dnodes = + multilist_create(sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[txgoff]), + dnode_multilist_index_func); + } else { + ASSERT3U(os->os_synced_dnodes->ml_offset, ==, + offsetof(dnode_t, dn_dirty_link[txgoff])); + } + } + + for (int i = 0; + i < multilist_get_num_sublists(os->os_dirty_dnodes[txgoff]); i++) { + sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP); + sda->sda_list = os->os_dirty_dnodes[txgoff]; + sda->sda_sublist_idx = i; + sda->sda_tx = tx; + (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, + sync_dnodes_task, sda, 0); + /* callback frees sda */ + } + taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); + + list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; + while ((dr = list_head(list)) != NULL) { + ASSERT0(dr->dr_dbuf->db_level); + list_remove(list, dr); + if (dr->dr_zio) + zio_nowait(dr->dr_zio); + } + + /* Enable dnode backfill if enough objects have been freed. */ + if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { + os->os_rescan_dnodes = B_TRUE; + os->os_freed_dnodes = 0; + } + + /* + * Free intent log blocks up to this tx. + */ + zil_sync(os->os_zil, tx); + os->os_phys->os_zil_header = os->os_zil_header; + zio_nowait(zio); +} + +boolean_t +dmu_objset_is_dirty(objset_t *os, uint64_t txg) +{ + return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK])); +} + +static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; + +void +dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) +{ + used_cbs[ost] = cb; +} + +boolean_t +dmu_objset_userused_enabled(objset_t *os) +{ + return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && + used_cbs[os->os_phys->os_type] != NULL && + DMU_USERUSED_DNODE(os) != NULL); +} + +typedef struct userquota_node { + uint64_t uqn_id; + int64_t uqn_delta; + avl_node_t uqn_node; +} userquota_node_t; + +typedef struct userquota_cache { + avl_tree_t uqc_user_deltas; + avl_tree_t uqc_group_deltas; +} userquota_cache_t; + +static int +userquota_compare(const void *l, const void *r) +{ + const userquota_node_t *luqn = l; + const userquota_node_t *ruqn = r; + + if (luqn->uqn_id < ruqn->uqn_id) + return (-1); + if (luqn->uqn_id > ruqn->uqn_id) + return (1); + return (0); +} + +static void +do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) +{ + void *cookie; + userquota_node_t *uqn; + + ASSERT(dmu_tx_is_syncing(tx)); + + cookie = NULL; + while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas, + &cookie)) != NULL) { + /* + * os_userused_lock protects against concurrent calls to + * zap_increment_int(). It's needed because zap_increment_int() + * is not thread-safe (i.e. not atomic). + */ + mutex_enter(&os->os_userused_lock); + VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx)); + mutex_exit(&os->os_userused_lock); + kmem_free(uqn, sizeof (*uqn)); + } + avl_destroy(&cache->uqc_user_deltas); + + cookie = NULL; + while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas, + &cookie)) != NULL) { + mutex_enter(&os->os_userused_lock); + VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx)); + mutex_exit(&os->os_userused_lock); + kmem_free(uqn, sizeof (*uqn)); + } + avl_destroy(&cache->uqc_group_deltas); +} + +static void +userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta) +{ + userquota_node_t search = { .uqn_id = id }; + avl_index_t idx; + + userquota_node_t *uqn = avl_find(avl, &search, &idx); + if (uqn == NULL) { + uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP); + uqn->uqn_id = id; + avl_insert(avl, uqn, idx); + } + uqn->uqn_delta += delta; +} + +static void +do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags, + uint64_t user, uint64_t group, boolean_t subtract) +{ + if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { + int64_t delta = DNODE_MIN_SIZE + used; + if (subtract) + delta = -delta; + + userquota_update_cache(&cache->uqc_user_deltas, user, delta); + userquota_update_cache(&cache->uqc_group_deltas, group, delta); + } +} + +typedef struct userquota_updates_arg { + objset_t *uua_os; + int uua_sublist_idx; + dmu_tx_t *uua_tx; +} userquota_updates_arg_t; + +static void +userquota_updates_task(void *arg) +{ + userquota_updates_arg_t *uua = arg; + objset_t *os = uua->uua_os; + dmu_tx_t *tx = uua->uua_tx; + dnode_t *dn; + userquota_cache_t cache = { 0 }; + + multilist_sublist_t *list = + multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx); + + ASSERT(multilist_sublist_head(list) == NULL || + dmu_objset_userused_enabled(os)); + avl_create(&cache.uqc_user_deltas, userquota_compare, + sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); + avl_create(&cache.uqc_group_deltas, userquota_compare, + sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); + + while ((dn = multilist_sublist_head(list)) != NULL) { + int flags; + ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); + ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || + dn->dn_phys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED); + + flags = dn->dn_id_flags; + ASSERT(flags); + if (flags & DN_ID_OLD_EXIST) { + do_userquota_update(&cache, + dn->dn_oldused, dn->dn_oldflags, + dn->dn_olduid, dn->dn_oldgid, B_TRUE); + } + if (flags & DN_ID_NEW_EXIST) { + do_userquota_update(&cache, + DN_USED_BYTES(dn->dn_phys), + dn->dn_phys->dn_flags, dn->dn_newuid, + dn->dn_newgid, B_FALSE); + } + + mutex_enter(&dn->dn_mtx); + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + if (dn->dn_id_flags & DN_ID_NEW_EXIST) { + dn->dn_olduid = dn->dn_newuid; + dn->dn_oldgid = dn->dn_newgid; + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (dn->dn_bonuslen == 0) + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + else + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); + mutex_exit(&dn->dn_mtx); + + multilist_sublist_remove(list, dn); + dnode_rele(dn, os->os_synced_dnodes); + } + do_userquota_cacheflush(os, &cache, tx); + multilist_sublist_unlock(list); + kmem_free(uua, sizeof (*uua)); +} + +void +dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) +{ + if (!dmu_objset_userused_enabled(os)) + return; + + /* Allocate the user/groupused objects if necessary. */ + if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { + VERIFY0(zap_create_claim(os, + DMU_USERUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + VERIFY0(zap_create_claim(os, + DMU_GROUPUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + } + + for (int i = 0; + i < multilist_get_num_sublists(os->os_synced_dnodes); i++) { + userquota_updates_arg_t *uua = + kmem_alloc(sizeof (*uua), KM_SLEEP); + uua->uua_os = os; + uua->uua_sublist_idx = i; + uua->uua_tx = tx; + /* note: caller does taskq_wait() */ + (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, + userquota_updates_task, uua, 0); + /* callback frees uua */ + } +} + +/* + * Returns a pointer to data to find uid/gid from + * + * If a dirty record for transaction group that is syncing can't + * be found then NULL is returned. In the NULL case it is assumed + * the uid/gid aren't changing. + */ +static void * +dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr, **drp; + void *data; + + if (db->db_dirtycnt == 0) + return (db->db.db_data); /* Nothing is changing */ + + for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) + if (dr->dr_txg == tx->tx_txg) + break; + + if (dr == NULL) { + data = NULL; + } else { + dnode_t *dn; + + DB_DNODE_ENTER(dr->dr_dbuf); + dn = DB_DNODE(dr->dr_dbuf); + + if (dn->dn_bonuslen == 0 && + dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) + data = dr->dt.dl.dr_data->b_data; + else + data = dr->dt.dl.dr_data; + + DB_DNODE_EXIT(dr->dr_dbuf); + } + + return (data); +} + +void +dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) +{ + objset_t *os = dn->dn_objset; + void *data = NULL; + dmu_buf_impl_t *db = NULL; + uint64_t *user = NULL; + uint64_t *group = NULL; + int flags = dn->dn_id_flags; + int error; + boolean_t have_spill = B_FALSE; + + if (!dmu_objset_userused_enabled(dn->dn_objset)) + return; + + if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| + DN_ID_CHKED_SPILL))) + return; + + if (before && dn->dn_bonuslen != 0) + data = DN_BONUS(dn->dn_phys); + else if (!before && dn->dn_bonuslen != 0) { + if (dn->dn_bonus) { + db = dn->dn_bonus; + mutex_enter(&db->db_mtx); + data = dmu_objset_userquota_find_data(db, tx); + } else { + data = DN_BONUS(dn->dn_phys); + } + } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { + int rf = 0; + + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + error = dmu_spill_hold_by_dnode(dn, + rf | DB_RF_MUST_SUCCEED, + FTAG, (dmu_buf_t **)&db); + ASSERT(error == 0); + mutex_enter(&db->db_mtx); + data = (before) ? db->db.db_data : + dmu_objset_userquota_find_data(db, tx); + have_spill = B_TRUE; + } else { + mutex_enter(&dn->dn_mtx); + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + mutex_exit(&dn->dn_mtx); + return; + } + + if (before) { + ASSERT(data); + user = &dn->dn_olduid; + group = &dn->dn_oldgid; + } else if (data) { + user = &dn->dn_newuid; + group = &dn->dn_newgid; + } + + /* + * Must always call the callback in case the object + * type has changed and that type isn't an object type to track + */ + error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, + user, group); + + /* + * Preserve existing uid/gid when the callback can't determine + * what the new uid/gid are and the callback returned EEXIST. + * The EEXIST error tells us to just use the existing uid/gid. + * If we don't know what the old values are then just assign + * them to 0, since that is a new file being created. + */ + if (!before && data == NULL && error == EEXIST) { + if (flags & DN_ID_OLD_EXIST) { + dn->dn_newuid = dn->dn_olduid; + dn->dn_newgid = dn->dn_oldgid; + } else { + dn->dn_newuid = 0; + dn->dn_newgid = 0; + } + error = 0; + } + + if (db) + mutex_exit(&db->db_mtx); + + mutex_enter(&dn->dn_mtx); + if (error == 0 && before) + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (error == 0 && !before) + dn->dn_id_flags |= DN_ID_NEW_EXIST; + + if (have_spill) { + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + } else { + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + mutex_exit(&dn->dn_mtx); + if (have_spill) + dmu_buf_rele((dmu_buf_t *)db, FTAG); +} + +boolean_t +dmu_objset_userspace_present(objset_t *os) +{ + return (os->os_phys->os_flags & + OBJSET_FLAG_USERACCOUNTING_COMPLETE); +} + +int +dmu_objset_userspace_upgrade(objset_t *os) +{ + uint64_t obj; + int err = 0; + + if (dmu_objset_userspace_present(os)) + return (0); + if (!dmu_objset_userused_enabled(os)) + return (SET_ERROR(ENOTSUP)); + if (dmu_objset_is_snapshot(os)) + return (SET_ERROR(EINVAL)); + + /* + * We simply need to mark every object dirty, so that it will be + * synced out and now accounted. If this is called + * concurrently, or if we already did some work before crashing, + * that's fine, since we track each object's accounted state + * independently. + */ + + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { + dmu_tx_t *tx; + dmu_buf_t *db; + int objerr; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (SET_ERROR(EINTR)); + + objerr = dmu_bonus_hold(os, obj, FTAG, &db); + if (objerr != 0) + continue; + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, obj); + objerr = dmu_tx_assign(tx, TXG_WAIT); + if (objerr != 0) { + dmu_tx_abort(tx); + continue; + } + dmu_buf_will_dirty(db, tx); + dmu_buf_rele(db, FTAG); + dmu_tx_commit(tx); + } + + os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + txg_wait_synced(dmu_objset_pool(os), 0); + return (0); +} + +void +dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp) +{ + dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, + usedobjsp, availobjsp); +} + +uint64_t +dmu_objset_fsid_guid(objset_t *os) +{ + return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); +} + +void +dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) +{ + stat->dds_type = os->os_phys->os_type; + if (os->os_dsl_dataset) + dsl_dataset_fast_stat(os->os_dsl_dataset, stat); +} + +void +dmu_objset_stats(objset_t *os, nvlist_t *nv) +{ + ASSERT(os->os_dsl_dataset || + os->os_phys->os_type == DMU_OST_META); + + if (os->os_dsl_dataset != NULL) + dsl_dataset_stats(os->os_dsl_dataset, nv); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, + os->os_phys->os_type); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, + dmu_objset_userspace_present(os)); +} + +int +dmu_objset_is_snapshot(objset_t *os) +{ + if (os->os_dsl_dataset != NULL) + return (os->os_dsl_dataset->ds_is_snapshot); + else + return (B_FALSE); +} + +int +dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, + boolean_t *conflict) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + uint64_t ignored; + + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) + return (SET_ERROR(ENOENT)); + + return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored, + MT_NORMALIZE, real, maxlen, conflict)); +} + +int +dmu_snapshot_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + zap_cursor_t cursor; + zap_attribute_t attr; + + ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); + + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) + return (SET_ERROR(ENOENT)); + + zap_cursor_init_serialized(&cursor, + ds->ds_dir->dd_pool->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp); + + if (zap_cursor_retrieve(&cursor, &attr) != 0) { + zap_cursor_fini(&cursor); + return (SET_ERROR(ENOENT)); + } + + if (strlen(attr.za_name) + 1 > namelen) { + zap_cursor_fini(&cursor); + return (SET_ERROR(ENAMETOOLONG)); + } + + (void) strcpy(name, attr.za_name); + if (idp) + *idp = attr.za_first_integer; + if (case_conflict) + *case_conflict = attr.za_normalization_conflict; + zap_cursor_advance(&cursor); + *offp = zap_cursor_serialize(&cursor); + zap_cursor_fini(&cursor); + + return (0); +} + +int +dmu_dir_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp) +{ + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; + zap_cursor_t cursor; + zap_attribute_t attr; + + /* there is no next dir on a snapshot! */ + if (os->os_dsl_dataset->ds_object != + dsl_dir_phys(dd)->dd_head_dataset_obj) + return (SET_ERROR(ENOENT)); + + zap_cursor_init_serialized(&cursor, + dd->dd_pool->dp_meta_objset, + dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp); + + if (zap_cursor_retrieve(&cursor, &attr) != 0) { + zap_cursor_fini(&cursor); + return (SET_ERROR(ENOENT)); + } + + if (strlen(attr.za_name) + 1 > namelen) { + zap_cursor_fini(&cursor); + return (SET_ERROR(ENAMETOOLONG)); + } + + (void) strcpy(name, attr.za_name); + if (idp) + *idp = attr.za_first_integer; + zap_cursor_advance(&cursor); + *offp = zap_cursor_serialize(&cursor); + zap_cursor_fini(&cursor); + + return (0); +} + +typedef struct dmu_objset_find_ctx { + taskq_t *dc_tq; + dsl_pool_t *dc_dp; + uint64_t dc_ddobj; + char *dc_ddname; /* last component of ddobj's name */ + int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *); + void *dc_arg; + int dc_flags; + kmutex_t *dc_error_lock; + int *dc_error; +} dmu_objset_find_ctx_t; + +static void +dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp) +{ + dsl_pool_t *dp = dcp->dc_dp; + dsl_dir_t *dd; + dsl_dataset_t *ds; + zap_cursor_t zc; + zap_attribute_t *attr; + uint64_t thisobj; + int err = 0; + + /* don't process if there already was an error */ + if (*dcp->dc_error != 0) + goto out; + + /* + * Note: passing the name (dc_ddname) here is optional, but it + * improves performance because we don't need to call + * zap_value_search() to determine the name. + */ + err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd); + if (err != 0) + goto out; + + /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ + if (dd->dd_myname[0] == '$') { + dsl_dir_rele(dd, FTAG); + goto out; + } + + thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; + attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* + * Iterate over all children. + */ + if (dcp->dc_flags & DS_FIND_CHILDREN) { + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + dmu_objset_find_ctx_t *child_dcp = + kmem_alloc(sizeof (*child_dcp), KM_SLEEP); + *child_dcp = *dcp; + child_dcp->dc_ddobj = attr->za_first_integer; + child_dcp->dc_ddname = spa_strdup(attr->za_name); + if (dcp->dc_tq != NULL) + (void) taskq_dispatch(dcp->dc_tq, + dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP); + else + dmu_objset_find_dp_impl(child_dcp); + } + zap_cursor_fini(&zc); + } + + /* + * Iterate over all snapshots. + */ + if (dcp->dc_flags & DS_FIND_SNAPSHOTS) { + dsl_dataset_t *ds; + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + + if (err == 0) { + uint64_t snapobj; + + snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; + dsl_dataset_rele(ds, FTAG); + + for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + err = dsl_dataset_hold_obj(dp, + attr->za_first_integer, FTAG, &ds); + if (err != 0) + break; + err = dcp->dc_func(dp, ds, dcp->dc_arg); + dsl_dataset_rele(ds, FTAG); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + } + } + + kmem_free(attr, sizeof (zap_attribute_t)); + + if (err != 0) { + dsl_dir_rele(dd, FTAG); + goto out; + } + + /* + * Apply to self. + */ + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + + /* + * Note: we hold the dir while calling dsl_dataset_hold_obj() so + * that the dir will remain cached, and we won't have to re-instantiate + * it (which could be expensive due to finding its name via + * zap_value_search()). + */ + dsl_dir_rele(dd, FTAG); + if (err != 0) + goto out; + err = dcp->dc_func(dp, ds, dcp->dc_arg); + dsl_dataset_rele(ds, FTAG); + +out: + if (err != 0) { + mutex_enter(dcp->dc_error_lock); + /* only keep first error */ + if (*dcp->dc_error == 0) + *dcp->dc_error = err; + mutex_exit(dcp->dc_error_lock); + } + + if (dcp->dc_ddname != NULL) + spa_strfree(dcp->dc_ddname); + kmem_free(dcp, sizeof (*dcp)); +} + +static void +dmu_objset_find_dp_cb(void *arg) +{ + dmu_objset_find_ctx_t *dcp = arg; + dsl_pool_t *dp = dcp->dc_dp; + + /* + * We need to get a pool_config_lock here, as there are several + * asssert(pool_config_held) down the stack. Getting a lock via + * dsl_pool_config_enter is risky, as it might be stalled by a + * pending writer. This would deadlock, as the write lock can + * only be granted when our parent thread gives up the lock. + * The _prio interface gives us priority over a pending writer. + */ + dsl_pool_config_enter_prio(dp, FTAG); + + dmu_objset_find_dp_impl(dcp); + + dsl_pool_config_exit(dp, FTAG); +} + +/* + * Find objsets under and including ddobj, call func(ds) on each. + * The order for the enumeration is completely undefined. + * func is called with dsl_pool_config held. + */ +int +dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, + int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) +{ + int error = 0; + taskq_t *tq = NULL; + int ntasks; + dmu_objset_find_ctx_t *dcp; + kmutex_t err_lock; + + mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL); + dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP); + dcp->dc_tq = NULL; + dcp->dc_dp = dp; + dcp->dc_ddobj = ddobj; + dcp->dc_ddname = NULL; + dcp->dc_func = func; + dcp->dc_arg = arg; + dcp->dc_flags = flags; + dcp->dc_error_lock = &err_lock; + dcp->dc_error = &error; + + if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) { + /* + * In case a write lock is held we can't make use of + * parallelism, as down the stack of the worker threads + * the lock is asserted via dsl_pool_config_held. + * In case of a read lock this is solved by getting a read + * lock in each worker thread, which isn't possible in case + * of a writer lock. So we fall back to the synchronous path + * here. + * In the future it might be possible to get some magic into + * dsl_pool_config_held in a way that it returns true for + * the worker threads so that a single lock held from this + * thread suffices. For now, stay single threaded. + */ + dmu_objset_find_dp_impl(dcp); + mutex_destroy(&err_lock); + + return (error); + } + + ntasks = dmu_find_threads; + if (ntasks == 0) + ntasks = vdev_count_leaves(dp->dp_spa) * 4; + tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks, + INT_MAX, 0); + if (tq == NULL) { + kmem_free(dcp, sizeof (*dcp)); + mutex_destroy(&err_lock); + + return (SET_ERROR(ENOMEM)); + } + dcp->dc_tq = tq; + + /* dcp will be freed by task */ + (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP); + + /* + * PORTING: this code relies on the property of taskq_wait to wait + * until no more tasks are queued and no more tasks are active. As + * we always queue new tasks from within other tasks, task_wait + * reliably waits for the full recursion to finish, even though we + * enqueue new tasks after taskq_wait has been called. + * On platforms other than illumos, taskq_wait may not have this + * property. + */ + taskq_wait(tq); + taskq_destroy(tq); + mutex_destroy(&err_lock); + + return (error); +} + +/* + * Find all objsets under name, and for each, call 'func(child_name, arg)'. + * The dp_config_rwlock must not be held when this is called, and it + * will not be held when the callback is called. + * Therefore this function should only be used when the pool is not changing + * (e.g. in syncing context), or the callback can deal with the possible races. + */ +static int +dmu_objset_find_impl(spa_t *spa, const char *name, + int func(const char *, void *), void *arg, int flags) +{ + dsl_dir_t *dd; + dsl_pool_t *dp = spa_get_dsl(spa); + dsl_dataset_t *ds; + zap_cursor_t zc; + zap_attribute_t *attr; + char *child; + uint64_t thisobj; + int err; + + dsl_pool_config_enter(dp, FTAG); + + err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); + if (err != 0) { + dsl_pool_config_exit(dp, FTAG); + return (err); + } + + /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ + if (dd->dd_myname[0] == '$') { + dsl_dir_rele(dd, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (0); + } + + thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; + attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* + * Iterate over all children. + */ + if (flags & DS_FIND_CHILDREN) { + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + child = kmem_asprintf("%s/%s", name, attr->za_name); + dsl_pool_config_exit(dp, FTAG); + err = dmu_objset_find_impl(spa, child, + func, arg, flags); + dsl_pool_config_enter(dp, FTAG); + strfree(child); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + + if (err != 0) { + dsl_dir_rele(dd, FTAG); + dsl_pool_config_exit(dp, FTAG); + kmem_free(attr, sizeof (zap_attribute_t)); + return (err); + } + } + + /* + * Iterate over all snapshots. + */ + if (flags & DS_FIND_SNAPSHOTS) { + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + + if (err == 0) { + uint64_t snapobj; + + snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; + dsl_dataset_rele(ds, FTAG); + + for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + child = kmem_asprintf("%s@%s", + name, attr->za_name); + dsl_pool_config_exit(dp, FTAG); + err = func(child, arg); + dsl_pool_config_enter(dp, FTAG); + strfree(child); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + } + } + + dsl_dir_rele(dd, FTAG); + kmem_free(attr, sizeof (zap_attribute_t)); + dsl_pool_config_exit(dp, FTAG); + + if (err != 0) + return (err); + + /* Apply to self. */ + return (func(name, arg)); +} + +/* + * See comment above dmu_objset_find_impl(). + */ +int +dmu_objset_find(char *name, int func(const char *, void *), void *arg, + int flags) +{ + spa_t *spa; + int error; + + error = spa_open(name, &spa, FTAG); + if (error != 0) + return (error); + error = dmu_objset_find_impl(spa, name, func, arg, flags); + spa_close(spa, FTAG); + return (error); +} + +void +dmu_objset_set_user(objset_t *os, void *user_ptr) +{ + ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); + os->os_user_ptr = user_ptr; +} + +void * +dmu_objset_get_user(objset_t *os) +{ + ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); + return (os->os_user_ptr); +} + +/* + * Determine name of filesystem, given name of snapshot. + * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes + */ +int +dmu_fsname(const char *snapname, char *buf) +{ + char *atp = strchr(snapname, '@'); + if (atp == NULL) + return (SET_ERROR(EINVAL)); + if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + (void) strlcpy(buf, snapname, atp - snapname + 1); + return (0); +} + +/* + * Call when we think we're going to write/free space in open context to track + * the amount of dirty data in the open txg, which is also the amount + * of memory that can not be evicted until this txg syncs. + */ +void +dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + int64_t aspace = spa_get_worst_case_asize(os->os_spa, space); + + if (ds != NULL) { + dsl_dir_willuse_space(ds->ds_dir, aspace, tx); + dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c new file mode 100644 index 000000000000..8d78d1d5ec18 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -0,0 +1,3455 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. + * Copyright 2014 HybridCluster. All rights reserved. + * Copyright 2016 RackTop Systems. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> +#include <sys/zfs_znode.h> +#include <zfs_fletcher.h> +#include <sys/avl.h> +#include <sys/ddt.h> +#include <sys/zfs_onexit.h> +#include <sys/dmu_send.h> +#include <sys/dsl_destroy.h> +#include <sys/blkptr.h> +#include <sys/dsl_bookmark.h> +#include <sys/zfeature.h> +#include <sys/bqueue.h> + +#ifdef __FreeBSD__ +#undef dump_write +#define dump_write dmu_dump_write +#endif + +/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ +int zfs_send_corrupt_data = B_FALSE; +int zfs_send_queue_length = 16 * 1024 * 1024; +int zfs_recv_queue_length = 16 * 1024 * 1024; +/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ +int zfs_send_set_freerecords_bit = B_TRUE; + +#ifdef _KERNEL +TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit); +#endif + +static char *dmu_recv_tag = "dmu_recv_tag"; +const char *recv_clone_name = "%recv"; + +/* + * Use this to override the recordsize calculation for fast zfs send estimates. + */ +uint64_t zfs_override_estimate_recordsize = 0; + +#define BP_SPAN(datablkszsec, indblkshift, level) \ + (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (indblkshift - SPA_BLKPTRSHIFT))) + +static void byteswap_record(dmu_replay_record_t *drr); + +struct send_thread_arg { + bqueue_t q; + dsl_dataset_t *ds; /* Dataset to traverse */ + uint64_t fromtxg; /* Traverse from this txg */ + int flags; /* flags to pass to traverse_dataset */ + int error_code; + boolean_t cancel; + zbookmark_phys_t resume; +}; + +struct send_block_record { + boolean_t eos_marker; /* Marks the end of the stream */ + blkptr_t bp; + zbookmark_phys_t zb; + uint8_t indblkshift; + uint16_t datablkszsec; + bqueue_node_t ln; +}; + +static int +dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) +{ + dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); + struct uio auio; + struct iovec aiov; + + /* + * The code does not rely on this (len being a multiple of 8). We keep + * this assertion because of the corresponding assertion in + * receive_read(). Keeping this assertion ensures that we do not + * inadvertently break backwards compatibility (causing the assertion + * in receive_read() to trigger on old software). + * + * Removing the assertions could be rolled into a new feature that uses + * data that isn't 8-byte aligned; if the assertions were removed, a + * feature flag would have to be added. + */ + + ASSERT0(len % 8); + + aiov.iov_base = buf; + aiov.iov_len = len; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_offset = (off_t)-1; + auio.uio_td = dsp->dsa_td; +#ifdef _KERNEL + if (dsp->dsa_fp->f_type == DTYPE_VNODE) + bwillwrite(); + dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0, + dsp->dsa_td); +#else + fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); + dsp->dsa_err = EOPNOTSUPP; +#endif + mutex_enter(&ds->ds_sendstream_lock); + *dsp->dsa_off += len; + mutex_exit(&ds->ds_sendstream_lock); + + return (dsp->dsa_err); +} + +/* + * For all record types except BEGIN, fill in the checksum (overlaid in + * drr_u.drr_checksum.drr_checksum). The checksum verifies everything + * up to the start of the checksum itself. + */ +static int +dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) +{ + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + (void) fletcher_4_incremental_native(dsp->dsa_drr, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + &dsp->dsa_zc); + if (dsp->dsa_drr->drr_type == DRR_BEGIN) { + dsp->dsa_sent_begin = B_TRUE; + } else { + ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. + drr_checksum.drr_checksum)); + dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; + } + if (dsp->dsa_drr->drr_type == DRR_END) { + dsp->dsa_sent_end = B_TRUE; + } + (void) fletcher_4_incremental_native(&dsp->dsa_drr-> + drr_u.drr_checksum.drr_checksum, + sizeof (zio_cksum_t), &dsp->dsa_zc); + if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) + return (SET_ERROR(EINTR)); + if (payload_len != 0) { + (void) fletcher_4_incremental_native(payload, payload_len, + &dsp->dsa_zc); + if (dump_bytes(dsp, payload, payload_len) != 0) + return (SET_ERROR(EINTR)); + } + return (0); +} + +/* + * Fill in the drr_free struct, or perform aggregation if the previous record is + * also a free record, and the two are adjacent. + * + * Note that we send free records even for a full send, because we want to be + * able to receive a full send as a clone, which requires a list of all the free + * and freeobject records that were generated on the source. + */ +static int +dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, + uint64_t length) +{ + struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); + + /* + * When we receive a free record, dbuf_free_range() assumes + * that the receiving system doesn't have any dbufs in the range + * being freed. This is always true because there is a one-record + * constraint: we only send one WRITE record for any given + * object,offset. We know that the one-record constraint is + * true because we always send data in increasing order by + * object,offset. + * + * If the increasing-order constraint ever changes, we should find + * another way to assert that the one-record constraint is still + * satisfied. + */ + ASSERT(object > dsp->dsa_last_data_object || + (object == dsp->dsa_last_data_object && + offset > dsp->dsa_last_data_offset)); + + if (length != -1ULL && offset + length < offset) + length = -1ULL; + + /* + * If there is a pending op, but it's not PENDING_FREE, push it out, + * since free block aggregation can only be done for blocks of the + * same type (i.e., DRR_FREE records can only be aggregated with + * other DRR_FREE records. DRR_FREEOBJECTS records can only be + * aggregated with other DRR_FREEOBJECTS records. + */ + if (dsp->dsa_pending_op != PENDING_NONE && + dsp->dsa_pending_op != PENDING_FREE) { + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; + } + + if (dsp->dsa_pending_op == PENDING_FREE) { + /* + * There should never be a PENDING_FREE if length is -1 + * (because dump_dnode is the only place where this + * function is called with a -1, and only after flushing + * any pending record). + */ + ASSERT(length != -1ULL); + /* + * Check to see whether this free block can be aggregated + * with pending one. + */ + if (drrf->drr_object == object && drrf->drr_offset + + drrf->drr_length == offset) { + drrf->drr_length += length; + return (0); + } else { + /* not a continuation. Push out pending record */ + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; + } + } + /* create a FREE record and make it pending */ + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_FREE; + drrf->drr_object = object; + drrf->drr_offset = offset; + drrf->drr_length = length; + drrf->drr_toguid = dsp->dsa_toguid; + if (length == -1ULL) { + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + } else { + dsp->dsa_pending_op = PENDING_FREE; + } + + return (0); +} + +static int +dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, + uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp, + void *data) +{ + uint64_t payload_size; + struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); + + /* + * We send data in increasing object, offset order. + * See comment in dump_free() for details. + */ + ASSERT(object > dsp->dsa_last_data_object || + (object == dsp->dsa_last_data_object && + offset > dsp->dsa_last_data_offset)); + dsp->dsa_last_data_object = object; + dsp->dsa_last_data_offset = offset + lsize - 1; + + /* + * If there is any kind of pending aggregation (currently either + * a grouping of free objects or free blocks), push it out to + * the stream, since aggregation can't be done across operations + * of different types. + */ + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; + } + /* write a WRITE record */ + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_WRITE; + drrw->drr_object = object; + drrw->drr_type = type; + drrw->drr_offset = offset; + drrw->drr_toguid = dsp->dsa_toguid; + drrw->drr_logical_size = lsize; + + /* only set the compression fields if the buf is compressed */ + if (lsize != psize) { + ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(!BP_SHOULD_BYTESWAP(bp)); + ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); + ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF); + ASSERT3S(psize, >, 0); + ASSERT3S(lsize, >=, psize); + + drrw->drr_compressiontype = BP_GET_COMPRESS(bp); + drrw->drr_compressed_size = psize; + payload_size = drrw->drr_compressed_size; + } else { + payload_size = drrw->drr_logical_size; + } + + if (bp == NULL || BP_IS_EMBEDDED(bp)) { + /* + * There's no pre-computed checksum for partial-block + * writes or embedded BP's, so (like + * fletcher4-checkummed blocks) userland will have to + * compute a dedup-capable checksum itself. + */ + drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; + } else { + drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); + if (zio_checksum_table[drrw->drr_checksumtype].ci_flags & + ZCHECKSUM_FLAG_DEDUP) + drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; + DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); + drrw->drr_key.ddk_cksum = bp->blk_cksum; + } + + if (dump_record(dsp, data, payload_size) != 0) + return (SET_ERROR(EINTR)); + return (0); +} + +static int +dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, + int blksz, const blkptr_t *bp) +{ + char buf[BPE_PAYLOAD_SIZE]; + struct drr_write_embedded *drrw = + &(dsp->dsa_drr->drr_u.drr_write_embedded); + + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_record(dsp, NULL, 0) != 0) + return (EINTR); + dsp->dsa_pending_op = PENDING_NONE; + } + + ASSERT(BP_IS_EMBEDDED(bp)); + + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; + drrw->drr_object = object; + drrw->drr_offset = offset; + drrw->drr_length = blksz; + drrw->drr_toguid = dsp->dsa_toguid; + drrw->drr_compression = BP_GET_COMPRESS(bp); + drrw->drr_etype = BPE_GET_ETYPE(bp); + drrw->drr_lsize = BPE_GET_LSIZE(bp); + drrw->drr_psize = BPE_GET_PSIZE(bp); + + decode_embedded_bp_compressed(bp, buf); + + if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) + return (EINTR); + return (0); +} + +static int +dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) +{ + struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); + + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; + } + + /* write a SPILL record */ + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_SPILL; + drrs->drr_object = object; + drrs->drr_length = blksz; + drrs->drr_toguid = dsp->dsa_toguid; + + if (dump_record(dsp, data, blksz) != 0) + return (SET_ERROR(EINTR)); + return (0); +} + +static int +dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) +{ + struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); + + /* + * If there is a pending op, but it's not PENDING_FREEOBJECTS, + * push it out, since free block aggregation can only be done for + * blocks of the same type (i.e., DRR_FREE records can only be + * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records + * can only be aggregated with other DRR_FREEOBJECTS records. + */ + if (dsp->dsa_pending_op != PENDING_NONE && + dsp->dsa_pending_op != PENDING_FREEOBJECTS) { + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; + } + if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { + /* + * See whether this free object array can be aggregated + * with pending one + */ + if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { + drrfo->drr_numobjs += numobjs; + return (0); + } else { + /* can't be aggregated. Push out pending record */ + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; + } + } + + /* write a FREEOBJECTS record */ + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; + drrfo->drr_firstobj = firstobj; + drrfo->drr_numobjs = numobjs; + drrfo->drr_toguid = dsp->dsa_toguid; + + dsp->dsa_pending_op = PENDING_FREEOBJECTS; + + return (0); +} + +static int +dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) +{ + struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); + + if (object < dsp->dsa_resume_object) { + /* + * Note: when resuming, we will visit all the dnodes in + * the block of dnodes that we are resuming from. In + * this case it's unnecessary to send the dnodes prior to + * the one we are resuming from. We should be at most one + * block's worth of dnodes behind the resume point. + */ + ASSERT3U(dsp->dsa_resume_object - object, <, + 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); + return (0); + } + + if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) + return (dump_freeobjects(dsp, object, 1)); + + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; + } + + /* write an OBJECT record */ + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_OBJECT; + drro->drr_object = object; + drro->drr_type = dnp->dn_type; + drro->drr_bonustype = dnp->dn_bonustype; + drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + drro->drr_bonuslen = dnp->dn_bonuslen; + drro->drr_dn_slots = dnp->dn_extra_slots + 1; + drro->drr_checksumtype = dnp->dn_checksum; + drro->drr_compress = dnp->dn_compress; + drro->drr_toguid = dsp->dsa_toguid; + + if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) + drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; + + if (dump_record(dsp, DN_BONUS(dnp), + P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) { + return (SET_ERROR(EINTR)); + } + + /* Free anything past the end of the file. */ + if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) + return (SET_ERROR(EINTR)); + if (dsp->dsa_err != 0) + return (SET_ERROR(EINTR)); + return (0); +} + +static boolean_t +backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) +{ + if (!BP_IS_EMBEDDED(bp)) + return (B_FALSE); + + /* + * Compression function must be legacy, or explicitly enabled. + */ + if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && + !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4))) + return (B_FALSE); + + /* + * Embed type must be explicitly enabled. + */ + switch (BPE_GET_ETYPE(bp)) { + case BP_EMBEDDED_TYPE_DATA: + if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) + return (B_TRUE); + break; + default: + return (B_FALSE); + } + return (B_FALSE); +} + +/* + * This is the callback function to traverse_dataset that acts as the worker + * thread for dmu_send_impl. + */ +/*ARGSUSED*/ +static int +send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) +{ + struct send_thread_arg *sta = arg; + struct send_block_record *record; + uint64_t record_size; + int err = 0; + + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= sta->resume.zb_object); + + if (sta->cancel) + return (SET_ERROR(EINTR)); + + if (bp == NULL) { + ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); + return (0); + } else if (zb->zb_level < 0) { + return (0); + } + + record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); + record->eos_marker = B_FALSE; + record->bp = *bp; + record->zb = *zb; + record->indblkshift = dnp->dn_indblkshift; + record->datablkszsec = dnp->dn_datablkszsec; + record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + bqueue_enqueue(&sta->q, record, record_size); + + return (err); +} + +/* + * This function kicks off the traverse_dataset. It also handles setting the + * error code of the thread in case something goes wrong, and pushes the End of + * Stream record when the traverse_dataset call has finished. If there is no + * dataset to traverse, the thread immediately pushes End of Stream marker. + */ +static void +send_traverse_thread(void *arg) +{ + struct send_thread_arg *st_arg = arg; + int err; + struct send_block_record *data; + + if (st_arg->ds != NULL) { + err = traverse_dataset_resume(st_arg->ds, + st_arg->fromtxg, &st_arg->resume, + st_arg->flags, send_cb, st_arg); + + if (err != EINTR) + st_arg->error_code = err; + } + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + data->eos_marker = B_TRUE; + bqueue_enqueue(&st_arg->q, data, 1); + thread_exit(); +} + +/* + * This function actually handles figuring out what kind of record needs to be + * dumped, reading the data (which has hopefully been prefetched), and calling + * the appropriate helper function. + */ +static int +do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) +{ + dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); + const blkptr_t *bp = &data->bp; + const zbookmark_phys_t *zb = &data->zb; + uint8_t indblkshift = data->indblkshift; + uint16_t dblkszsec = data->datablkszsec; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; + int err = 0; + + ASSERT3U(zb->zb_level, >=, 0); + + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= dsa->dsa_resume_object); + + if (zb->zb_object != DMU_META_DNODE_OBJECT && + DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { + return (0); + } else if (BP_IS_HOLE(bp) && + zb->zb_object == DMU_META_DNODE_OBJECT) { + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); + uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; + err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); + } else if (BP_IS_HOLE(bp)) { + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); + uint64_t offset = zb->zb_blkid * span; + err = dump_free(dsa, zb->zb_object, offset, span); + } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { + return (0); + } else if (type == DMU_OT_DNODE) { + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + arc_flags_t aflags = ARC_FLAG_WAIT; + arc_buf_t *abuf; + + ASSERT0(zb->zb_level); + + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &aflags, zb) != 0) + return (SET_ERROR(EIO)); + + dnode_phys_t *blk = abuf->b_data; + uint64_t dnobj = zb->zb_blkid * epb; + for (int i = 0; i < epb; i += blk[i].dn_extra_slots + 1) { + err = dump_dnode(dsa, dnobj + i, blk + i); + if (err != 0) + break; + } + arc_buf_destroy(abuf, &abuf); + } else if (type == DMU_OT_SA) { + arc_flags_t aflags = ARC_FLAG_WAIT; + arc_buf_t *abuf; + int blksz = BP_GET_LSIZE(bp); + + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &aflags, zb) != 0) + return (SET_ERROR(EIO)); + + err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); + arc_buf_destroy(abuf, &abuf); + } else if (backup_do_embed(dsa, bp)) { + /* it's an embedded level-0 block of a regular object */ + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; + ASSERT0(zb->zb_level); + err = dump_write_embedded(dsa, zb->zb_object, + zb->zb_blkid * blksz, blksz, bp); + } else { + /* it's a level-0 block of a regular object */ + arc_flags_t aflags = ARC_FLAG_WAIT; + arc_buf_t *abuf; + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; + uint64_t offset; + + /* + * If we have large blocks stored on disk but the send flags + * don't allow us to send large blocks, we split the data from + * the arc buf into chunks. + */ + boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE && + !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); + /* + * We should only request compressed data from the ARC if all + * the following are true: + * - stream compression was requested + * - we aren't splitting large blocks into smaller chunks + * - the data won't need to be byteswapped before sending + * - this isn't an embedded block + * - this isn't metadata (if receiving on a different endian + * system it can be byteswapped more easily) + */ + boolean_t request_compressed = + (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && + !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && + !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); + + ASSERT0(zb->zb_level); + ASSERT(zb->zb_object > dsa->dsa_resume_object || + (zb->zb_object == dsa->dsa_resume_object && + zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); + + ASSERT0(zb->zb_level); + ASSERT(zb->zb_object > dsa->dsa_resume_object || + (zb->zb_object == dsa->dsa_resume_object && + zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); + + ASSERT3U(blksz, ==, BP_GET_LSIZE(bp)); + + enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + if (request_compressed) + zioflags |= ZIO_FLAG_RAW; + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) { + if (zfs_send_corrupt_data) { + /* Send a block filled with 0x"zfs badd bloc" */ + abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA, + blksz); + uint64_t *ptr; + for (ptr = abuf->b_data; + (char *)ptr < (char *)abuf->b_data + blksz; + ptr++) + *ptr = 0x2f5baddb10cULL; + } else { + return (SET_ERROR(EIO)); + } + } + + offset = zb->zb_blkid * blksz; + + if (split_large_blocks) { + ASSERT3U(arc_get_compression(abuf), ==, + ZIO_COMPRESS_OFF); + char *buf = abuf->b_data; + while (blksz > 0 && err == 0) { + int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); + err = dump_write(dsa, type, zb->zb_object, + offset, n, n, NULL, buf); + offset += n; + buf += n; + blksz -= n; + } + } else { + err = dump_write(dsa, type, zb->zb_object, offset, + blksz, arc_buf_size(abuf), bp, abuf->b_data); + } + arc_buf_destroy(abuf, &abuf); + } + + ASSERT(err == 0 || err == EINTR); + return (err); +} + +/* + * Pop the new data off the queue, and free the old data. + */ +static struct send_block_record * +get_next_record(bqueue_t *bq, struct send_block_record *data) +{ + struct send_block_record *tmp = bqueue_dequeue(bq); + kmem_free(data, sizeof (*data)); + return (tmp); +} + +/* + * Actually do the bulk of the work in a zfs send. + * + * Note: Releases dp using the specified tag. + */ +static int +dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, + zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, + boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, + int outfd, uint64_t resumeobj, uint64_t resumeoff, +#ifdef illumos + vnode_t *vp, offset_t *off) +#else + struct file *fp, offset_t *off) +#endif +{ + objset_t *os; + dmu_replay_record_t *drr; + dmu_sendarg_t *dsp; + int err; + uint64_t fromtxg = 0; + uint64_t featureflags = 0; + struct send_thread_arg to_arg = { 0 }; + + err = dmu_objset_from_ds(to_ds, &os); + if (err != 0) { + dsl_pool_rele(dp, tag); + return (err); + } + + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); + drr->drr_type = DRR_BEGIN; + drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; + DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, + DMU_SUBSTREAM); + +#ifdef _KERNEL + if (dmu_objset_type(os) == DMU_OST_ZFS) { + uint64_t version; + if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { + kmem_free(drr, sizeof (dmu_replay_record_t)); + dsl_pool_rele(dp, tag); + return (SET_ERROR(EINVAL)); + } + if (version >= ZPL_VERSION_SA) { + featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + } + } +#endif + + if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) + featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; + if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) + featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; + if (embedok && + spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { + featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) + featureflags |= DMU_BACKUP_FEATURE_LZ4; + } + if (compressok) { + featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; + } + if ((featureflags & + (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED)) != + 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { + featureflags |= DMU_BACKUP_FEATURE_LZ4; + } + + if (resumeobj != 0 || resumeoff != 0) { + featureflags |= DMU_BACKUP_FEATURE_RESUMING; + } + + DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, + featureflags); + + drr->drr_u.drr_begin.drr_creation_time = + dsl_dataset_phys(to_ds)->ds_creation_time; + drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); + if (is_clone) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; + drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; + if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; + if (zfs_send_set_freerecords_bit) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS; + + if (ancestor_zb != NULL) { + drr->drr_u.drr_begin.drr_fromguid = + ancestor_zb->zbm_guid; + fromtxg = ancestor_zb->zbm_creation_txg; + } + dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); + if (!to_ds->ds_is_snapshot) { + (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", + sizeof (drr->drr_u.drr_begin.drr_toname)); + } + + dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); + + dsp->dsa_drr = drr; + dsp->dsa_outfd = outfd; + dsp->dsa_proc = curproc; + dsp->dsa_td = curthread; + dsp->dsa_fp = fp; + dsp->dsa_os = os; + dsp->dsa_off = off; + dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; + dsp->dsa_pending_op = PENDING_NONE; + dsp->dsa_featureflags = featureflags; + dsp->dsa_resume_object = resumeobj; + dsp->dsa_resume_offset = resumeoff; + + mutex_enter(&to_ds->ds_sendstream_lock); + list_insert_head(&to_ds->ds_sendstreams, dsp); + mutex_exit(&to_ds->ds_sendstream_lock); + + dsl_dataset_long_hold(to_ds, FTAG); + dsl_pool_rele(dp, tag); + + void *payload = NULL; + size_t payload_len = 0; + if (resumeobj != 0 || resumeoff != 0) { + dmu_object_info_t to_doi; + err = dmu_object_info(os, resumeobj, &to_doi); + if (err != 0) + goto out; + SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0, + resumeoff / to_doi.doi_data_block_size); + + nvlist_t *nvl = fnvlist_alloc(); + fnvlist_add_uint64(nvl, "resume_object", resumeobj); + fnvlist_add_uint64(nvl, "resume_offset", resumeoff); + payload = fnvlist_pack(nvl, &payload_len); + drr->drr_payloadlen = payload_len; + fnvlist_free(nvl); + } + + err = dump_record(dsp, payload, payload_len); + fnvlist_pack_free(payload, payload_len); + if (err != 0) { + err = dsp->dsa_err; + goto out; + } + + err = bqueue_init(&to_arg.q, zfs_send_queue_length, + offsetof(struct send_block_record, ln)); + to_arg.error_code = 0; + to_arg.cancel = B_FALSE; + to_arg.ds = to_ds; + to_arg.fromtxg = fromtxg; + to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; + (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, &p0, + TS_RUN, minclsyspri); + + struct send_block_record *to_data; + to_data = bqueue_dequeue(&to_arg.q); + + while (!to_data->eos_marker && err == 0) { + err = do_dump(dsp, to_data); + to_data = get_next_record(&to_arg.q, to_data); + if (issig(JUSTLOOKING) && issig(FORREAL)) + err = EINTR; + } + + if (err != 0) { + to_arg.cancel = B_TRUE; + while (!to_data->eos_marker) { + to_data = get_next_record(&to_arg.q, to_data); + } + } + kmem_free(to_data, sizeof (*to_data)); + + bqueue_destroy(&to_arg.q); + + if (err == 0 && to_arg.error_code != 0) + err = to_arg.error_code; + + if (err != 0) + goto out; + + if (dsp->dsa_pending_op != PENDING_NONE) + if (dump_record(dsp, NULL, 0) != 0) + err = SET_ERROR(EINTR); + + if (err != 0) { + if (err == EINTR && dsp->dsa_err != 0) + err = dsp->dsa_err; + goto out; + } + + bzero(drr, sizeof (dmu_replay_record_t)); + drr->drr_type = DRR_END; + drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; + drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; + + if (dump_record(dsp, NULL, 0) != 0) + err = dsp->dsa_err; + +out: + mutex_enter(&to_ds->ds_sendstream_lock); + list_remove(&to_ds->ds_sendstreams, dsp); + mutex_exit(&to_ds->ds_sendstream_lock); + + VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end)); + + kmem_free(drr, sizeof (dmu_replay_record_t)); + kmem_free(dsp, sizeof (dmu_sendarg_t)); + + dsl_dataset_long_rele(to_ds, FTAG); + + return (err); +} + +int +dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, + boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, +#ifdef illumos + int outfd, vnode_t *vp, offset_t *off) +#else + int outfd, struct file *fp, offset_t *off) +#endif +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + dsl_dataset_t *fromds = NULL; + int err; + + err = dsl_pool_hold(pool, FTAG, &dp); + if (err != 0) + return (err); + + err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + if (fromsnap != 0) { + zfs_bookmark_phys_t zb; + boolean_t is_clone; + + err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); + if (err != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); + } + if (!dsl_dataset_is_before(ds, fromds, 0)) + err = SET_ERROR(EXDEV); + zb.zbm_creation_time = + dsl_dataset_phys(fromds)->ds_creation_time; + zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; + zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; + is_clone = (fromds->ds_dir != ds->ds_dir); + dsl_dataset_rele(fromds, FTAG); + err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, + embedok, large_block_ok, compressok, outfd, 0, 0, fp, off); + } else { + err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, + embedok, large_block_ok, compressok, outfd, 0, 0, fp, off); + } + dsl_dataset_rele(ds, FTAG); + return (err); +} + +int +dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, boolean_t compressok, int outfd, + uint64_t resumeobj, uint64_t resumeoff, +#ifdef illumos + vnode_t *vp, offset_t *off) +#else + struct file *fp, offset_t *off) +#endif +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + boolean_t owned = B_FALSE; + + if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) + return (SET_ERROR(EINVAL)); + + err = dsl_pool_hold(tosnap, FTAG, &dp); + if (err != 0) + return (err); + + if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { + /* + * We are sending a filesystem or volume. Ensure + * that it doesn't change by owning the dataset. + */ + err = dsl_dataset_own(dp, tosnap, FTAG, &ds); + owned = B_TRUE; + } else { + err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); + } + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + if (fromsnap != NULL) { + zfs_bookmark_phys_t zb; + boolean_t is_clone = B_FALSE; + int fsnamelen = strchr(tosnap, '@') - tosnap; + + /* + * If the fromsnap is in a different filesystem, then + * mark the send stream as a clone. + */ + if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || + (fromsnap[fsnamelen] != '@' && + fromsnap[fsnamelen] != '#')) { + is_clone = B_TRUE; + } + + if (strchr(fromsnap, '@')) { + dsl_dataset_t *fromds; + err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); + if (err == 0) { + if (!dsl_dataset_is_before(ds, fromds, 0)) + err = SET_ERROR(EXDEV); + zb.zbm_creation_time = + dsl_dataset_phys(fromds)->ds_creation_time; + zb.zbm_creation_txg = + dsl_dataset_phys(fromds)->ds_creation_txg; + zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; + is_clone = (ds->ds_dir != fromds->ds_dir); + dsl_dataset_rele(fromds, FTAG); + } + } else { + err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); + } + if (err != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); + } + err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, + embedok, large_block_ok, compressok, + outfd, resumeobj, resumeoff, fp, off); + } else { + err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, + embedok, large_block_ok, compressok, + outfd, resumeobj, resumeoff, fp, off); + } + if (owned) + dsl_dataset_disown(ds, FTAG); + else + dsl_dataset_rele(ds, FTAG); + return (err); +} + +static int +dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, + uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) +{ + int err = 0; + uint64_t size; + /* + * Assume that space (both on-disk and in-stream) is dominated by + * data. We will adjust for indirect blocks and the copies property, + * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). + */ + uint64_t recordsize; + uint64_t record_count; + objset_t *os; + VERIFY0(dmu_objset_from_ds(ds, &os)); + + /* Assume all (uncompressed) blocks are recordsize. */ + if (zfs_override_estimate_recordsize != 0) { + recordsize = zfs_override_estimate_recordsize; + } else if (os->os_phys->os_type == DMU_OST_ZVOL) { + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize); + } else { + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize); + } + if (err != 0) + return (err); + record_count = uncompressed / recordsize; + + /* + * If we're estimating a send size for a compressed stream, use the + * compressed data size to estimate the stream size. Otherwise, use the + * uncompressed data size. + */ + size = stream_compressed ? compressed : uncompressed; + + /* + * Subtract out approximate space used by indirect blocks. + * Assume most space is used by data blocks (non-indirect, non-dnode). + * Assume no ditto blocks or internal fragmentation. + * + * Therefore, space used by indirect blocks is sizeof(blkptr_t) per + * block. + */ + size -= record_count * sizeof (blkptr_t); + + /* Add in the space for the record associated with each block. */ + size += record_count * sizeof (dmu_replay_record_t); + + *sizep = size; + + return (0); +} + +int +dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, + boolean_t stream_compressed, uint64_t *sizep) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + int err; + uint64_t uncomp, comp; + + ASSERT(dsl_pool_config_held(dp)); + + /* tosnap must be a snapshot */ + if (!ds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + /* fromsnap, if provided, must be a snapshot */ + if (fromds != NULL && !fromds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + /* + * fromsnap must be an earlier snapshot from the same fs as tosnap, + * or the origin's fs. + */ + if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) + return (SET_ERROR(EXDEV)); + + /* Get compressed and uncompressed size estimates of changed data. */ + if (fromds == NULL) { + uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; + comp = dsl_dataset_phys(ds)->ds_compressed_bytes; + } else { + uint64_t used; + err = dsl_dataset_space_written(fromds, ds, + &used, &comp, &uncomp); + if (err != 0) + return (err); + } + + err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, + stream_compressed, sizep); + /* + * Add the size of the BEGIN and END records to the estimate. + */ + *sizep += 2 * sizeof (dmu_replay_record_t); + return (err); +} + +struct calculate_send_arg { + uint64_t uncompressed; + uint64_t compressed; +}; + +/* + * Simple callback used to traverse the blocks of a snapshot and sum their + * uncompressed and compressed sizes. + */ +/* ARGSUSED */ +static int +dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + struct calculate_send_arg *space = arg; + if (bp != NULL && !BP_IS_HOLE(bp)) { + space->uncompressed += BP_GET_UCSIZE(bp); + space->compressed += BP_GET_PSIZE(bp); + } + return (0); +} + +/* + * Given a desination snapshot and a TXG, calculate the approximate size of a + * send stream sent from that TXG. from_txg may be zero, indicating that the + * whole snapshot will be sent. + */ +int +dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, + boolean_t stream_compressed, uint64_t *sizep) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + int err; + struct calculate_send_arg size = { 0 }; + + ASSERT(dsl_pool_config_held(dp)); + + /* tosnap must be a snapshot */ + if (!ds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + /* verify that from_txg is before the provided snapshot was taken */ + if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) { + return (SET_ERROR(EXDEV)); + } + + /* + * traverse the blocks of the snapshot with birth times after + * from_txg, summing their uncompressed size + */ + err = traverse_dataset(ds, from_txg, TRAVERSE_POST, + dmu_calculate_send_traversal, &size); + if (err) + return (err); + + err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed, + size.compressed, stream_compressed, sizep); + return (err); +} + +typedef struct dmu_recv_begin_arg { + const char *drba_origin; + dmu_recv_cookie_t *drba_cookie; + cred_t *drba_cred; + uint64_t drba_snapobj; +} dmu_recv_begin_arg_t; + +static int +recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, + uint64_t fromguid) +{ + uint64_t val; + int error; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* temporary clone name must not exist */ + error = zap_lookup(dp->dp_meta_objset, + dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, + 8, 1, &val); + if (error != ENOENT) + return (error == 0 ? EBUSY : error); + + /* new snapshot name must not exist */ + error = zap_lookup(dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, + drba->drba_cookie->drc_tosnap, 8, 1, &val); + if (error != ENOENT) + return (error == 0 ? EEXIST : error); + + /* + * Check snapshot limit before receiving. We'll recheck again at the + * end, but might as well abort before receiving if we're already over + * the limit. + * + * Note that we do not check the file system limit with + * dsl_dir_fscount_check because the temporary %clones don't count + * against that limit. + */ + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, + NULL, drba->drba_cred); + if (error != 0) + return (error); + + if (fromguid != 0) { + dsl_dataset_t *snap; + uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + + /* Find snapshot in this dir that matches fromguid. */ + while (obj != 0) { + error = dsl_dataset_hold_obj(dp, obj, FTAG, + &snap); + if (error != 0) + return (SET_ERROR(ENODEV)); + if (snap->ds_dir != ds->ds_dir) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(ENODEV)); + } + if (dsl_dataset_phys(snap)->ds_guid == fromguid) + break; + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + dsl_dataset_rele(snap, FTAG); + } + if (obj == 0) + return (SET_ERROR(ENODEV)); + + if (drba->drba_cookie->drc_force) { + drba->drba_snapobj = obj; + } else { + /* + * If we are not forcing, there must be no + * changes since fromsnap. + */ + if (dsl_dataset_modified_since_snap(ds, snap)) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(ETXTBSY)); + } + drba->drba_snapobj = ds->ds_prev->ds_object; + } + + dsl_dataset_rele(snap, FTAG); + } else { + /* if full, then must be forced */ + if (!drba->drba_cookie->drc_force) + return (SET_ERROR(EEXIST)); + /* start from $ORIGIN@$ORIGIN, if supported */ + drba->drba_snapobj = dp->dp_origin_snap != NULL ? + dp->dp_origin_snap->ds_object : 0; + } + + return (0); + +} + +static int +dmu_recv_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + uint64_t fromguid = drrb->drr_fromguid; + int flags = drrb->drr_flags; + int error; + uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + dsl_dataset_t *ds; + const char *tofs = drba->drba_cookie->drc_tofs; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES || + ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) + return (SET_ERROR(EINVAL)); + + /* Verify pool version supports SA if SA_SPILL feature set */ + if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(dp->dp_spa) < SPA_VERSION_SA) + return (SET_ERROR(ENOTSUP)); + + if (drba->drba_cookie->drc_resumable && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) + return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate a WRITE_EMBEDDED + * record to a plain WRITE record, so the pool must have the + * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED + * records. Same with WRITE_EMBEDDED records that use LZ4 compression. + */ + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) + return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate large blocks + * to smaller ones, so the pool must have the LARGE_BLOCKS + * feature enabled if the stream has LARGE_BLOCKS. + */ + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate large dnodes + * to smaller ones, so the pool must have the LARGE_DNODE + * feature enabled if the stream has LARGE_DNODE. + */ + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) + return (SET_ERROR(ENOTSUP)); + + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error == 0) { + /* target fs already exists; recv into temp clone */ + + /* Can't recv a clone into an existing fs */ + if (flags & DRR_FLAG_CLONE || drba->drba_origin) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = recv_begin_check_existing_impl(drba, ds, fromguid); + dsl_dataset_rele(ds, FTAG); + } else if (error == ENOENT) { + /* target fs does not exist; must be a full backup or clone */ + char buf[ZFS_MAX_DATASET_NAME_LEN]; + + /* + * If it's a non-clone incremental, we are missing the + * target fs, so fail the recv. + */ + if (fromguid != 0 && !(flags & DRR_FLAG_CLONE || + drba->drba_origin)) + return (SET_ERROR(ENOENT)); + + /* + * If we're receiving a full send as a clone, and it doesn't + * contain all the necessary free records and freeobject + * records, reject it. + */ + if (fromguid == 0 && drba->drba_origin && + !(flags & DRR_FLAG_FREERECORDS)) + return (SET_ERROR(EINVAL)); + + /* Open the parent of tofs */ + ASSERT3U(strlen(tofs), <, sizeof (buf)); + (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); + error = dsl_dataset_hold(dp, buf, FTAG, &ds); + if (error != 0) + return (error); + + /* + * Check filesystem and snapshot limits before receiving. We'll + * recheck snapshot limits again at the end (we create the + * filesystems and increment those counts during begin_sync). + */ + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, + ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (drba->drba_origin != NULL) { + dsl_dataset_t *origin; + error = dsl_dataset_hold(dp, drba->drba_origin, + FTAG, &origin); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + if (!origin->ds_is_snapshot) { + dsl_dataset_rele(origin, FTAG); + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + if (dsl_dataset_phys(origin)->ds_guid != fromguid && + fromguid != 0) { + dsl_dataset_rele(origin, FTAG); + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENODEV)); + } + dsl_dataset_rele(origin, FTAG); + } + dsl_dataset_rele(ds, FTAG); + error = 0; + } + return (error); +} + +static void +dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + const char *tofs = drba->drba_cookie->drc_tofs; + dsl_dataset_t *ds, *newds; + uint64_t dsobj; + int error; + uint64_t crflags = 0; + + if (drrb->drr_flags & DRR_FLAG_CI_DATA) + crflags |= DS_FLAG_CI_DATASET; + + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error == 0) { + /* create temporary clone */ + dsl_dataset_t *snap = NULL; + if (drba->drba_snapobj != 0) { + VERIFY0(dsl_dataset_hold_obj(dp, + drba->drba_snapobj, FTAG, &snap)); + } + dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, + snap, crflags, drba->drba_cred, tx); + if (drba->drba_snapobj != 0) + dsl_dataset_rele(snap, FTAG); + dsl_dataset_rele(ds, FTAG); + } else { + dsl_dir_t *dd; + const char *tail; + dsl_dataset_t *origin = NULL; + + VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); + + if (drba->drba_origin != NULL) { + VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, + FTAG, &origin)); + } + + /* Create new dataset. */ + dsobj = dsl_dataset_create_sync(dd, + strrchr(tofs, '/') + 1, + origin, crflags, drba->drba_cred, tx); + if (origin != NULL) + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(dd, FTAG); + drba->drba_cookie->drc_newfs = B_TRUE; + } + VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); + + if (drba->drba_cookie->drc_resumable) { + dsl_dataset_zapify(newds, tx); + if (drrb->drr_fromguid != 0) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, + 8, 1, &drrb->drr_fromguid, tx)); + } + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, + 8, 1, &drrb->drr_toguid, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, + 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); + uint64_t one = 1; + uint64_t zero = 0; + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, + 8, 1, &one, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, + 8, 1, &zero, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, + 8, 1, &zero, tx)); + if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_LARGE_BLOCKS) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, + 8, 1, &one, tx)); + } + if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_EMBED_DATA) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, + 8, 1, &one, tx)); + } + if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_COMPRESSED) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, + 8, 1, &one, tx)); + } + } + + dmu_buf_will_dirty(newds->ds_dbuf, tx); + dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; + + /* + * If we actually created a non-clone, we need to create the + * objset in our new dataset. + */ + rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); + if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { + (void) dmu_objset_create_impl(dp->dp_spa, + newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); + } + rrw_exit(&newds->ds_bp_rwlock, FTAG); + + drba->drba_cookie->drc_ds = newds; + + spa_history_log_internal_ds(newds, "receive", tx, ""); +} + +static int +dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + int error; + uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + dsl_dataset_t *ds; + const char *tofs = drba->drba_cookie->drc_tofs; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES) + return (SET_ERROR(EINVAL)); + + /* Verify pool version supports SA if SA_SPILL feature set */ + if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(dp->dp_spa) < SPA_VERSION_SA) + return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate a WRITE_EMBEDDED + * record to a plain WRITE record, so the pool must have the + * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED + * records. Same with WRITE_EMBEDDED records that use LZ4 compression. + */ + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) + return (SET_ERROR(ENOTSUP)); + + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error != 0) + return (error); + } + + /* check that ds is marked inconsistent */ + if (!DS_IS_INCONSISTENT(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* check that there is resuming data, and that the toguid matches */ + if (!dsl_dataset_is_zapified(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + uint64_t val; + error = zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); + if (error != 0 || drrb->drr_toguid != val) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Check if the receive is still running. If so, it will be owned. + * Note that nothing else can own the dataset (e.g. after the receive + * fails) because it will be marked inconsistent. + */ + if (dsl_dataset_has_owner(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EBUSY)); + } + + /* There should not be any snapshots of this fs yet. */ + if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Note: resume point will be checked when we process the first WRITE + * record. + */ + + /* check that the origin matches */ + val = 0; + (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); + if (drrb->drr_fromguid != val) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + const char *tofs = drba->drba_cookie->drc_tofs; + dsl_dataset_t *ds; + uint64_t dsobj; + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds)); + drba->drba_cookie->drc_newfs = B_TRUE; + } + + /* clear the inconsistent flag so that we can own it */ + ASSERT(DS_IS_INCONSISTENT(ds)); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + + VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds)); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; + + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds))); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + drba->drba_cookie->drc_ds = ds; + + spa_history_log_internal_ds(ds, "resume receive", tx, ""); +} + +/* + * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() + * succeeds; otherwise we will leak the holds on the datasets. + */ +int +dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, + boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc) +{ + dmu_recv_begin_arg_t drba = { 0 }; + + bzero(drc, sizeof (dmu_recv_cookie_t)); + drc->drc_drr_begin = drr_begin; + drc->drc_drrb = &drr_begin->drr_u.drr_begin; + drc->drc_tosnap = tosnap; + drc->drc_tofs = tofs; + drc->drc_force = force; + drc->drc_resumable = resumable; + drc->drc_cred = CRED(); + drc->drc_clone = (origin != NULL); + + if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { + drc->drc_byteswap = B_TRUE; + (void) fletcher_4_incremental_byteswap(drr_begin, + sizeof (dmu_replay_record_t), &drc->drc_cksum); + byteswap_record(drr_begin); + } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { + (void) fletcher_4_incremental_native(drr_begin, + sizeof (dmu_replay_record_t), &drc->drc_cksum); + } else { + return (SET_ERROR(EINVAL)); + } + + drba.drba_origin = origin; + drba.drba_cookie = drc; + drba.drba_cred = CRED(); + + if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_RESUMING) { + return (dsl_sync_task(tofs, + dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + } else { + return (dsl_sync_task(tofs, + dmu_recv_begin_check, dmu_recv_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + } +} + +struct receive_record_arg { + dmu_replay_record_t header; + void *payload; /* Pointer to a buffer containing the payload */ + /* + * If the record is a write, pointer to the arc_buf_t containing the + * payload. + */ + arc_buf_t *write_buf; + int payload_size; + uint64_t bytes_read; /* bytes read from stream when record created */ + boolean_t eos_marker; /* Marks the end of the stream */ + bqueue_node_t node; +}; + +struct receive_writer_arg { + objset_t *os; + boolean_t byteswap; + bqueue_t q; + + /* + * These three args are used to signal to the main thread that we're + * done. + */ + kmutex_t mutex; + kcondvar_t cv; + boolean_t done; + + int err; + /* A map from guid to dataset to help handle dedup'd streams. */ + avl_tree_t *guid_to_ds_map; + boolean_t resumable; + uint64_t last_object; + uint64_t last_offset; + uint64_t max_object; /* highest object ID referenced in stream */ + uint64_t bytes_read; /* bytes read when current record created */ +}; + +struct objlist { + list_t list; /* List of struct receive_objnode. */ + /* + * Last object looked up. Used to assert that objects are being looked + * up in ascending order. + */ + uint64_t last_lookup; +}; + +struct receive_objnode { + list_node_t node; + uint64_t object; +}; + +struct receive_arg { + objset_t *os; + kthread_t *td; + struct file *fp; + uint64_t voff; /* The current offset in the stream */ + uint64_t bytes_read; + /* + * A record that has had its payload read in, but hasn't yet been handed + * off to the worker thread. + */ + struct receive_record_arg *rrd; + /* A record that has had its header read in, but not its payload. */ + struct receive_record_arg *next_rrd; + zio_cksum_t cksum; + zio_cksum_t prev_cksum; + int err; + boolean_t byteswap; + /* Sorted list of objects not to issue prefetches for. */ + struct objlist ignore_objlist; +}; + +typedef struct guid_map_entry { + uint64_t guid; + dsl_dataset_t *gme_ds; + avl_node_t avlnode; +} guid_map_entry_t; + +static int +guid_compare(const void *arg1, const void *arg2) +{ + const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1; + const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2; + + return (AVL_CMP(gmep1->guid, gmep2->guid)); +} + +static void +free_guid_map_onexit(void *arg) +{ + avl_tree_t *ca = arg; + void *cookie = NULL; + guid_map_entry_t *gmep; + + while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { + dsl_dataset_long_rele(gmep->gme_ds, gmep); + dsl_dataset_rele(gmep->gme_ds, gmep); + kmem_free(gmep, sizeof (guid_map_entry_t)); + } + avl_destroy(ca); + kmem_free(ca, sizeof (avl_tree_t)); +} + +static int +restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid) +{ + struct uio auio; + struct iovec aiov; + int error; + + aiov.iov_base = buf; + aiov.iov_len = len; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_offset = off; + auio.uio_td = ra->td; +#ifdef _KERNEL + error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); +#else + fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); + error = EOPNOTSUPP; +#endif + *resid = auio.uio_resid; + return (error); +} + +static int +receive_read(struct receive_arg *ra, int len, void *buf) +{ + int done = 0; + + /* + * The code doesn't rely on this (lengths being multiples of 8). See + * comment in dump_bytes. + */ + ASSERT0(len % 8); + + while (done < len) { + ssize_t resid; + + ra->err = restore_bytes(ra, buf + done, + len - done, ra->voff, &resid); + + if (resid == len - done) { + /* + * Note: ECKSUM indicates that the receive + * was interrupted and can potentially be resumed. + */ + ra->err = SET_ERROR(ECKSUM); + } + ra->voff += len - done - resid; + done = len - resid; + if (ra->err != 0) + return (ra->err); + } + + ra->bytes_read += len; + + ASSERT3U(done, ==, len); + return (0); +} + +noinline static void +byteswap_record(dmu_replay_record_t *drr) +{ +#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) +#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) + drr->drr_type = BSWAP_32(drr->drr_type); + drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); + + switch (drr->drr_type) { + case DRR_BEGIN: + DO64(drr_begin.drr_magic); + DO64(drr_begin.drr_versioninfo); + DO64(drr_begin.drr_creation_time); + DO32(drr_begin.drr_type); + DO32(drr_begin.drr_flags); + DO64(drr_begin.drr_toguid); + DO64(drr_begin.drr_fromguid); + break; + case DRR_OBJECT: + DO64(drr_object.drr_object); + DO32(drr_object.drr_type); + DO32(drr_object.drr_bonustype); + DO32(drr_object.drr_blksz); + DO32(drr_object.drr_bonuslen); + DO64(drr_object.drr_toguid); + break; + case DRR_FREEOBJECTS: + DO64(drr_freeobjects.drr_firstobj); + DO64(drr_freeobjects.drr_numobjs); + DO64(drr_freeobjects.drr_toguid); + break; + case DRR_WRITE: + DO64(drr_write.drr_object); + DO32(drr_write.drr_type); + DO64(drr_write.drr_offset); + DO64(drr_write.drr_logical_size); + DO64(drr_write.drr_toguid); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); + DO64(drr_write.drr_key.ddk_prop); + DO64(drr_write.drr_compressed_size); + break; + case DRR_WRITE_BYREF: + DO64(drr_write_byref.drr_object); + DO64(drr_write_byref.drr_offset); + DO64(drr_write_byref.drr_length); + DO64(drr_write_byref.drr_toguid); + DO64(drr_write_byref.drr_refguid); + DO64(drr_write_byref.drr_refobject); + DO64(drr_write_byref.drr_refoffset); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. + drr_key.ddk_cksum); + DO64(drr_write_byref.drr_key.ddk_prop); + break; + case DRR_WRITE_EMBEDDED: + DO64(drr_write_embedded.drr_object); + DO64(drr_write_embedded.drr_offset); + DO64(drr_write_embedded.drr_length); + DO64(drr_write_embedded.drr_toguid); + DO32(drr_write_embedded.drr_lsize); + DO32(drr_write_embedded.drr_psize); + break; + case DRR_FREE: + DO64(drr_free.drr_object); + DO64(drr_free.drr_offset); + DO64(drr_free.drr_length); + DO64(drr_free.drr_toguid); + break; + case DRR_SPILL: + DO64(drr_spill.drr_object); + DO64(drr_spill.drr_length); + DO64(drr_spill.drr_toguid); + break; + case DRR_END: + DO64(drr_end.drr_toguid); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); + break; + } + + if (drr->drr_type != DRR_BEGIN) { + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); + } + +#undef DO64 +#undef DO32 +} + +static inline uint8_t +deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) +{ + if (bonus_type == DMU_OT_SA) { + return (1); + } else { + return (1 + + ((DN_OLD_MAX_BONUSLEN - + MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT)); + } +} + +static void +save_resume_state(struct receive_writer_arg *rwa, + uint64_t object, uint64_t offset, dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + if (!rwa->resumable) + return; + + /* + * We use ds_resume_bytes[] != 0 to indicate that we need to + * update this on disk, so it must not be 0. + */ + ASSERT(rwa->bytes_read != 0); + + /* + * We only resume from write records, which have a valid + * (non-meta-dnode) object number. + */ + ASSERT(object != 0); + + /* + * For resuming to work correctly, we must receive records in order, + * sorted by object,offset. This is checked by the callers, but + * assert it here for good measure. + */ + ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); + ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || + offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); + ASSERT3U(rwa->bytes_read, >=, + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); + + rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; + rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; +} + +noinline static int +receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, + void *data) +{ + dmu_object_info_t doi; + dmu_tx_t *tx; + uint64_t object; + int err; + + if (drro->drr_type == DMU_OT_NONE || + !DMU_OT_IS_VALID(drro->drr_type) || + !DMU_OT_IS_VALID(drro->drr_bonustype) || + drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || + drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || + P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || + drro->drr_blksz < SPA_MINBLOCKSIZE || + drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || + drro->drr_bonuslen > + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os)))) { + return (SET_ERROR(EINVAL)); + } + + err = dmu_object_info(rwa->os, drro->drr_object, &doi); + + if (err != 0 && err != ENOENT) + return (SET_ERROR(EINVAL)); + object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; + + if (drro->drr_object > rwa->max_object) + rwa->max_object = drro->drr_object; + + /* + * If we are losing blkptrs or changing the block size this must + * be a new file instance. We must clear out the previous file + * contents before we can change this type of metadata in the dnode. + */ + if (err == 0) { + int nblkptr; + + nblkptr = deduce_nblkptr(drro->drr_bonustype, + drro->drr_bonuslen); + + if (drro->drr_blksz != doi.doi_data_block_size || + nblkptr < doi.doi_nblkptr) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + 0, DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + } + } + + tx = dmu_tx_create(rwa->os); + dmu_tx_hold_bonus(tx, object); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + + if (object == DMU_NEW_OBJECT) { + /* currently free, want to be allocated */ + err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, + drro->drr_dn_slots << DNODE_SHIFT, tx); + } else if (drro->drr_type != doi.doi_type || + drro->drr_blksz != doi.doi_data_block_size || + drro->drr_bonustype != doi.doi_bonus_type || + drro->drr_bonuslen != doi.doi_bonus_size) { + /* currently allocated, but with different properties */ + err = dmu_object_reclaim(rwa->os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, tx); + } + if (err != 0) { + dmu_tx_commit(tx); + return (SET_ERROR(EINVAL)); + } + + dmu_object_set_checksum(rwa->os, drro->drr_object, + drro->drr_checksumtype, tx); + dmu_object_set_compress(rwa->os, drro->drr_object, + drro->drr_compress, tx); + + if (data != NULL) { + dmu_buf_t *db; + + VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + + ASSERT3U(db->db_size, >=, drro->drr_bonuslen); + bcopy(data, db->db_data, drro->drr_bonuslen); + if (rwa->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drro->drr_bonustype); + dmu_ot_byteswap[byteswap].ob_func(db->db_data, + drro->drr_bonuslen); + } + dmu_buf_rele(db, FTAG); + } + dmu_tx_commit(tx); + + return (0); +} + +/* ARGSUSED */ +noinline static int +receive_freeobjects(struct receive_writer_arg *rwa, + struct drr_freeobjects *drrfo) +{ + uint64_t obj; + int next_err = 0; + + if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) + return (SET_ERROR(EINVAL)); + + for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; + obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0; + next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { + dmu_object_info_t doi; + int err; + + err = dmu_object_info(rwa->os, obj, &doi); + if (err == ENOENT) { + obj++; + continue; + } else if (err != 0) { + return (err); + } + + err = dmu_free_long_object(rwa->os, obj); + if (err != 0) + return (err); + + if (obj > rwa->max_object) + rwa->max_object = obj; + } + if (next_err != ESRCH) + return (next_err); + return (0); +} + +noinline static int +receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, + arc_buf_t *abuf) +{ + dmu_tx_t *tx; + int err; + + if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || + !DMU_OT_IS_VALID(drrw->drr_type)) + return (SET_ERROR(EINVAL)); + + /* + * For resuming to work, records must be in increasing order + * by (object, offset). + */ + if (drrw->drr_object < rwa->last_object || + (drrw->drr_object == rwa->last_object && + drrw->drr_offset < rwa->last_offset)) { + return (SET_ERROR(EINVAL)); + } + rwa->last_object = drrw->drr_object; + rwa->last_offset = drrw->drr_offset; + + if (rwa->last_object > rwa->max_object) + rwa->max_object = rwa->last_object; + + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) + return (SET_ERROR(EINVAL)); + + tx = dmu_tx_create(rwa->os); + dmu_tx_hold_write(tx, drrw->drr_object, + drrw->drr_offset, drrw->drr_logical_size); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + if (rwa->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, + DRR_WRITE_PAYLOAD_SIZE(drrw)); + } + + /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */ + dmu_buf_t *bonus; + if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) + return (SET_ERROR(EINVAL)); + dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); + + /* + * Note: If the receive fails, we want the resume stream to start + * with the same record that we last successfully received (as opposed + * to the next record), so that we can verify that we are + * resuming from the correct location. + */ + save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); + dmu_tx_commit(tx); + dmu_buf_rele(bonus, FTAG); + + return (0); +} + +/* + * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed + * streams to refer to a copy of the data that is already on the + * system because it came in earlier in the stream. This function + * finds the earlier copy of the data, and uses that copy instead of + * data from the stream to fulfill this write. + */ +static int +receive_write_byref(struct receive_writer_arg *rwa, + struct drr_write_byref *drrwbr) +{ + dmu_tx_t *tx; + int err; + guid_map_entry_t gmesrch; + guid_map_entry_t *gmep; + avl_index_t where; + objset_t *ref_os = NULL; + dmu_buf_t *dbp; + + if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) + return (SET_ERROR(EINVAL)); + + /* + * If the GUID of the referenced dataset is different from the + * GUID of the target dataset, find the referenced dataset. + */ + if (drrwbr->drr_toguid != drrwbr->drr_refguid) { + gmesrch.guid = drrwbr->drr_refguid; + if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, + &where)) == NULL) { + return (SET_ERROR(EINVAL)); + } + if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) + return (SET_ERROR(EINVAL)); + } else { + ref_os = rwa->os; + } + + if (drrwbr->drr_object > rwa->max_object) + rwa->max_object = drrwbr->drr_object; + + err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, + drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); + if (err != 0) + return (err); + + tx = dmu_tx_create(rwa->os); + + dmu_tx_hold_write(tx, drrwbr->drr_object, + drrwbr->drr_offset, drrwbr->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + dmu_write(rwa->os, drrwbr->drr_object, + drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); + dmu_buf_rele(dbp, FTAG); + + /* See comment in restore_write. */ + save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx); + dmu_tx_commit(tx); + return (0); +} + +static int +receive_write_embedded(struct receive_writer_arg *rwa, + struct drr_write_embedded *drrwe, void *data) +{ + dmu_tx_t *tx; + int err; + + if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) + return (EINVAL); + + if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) + return (EINVAL); + + if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) + return (EINVAL); + if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) + return (EINVAL); + + if (drrwe->drr_object > rwa->max_object) + rwa->max_object = drrwe->drr_object; + + tx = dmu_tx_create(rwa->os); + + dmu_tx_hold_write(tx, drrwe->drr_object, + drrwe->drr_offset, drrwe->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + + dmu_write_embedded(rwa->os, drrwe->drr_object, + drrwe->drr_offset, data, drrwe->drr_etype, + drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, + rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); + + /* See comment in restore_write. */ + save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); + dmu_tx_commit(tx); + return (0); +} + +static int +receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, + void *data) +{ + dmu_tx_t *tx; + dmu_buf_t *db, *db_spill; + int err; + + if (drrs->drr_length < SPA_MINBLOCKSIZE || + drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) + return (SET_ERROR(EINVAL)); + + if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) + return (SET_ERROR(EINVAL)); + + if (drrs->drr_object > rwa->max_object) + rwa->max_object = drrs->drr_object; + + VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); + if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { + dmu_buf_rele(db, FTAG); + return (err); + } + + tx = dmu_tx_create(rwa->os); + + dmu_tx_hold_spill(tx, db->db_object); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + dmu_tx_abort(tx); + return (err); + } + dmu_buf_will_dirty(db_spill, tx); + + if (db_spill->db_size < drrs->drr_length) + VERIFY(0 == dbuf_spill_set_blksz(db_spill, + drrs->drr_length, tx)); + bcopy(data, db_spill->db_data, drrs->drr_length); + + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +noinline static int +receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) +{ + int err; + + if (drrf->drr_length != -1ULL && + drrf->drr_offset + drrf->drr_length < drrf->drr_offset) + return (SET_ERROR(EINVAL)); + + if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) + return (SET_ERROR(EINVAL)); + + if (drrf->drr_object > rwa->max_object) + rwa->max_object = drrf->drr_object; + + err = dmu_free_long_range(rwa->os, drrf->drr_object, + drrf->drr_offset, drrf->drr_length); + + return (err); +} + +/* used to destroy the drc_ds on error */ +static void +dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) +{ + if (drc->drc_resumable) { + /* wait for our resume state to be written to disk */ + txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + } else { + char name[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(drc->drc_ds, name); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + (void) dsl_destroy_head(name); + } +} + +static void +receive_cksum(struct receive_arg *ra, int len, void *buf) +{ + if (ra->byteswap) { + (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum); + } else { + (void) fletcher_4_incremental_native(buf, len, &ra->cksum); + } +} + +/* + * Read the payload into a buffer of size len, and update the current record's + * payload field. + * Allocate ra->next_rrd and read the next record's header into + * ra->next_rrd->header. + * Verify checksum of payload and next record. + */ +static int +receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) +{ + int err; + + if (len != 0) { + ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); + err = receive_read(ra, len, buf); + if (err != 0) + return (err); + receive_cksum(ra, len, buf); + + /* note: rrd is NULL when reading the begin record's payload */ + if (ra->rrd != NULL) { + ra->rrd->payload = buf; + ra->rrd->payload_size = len; + ra->rrd->bytes_read = ra->bytes_read; + } + } + + ra->prev_cksum = ra->cksum; + + ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); + err = receive_read(ra, sizeof (ra->next_rrd->header), + &ra->next_rrd->header); + ra->next_rrd->bytes_read = ra->bytes_read; + if (err != 0) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; + return (err); + } + if (ra->next_rrd->header.drr_type == DRR_BEGIN) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; + return (SET_ERROR(EINVAL)); + } + + /* + * Note: checksum is of everything up to but not including the + * checksum itself. + */ + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + receive_cksum(ra, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + &ra->next_rrd->header); + + zio_cksum_t cksum_orig = + ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; + zio_cksum_t *cksump = + &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; + + if (ra->byteswap) + byteswap_record(&ra->next_rrd->header); + + if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && + !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; + return (SET_ERROR(ECKSUM)); + } + + receive_cksum(ra, sizeof (cksum_orig), &cksum_orig); + + return (0); +} + +static void +objlist_create(struct objlist *list) +{ + list_create(&list->list, sizeof (struct receive_objnode), + offsetof(struct receive_objnode, node)); + list->last_lookup = 0; +} + +static void +objlist_destroy(struct objlist *list) +{ + for (struct receive_objnode *n = list_remove_head(&list->list); + n != NULL; n = list_remove_head(&list->list)) { + kmem_free(n, sizeof (*n)); + } + list_destroy(&list->list); +} + +/* + * This function looks through the objlist to see if the specified object number + * is contained in the objlist. In the process, it will remove all object + * numbers in the list that are smaller than the specified object number. Thus, + * any lookup of an object number smaller than a previously looked up object + * number will always return false; therefore, all lookups should be done in + * ascending order. + */ +static boolean_t +objlist_exists(struct objlist *list, uint64_t object) +{ + struct receive_objnode *node = list_head(&list->list); + ASSERT3U(object, >=, list->last_lookup); + list->last_lookup = object; + while (node != NULL && node->object < object) { + VERIFY3P(node, ==, list_remove_head(&list->list)); + kmem_free(node, sizeof (*node)); + node = list_head(&list->list); + } + return (node != NULL && node->object == object); +} + +/* + * The objlist is a list of object numbers stored in ascending order. However, + * the insertion of new object numbers does not seek out the correct location to + * store a new object number; instead, it appends it to the list for simplicity. + * Thus, any users must take care to only insert new object numbers in ascending + * order. + */ +static void +objlist_insert(struct objlist *list, uint64_t object) +{ + struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP); + node->object = object; +#ifdef ZFS_DEBUG + struct receive_objnode *last_object = list_tail(&list->list); + uint64_t last_objnum = (last_object != NULL ? last_object->object : 0); + ASSERT3U(node->object, >, last_objnum); +#endif + list_insert_tail(&list->list, node); +} + +/* + * Issue the prefetch reads for any necessary indirect blocks. + * + * We use the object ignore list to tell us whether or not to issue prefetches + * for a given object. We do this for both correctness (in case the blocksize + * of an object has changed) and performance (if the object doesn't exist, don't + * needlessly try to issue prefetches). We also trim the list as we go through + * the stream to prevent it from growing to an unbounded size. + * + * The object numbers within will always be in sorted order, and any write + * records we see will also be in sorted order, but they're not sorted with + * respect to each other (i.e. we can get several object records before + * receiving each object's write records). As a result, once we've reached a + * given object number, we can safely remove any reference to lower object + * numbers in the ignore list. In practice, we receive up to 32 object records + * before receiving write records, so the list can have up to 32 nodes in it. + */ +/* ARGSUSED */ +static void +receive_read_prefetch(struct receive_arg *ra, + uint64_t object, uint64_t offset, uint64_t length) +{ + if (!objlist_exists(&ra->ignore_objlist, object)) { + dmu_prefetch(ra->os, object, 1, offset, length, + ZIO_PRIORITY_SYNC_READ); + } +} + +/* + * Read records off the stream, issuing any necessary prefetches. + */ +static int +receive_read_record(struct receive_arg *ra) +{ + int err; + + switch (ra->rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; + uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + dmu_object_info_t doi; + err = receive_read_payload_and_next_header(ra, size, buf); + if (err != 0) { + kmem_free(buf, size); + return (err); + } + err = dmu_object_info(ra->os, drro->drr_object, &doi); + /* + * See receive_read_prefetch for an explanation why we're + * storing this object in the ignore_obj_list. + */ + if (err == ENOENT || + (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { + objlist_insert(&ra->ignore_objlist, drro->drr_object); + err = 0; + } + return (err); + } + case DRR_FREEOBJECTS: + { + err = receive_read_payload_and_next_header(ra, 0, NULL); + return (err); + } + case DRR_WRITE: + { + struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; + arc_buf_t *abuf; + boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); + if (DRR_WRITE_COMPRESSED(drrw)) { + ASSERT3U(drrw->drr_compressed_size, >, 0); + ASSERT3U(drrw->drr_logical_size, >=, + drrw->drr_compressed_size); + ASSERT(!is_meta); + abuf = arc_loan_compressed_buf( + dmu_objset_spa(ra->os), + drrw->drr_compressed_size, drrw->drr_logical_size, + drrw->drr_compressiontype); + } else { + abuf = arc_loan_buf(dmu_objset_spa(ra->os), + is_meta, drrw->drr_logical_size); + } + + err = receive_read_payload_and_next_header(ra, + DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); + if (err != 0) { + dmu_return_arcbuf(abuf); + return (err); + } + ra->rrd->write_buf = abuf; + receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, + drrw->drr_logical_size); + return (err); + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwb = + &ra->rrd->header.drr_u.drr_write_byref; + err = receive_read_payload_and_next_header(ra, 0, NULL); + receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, + drrwb->drr_length); + return (err); + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &ra->rrd->header.drr_u.drr_write_embedded; + uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + + err = receive_read_payload_and_next_header(ra, size, buf); + if (err != 0) { + kmem_free(buf, size); + return (err); + } + + receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset, + drrwe->drr_length); + return (err); + } + case DRR_FREE: + { + /* + * It might be beneficial to prefetch indirect blocks here, but + * we don't really have the data to decide for sure. + */ + err = receive_read_payload_and_next_header(ra, 0, NULL); + return (err); + } + case DRR_END: + { + struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; + if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) + return (SET_ERROR(ECKSUM)); + return (0); + } + case DRR_SPILL: + { + struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; + void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP); + err = receive_read_payload_and_next_header(ra, drrs->drr_length, + buf); + if (err != 0) + kmem_free(buf, drrs->drr_length); + return (err); + } + default: + return (SET_ERROR(EINVAL)); + } +} + +/* + * Commit the records to the pool. + */ +static int +receive_process_record(struct receive_writer_arg *rwa, + struct receive_record_arg *rrd) +{ + int err; + + /* Processing in order, therefore bytes_read should be increasing. */ + ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); + rwa->bytes_read = rrd->bytes_read; + + switch (rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = &rrd->header.drr_u.drr_object; + err = receive_object(rwa, drro, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects *drrfo = + &rrd->header.drr_u.drr_freeobjects; + return (receive_freeobjects(rwa, drrfo)); + } + case DRR_WRITE: + { + struct drr_write *drrw = &rrd->header.drr_u.drr_write; + err = receive_write(rwa, drrw, rrd->write_buf); + /* if receive_write() is successful, it consumes the arc_buf */ + if (err != 0) + dmu_return_arcbuf(rrd->write_buf); + rrd->write_buf = NULL; + rrd->payload = NULL; + return (err); + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwbr = + &rrd->header.drr_u.drr_write_byref; + return (receive_write_byref(rwa, drrwbr)); + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &rrd->header.drr_u.drr_write_embedded; + err = receive_write_embedded(rwa, drrwe, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + case DRR_FREE: + { + struct drr_free *drrf = &rrd->header.drr_u.drr_free; + return (receive_free(rwa, drrf)); + } + case DRR_SPILL: + { + struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; + err = receive_spill(rwa, drrs, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + default: + return (SET_ERROR(EINVAL)); + } +} + +/* + * dmu_recv_stream's worker thread; pull records off the queue, and then call + * receive_process_record When we're done, signal the main thread and exit. + */ +static void +receive_writer_thread(void *arg) +{ + struct receive_writer_arg *rwa = arg; + struct receive_record_arg *rrd; + for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; + rrd = bqueue_dequeue(&rwa->q)) { + /* + * If there's an error, the main thread will stop putting things + * on the queue, but we need to clear everything in it before we + * can exit. + */ + if (rwa->err == 0) { + rwa->err = receive_process_record(rwa, rrd); + } else if (rrd->write_buf != NULL) { + dmu_return_arcbuf(rrd->write_buf); + rrd->write_buf = NULL; + rrd->payload = NULL; + } else if (rrd->payload != NULL) { + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + } + kmem_free(rrd, sizeof (*rrd)); + } + kmem_free(rrd, sizeof (*rrd)); + mutex_enter(&rwa->mutex); + rwa->done = B_TRUE; + cv_signal(&rwa->cv); + mutex_exit(&rwa->mutex); + thread_exit(); +} + +static int +resume_check(struct receive_arg *ra, nvlist_t *begin_nvl) +{ + uint64_t val; + objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset; + uint64_t dsobj = dmu_objset_id(ra->os); + uint64_t resume_obj, resume_off; + + if (nvlist_lookup_uint64(begin_nvl, + "resume_object", &resume_obj) != 0 || + nvlist_lookup_uint64(begin_nvl, + "resume_offset", &resume_off) != 0) { + return (SET_ERROR(EINVAL)); + } + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); + if (resume_obj != val) + return (SET_ERROR(EINVAL)); + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); + if (resume_off != val) + return (SET_ERROR(EINVAL)); + + return (0); +} + +/* + * Read in the stream's records, one by one, and apply them to the pool. There + * are two threads involved; the thread that calls this function will spin up a + * worker thread, read the records off the stream one by one, and issue + * prefetches for any necessary indirect blocks. It will then push the records + * onto an internal blocking queue. The worker thread will pull the records off + * the queue, and actually write the data into the DMU. This way, the worker + * thread doesn't have to wait for reads to complete, since everything it needs + * (the indirect blocks) will be prefetched. + * + * NB: callers *must* call dmu_recv_end() if this succeeds. + */ +int +dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, + int cleanup_fd, uint64_t *action_handlep) +{ + int err = 0; + struct receive_arg ra = { 0 }; + struct receive_writer_arg rwa = { 0 }; + int featureflags; + nvlist_t *begin_nvl = NULL; + + ra.byteswap = drc->drc_byteswap; + ra.cksum = drc->drc_cksum; + ra.td = curthread; + ra.fp = fp; + ra.voff = *voffp; + + if (dsl_dataset_is_zapified(drc->drc_ds)) { + (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, + drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, + sizeof (ra.bytes_read), 1, &ra.bytes_read); + } + + objlist_create(&ra.ignore_objlist); + + /* these were verified in dmu_recv_begin */ + ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, + DMU_SUBSTREAM); + ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); + + /* + * Open the objset we are modifying. + */ + VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os)); + + ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); + + featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); + + /* if this stream is dedup'ed, set up the avl tree for guid mapping */ + if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { + minor_t minor; + + if (cleanup_fd == -1) { + ra.err = SET_ERROR(EBADF); + goto out; + } + ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (ra.err != 0) { + cleanup_fd = -1; + goto out; + } + + if (*action_handlep == 0) { + rwa.guid_to_ds_map = + kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); + avl_create(rwa.guid_to_ds_map, guid_compare, + sizeof (guid_map_entry_t), + offsetof(guid_map_entry_t, avlnode)); + err = zfs_onexit_add_cb(minor, + free_guid_map_onexit, rwa.guid_to_ds_map, + action_handlep); + if (ra.err != 0) + goto out; + } else { + err = zfs_onexit_cb_data(minor, *action_handlep, + (void **)&rwa.guid_to_ds_map); + if (ra.err != 0) + goto out; + } + + drc->drc_guid_to_ds_map = rwa.guid_to_ds_map; + } + + uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; + void *payload = NULL; + if (payloadlen != 0) + payload = kmem_alloc(payloadlen, KM_SLEEP); + + err = receive_read_payload_and_next_header(&ra, payloadlen, payload); + if (err != 0) { + if (payloadlen != 0) + kmem_free(payload, payloadlen); + goto out; + } + if (payloadlen != 0) { + err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP); + kmem_free(payload, payloadlen); + if (err != 0) + goto out; + } + + if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { + err = resume_check(&ra, begin_nvl); + if (err != 0) + goto out; + } + + (void) bqueue_init(&rwa.q, zfs_recv_queue_length, + offsetof(struct receive_record_arg, node)); + cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL); + mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL); + rwa.os = ra.os; + rwa.byteswap = drc->drc_byteswap; + rwa.resumable = drc->drc_resumable; + + (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0, + TS_RUN, minclsyspri); + /* + * We're reading rwa.err without locks, which is safe since we are the + * only reader, and the worker thread is the only writer. It's ok if we + * miss a write for an iteration or two of the loop, since the writer + * thread will keep freeing records we send it until we send it an eos + * marker. + * + * We can leave this loop in 3 ways: First, if rwa.err is + * non-zero. In that case, the writer thread will free the rrd we just + * pushed. Second, if we're interrupted; in that case, either it's the + * first loop and ra.rrd was never allocated, or it's later, and ra.rrd + * has been handed off to the writer thread who will free it. Finally, + * if receive_read_record fails or we're at the end of the stream, then + * we free ra.rrd and exit. + */ + while (rwa.err == 0) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = SET_ERROR(EINTR); + break; + } + + ASSERT3P(ra.rrd, ==, NULL); + ra.rrd = ra.next_rrd; + ra.next_rrd = NULL; + /* Allocates and loads header into ra.next_rrd */ + err = receive_read_record(&ra); + + if (ra.rrd->header.drr_type == DRR_END || err != 0) { + kmem_free(ra.rrd, sizeof (*ra.rrd)); + ra.rrd = NULL; + break; + } + + bqueue_enqueue(&rwa.q, ra.rrd, + sizeof (struct receive_record_arg) + ra.rrd->payload_size); + ra.rrd = NULL; + } + if (ra.next_rrd == NULL) + ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP); + ra.next_rrd->eos_marker = B_TRUE; + bqueue_enqueue(&rwa.q, ra.next_rrd, 1); + + mutex_enter(&rwa.mutex); + while (!rwa.done) { + cv_wait(&rwa.cv, &rwa.mutex); + } + mutex_exit(&rwa.mutex); + + /* + * If we are receiving a full stream as a clone, all object IDs which + * are greater than the maximum ID referenced in the stream are + * by definition unused and must be freed. Note that it's possible that + * we've resumed this send and the first record we received was the END + * record. In that case, max_object would be 0, but we shouldn't start + * freeing all objects from there; instead we should start from the + * resumeobj. + */ + if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) { + uint64_t obj; + if (nvlist_lookup_uint64(begin_nvl, "resume_object", &obj) != 0) + obj = 0; + if (rwa.max_object > obj) + obj = rwa.max_object; + obj++; + int free_err = 0; + int next_err = 0; + + while (next_err == 0) { + free_err = dmu_free_long_object(rwa.os, obj); + if (free_err != 0 && free_err != ENOENT) + break; + + next_err = dmu_object_next(rwa.os, &obj, FALSE, 0); + } + + if (err == 0) { + if (free_err != 0 && free_err != ENOENT) + err = free_err; + else if (next_err != ESRCH) + err = next_err; + } + } + + cv_destroy(&rwa.cv); + mutex_destroy(&rwa.mutex); + bqueue_destroy(&rwa.q); + if (err == 0) + err = rwa.err; + +out: + nvlist_free(begin_nvl); + if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) + zfs_onexit_fd_rele(cleanup_fd); + + if (err != 0) { + /* + * Clean up references. If receive is not resumable, + * destroy what we created, so we don't leave it in + * the inconsistent state. + */ + dmu_recv_cleanup_ds(drc); + } + + *voffp = ra.voff; + objlist_destroy(&ra.ignore_objlist); + return (err); +} + +static int +dmu_recv_end_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_cookie_t *drc = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + int error; + + ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); + + if (!drc->drc_newfs) { + dsl_dataset_t *origin_head; + + error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); + if (error != 0) + return (error); + if (drc->drc_force) { + /* + * We will destroy any snapshots in tofs (i.e. before + * origin_head) that are after the origin (which is + * the snap before drc_ds, because drc_ds can not + * have any snaps of its own). + */ + uint64_t obj; + + obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + while (obj != + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { + dsl_dataset_t *snap; + error = dsl_dataset_hold_obj(dp, obj, FTAG, + &snap); + if (error != 0) + break; + if (snap->ds_dir != origin_head->ds_dir) + error = SET_ERROR(EINVAL); + if (error == 0) { + error = dsl_destroy_snapshot_check_impl( + snap, B_FALSE); + } + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + dsl_dataset_rele(snap, FTAG); + if (error != 0) + break; + } + if (error != 0) { + dsl_dataset_rele(origin_head, FTAG); + return (error); + } + } + error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, + origin_head, drc->drc_force, drc->drc_owner, tx); + if (error != 0) { + dsl_dataset_rele(origin_head, FTAG); + return (error); + } + error = dsl_dataset_snapshot_check_impl(origin_head, + drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); + dsl_dataset_rele(origin_head, FTAG); + if (error != 0) + return (error); + + error = dsl_destroy_head_check_impl(drc->drc_ds, 1); + } else { + error = dsl_dataset_snapshot_check_impl(drc->drc_ds, + drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); + } + return (error); +} + +static void +dmu_recv_end_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_cookie_t *drc = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + + spa_history_log_internal_ds(drc->drc_ds, "finish receiving", + tx, "snap=%s", drc->drc_tosnap); + + if (!drc->drc_newfs) { + dsl_dataset_t *origin_head; + + VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, + &origin_head)); + + if (drc->drc_force) { + /* + * Destroy any snapshots of drc_tofs (origin_head) + * after the origin (the snap before drc_ds). + */ + uint64_t obj; + + obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + while (obj != + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { + dsl_dataset_t *snap; + VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, + &snap)); + ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + dsl_destroy_snapshot_sync_impl(snap, + B_FALSE, tx); + dsl_dataset_rele(snap, FTAG); + } + } + VERIFY3P(drc->drc_ds->ds_prev, ==, + origin_head->ds_prev); + + dsl_dataset_clone_swap_sync_impl(drc->drc_ds, + origin_head, tx); + dsl_dataset_snapshot_sync_impl(origin_head, + drc->drc_tosnap, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); + dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = + drc->drc_drrb->drr_creation_time; + dsl_dataset_phys(origin_head->ds_prev)->ds_guid = + drc->drc_drrb->drr_toguid; + dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(origin_head->ds_dbuf, tx); + dsl_dataset_phys(origin_head)->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + drc->drc_newsnapobj = + dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + + dsl_dataset_rele(origin_head, FTAG); + dsl_destroy_head_sync_impl(drc->drc_ds, tx); + + if (drc->drc_owner != NULL) + VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); + } else { + dsl_dataset_t *ds = drc->drc_ds; + + dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + dsl_dataset_phys(ds->ds_prev)->ds_creation_time = + drc->drc_drrb->drr_creation_time; + dsl_dataset_phys(ds->ds_prev)->ds_guid = + drc->drc_drrb->drr_toguid; + dsl_dataset_phys(ds->ds_prev)->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + if (dsl_dataset_has_resume_receive_state(ds)) { + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, tx); + } + drc->drc_newsnapobj = + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; + } + /* + * Release the hold from dmu_recv_begin. This must be done before + * we return to open context, so that when we free the dataset's dnode, + * we can evict its bonus buffer. + */ + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + drc->drc_ds = NULL; +} + +static int +add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) +{ + dsl_pool_t *dp; + dsl_dataset_t *snapds; + guid_map_entry_t *gmep; + int err; + + ASSERT(guid_map != NULL); + + err = dsl_pool_hold(name, FTAG, &dp); + if (err != 0) + return (err); + gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); + err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); + if (err == 0) { + gmep->guid = dsl_dataset_phys(snapds)->ds_guid; + gmep->gme_ds = snapds; + avl_add(guid_map, gmep); + dsl_dataset_long_hold(snapds, gmep); + } else + kmem_free(gmep, sizeof (*gmep)); + + dsl_pool_rele(dp, FTAG); + return (err); +} + +static int dmu_recv_end_modified_blocks = 3; + +static int +dmu_recv_existing_end(dmu_recv_cookie_t *drc) +{ +#ifdef _KERNEL + /* + * We will be destroying the ds; make sure its origin is unmounted if + * necessary. + */ + char name[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(drc->drc_ds, name); + zfs_destroy_unmount_origin(name); +#endif + + return (dsl_sync_task(drc->drc_tofs, + dmu_recv_end_check, dmu_recv_end_sync, drc, + dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); +} + +static int +dmu_recv_new_end(dmu_recv_cookie_t *drc) +{ + return (dsl_sync_task(drc->drc_tofs, + dmu_recv_end_check, dmu_recv_end_sync, drc, + dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); +} + +int +dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) +{ + int error; + + drc->drc_owner = owner; + + if (drc->drc_newfs) + error = dmu_recv_new_end(drc); + else + error = dmu_recv_existing_end(drc); + + if (error != 0) { + dmu_recv_cleanup_ds(drc); + } else if (drc->drc_guid_to_ds_map != NULL) { + (void) add_ds_to_guidmap(drc->drc_tofs, + drc->drc_guid_to_ds_map, + drc->drc_newsnapobj); + } + return (error); +} + +/* + * Return TRUE if this objset is currently being received into. + */ +boolean_t +dmu_objset_is_receiving(objset_t *os) +{ + return (os->os_dsl_dataset != NULL && + os->os_dsl_dataset->ds_owner == dmu_recv_tag); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c new file mode 100644 index 000000000000..8ed53914ceae --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -0,0 +1,712 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2015 Chunwei Chen. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dnode.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/dmu_impl.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#include <sys/callb.h> +#include <sys/zfeature.h> + +int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ +boolean_t send_holes_without_birth_time = B_TRUE; + +#ifdef _KERNEL +SYSCTL_DECL(_vfs_zfs); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, send_holes_without_birth_time, CTLFLAG_RWTUN, + &send_holes_without_birth_time, 0, "Send holes without birth time"); +#endif + +typedef struct prefetch_data { + kmutex_t pd_mtx; + kcondvar_t pd_cv; + int32_t pd_bytes_fetched; + int pd_flags; + boolean_t pd_cancel; + boolean_t pd_exited; + zbookmark_phys_t pd_resume; +} prefetch_data_t; + +typedef struct traverse_data { + spa_t *td_spa; + uint64_t td_objset; + blkptr_t *td_rootbp; + uint64_t td_min_txg; + zbookmark_phys_t *td_resume; + int td_flags; + prefetch_data_t *td_pfd; + boolean_t td_paused; + uint64_t td_hole_birth_enabled_txg; + blkptr_cb_t *td_func; + void *td_arg; + boolean_t td_realloc_possible; +} traverse_data_t; + +static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, + uint64_t objset, uint64_t object); +static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, + uint64_t objset, uint64_t object); + +static int +traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + traverse_data_t *td = arg; + zbookmark_phys_t zb; + + if (BP_IS_HOLE(bp)) + return (0); + + if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa)) + return (-1); + + SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); + + return (0); +} + +static int +traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + traverse_data_t *td = arg; + + if (lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_phys_t zb; + + if (BP_IS_HOLE(bp)) + return (0); + + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return (0); + + SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, + ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + + (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, + td->td_arg); + } + return (0); +} + +static void +traverse_zil(traverse_data_t *td, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed; plus blocks that are already stable in read-only mode. + */ + if (claim_txg == 0 && spa_writeable(td->td_spa)) + return; + + zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); + (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, + claim_txg); + zil_free(zilog); +} + +typedef enum resume_skip { + RESUME_SKIP_ALL, + RESUME_SKIP_NONE, + RESUME_SKIP_CHILDREN +} resume_skip_t; + +/* + * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and + * the block indicated by zb does not need to be visited at all. Returns + * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the + * resume point. This indicates that this block should be visited but not its + * children (since they must have been visited in a previous traversal). + * Otherwise returns RESUME_SKIP_NONE. + */ +static resume_skip_t +resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, + const zbookmark_phys_t *zb) +{ + if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { + /* + * If we already visited this bp & everything below, + * don't bother doing it again. + */ + if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) + return (RESUME_SKIP_ALL); + + /* + * If we found the block we're trying to resume from, zero + * the bookmark out to indicate that we have resumed. + */ + if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { + bzero(td->td_resume, sizeof (*zb)); + if (td->td_flags & TRAVERSE_POST) + return (RESUME_SKIP_CHILDREN); + } + } + return (RESUME_SKIP_NONE); +} + +static void +traverse_prefetch_metadata(traverse_data_t *td, + const blkptr_t *bp, const zbookmark_phys_t *zb) +{ + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + + if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) + return; + /* + * If we are in the process of resuming, don't prefetch, because + * some children will not be needed (and in fact may have already + * been freed). + */ + if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) + return; + if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) + return; + if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) + return; + + (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); +} + +static boolean_t +prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) +{ + ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA); + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || + BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) + return (B_FALSE); + return (B_TRUE); +} + +static int +traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, + const blkptr_t *bp, const zbookmark_phys_t *zb) +{ + zbookmark_phys_t czb; + int err = 0; + arc_buf_t *buf = NULL; + prefetch_data_t *pd = td->td_pfd; + boolean_t hard = td->td_flags & TRAVERSE_HARD; + + switch (resume_skip_check(td, dnp, zb)) { + case RESUME_SKIP_ALL: + return (0); + case RESUME_SKIP_CHILDREN: + goto post; + case RESUME_SKIP_NONE: + break; + default: + ASSERT(0); + } + + if (bp->blk_birth == 0) { + /* + * Since this block has a birth time of 0 it must be one of + * two things: a hole created before the + * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole + * which has always been a hole in an object. + * + * If a file is written sparsely, then the unwritten parts of + * the file were "always holes" -- that is, they have been + * holes since this object was allocated. However, we (and + * our callers) can not necessarily tell when an object was + * allocated. Therefore, if it's possible that this object + * was freed and then its object number reused, we need to + * visit all the holes with birth==0. + * + * If it isn't possible that the object number was reused, + * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote + * all the blocks we will visit as part of this traversal, + * then this hole must have always existed, so we can skip + * it. We visit blocks born after (exclusive) td_min_txg. + * + * Note that the meta-dnode cannot be reallocated. + */ + if (!send_holes_without_birth_time && + (!td->td_realloc_possible || + zb->zb_object == DMU_META_DNODE_OBJECT) && + td->td_hole_birth_enabled_txg <= td->td_min_txg) + return (0); + } else if (bp->blk_birth <= td->td_min_txg) { + return (0); + } + + if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) { + uint64_t size = BP_GET_LSIZE(bp); + mutex_enter(&pd->pd_mtx); + ASSERT(pd->pd_bytes_fetched >= 0); + while (pd->pd_bytes_fetched < size && !pd->pd_exited) + cv_wait(&pd->pd_cv, &pd->pd_mtx); + pd->pd_bytes_fetched -= size; + cv_broadcast(&pd->pd_cv); + mutex_exit(&pd->pd_mtx); + } + + if (BP_IS_HOLE(bp)) { + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); + if (err != 0) + goto post; + return (0); + } + + if (td->td_flags & TRAVERSE_PRE) { + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + goto post; + } + + if (BP_GET_LEVEL(bp) > 0) { + arc_flags_t flags = ARC_FLAG_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err != 0) + goto post; + cbp = buf->b_data; + + for (i = 0; i < epb; i++) { + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + traverse_prefetch_metadata(td, &cbp[i], &czb); + } + + /* recursively visitbp() blocks below this */ + for (i = 0; i < epb; i++) { + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + err = traverse_visitbp(td, dnp, &cbp[i], &czb); + if (err != 0) + break; + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + arc_flags_t flags = ARC_FLAG_WAIT; + int i; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err != 0) + goto post; + dnode_phys_t *child_dnp = buf->b_data; + + for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { + prefetch_dnode_metadata(td, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); + } + + /* recursively visitbp() blocks below this */ + for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { + err = traverse_dnode(td, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); + if (err != 0) + break; + } + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + arc_flags_t flags = ARC_FLAG_WAIT; + + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err != 0) + goto post; + + objset_phys_t *osp = buf->b_data; + prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset, + DMU_META_DNODE_OBJECT); + /* + * See the block comment above for the goal of this variable. + * If the maxblkid of the meta-dnode is 0, then we know that + * we've never had more than DNODES_PER_BLOCK objects in the + * dataset, which means we can't have reused any object ids. + */ + if (osp->os_meta_dnode.dn_maxblkid == 0) + td->td_realloc_possible = B_FALSE; + + if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { + prefetch_dnode_metadata(td, &osp->os_groupused_dnode, + zb->zb_objset, DMU_GROUPUSED_OBJECT); + prefetch_dnode_metadata(td, &osp->os_userused_dnode, + zb->zb_objset, DMU_USERUSED_OBJECT); + } + + err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset, + DMU_META_DNODE_OBJECT); + if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { + err = traverse_dnode(td, &osp->os_groupused_dnode, + zb->zb_objset, DMU_GROUPUSED_OBJECT); + } + if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { + err = traverse_dnode(td, &osp->os_userused_dnode, + zb->zb_objset, DMU_USERUSED_OBJECT); + } + } + + if (buf) + arc_buf_destroy(buf, &buf); + +post: + if (err == 0 && (td->td_flags & TRAVERSE_POST)) + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); + + if (hard && (err == EIO || err == ECKSUM)) { + /* + * Ignore this disk error as requested by the HARD flag, + * and continue traversal. + */ + err = 0; + } + + /* + * If we are stopping here, set td_resume. + */ + if (td->td_resume != NULL && err != 0 && !td->td_paused) { + td->td_resume->zb_objset = zb->zb_objset; + td->td_resume->zb_object = zb->zb_object; + td->td_resume->zb_level = 0; + /* + * If we have stopped on an indirect block (e.g. due to + * i/o error), we have not visited anything below it. + * Set the bookmark to the first level-0 block that we need + * to visit. This way, the resuming code does not need to + * deal with resuming from indirect blocks. + * + * Note, if zb_level <= 0, dnp may be NULL, so we don't want + * to dereference it. + */ + td->td_resume->zb_blkid = zb->zb_blkid; + if (zb->zb_level > 0) { + td->td_resume->zb_blkid <<= zb->zb_level * + (dnp->dn_indblkshift - SPA_BLKPTRSHIFT); + } + td->td_paused = B_TRUE; + } + + return (err); +} + +static void +prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, + uint64_t objset, uint64_t object) +{ + int j; + zbookmark_phys_t czb; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); + traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); + traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb); + } +} + +static int +traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, + uint64_t objset, uint64_t object) +{ + int j, err = 0; + zbookmark_phys_t czb; + + if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL && + object < td->td_resume->zb_object) + return (0); + + if (td->td_flags & TRAVERSE_PRE) { + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, + ZB_DNODE_BLKID); + err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + return (err); + } + + for (j = 0; j < dnp->dn_nblkptr; j++) { + SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); + err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); + if (err != 0) + break; + } + + if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { + SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); + err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb); + } + + if (err == 0 && (td->td_flags & TRAVERSE_POST)) { + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, + ZB_DNODE_BLKID); + err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + return (err); + } + return (err); +} + +/* ARGSUSED */ +static int +traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + prefetch_data_t *pfd = arg; + arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH; + + ASSERT(pfd->pd_bytes_fetched >= 0); + if (bp == NULL) + return (0); + if (pfd->pd_cancel) + return (SET_ERROR(EINTR)); + + if (!prefetch_needed(pfd, bp)) + return (0); + + mutex_enter(&pfd->pd_mtx); + while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max) + cv_wait(&pfd->pd_cv, &pfd->pd_mtx); + pfd->pd_bytes_fetched += BP_GET_LSIZE(bp); + cv_broadcast(&pfd->pd_cv); + mutex_exit(&pfd->pd_mtx); + + (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb); + + return (0); +} + +static void +traverse_prefetch_thread(void *arg) +{ + traverse_data_t *td_main = arg; + traverse_data_t td = *td_main; + zbookmark_phys_t czb; + + td.td_func = traverse_prefetcher; + td.td_arg = td_main->td_pfd; + td.td_pfd = NULL; + td.td_resume = &td_main->td_pfd->pd_resume; + + SET_BOOKMARK(&czb, td.td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb); + + mutex_enter(&td_main->td_pfd->pd_mtx); + td_main->td_pfd->pd_exited = B_TRUE; + cv_broadcast(&td_main->td_pfd->pd_cv); + mutex_exit(&td_main->td_pfd->pd_mtx); +} + +/* + * NB: dataset must not be changing on-disk (eg, is a snapshot or we are + * in syncing context). + */ +static int +traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, + uint64_t txg_start, zbookmark_phys_t *resume, int flags, + blkptr_cb_t func, void *arg) +{ + traverse_data_t td; + prefetch_data_t pd = { 0 }; + zbookmark_phys_t czb; + int err; + + ASSERT(ds == NULL || objset == ds->ds_object); + ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); + + td.td_spa = spa; + td.td_objset = objset; + td.td_rootbp = rootbp; + td.td_min_txg = txg_start; + td.td_resume = resume; + td.td_func = func; + td.td_arg = arg; + td.td_pfd = &pd; + td.td_flags = flags; + td.td_paused = B_FALSE; + td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE); + + if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { + VERIFY(spa_feature_enabled_txg(spa, + SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg)); + } else { + td.td_hole_birth_enabled_txg = UINT64_MAX; + } + + pd.pd_flags = flags; + if (resume != NULL) + pd.pd_resume = *resume; + mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); + + /* See comment on ZIL traversal in dsl_scan_visitds. */ + if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { + arc_flags_t flags = ARC_FLAG_WAIT; + objset_phys_t *osp; + arc_buf_t *buf; + + err = arc_read(NULL, td.td_spa, rootbp, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); + if (err != 0) + return (err); + + osp = buf->b_data; + traverse_zil(&td, &osp->os_zil_header); + arc_buf_destroy(buf, &buf); + } + + if (!(flags & TRAVERSE_PREFETCH_DATA) || + 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, + &td, TQ_NOQUEUE)) + pd.pd_exited = B_TRUE; + + SET_BOOKMARK(&czb, td.td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + err = traverse_visitbp(&td, NULL, rootbp, &czb); + + mutex_enter(&pd.pd_mtx); + pd.pd_cancel = B_TRUE; + cv_broadcast(&pd.pd_cv); + while (!pd.pd_exited) + cv_wait(&pd.pd_cv, &pd.pd_mtx); + mutex_exit(&pd.pd_mtx); + + mutex_destroy(&pd.pd_mtx); + cv_destroy(&pd.pd_cv); + + return (err); +} + +/* + * NB: dataset must not be changing on-disk (eg, is a snapshot or we are + * in syncing context). + */ +int +traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start, + zbookmark_phys_t *resume, + int flags, blkptr_cb_t func, void *arg) +{ + return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, + &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg)); +} + +int +traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, + int flags, blkptr_cb_t func, void *arg) +{ + return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg)); +} + +int +traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, + uint64_t txg_start, zbookmark_phys_t *resume, int flags, + blkptr_cb_t func, void *arg) +{ + return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, + blkptr, txg_start, resume, flags, func, arg)); +} + +/* + * NB: pool must not be changing on-disk (eg, from zdb or sync context). + */ +int +traverse_pool(spa_t *spa, uint64_t txg_start, int flags, + blkptr_cb_t func, void *arg) +{ + int err; + dsl_pool_t *dp = spa_get_dsl(spa); + objset_t *mos = dp->dp_meta_objset; + boolean_t hard = (flags & TRAVERSE_HARD); + + /* visit the MOS */ + err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), + txg_start, NULL, flags, func, arg); + if (err != 0) + return (err); + + /* visit each dataset */ + for (uint64_t obj = 1; err == 0; + err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) { + dmu_object_info_t doi; + + err = dmu_object_info(mos, obj, &doi); + if (err != 0) { + if (hard) + continue; + break; + } + + if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) { + dsl_dataset_t *ds; + uint64_t txg = txg_start; + + dsl_pool_config_enter(dp, FTAG); + err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); + dsl_pool_config_exit(dp, FTAG); + if (err != 0) { + if (hard) + continue; + break; + } + if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg) + txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + err = traverse_dataset(ds, txg, flags, func, arg); + dsl_dataset_rele(ds, FTAG); + if (err != 0) + break; + } + } + if (err == ESRCH) + err = 0; + return (err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c new file mode 100644 index 000000000000..e9f1f4ac19c6 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -0,0 +1,1342 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dbuf.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/zap_impl.h> +#include <sys/spa.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#include <sys/zfs_context.h> +#include <sys/varargs.h> + +typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, + uint64_t arg1, uint64_t arg2); + + +dmu_tx_t * +dmu_tx_create_dd(dsl_dir_t *dd) +{ + dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); + tx->tx_dir = dd; + if (dd != NULL) + tx->tx_pool = dd->dd_pool; + list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), + offsetof(dmu_tx_hold_t, txh_node)); + list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); + tx->tx_start = gethrtime(); + return (tx); +} + +dmu_tx_t * +dmu_tx_create(objset_t *os) +{ + dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); + tx->tx_objset = os; + return (tx); +} + +dmu_tx_t * +dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) +{ + dmu_tx_t *tx = dmu_tx_create_dd(NULL); + + txg_verify(dp->dp_spa, txg); + tx->tx_pool = dp; + tx->tx_txg = txg; + tx->tx_anyobj = TRUE; + + return (tx); +} + +int +dmu_tx_is_syncing(dmu_tx_t *tx) +{ + return (tx->tx_anyobj); +} + +int +dmu_tx_private_ok(dmu_tx_t *tx) +{ + return (tx->tx_anyobj); +} + +static dmu_tx_hold_t * +dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, + uint64_t arg1, uint64_t arg2) +{ + dmu_tx_hold_t *txh; + + if (dn != NULL) { + (void) refcount_add(&dn->dn_holds, tx); + if (tx->tx_txg != 0) { + mutex_enter(&dn->dn_mtx); + /* + * dn->dn_assigned_txg == tx->tx_txg doesn't pose a + * problem, but there's no way for it to happen (for + * now, at least). + */ + ASSERT(dn->dn_assigned_txg == 0); + dn->dn_assigned_txg = tx->tx_txg; + (void) refcount_add(&dn->dn_tx_holds, tx); + mutex_exit(&dn->dn_mtx); + } + } + + txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); + txh->txh_tx = tx; + txh->txh_dnode = dn; + refcount_create(&txh->txh_space_towrite); + refcount_create(&txh->txh_memory_tohold); + txh->txh_type = type; + txh->txh_arg1 = arg1; + txh->txh_arg2 = arg2; + list_insert_tail(&tx->tx_holds, txh); + + return (txh); +} + +static dmu_tx_hold_t * +dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, + enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) +{ + dnode_t *dn = NULL; + dmu_tx_hold_t *txh; + int err; + + if (object != DMU_NEW_OBJECT) { + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) { + tx->tx_err = err; + return (NULL); + } + } + txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); + if (dn != NULL) + dnode_rele(dn, FTAG); + return (txh); +} + +void +dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) +{ + /* + * If we're syncing, they can manipulate any object anyhow, and + * the hold on the dnode_t can cause problems. + */ + if (!dmu_tx_is_syncing(tx)) + (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); +} + +/* + * This function reads specified data from disk. The specified data will + * be needed to perform the transaction -- i.e, it will be read after + * we do dmu_tx_assign(). There are two reasons that we read the data now + * (before dmu_tx_assign()): + * + * 1. Reading it now has potentially better performance. The transaction + * has not yet been assigned, so the TXG is not held open, and also the + * caller typically has less locks held when calling dmu_tx_hold_*() than + * after the transaction has been assigned. This reduces the lock (and txg) + * hold times, thus reducing lock contention. + * + * 2. It is easier for callers (primarily the ZPL) to handle i/o errors + * that are detected before they start making changes to the DMU state + * (i.e. now). Once the transaction has been assigned, and some DMU + * state has been changed, it can be difficult to recover from an i/o + * error (e.g. to undo the changes already made in memory at the DMU + * layer). Typically code to do so does not exist in the caller -- it + * assumes that the data has already been cached and thus i/o errors are + * not possible. + * + * It has been observed that the i/o initiated here can be a performance + * problem, and it appears to be optional, because we don't look at the + * data which is read. However, removing this read would only serve to + * move the work elsewhere (after the dmu_tx_assign()), where it may + * have a greater impact on performance (in addition to the impact on + * fault tolerance noted above). + */ +static int +dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) +{ + int err; + dmu_buf_impl_t *db; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold_level(dn, level, blkid, FTAG); + rw_exit(&dn->dn_struct_rwlock); + if (db == NULL) + return (SET_ERROR(EIO)); + err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); + dbuf_rele(db, FTAG); + return (err); +} + +/* ARGSUSED */ +static void +dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + dnode_t *dn = txh->txh_dnode; + int err = 0; + + if (len == 0) + return; + + (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG); + + if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) + err = SET_ERROR(EFBIG); + + if (dn == NULL) + return; + + /* + * For i/o error checking, read the blocks that will be needed + * to perform the write: the first and last level-0 blocks (if + * they are not aligned, i.e. if they are partial-block writes), + * and all the level-1 blocks. + */ + if (dn->dn_maxblkid == 0) { + if (off < dn->dn_datablksz && + (off > 0 || len < dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + } else { + zio_t *zio = zio_root(dn->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + + /* first level-0 block */ + uint64_t start = off >> dn->dn_datablkshift; + if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { + err = dmu_tx_check_ioerr(zio, dn, 0, start); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + + /* last level-0 block */ + uint64_t end = (off + len - 1) >> dn->dn_datablkshift; + if (end != start && end <= dn->dn_maxblkid && + P2PHASE(off + len, dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(zio, dn, 0, end); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + + /* level-1 blocks */ + if (dn->dn_nlevels > 1) { + int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (uint64_t i = (start >> shft) + 1; + i < end >> shft; i++) { + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + } + + err = zio_wait(zio); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } +} + +static void +dmu_tx_count_dnode(dmu_tx_hold_t *txh) +{ + (void) refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE, FTAG); +} + +void +dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + ASSERT(len == 0 || UINT64_MAX - off >= len - 1); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_WRITE, off, len); + if (txh != NULL) { + dmu_tx_count_write(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + +void +dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_WRITE, 0, 0); + if (txh == NULL) + return; + + dnode_t *dn = txh->txh_dnode; + (void) refcount_add_many(&txh->txh_space_towrite, + 1ULL << dn->dn_indblkshift, FTAG); + dmu_tx_count_dnode(txh); +} + +void +dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + ASSERT(len == 0 || UINT64_MAX - off >= len - 1); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); + if (txh != NULL) { + dmu_tx_count_write(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + +/* + * This function marks the transaction as being a "net free". The end + * result is that refquotas will be disabled for this transaction, and + * this transaction will be able to use half of the pool space overhead + * (see dsl_pool_adjustedsize()). Therefore this function should only + * be called for transactions that we expect will not cause a net increase + * in the amount of space used (but it's OK if that is occasionally not true). + */ +void +dmu_tx_mark_netfree(dmu_tx_t *tx) +{ + tx->tx_netfree = B_TRUE; +} + +static void +dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + dmu_tx_t *tx; + dnode_t *dn; + int err; + + tx = txh->txh_tx; + ASSERT(tx->tx_txg == 0); + + dn = txh->txh_dnode; + dmu_tx_count_dnode(txh); + + if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) + return; + if (len == DMU_OBJECT_END) + len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; + + + /* + * For i/o error checking, we read the first and last level-0 + * blocks if they are not aligned, and all the level-1 blocks. + * + * Note: dbuf_free_range() assumes that we have not instantiated + * any level-0 dbufs that will be completely freed. Therefore we must + * exercise care to not read or count the first and last blocks + * if they are blocksize-aligned. + */ + if (dn->dn_datablkshift == 0) { + if (off != 0 || len < dn->dn_datablksz) + dmu_tx_count_write(txh, 0, dn->dn_datablksz); + } else { + /* first block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off, 1); + /* last block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off + len, 1); + } + + /* + * Check level-1 blocks. + */ + if (dn->dn_nlevels > 1) { + int shift = dn->dn_datablkshift + dn->dn_indblkshift - + SPA_BLKPTRSHIFT; + uint64_t start = off >> shift; + uint64_t end = (off + len) >> shift; + + ASSERT(dn->dn_indblkshift != 0); + + /* + * dnode_reallocate() can result in an object with indirect + * blocks having an odd data block size. In this case, + * just check the single block. + */ + if (dn->dn_datablkshift == 0) + start = end = 0; + + zio_t *zio = zio_root(tx->tx_pool->dp_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + for (uint64_t i = start; i <= end; i++) { + uint64_t ibyte = i << shift; + err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); + i = ibyte >> shift; + if (err == ESRCH || i > end) + break; + if (err != 0) { + tx->tx_err = err; + (void) zio_wait(zio); + return; + } + + (void) refcount_add_many(&txh->txh_memory_tohold, + 1 << dn->dn_indblkshift, FTAG); + + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err != 0) { + tx->tx_err = err; + (void) zio_wait(zio); + return; + } + } + err = zio_wait(zio); + if (err != 0) { + tx->tx_err = err; + return; + } + } +} + +void +dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) +{ + dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_FREE, off, len); + if (txh != NULL) + (void) dmu_tx_hold_free_impl(txh, off, len); +} + +void +dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) +{ + dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); + if (txh != NULL) + (void) dmu_tx_hold_free_impl(txh, off, len); +} + +static void +dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) +{ + dmu_tx_t *tx = txh->txh_tx; + dnode_t *dn; + int err; + + ASSERT(tx->tx_txg == 0); + + dn = txh->txh_dnode; + + dmu_tx_count_dnode(txh); + + /* + * Modifying a almost-full microzap is around the worst case (128KB) + * + * If it is a fat zap, the worst case would be 7*16KB=112KB: + * - 3 blocks overwritten: target leaf, ptrtbl block, header block + * - 4 new blocks written if adding: + * - 2 blocks for possibly split leaves, + * - 2 grown ptrtbl blocks + */ + (void) refcount_add_many(&txh->txh_space_towrite, + MZAP_MAX_BLKSZ, FTAG); + + if (dn == NULL) + return; + + ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); + + if (dn->dn_maxblkid == 0 || name == NULL) { + /* + * This is a microzap (only one block), or we don't know + * the name. Check the first block for i/o errors. + */ + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err != 0) { + tx->tx_err = err; + } + } else { + /* + * Access the name so that we'll check for i/o errors to + * the leaf blocks, etc. We ignore ENOENT, as this name + * may not yet exist. + */ + err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); + if (err == EIO || err == ECKSUM || err == ENXIO) { + tx->tx_err = err; + } + } +} + +void +dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_ZAP, add, (uintptr_t)name); + if (txh != NULL) + dmu_tx_hold_zap_impl(txh, name); +} + +void +dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT(dn != NULL); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); + if (txh != NULL) + dmu_tx_hold_zap_impl(txh, name); +} + +void +dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_BONUS, 0, 0); + if (txh) + dmu_tx_count_dnode(txh); +} + +void +dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); + if (txh) + dmu_tx_count_dnode(txh); +} + +void +dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) +{ + dmu_tx_hold_t *txh; + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + DMU_NEW_OBJECT, THT_SPACE, space, 0); + + (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); +} + +#ifdef ZFS_DEBUG +void +dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) +{ + boolean_t match_object = B_FALSE; + boolean_t match_offset = B_FALSE; + + DB_DNODE_ENTER(db); + dnode_t *dn = DB_DNODE(db); + ASSERT(tx->tx_txg != 0); + ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); + ASSERT3U(dn->dn_object, ==, db->db.db_object); + + if (tx->tx_anyobj) { + DB_DNODE_EXIT(db); + return; + } + + /* XXX No checking on the meta dnode for now */ + if (db->db.db_object == DMU_META_DNODE_OBJECT) { + DB_DNODE_EXIT(db); + return; + } + + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; + txh = list_next(&tx->tx_holds, txh)) { + ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); + if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) + match_object = TRUE; + if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { + int datablkshift = dn->dn_datablkshift ? + dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + int shift = datablkshift + epbs * db->db_level; + uint64_t beginblk = shift >= 64 ? 0 : + (txh->txh_arg1 >> shift); + uint64_t endblk = shift >= 64 ? 0 : + ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); + uint64_t blkid = db->db_blkid; + + /* XXX txh_arg2 better not be zero... */ + + dprintf("found txh type %x beginblk=%llx endblk=%llx\n", + txh->txh_type, beginblk, endblk); + + switch (txh->txh_type) { + case THT_WRITE: + if (blkid >= beginblk && blkid <= endblk) + match_offset = TRUE; + /* + * We will let this hold work for the bonus + * or spill buffer so that we don't need to + * hold it when creating a new object. + */ + if (blkid == DMU_BONUS_BLKID || + blkid == DMU_SPILL_BLKID) + match_offset = TRUE; + /* + * They might have to increase nlevels, + * thus dirtying the new TLIBs. Or the + * might have to change the block size, + * thus dirying the new lvl=0 blk=0. + */ + if (blkid == 0) + match_offset = TRUE; + break; + case THT_FREE: + /* + * We will dirty all the level 1 blocks in + * the free range and perhaps the first and + * last level 0 block. + */ + if (blkid >= beginblk && (blkid <= endblk || + txh->txh_arg2 == DMU_OBJECT_END)) + match_offset = TRUE; + break; + case THT_SPILL: + if (blkid == DMU_SPILL_BLKID) + match_offset = TRUE; + break; + case THT_BONUS: + if (blkid == DMU_BONUS_BLKID) + match_offset = TRUE; + break; + case THT_ZAP: + match_offset = TRUE; + break; + case THT_NEWOBJECT: + match_object = TRUE; + break; + default: + ASSERT(!"bad txh_type"); + } + } + if (match_object && match_offset) { + DB_DNODE_EXIT(db); + return; + } + } + DB_DNODE_EXIT(db); + panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", + (u_longlong_t)db->db.db_object, db->db_level, + (u_longlong_t)db->db_blkid); +} +#endif + +/* + * If we can't do 10 iops, something is wrong. Let us go ahead + * and hit zfs_dirty_data_max. + */ +hrtime_t zfs_delay_max_ns = MSEC2NSEC(100); +int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ + +/* + * We delay transactions when we've determined that the backend storage + * isn't able to accommodate the rate of incoming writes. + * + * If there is already a transaction waiting, we delay relative to when + * that transaction finishes waiting. This way the calculated min_time + * is independent of the number of threads concurrently executing + * transactions. + * + * If we are the only waiter, wait relative to when the transaction + * started, rather than the current time. This credits the transaction for + * "time already served", e.g. reading indirect blocks. + * + * The minimum time for a transaction to take is calculated as: + * min_time = scale * (dirty - min) / (max - dirty) + * min_time is then capped at zfs_delay_max_ns. + * + * The delay has two degrees of freedom that can be adjusted via tunables. + * The percentage of dirty data at which we start to delay is defined by + * zfs_delay_min_dirty_percent. This should typically be at or above + * zfs_vdev_async_write_active_max_dirty_percent so that we only start to + * delay after writing at full speed has failed to keep up with the incoming + * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly + * speaking, this variable determines the amount of delay at the midpoint of + * the curve. + * + * delay + * 10ms +-------------------------------------------------------------*+ + * | *| + * 9ms + *+ + * | *| + * 8ms + *+ + * | * | + * 7ms + * + + * | * | + * 6ms + * + + * | * | + * 5ms + * + + * | * | + * 4ms + * + + * | * | + * 3ms + * + + * | * | + * 2ms + (midpoint) * + + * | | ** | + * 1ms + v *** + + * | zfs_delay_scale ----------> ******** | + * 0 +-------------------------------------*********----------------+ + * 0% <- zfs_dirty_data_max -> 100% + * + * Note that since the delay is added to the outstanding time remaining on the + * most recent transaction, the delay is effectively the inverse of IOPS. + * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve + * was chosen such that small changes in the amount of accumulated dirty data + * in the first 3/4 of the curve yield relatively small differences in the + * amount of delay. + * + * The effects can be easier to understand when the amount of delay is + * represented on a log scale: + * + * delay + * 100ms +-------------------------------------------------------------++ + * + + + * | | + * + *+ + * 10ms + *+ + * + ** + + * | (midpoint) ** | + * + | ** + + * 1ms + v **** + + * + zfs_delay_scale ----------> ***** + + * | **** | + * + **** + + * 100us + ** + + * + * + + * | * | + * + * + + * 10us + * + + * + + + * | | + * + + + * +--------------------------------------------------------------+ + * 0% <- zfs_dirty_data_max -> 100% + * + * Note here that only as the amount of dirty data approaches its limit does + * the delay start to increase rapidly. The goal of a properly tuned system + * should be to keep the amount of dirty data out of that range by first + * ensuring that the appropriate limits are set for the I/O scheduler to reach + * optimal throughput on the backend storage, and then by changing the value + * of zfs_delay_scale to increase the steepness of the curve. + */ +static void +dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) +{ + dsl_pool_t *dp = tx->tx_pool; + uint64_t delay_min_bytes = + zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; + hrtime_t wakeup, min_tx_time, now; + + if (dirty <= delay_min_bytes) + return; + + /* + * The caller has already waited until we are under the max. + * We make them pass us the amount of dirty data so we don't + * have to handle the case of it being >= the max, which could + * cause a divide-by-zero if it's == the max. + */ + ASSERT3U(dirty, <, zfs_dirty_data_max); + + now = gethrtime(); + min_tx_time = zfs_delay_scale * + (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); + if (now > tx->tx_start + min_tx_time) + return; + + min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); + + DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, + uint64_t, min_tx_time); + + mutex_enter(&dp->dp_lock); + wakeup = MAX(tx->tx_start + min_tx_time, + dp->dp_last_wakeup + min_tx_time); + dp->dp_last_wakeup = wakeup; + mutex_exit(&dp->dp_lock); + +#ifdef _KERNEL +#ifdef illumos + mutex_enter(&curthread->t_delay_lock); + while (cv_timedwait_hires(&curthread->t_delay_cv, + &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns, + CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0) + continue; + mutex_exit(&curthread->t_delay_lock); +#else + pause_sbt("dmu_tx_delay", nstosbt(wakeup), + nstosbt(zfs_delay_resolution_ns), C_ABSOLUTE); +#endif +#else + hrtime_t delta = wakeup - gethrtime(); + struct timespec ts; + ts.tv_sec = delta / NANOSEC; + ts.tv_nsec = delta % NANOSEC; + (void) nanosleep(&ts, NULL); +#endif +} + +/* + * This routine attempts to assign the transaction to a transaction group. + * To do so, we must determine if there is sufficient free space on disk. + * + * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() + * on it), then it is assumed that there is sufficient free space, + * unless there's insufficient slop space in the pool (see the comment + * above spa_slop_shift in spa_misc.c). + * + * If it is not a "netfree" transaction, then if the data already on disk + * is over the allowed usage (e.g. quota), this will fail with EDQUOT or + * ENOSPC. Otherwise, if the current rough estimate of pending changes, + * plus the rough estimate of this transaction's changes, may exceed the + * allowed usage, then this will fail with ERESTART, which will cause the + * caller to wait for the pending changes to be written to disk (by waiting + * for the next TXG to open), and then check the space usage again. + * + * The rough estimate of pending changes is comprised of the sum of: + * + * - this transaction's holds' txh_space_towrite + * + * - dd_tempreserved[], which is the sum of in-flight transactions' + * holds' txh_space_towrite (i.e. those transactions that have called + * dmu_tx_assign() but not yet called dmu_tx_commit()). + * + * - dd_space_towrite[], which is the amount of dirtied dbufs. + * + * Note that all of these values are inflated by spa_get_worst_case_asize(), + * which means that we may get ERESTART well before we are actually in danger + * of running out of space, but this also mitigates any small inaccuracies + * in the rough estimate (e.g. txh_space_towrite doesn't take into account + * indirect blocks, and dd_space_towrite[] doesn't take into account changes + * to the MOS). + * + * Note that due to this algorithm, it is possible to exceed the allowed + * usage by one transaction. Also, as we approach the allowed usage, + * we will allow a very limited amount of changes into each TXG, thus + * decreasing performance. + */ +static int +dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) +{ + spa_t *spa = tx->tx_pool->dp_spa; + + ASSERT0(tx->tx_txg); + + if (tx->tx_err) + return (tx->tx_err); + + if (spa_suspended(spa)) { + /* + * If the user has indicated a blocking failure mode + * then return ERESTART which will block in dmu_tx_wait(). + * Otherwise, return EIO so that an error can get + * propagated back to the VOP calls. + * + * Note that we always honor the txg_how flag regardless + * of the failuremode setting. + */ + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && + !(txg_how & TXG_WAIT)) + return (SET_ERROR(EIO)); + + return (SET_ERROR(ERESTART)); + } + + if (!tx->tx_dirty_delayed && + dsl_pool_need_dirty_delay(tx->tx_pool)) { + tx->tx_wait_dirty = B_TRUE; + return (SET_ERROR(ERESTART)); + } + + tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); + tx->tx_needassign_txh = NULL; + + /* + * NB: No error returns are allowed after txg_hold_open, but + * before processing the dnode holds, due to the + * dmu_tx_unassign() logic. + */ + + uint64_t towrite = 0; + uint64_t tohold = 0; + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + if (dn != NULL) { + mutex_enter(&dn->dn_mtx); + if (dn->dn_assigned_txg == tx->tx_txg - 1) { + mutex_exit(&dn->dn_mtx); + tx->tx_needassign_txh = txh; + return (SET_ERROR(ERESTART)); + } + if (dn->dn_assigned_txg == 0) + dn->dn_assigned_txg = tx->tx_txg; + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + (void) refcount_add(&dn->dn_tx_holds, tx); + mutex_exit(&dn->dn_mtx); + } + towrite += refcount_count(&txh->txh_space_towrite); + tohold += refcount_count(&txh->txh_memory_tohold); + } + + /* needed allocation: worst-case estimate of write space */ + uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); + /* calculate memory footprint estimate */ + uint64_t memory = towrite + tohold; + + if (tx->tx_dir != NULL && asize != 0) { + int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, + asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); + if (err != 0) + return (err); + } + + return (0); +} + +static void +dmu_tx_unassign(dmu_tx_t *tx) +{ + if (tx->tx_txg == 0) + return; + + txg_rele_to_quiesce(&tx->tx_txgh); + + /* + * Walk the transaction's hold list, removing the hold on the + * associated dnode, and notifying waiters if the refcount drops to 0. + */ + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); + txh != tx->tx_needassign_txh; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + + if (dn == NULL) + continue; + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + + if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + } + + txg_rele_to_sync(&tx->tx_txgh); + + tx->tx_lasttried_txg = tx->tx_txg; + tx->tx_txg = 0; +} + +/* + * Assign tx to a transaction group; txg_how is a bitmask: + * + * If TXG_WAIT is set and the currently open txg is full, this function + * will wait until there's a new txg. This should be used when no locks + * are being held. With this bit set, this function will only fail if + * we're truly out of space (or over quota). + * + * If TXG_WAIT is *not* set and we can't assign into the currently open + * txg without blocking, this function will return immediately with + * ERESTART. This should be used whenever locks are being held. On an + * ERESTART error, the caller should drop all locks, call dmu_tx_wait(), + * and try again. + * + * If TXG_NOTHROTTLE is set, this indicates that this tx should not be + * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for + * details on the throttle). This is used by the VFS operations, after + * they have already called dmu_tx_wait() (though most likely on a + * different tx). + */ +int +dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) +{ + int err; + + ASSERT(tx->tx_txg == 0); + ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); + ASSERT(!dsl_pool_sync_context(tx->tx_pool)); + + /* If we might wait, we must not hold the config lock. */ + IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool)); + + if ((txg_how & TXG_NOTHROTTLE)) + tx->tx_dirty_delayed = B_TRUE; + + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { + dmu_tx_unassign(tx); + + if (err != ERESTART || !(txg_how & TXG_WAIT)) + return (err); + + dmu_tx_wait(tx); + } + + txg_rele_to_quiesce(&tx->tx_txgh); + + return (0); +} + +void +dmu_tx_wait(dmu_tx_t *tx) +{ + spa_t *spa = tx->tx_pool->dp_spa; + dsl_pool_t *dp = tx->tx_pool; + + ASSERT(tx->tx_txg == 0); + ASSERT(!dsl_pool_config_held(tx->tx_pool)); + + if (tx->tx_wait_dirty) { + /* + * dmu_tx_try_assign() has determined that we need to wait + * because we've consumed much or all of the dirty buffer + * space. + */ + mutex_enter(&dp->dp_lock); + while (dp->dp_dirty_total >= zfs_dirty_data_max) + cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); + uint64_t dirty = dp->dp_dirty_total; + mutex_exit(&dp->dp_lock); + + dmu_tx_delay(tx, dirty); + + tx->tx_wait_dirty = B_FALSE; + + /* + * Note: setting tx_dirty_delayed only has effect if the + * caller used TX_WAIT. Otherwise they are going to + * destroy this tx and try again. The common case, + * zfs_write(), uses TX_WAIT. + */ + tx->tx_dirty_delayed = B_TRUE; + } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { + /* + * If the pool is suspended we need to wait until it + * is resumed. Note that it's possible that the pool + * has become active after this thread has tried to + * obtain a tx. If that's the case then tx_lasttried_txg + * would not have been set. + */ + txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); + } else if (tx->tx_needassign_txh) { + /* + * A dnode is assigned to the quiescing txg. Wait for its + * transaction to complete. + */ + dnode_t *dn = tx->tx_needassign_txh->txh_dnode; + + mutex_enter(&dn->dn_mtx); + while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) + cv_wait(&dn->dn_notxholds, &dn->dn_mtx); + mutex_exit(&dn->dn_mtx); + tx->tx_needassign_txh = NULL; + } else { + /* + * If we have a lot of dirty data just wait until we sync + * out a TXG at which point we'll hopefully have synced + * a portion of the changes. + */ + txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); + } +} + +static void +dmu_tx_destroy(dmu_tx_t *tx) +{ + dmu_tx_hold_t *txh; + + while ((txh = list_head(&tx->tx_holds)) != NULL) { + dnode_t *dn = txh->txh_dnode; + + list_remove(&tx->tx_holds, txh); + refcount_destroy_many(&txh->txh_space_towrite, + refcount_count(&txh->txh_space_towrite)); + refcount_destroy_many(&txh->txh_memory_tohold, + refcount_count(&txh->txh_memory_tohold)); + kmem_free(txh, sizeof (dmu_tx_hold_t)); + if (dn != NULL) + dnode_rele(dn, tx); + } + + list_destroy(&tx->tx_callbacks); + list_destroy(&tx->tx_holds); + kmem_free(tx, sizeof (dmu_tx_t)); +} + +void +dmu_tx_commit(dmu_tx_t *tx) +{ + ASSERT(tx->tx_txg != 0); + + /* + * Go through the transaction's hold list and remove holds on + * associated dnodes, notifying waiters if no holds remain. + */ + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + + if (dn == NULL) + continue; + + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + + if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + } + + if (tx->tx_tempreserve_cookie) + dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); + + if (!list_is_empty(&tx->tx_callbacks)) + txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); + + if (tx->tx_anyobj == FALSE) + txg_rele_to_sync(&tx->tx_txgh); + + dmu_tx_destroy(tx); +} + +void +dmu_tx_abort(dmu_tx_t *tx) +{ + ASSERT(tx->tx_txg == 0); + + /* + * Call any registered callbacks with an error code. + */ + if (!list_is_empty(&tx->tx_callbacks)) + dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); + + dmu_tx_destroy(tx); +} + +uint64_t +dmu_tx_get_txg(dmu_tx_t *tx) +{ + ASSERT(tx->tx_txg != 0); + return (tx->tx_txg); +} + +dsl_pool_t * +dmu_tx_pool(dmu_tx_t *tx) +{ + ASSERT(tx->tx_pool != NULL); + return (tx->tx_pool); +} + +void +dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) +{ + dmu_tx_callback_t *dcb; + + dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); + + dcb->dcb_func = func; + dcb->dcb_data = data; + + list_insert_tail(&tx->tx_callbacks, dcb); +} + +/* + * Call all the commit callbacks on a list, with a given error code. + */ +void +dmu_tx_do_callbacks(list_t *cb_list, int error) +{ + dmu_tx_callback_t *dcb; + + while ((dcb = list_head(cb_list)) != NULL) { + list_remove(cb_list, dcb); + dcb->dcb_func(dcb->dcb_data, error); + kmem_free(dcb, sizeof (dmu_tx_callback_t)); + } +} + +/* + * Interface to hold a bunch of attributes. + * used for creating new files. + * attrsize is the total size of all attributes + * to be added during object creation + * + * For updating/adding a single attribute dmu_tx_hold_sa() should be used. + */ + +/* + * hold necessary attribute name for attribute registration. + * should be a very rare case where this is needed. If it does + * happen it would only happen on the first write to the file system. + */ +static void +dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) +{ + if (!sa->sa_need_attr_registration) + return; + + for (int i = 0; i != sa->sa_num_attrs; i++) { + if (!sa->sa_attr_table[i].sa_registered) { + if (sa->sa_reg_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, + B_TRUE, sa->sa_attr_table[i].sa_name); + else + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, + B_TRUE, sa->sa_attr_table[i].sa_name); + } + } +} + +void +dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, + tx->tx_objset, object, THT_SPILL, 0, 0); + + (void) refcount_add_many(&txh->txh_space_towrite, + SPA_OLD_MAXBLOCKSIZE, FTAG); +} + +void +dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) +{ + sa_os_t *sa = tx->tx_objset->os_sa; + + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_layout_attr_obj) { + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + } else { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) + return; + + (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, + THT_SPILL, 0, 0); +} + +/* + * Hold SA attribute + * + * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) + * + * variable_size is the total size of all variable sized attributes + * passed to this function. It is not the total size of all + * variable size attributes that *may* exist on this object. + */ +void +dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) +{ + uint64_t object; + sa_os_t *sa = tx->tx_objset->os_sa; + + ASSERT(hdl != NULL); + + object = sa_handle_object(hdl); + + dmu_tx_hold_bonus(tx, object); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || + tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + + if (sa->sa_force_spill || may_grow || hdl->sa_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } else { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_have_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } + DB_DNODE_EXIT(db); + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c new file mode 100644 index 000000000000..174a3120eb5e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c @@ -0,0 +1,373 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/dnode.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_zfetch.h> +#include <sys/dmu.h> +#include <sys/dbuf.h> +#include <sys/kstat.h> + +/* + * This tunable disables predictive prefetch. Note that it leaves "prescient" + * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, + * prescient prefetch never issues i/os that end up not being needed, + * so it can't hurt performance. + */ +boolean_t zfs_prefetch_disable = B_FALSE; + +/* max # of streams per zfetch */ +uint32_t zfetch_max_streams = 8; +/* min time before stream reclaim */ +uint32_t zfetch_min_sec_reap = 2; +/* max bytes to prefetch per stream (default 8MB) */ +uint32_t zfetch_max_distance = 8 * 1024 * 1024; +/* max bytes to prefetch indirects for per stream (default 64MB) */ +uint32_t zfetch_max_idistance = 64 * 1024 * 1024; +/* max number of bytes in an array_read in which we allow prefetching (1MB) */ +uint64_t zfetch_array_rd_sz = 1024 * 1024; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW, + &zfs_prefetch_disable, 0, "Disable prefetch"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH"); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RWTUN, + &zfetch_max_streams, 0, "Max # of streams per zfetch"); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN, + &zfetch_min_sec_reap, 0, "Min time before stream reclaim"); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN, + &zfetch_max_distance, 0, "Max bytes to prefetch per stream"); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN, + &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream"); +SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN, + &zfetch_array_rd_sz, 0, + "Number of bytes in a array_read at which we stop prefetching"); + +typedef struct zfetch_stats { + kstat_named_t zfetchstat_hits; + kstat_named_t zfetchstat_misses; + kstat_named_t zfetchstat_max_streams; +} zfetch_stats_t; + +static zfetch_stats_t zfetch_stats = { + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "max_streams", KSTAT_DATA_UINT64 }, +}; + +#define ZFETCHSTAT_BUMP(stat) \ + atomic_inc_64(&zfetch_stats.stat.value.ui64); + +kstat_t *zfetch_ksp; + +void +zfetch_init(void) +{ + zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", + KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (zfetch_ksp != NULL) { + zfetch_ksp->ks_data = &zfetch_stats; + kstat_install(zfetch_ksp); + } +} + +void +zfetch_fini(void) +{ + if (zfetch_ksp != NULL) { + kstat_delete(zfetch_ksp); + zfetch_ksp = NULL; + } +} + +/* + * This takes a pointer to a zfetch structure and a dnode. It performs the + * necessary setup for the zfetch structure, grokking data from the + * associated dnode. + */ +void +dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) +{ + if (zf == NULL) + return; + + zf->zf_dnode = dno; + + list_create(&zf->zf_stream, sizeof (zstream_t), + offsetof(zstream_t, zs_node)); + + rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); +} + +static void +dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) +{ + ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + list_remove(&zf->zf_stream, zs); + mutex_destroy(&zs->zs_lock); + kmem_free(zs, sizeof (*zs)); +} + +/* + * Clean-up state associated with a zfetch structure (e.g. destroy the + * streams). This doesn't free the zfetch_t itself, that's left to the caller. + */ +void +dmu_zfetch_fini(zfetch_t *zf) +{ + zstream_t *zs; + + ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); + + rw_enter(&zf->zf_rwlock, RW_WRITER); + while ((zs = list_head(&zf->zf_stream)) != NULL) + dmu_zfetch_stream_remove(zf, zs); + rw_exit(&zf->zf_rwlock); + list_destroy(&zf->zf_stream); + rw_destroy(&zf->zf_rwlock); + + zf->zf_dnode = NULL; +} + +/* + * If there aren't too many streams already, create a new stream. + * The "blkid" argument is the next block that we expect this stream to access. + * While we're here, clean up old streams (which haven't been + * accessed for at least zfetch_min_sec_reap seconds). + */ +static void +dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) +{ + zstream_t *zs_next; + int numstreams = 0; + + ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + + /* + * Clean up old streams. + */ + for (zstream_t *zs = list_head(&zf->zf_stream); + zs != NULL; zs = zs_next) { + zs_next = list_next(&zf->zf_stream, zs); + if (((gethrtime() - zs->zs_atime) / NANOSEC) > + zfetch_min_sec_reap) + dmu_zfetch_stream_remove(zf, zs); + else + numstreams++; + } + + /* + * The maximum number of streams is normally zfetch_max_streams, + * but for small files we lower it such that it's at least possible + * for all the streams to be non-overlapping. + * + * If we are already at the maximum number of streams for this file, + * even after removing old streams, then don't create this stream. + */ + uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, + zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / + zfetch_max_distance)); + if (numstreams >= max_streams) { + ZFETCHSTAT_BUMP(zfetchstat_max_streams); + return; + } + + zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); + zs->zs_blkid = blkid; + zs->zs_pf_blkid = blkid; + zs->zs_ipf_blkid = blkid; + zs->zs_atime = gethrtime(); + mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); + + list_insert_head(&zf->zf_stream, zs); +} + +/* + * This is the predictive prefetch entry point. It associates dnode access + * specified with blkid and nblks arguments with prefetch stream, predicts + * further accesses based on that stats and initiates speculative prefetch. + * fetch_data argument specifies whether actual data blocks should be fetched: + * FALSE -- prefetch only indirect blocks for predicted data blocks; + * TRUE -- prefetch predicted data blocks plus following indirect blocks. + */ +void +dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) +{ + zstream_t *zs; + int64_t pf_start, ipf_start, ipf_istart, ipf_iend; + int64_t pf_ahead_blks, max_blks; + int epbs, max_dist_blks, pf_nblks, ipf_nblks; + uint64_t end_of_access_blkid = blkid + nblks; + spa_t *spa = zf->zf_dnode->dn_objset->os_spa; + + if (zfs_prefetch_disable) + return; + + /* + * If we haven't yet loaded the indirect vdevs' mappings, we + * can only read from blocks that we carefully ensure are on + * concrete vdevs (or previously-loaded indirect vdevs). So we + * can't allow the predictive prefetcher to attempt reads of other + * blocks (e.g. of the MOS's dnode obejct). + */ + if (!spa_indirect_vdevs_loaded(spa)) + return; + + /* + * As a fast path for small (single-block) files, ignore access + * to the first block. + */ + if (blkid == 0) + return; + + rw_enter(&zf->zf_rwlock, RW_READER); + + /* + * Find matching prefetch stream. Depending on whether the accesses + * are block-aligned, first block of the new access may either follow + * the last block of the previous access, or be equal to it. + */ + for (zs = list_head(&zf->zf_stream); zs != NULL; + zs = list_next(&zf->zf_stream, zs)) { + if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) { + mutex_enter(&zs->zs_lock); + /* + * zs_blkid could have changed before we + * acquired zs_lock; re-check them here. + */ + if (blkid == zs->zs_blkid) { + break; + } else if (blkid + 1 == zs->zs_blkid) { + blkid++; + nblks--; + if (nblks == 0) { + /* Already prefetched this before. */ + mutex_exit(&zs->zs_lock); + rw_exit(&zf->zf_rwlock); + return; + } + break; + } + mutex_exit(&zs->zs_lock); + } + } + + if (zs == NULL) { + /* + * This access is not part of any existing stream. Create + * a new stream for it. + */ + ZFETCHSTAT_BUMP(zfetchstat_misses); + if (rw_tryupgrade(&zf->zf_rwlock)) + dmu_zfetch_stream_create(zf, end_of_access_blkid); + rw_exit(&zf->zf_rwlock); + return; + } + + /* + * This access was to a block that we issued a prefetch for on + * behalf of this stream. Issue further prefetches for this stream. + * + * Normally, we start prefetching where we stopped + * prefetching last (zs_pf_blkid). But when we get our first + * hit on this stream, zs_pf_blkid == zs_blkid, we don't + * want to prefetch the block we just accessed. In this case, + * start just after the block we just accessed. + */ + pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid); + + /* + * Double our amount of prefetched data, but don't let the + * prefetch get further ahead than zfetch_max_distance. + */ + if (fetch_data) { + max_dist_blks = + zfetch_max_distance >> zf->zf_dnode->dn_datablkshift; + /* + * Previously, we were (zs_pf_blkid - blkid) ahead. We + * want to now be double that, so read that amount again, + * plus the amount we are catching up by (i.e. the amount + * read just now). + */ + pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks; + max_blks = max_dist_blks - (pf_start - end_of_access_blkid); + pf_nblks = MIN(pf_ahead_blks, max_blks); + } else { + pf_nblks = 0; + } + + zs->zs_pf_blkid = pf_start + pf_nblks; + + /* + * Do the same for indirects, starting from where we stopped last, + * or where we will stop reading data blocks (and the indirects + * that point to them). + */ + ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid); + max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift; + /* + * We want to double our distance ahead of the data prefetch + * (or reader, if we are not prefetching data). Previously, we + * were (zs_ipf_blkid - blkid) ahead. To double that, we read + * that amount again, plus the amount we are catching up by + * (i.e. the amount read now + the amount of data prefetched now). + */ + pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks; + max_blks = max_dist_blks - (ipf_start - end_of_access_blkid); + ipf_nblks = MIN(pf_ahead_blks, max_blks); + zs->zs_ipf_blkid = ipf_start + ipf_nblks; + + epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; + ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; + ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs; + + zs->zs_atime = gethrtime(); + zs->zs_blkid = end_of_access_blkid; + mutex_exit(&zs->zs_lock); + rw_exit(&zf->zf_rwlock); + + /* + * dbuf_prefetch() is asynchronous (even when it needs to read + * indirect blocks), but we still prefer to drop our locks before + * calling it to reduce the time we hold them. + */ + + for (int i = 0; i < pf_nblks; i++) { + dbuf_prefetch(zf->zf_dnode, 0, pf_start + i, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); + } + for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) { + dbuf_prefetch(zf->zf_dnode, 1, iblk, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); + } + ZFETCHSTAT_BUMP(zfetchstat_hits); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c new file mode 100644 index 000000000000..0c3b09446ae7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -0,0 +1,2223 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 RackTop Systems. + */ + +#include <sys/zfs_context.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_zfetch.h> +#include <sys/range_tree.h> + +static kmem_cache_t *dnode_cache; +/* + * Define DNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define DNODE_STATS +#endif /* DEBUG */ + +#ifdef DNODE_STATS +#define DNODE_STAT_ADD(stat) ((stat)++) +#else +#define DNODE_STAT_ADD(stat) /* nothing */ +#endif /* DNODE_STATS */ + +static dnode_phys_t dnode_phys_zero; + +int zfs_default_bs = SPA_MINBLOCKSHIFT; +int zfs_default_ibs = DN_MAX_INDBLKSHIFT; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN, + &zfs_default_bs, 0, "Default dnode block shift"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN, + &zfs_default_ibs, 0, "Default dnode indirect block shift"); + +#ifdef illumos +#ifdef _KERNEL +static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); +#endif /* _KERNEL */ +#endif + +static int +dbuf_compare(const void *x1, const void *x2) +{ + const dmu_buf_impl_t *d1 = x1; + const dmu_buf_impl_t *d2 = x2; + + int cmp = AVL_CMP(d1->db_level, d2->db_level); + if (likely(cmp)) + return (cmp); + + cmp = AVL_CMP(d1->db_blkid, d2->db_blkid); + if (likely(cmp)) + return (cmp); + + if (d1->db_state == DB_SEARCH) { + ASSERT3S(d2->db_state, !=, DB_SEARCH); + return (-1); + } else if (d2->db_state == DB_SEARCH) { + ASSERT3S(d1->db_state, !=, DB_SEARCH); + return (1); + } + + return (AVL_PCMP(d1, d2)); +} + +/* ARGSUSED */ +static int +dnode_cons(void *arg, void *unused, int kmflag) +{ + dnode_t *dn = arg; + int i; + + rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); + + /* + * Every dbuf has a reference, and dropping a tracked reference is + * O(number of references), so don't track dn_holds. + */ + refcount_create_untracked(&dn->dn_holds); + refcount_create(&dn->dn_tx_holds); + list_link_init(&dn->dn_link); + + bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); + bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); + bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); + bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); + bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); + bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); + bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); + + for (i = 0; i < TXG_SIZE; i++) { + list_link_init(&dn->dn_dirty_link[i]); + dn->dn_free_ranges[i] = NULL; + list_create(&dn->dn_dirty_records[i], + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + } + + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + dn->dn_dirtyctx = 0; + dn->dn_dirtyctx_firstset = NULL; + dn->dn_bonus = NULL; + dn->dn_have_spill = B_FALSE; + dn->dn_zio = NULL; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_id_flags = 0; + + dn->dn_dbufs_count = 0; + avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + + dn->dn_moved = 0; + POINTER_INVALIDATE(&dn->dn_objset); + return (0); +} + +/* ARGSUSED */ +static void +dnode_dest(void *arg, void *unused) +{ + int i; + dnode_t *dn = arg; + + rw_destroy(&dn->dn_struct_rwlock); + mutex_destroy(&dn->dn_mtx); + mutex_destroy(&dn->dn_dbufs_mtx); + cv_destroy(&dn->dn_notxholds); + refcount_destroy(&dn->dn_holds); + refcount_destroy(&dn->dn_tx_holds); + ASSERT(!list_link_active(&dn->dn_link)); + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + ASSERT3P(dn->dn_free_ranges[i], ==, NULL); + list_destroy(&dn->dn_dirty_records[i]); + ASSERT0(dn->dn_next_nblkptr[i]); + ASSERT0(dn->dn_next_nlevels[i]); + ASSERT0(dn->dn_next_indblkshift[i]); + ASSERT0(dn->dn_next_bonustype[i]); + ASSERT0(dn->dn_rm_spillblk[i]); + ASSERT0(dn->dn_next_bonuslen[i]); + ASSERT0(dn->dn_next_blksz[i]); + } + + ASSERT0(dn->dn_allocated_txg); + ASSERT0(dn->dn_free_txg); + ASSERT0(dn->dn_assigned_txg); + ASSERT0(dn->dn_dirtyctx); + ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); + ASSERT3P(dn->dn_bonus, ==, NULL); + ASSERT(!dn->dn_have_spill); + ASSERT3P(dn->dn_zio, ==, NULL); + ASSERT0(dn->dn_oldused); + ASSERT0(dn->dn_oldflags); + ASSERT0(dn->dn_olduid); + ASSERT0(dn->dn_oldgid); + ASSERT0(dn->dn_newuid); + ASSERT0(dn->dn_newgid); + ASSERT0(dn->dn_id_flags); + + ASSERT0(dn->dn_dbufs_count); + avl_destroy(&dn->dn_dbufs); +} + +void +dnode_init(void) +{ + ASSERT(dnode_cache == NULL); + dnode_cache = kmem_cache_create("dnode_t", + sizeof (dnode_t), + 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); +#ifdef _KERNEL + kmem_cache_set_move(dnode_cache, dnode_move); +#endif /* _KERNEL */ +} + +void +dnode_fini(void) +{ + kmem_cache_destroy(dnode_cache); + dnode_cache = NULL; +} + + +#ifdef ZFS_DEBUG +void +dnode_verify(dnode_t *dn) +{ + int drop_struct_lock = FALSE; + + ASSERT(dn->dn_phys); + ASSERT(dn->dn_objset); + ASSERT(dn->dn_handle->dnh_dnode == dn); + + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); + + if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) + return; + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { + int i; + int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + ASSERT3U(dn->dn_indblkshift, >=, 0); + ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); + if (dn->dn_datablkshift) { + ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); + ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz); + } + ASSERT3U(dn->dn_nlevels, <=, 30); + ASSERT(DMU_OT_IS_VALID(dn->dn_type)); + ASSERT3U(dn->dn_nblkptr, >=, 1); + ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); + ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen); + ASSERT3U(dn->dn_datablksz, ==, + dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); + ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + + dn->dn_bonuslen, <=, max_bonuslen); + for (i = 0; i < TXG_SIZE; i++) { + ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); + } + } + if (dn->dn_phys->dn_type != DMU_OT_NONE) + ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); + ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); + if (dn->dn_dbuf != NULL) { + ASSERT3P(dn->dn_phys, ==, + (dnode_phys_t *)dn->dn_dbuf->db.db_data + + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); + } + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); +} +#endif + +void +dnode_byteswap(dnode_phys_t *dnp) +{ + uint64_t *buf64 = (void*)&dnp->dn_blkptr; + int i; + + if (dnp->dn_type == DMU_OT_NONE) { + bzero(dnp, sizeof (dnode_phys_t)); + return; + } + + dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); + dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); + dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots); + dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); + dnp->dn_used = BSWAP_64(dnp->dn_used); + + /* + * dn_nblkptr is only one byte, so it's OK to read it in either + * byte order. We can't read dn_bouslen. + */ + ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); + ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); + for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) + buf64[i] = BSWAP_64(buf64[i]); + + /* + * OK to check dn_bonuslen for zero, because it won't matter if + * we have the wrong byte order. This is necessary because the + * dnode dnode is smaller than a regular dnode. + */ + if (dnp->dn_bonuslen != 0) { + /* + * Note that the bonus length calculated here may be + * longer than the actual bonus buffer. This is because + * we always put the bonus buffer after the last block + * pointer (instead of packing it against the end of the + * dnode buffer). + */ + int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); + int slots = dnp->dn_extra_slots + 1; + size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off; + ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(dnp->dn_bonustype); + dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); + } + + /* Swap SPILL block if we have one */ + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) + byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); +} + +void +dnode_buf_byteswap(void *vbuf, size_t size) +{ + int i = 0; + + ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT)); + ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0); + + while (i < size) { + dnode_phys_t *dnp = vbuf + i; + dnode_byteswap(dnp); + + i += DNODE_MIN_SIZE; + if (dnp->dn_type != DMU_OT_NONE) + i += dnp->dn_extra_slots * DNODE_MIN_SIZE; + } +} + +void +dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - + (dn->dn_nblkptr-1) * sizeof (blkptr_t)); + dn->dn_bonuslen = newsize; + if (newsize == 0) + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; + else + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + rw_exit(&dn->dn_struct_rwlock); +} + +void +dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dn->dn_bonustype = newtype; + dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; + rw_exit(&dn->dn_struct_rwlock); +} + +void +dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + dnode_setdirty(dn, tx); + dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK; + dn->dn_have_spill = B_FALSE; +} + +static void +dnode_setdblksz(dnode_t *dn, int size) +{ + ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE)); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(size, >=, SPA_MINBLOCKSIZE); + ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, + 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); + dn->dn_datablksz = size; + dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; + dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0; +} + +static dnode_t * +dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, + uint64_t object, dnode_handle_t *dnh) +{ + dnode_t *dn; + + dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); +#ifdef _KERNEL + ASSERT(!POINTER_IS_VALID(dn->dn_objset)); +#endif /* _KERNEL */ + dn->dn_moved = 0; + + /* + * Defer setting dn_objset until the dnode is ready to be a candidate + * for the dnode_move() callback. + */ + dn->dn_object = object; + dn->dn_dbuf = db; + dn->dn_handle = dnh; + dn->dn_phys = dnp; + + if (dnp->dn_datablkszsec) { + dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + } else { + dn->dn_datablksz = 0; + dn->dn_datablkszsec = 0; + dn->dn_datablkshift = 0; + } + dn->dn_indblkshift = dnp->dn_indblkshift; + dn->dn_nlevels = dnp->dn_nlevels; + dn->dn_type = dnp->dn_type; + dn->dn_nblkptr = dnp->dn_nblkptr; + dn->dn_checksum = dnp->dn_checksum; + dn->dn_compress = dnp->dn_compress; + dn->dn_bonustype = dnp->dn_bonustype; + dn->dn_bonuslen = dnp->dn_bonuslen; + dn->dn_num_slots = dnp->dn_extra_slots + 1; + dn->dn_maxblkid = dnp->dn_maxblkid; + dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); + dn->dn_id_flags = 0; + + dmu_zfetch_init(&dn->dn_zfetch, dn); + + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); + + mutex_enter(&os->os_lock); + if (dnh->dnh_dnode != NULL) { + /* Lost the allocation race. */ + mutex_exit(&os->os_lock); + kmem_cache_free(dnode_cache, dn); + return (dnh->dnh_dnode); + } + + /* + * Exclude special dnodes from os_dnodes so an empty os_dnodes + * signifies that the special dnodes have no references from + * their children (the entries in os_dnodes). This allows + * dnode_destroy() to easily determine if the last child has + * been removed and then complete eviction of the objset. + */ + if (!DMU_OBJECT_IS_SPECIAL(object)) + list_insert_head(&os->os_dnodes, dn); + membar_producer(); + + /* + * Everything else must be valid before assigning dn_objset + * makes the dnode eligible for dnode_move(). + */ + dn->dn_objset = os; + + dnh->dnh_dnode = dn; + mutex_exit(&os->os_lock); + + arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE); + return (dn); +} + +/* + * Caller must be holding the dnode handle, which is released upon return. + */ +static void +dnode_destroy(dnode_t *dn) +{ + objset_t *os = dn->dn_objset; + boolean_t complete_os_eviction = B_FALSE; + + ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); + + mutex_enter(&os->os_lock); + POINTER_INVALIDATE(&dn->dn_objset); + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + list_remove(&os->os_dnodes, dn); + complete_os_eviction = + list_is_empty(&os->os_dnodes) && + list_link_active(&os->os_evicting_node); + } + mutex_exit(&os->os_lock); + + /* the dnode can no longer move, so we can release the handle */ + zrl_remove(&dn->dn_handle->dnh_zrlock); + + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + + dn->dn_dirtyctx = 0; + if (dn->dn_dirtyctx_firstset != NULL) { + kmem_free(dn->dn_dirtyctx_firstset, 1); + dn->dn_dirtyctx_firstset = NULL; + } + if (dn->dn_bonus != NULL) { + mutex_enter(&dn->dn_bonus->db_mtx); + dbuf_destroy(dn->dn_bonus); + dn->dn_bonus = NULL; + } + dn->dn_zio = NULL; + + dn->dn_have_spill = B_FALSE; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_id_flags = 0; + + dmu_zfetch_fini(&dn->dn_zfetch); + kmem_cache_free(dnode_cache, dn); + arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE); + + if (complete_os_eviction) + dmu_objset_evict_done(os); +} + +void +dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, + dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) +{ + int i; + + ASSERT3U(dn_slots, >, 0); + ASSERT3U(dn_slots << DNODE_SHIFT, <=, + spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))); + ASSERT3U(blocksize, <=, + spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); + if (blocksize == 0) + blocksize = 1 << zfs_default_bs; + else + blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); + + if (ibs == 0) + ibs = zfs_default_ibs; + + ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); + + dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n", + dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots); + + ASSERT(dn->dn_type == DMU_OT_NONE); + ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); + ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); + ASSERT(ot != DMU_OT_NONE); + ASSERT(DMU_OT_IS_VALID(ot)); + ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype == DMU_OT_SA && bonuslen == 0) || + (bonustype != DMU_OT_NONE && bonuslen != 0)); + ASSERT(DMU_OT_IS_VALID(bonustype)); + ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots)); + ASSERT(dn->dn_type == DMU_OT_NONE); + ASSERT0(dn->dn_maxblkid); + ASSERT0(dn->dn_allocated_txg); + ASSERT0(dn->dn_assigned_txg); + ASSERT(refcount_is_zero(&dn->dn_tx_holds)); + ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); + ASSERT(avl_is_empty(&dn->dn_dbufs)); + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT0(dn->dn_next_nblkptr[i]); + ASSERT0(dn->dn_next_nlevels[i]); + ASSERT0(dn->dn_next_indblkshift[i]); + ASSERT0(dn->dn_next_bonuslen[i]); + ASSERT0(dn->dn_next_bonustype[i]); + ASSERT0(dn->dn_rm_spillblk[i]); + ASSERT0(dn->dn_next_blksz[i]); + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); + ASSERT3P(dn->dn_free_ranges[i], ==, NULL); + } + + dn->dn_type = ot; + dnode_setdblksz(dn, blocksize); + dn->dn_indblkshift = ibs; + dn->dn_nlevels = 1; + dn->dn_num_slots = dn_slots; + if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ + dn->dn_nblkptr = 1; + else { + dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR, + 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> + SPA_BLKPTRSHIFT)); + } + + dn->dn_bonustype = bonustype; + dn->dn_bonuslen = bonuslen; + dn->dn_checksum = ZIO_CHECKSUM_INHERIT; + dn->dn_compress = ZIO_COMPRESS_INHERIT; + dn->dn_dirtyctx = 0; + + dn->dn_free_txg = 0; + if (dn->dn_dirtyctx_firstset) { + kmem_free(dn->dn_dirtyctx_firstset, 1); + dn->dn_dirtyctx_firstset = NULL; + } + + dn->dn_allocated_txg = tx->tx_txg; + dn->dn_id_flags = 0; + + dnode_setdirty(dn, tx); + dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; + dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; +} + +void +dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) +{ + int nblkptr; + + ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); + ASSERT3U(blocksize, <=, + spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); + ASSERT0(blocksize % SPA_MINBLOCKSIZE); + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + ASSERT(tx->tx_txg != 0); + ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype != DMU_OT_NONE && bonuslen != 0) || + (bonustype == DMU_OT_SA && bonuslen == 0)); + ASSERT(DMU_OT_IS_VALID(bonustype)); + ASSERT3U(bonuslen, <=, + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); + + dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS; + + /* clean up any unreferenced dbufs */ + dnode_evict_dbufs(dn); + + dn->dn_id_flags = 0; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dnode_setdirty(dn, tx); + if (dn->dn_datablksz != blocksize) { + /* change blocksize */ + ASSERT(dn->dn_maxblkid == 0 && + (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || + dnode_block_freed(dn, 0))); + dnode_setdblksz(dn, blocksize); + dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; + } + if (dn->dn_bonuslen != bonuslen) + dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; + + if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ + nblkptr = 1; + else + nblkptr = MIN(DN_MAX_NBLKPTR, + 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> + SPA_BLKPTRSHIFT)); + if (dn->dn_bonustype != bonustype) + dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; + if (dn->dn_nblkptr != nblkptr) + dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; + if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + dbuf_rm_spill(dn, tx); + dnode_rm_spill(dn, tx); + } + rw_exit(&dn->dn_struct_rwlock); + + /* change type */ + dn->dn_type = ot; + + /* change bonus size and type */ + mutex_enter(&dn->dn_mtx); + dn->dn_bonustype = bonustype; + dn->dn_bonuslen = bonuslen; + dn->dn_num_slots = dn_slots; + dn->dn_nblkptr = nblkptr; + dn->dn_checksum = ZIO_CHECKSUM_INHERIT; + dn->dn_compress = ZIO_COMPRESS_INHERIT; + ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); + + /* fix up the bonus db_size */ + if (dn->dn_bonus) { + dn->dn_bonus->db.db_size = + DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - + (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); + } + + dn->dn_allocated_txg = tx->tx_txg; + mutex_exit(&dn->dn_mtx); +} + +#ifdef DNODE_STATS +static struct { + uint64_t dms_dnode_invalid; + uint64_t dms_dnode_recheck1; + uint64_t dms_dnode_recheck2; + uint64_t dms_dnode_special; + uint64_t dms_dnode_handle; + uint64_t dms_dnode_rwlock; + uint64_t dms_dnode_active; +} dnode_move_stats; +#endif /* DNODE_STATS */ + +#ifdef _KERNEL +static void +dnode_move_impl(dnode_t *odn, dnode_t *ndn) +{ + int i; + + ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); + ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock)); + + /* Copy fields. */ + ndn->dn_objset = odn->dn_objset; + ndn->dn_object = odn->dn_object; + ndn->dn_dbuf = odn->dn_dbuf; + ndn->dn_handle = odn->dn_handle; + ndn->dn_phys = odn->dn_phys; + ndn->dn_type = odn->dn_type; + ndn->dn_bonuslen = odn->dn_bonuslen; + ndn->dn_bonustype = odn->dn_bonustype; + ndn->dn_nblkptr = odn->dn_nblkptr; + ndn->dn_checksum = odn->dn_checksum; + ndn->dn_compress = odn->dn_compress; + ndn->dn_nlevels = odn->dn_nlevels; + ndn->dn_indblkshift = odn->dn_indblkshift; + ndn->dn_datablkshift = odn->dn_datablkshift; + ndn->dn_datablkszsec = odn->dn_datablkszsec; + ndn->dn_datablksz = odn->dn_datablksz; + ndn->dn_maxblkid = odn->dn_maxblkid; + bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0], + sizeof (odn->dn_next_type)); + bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], + sizeof (odn->dn_next_nblkptr)); + bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], + sizeof (odn->dn_next_nlevels)); + bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], + sizeof (odn->dn_next_indblkshift)); + bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], + sizeof (odn->dn_next_bonustype)); + bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], + sizeof (odn->dn_rm_spillblk)); + bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], + sizeof (odn->dn_next_bonuslen)); + bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], + sizeof (odn->dn_next_blksz)); + for (i = 0; i < TXG_SIZE; i++) { + list_move_tail(&ndn->dn_dirty_records[i], + &odn->dn_dirty_records[i]); + } + bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0], + sizeof (odn->dn_free_ranges)); + ndn->dn_allocated_txg = odn->dn_allocated_txg; + ndn->dn_free_txg = odn->dn_free_txg; + ndn->dn_assigned_txg = odn->dn_assigned_txg; + ndn->dn_dirtyctx = odn->dn_dirtyctx; + ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; + ASSERT(refcount_count(&odn->dn_tx_holds) == 0); + refcount_transfer(&ndn->dn_holds, &odn->dn_holds); + ASSERT(avl_is_empty(&ndn->dn_dbufs)); + avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); + ndn->dn_dbufs_count = odn->dn_dbufs_count; + ndn->dn_bonus = odn->dn_bonus; + ndn->dn_have_spill = odn->dn_have_spill; + ndn->dn_zio = odn->dn_zio; + ndn->dn_oldused = odn->dn_oldused; + ndn->dn_oldflags = odn->dn_oldflags; + ndn->dn_olduid = odn->dn_olduid; + ndn->dn_oldgid = odn->dn_oldgid; + ndn->dn_newuid = odn->dn_newuid; + ndn->dn_newgid = odn->dn_newgid; + ndn->dn_id_flags = odn->dn_id_flags; + dmu_zfetch_init(&ndn->dn_zfetch, NULL); + list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); + ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; + + /* + * Update back pointers. Updating the handle fixes the back pointer of + * every descendant dbuf as well as the bonus dbuf. + */ + ASSERT(ndn->dn_handle->dnh_dnode == odn); + ndn->dn_handle->dnh_dnode = ndn; + if (ndn->dn_zfetch.zf_dnode == odn) { + ndn->dn_zfetch.zf_dnode = ndn; + } + + /* + * Invalidate the original dnode by clearing all of its back pointers. + */ + odn->dn_dbuf = NULL; + odn->dn_handle = NULL; + avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + odn->dn_dbufs_count = 0; + odn->dn_bonus = NULL; + odn->dn_zfetch.zf_dnode = NULL; + + /* + * Set the low bit of the objset pointer to ensure that dnode_move() + * recognizes the dnode as invalid in any subsequent callback. + */ + POINTER_INVALIDATE(&odn->dn_objset); + + /* + * Satisfy the destructor. + */ + for (i = 0; i < TXG_SIZE; i++) { + list_create(&odn->dn_dirty_records[i], + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + odn->dn_free_ranges[i] = NULL; + odn->dn_next_nlevels[i] = 0; + odn->dn_next_indblkshift[i] = 0; + odn->dn_next_bonustype[i] = 0; + odn->dn_rm_spillblk[i] = 0; + odn->dn_next_bonuslen[i] = 0; + odn->dn_next_blksz[i] = 0; + } + odn->dn_allocated_txg = 0; + odn->dn_free_txg = 0; + odn->dn_assigned_txg = 0; + odn->dn_dirtyctx = 0; + odn->dn_dirtyctx_firstset = NULL; + odn->dn_have_spill = B_FALSE; + odn->dn_zio = NULL; + odn->dn_oldused = 0; + odn->dn_oldflags = 0; + odn->dn_olduid = 0; + odn->dn_oldgid = 0; + odn->dn_newuid = 0; + odn->dn_newgid = 0; + odn->dn_id_flags = 0; + + /* + * Mark the dnode. + */ + ndn->dn_moved = 1; + odn->dn_moved = (uint8_t)-1; +} + +#ifdef illumos +/*ARGSUSED*/ +static kmem_cbrc_t +dnode_move(void *buf, void *newbuf, size_t size, void *arg) +{ + dnode_t *odn = buf, *ndn = newbuf; + objset_t *os; + int64_t refcount; + uint32_t dbufs; + + /* + * The dnode is on the objset's list of known dnodes if the objset + * pointer is valid. We set the low bit of the objset pointer when + * freeing the dnode to invalidate it, and the memory patterns written + * by kmem (baddcafe and deadbeef) set at least one of the two low bits. + * A newly created dnode sets the objset pointer last of all to indicate + * that the dnode is known and in a valid state to be moved by this + * function. + */ + os = odn->dn_objset; + if (!POINTER_IS_VALID(os)) { + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * Ensure that the objset does not go away during the move. + */ + rw_enter(&os_lock, RW_WRITER); + if (os != odn->dn_objset) { + rw_exit(&os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * If the dnode is still valid, then so is the objset. We know that no + * valid objset can be freed while we hold os_lock, so we can safely + * ensure that the objset remains in use. + */ + mutex_enter(&os->os_lock); + + /* + * Recheck the objset pointer in case the dnode was removed just before + * acquiring the lock. + */ + if (os != odn->dn_objset) { + mutex_exit(&os->os_lock); + rw_exit(&os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * At this point we know that as long as we hold os->os_lock, the dnode + * cannot be freed and fields within the dnode can be safely accessed. + * The objset listing this dnode cannot go away as long as this dnode is + * on its list. + */ + rw_exit(&os_lock); + if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); + return (KMEM_CBRC_NO); + } + ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ + + /* + * Lock the dnode handle to prevent the dnode from obtaining any new + * holds. This also prevents the descendant dbufs and the bonus dbuf + * from accessing the dnode, so that we can discount their holds. The + * handle is safe to access because we know that while the dnode cannot + * go away, neither can its handle. Once we hold dnh_zrlock, we can + * safely move any dnode referenced only by dbufs. + */ + if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); + return (KMEM_CBRC_LATER); + } + + /* + * Ensure a consistent view of the dnode's holds and the dnode's dbufs. + * We need to guarantee that there is a hold for every dbuf in order to + * determine whether the dnode is actively referenced. Falsely matching + * a dbuf to an active hold would lead to an unsafe move. It's possible + * that a thread already having an active dnode hold is about to add a + * dbuf, and we can't compare hold and dbuf counts while the add is in + * progress. + */ + if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); + return (KMEM_CBRC_LATER); + } + + /* + * A dbuf may be removed (evicted) without an active dnode hold. In that + * case, the dbuf count is decremented under the handle lock before the + * dbuf's hold is released. This order ensures that if we count the hold + * after the dbuf is removed but before its hold is released, we will + * treat the unmatched hold as active and exit safely. If we count the + * hold before the dbuf is removed, the hold is discounted, and the + * removal is blocked until the move completes. + */ + refcount = refcount_count(&odn->dn_holds); + ASSERT(refcount >= 0); + dbufs = odn->dn_dbufs_count; + + /* We can't have more dbufs than dnode holds. */ + ASSERT3U(dbufs, <=, refcount); + DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, + uint32_t, dbufs); + + if (refcount > dbufs) { + rw_exit(&odn->dn_struct_rwlock); + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); + return (KMEM_CBRC_LATER); + } + + rw_exit(&odn->dn_struct_rwlock); + + /* + * At this point we know that anyone with a hold on the dnode is not + * actively referencing it. The dnode is known and in a valid state to + * move. We're holding the locks needed to execute the critical section. + */ + dnode_move_impl(odn, ndn); + + list_link_replace(&odn->dn_link, &ndn->dn_link); + /* If the dnode was safe to move, the refcount cannot have changed. */ + ASSERT(refcount == refcount_count(&ndn->dn_holds)); + ASSERT(dbufs == ndn->dn_dbufs_count); + zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ + mutex_exit(&os->os_lock); + + return (KMEM_CBRC_YES); +} +#endif /* illumos */ +#endif /* _KERNEL */ + +void +dnode_special_close(dnode_handle_t *dnh) +{ + dnode_t *dn = dnh->dnh_dnode; + + /* + * Wait for final references to the dnode to clear. This can + * only happen if the arc is asyncronously evicting state that + * has a hold on this dnode while we are trying to evict this + * dnode. + */ + while (refcount_count(&dn->dn_holds) > 0) + delay(1); + ASSERT(dn->dn_dbuf == NULL || + dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); + zrl_add(&dnh->dnh_zrlock); + dnode_destroy(dn); /* implicit zrl_remove() */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = NULL; +} + +void +dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, + dnode_handle_t *dnh) +{ + dnode_t *dn; + + dn = dnode_create(os, dnp, NULL, object, dnh); + zrl_init(&dnh->dnh_zrlock); + DNODE_VERIFY(dn); +} + +static void +dnode_buf_evict_async(void *dbu) +{ + dnode_children_t *children_dnodes = dbu; + int i; + + for (i = 0; i < children_dnodes->dnc_count; i++) { + dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; + dnode_t *dn; + + /* + * The dnode handle lock guards against the dnode moving to + * another valid address, so there is no need here to guard + * against changes to or from NULL. + */ + if (dnh->dnh_dnode == NULL) { + zrl_destroy(&dnh->dnh_zrlock); + continue; + } + + zrl_add(&dnh->dnh_zrlock); + dn = dnh->dnh_dnode; + /* + * If there are holds on this dnode, then there should + * be holds on the dnode's containing dbuf as well; thus + * it wouldn't be eligible for eviction and this function + * would not have been called. + */ + ASSERT(refcount_is_zero(&dn->dn_holds)); + ASSERT(refcount_is_zero(&dn->dn_tx_holds)); + + dnode_destroy(dn); /* implicit zrl_remove() */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = NULL; + } + kmem_free(children_dnodes, sizeof (dnode_children_t) + + children_dnodes->dnc_count * sizeof (dnode_handle_t)); +} + +/* + * Return true if the given index is interior to a dnode already + * allocated in the block. That is, the index is neither free nor + * allocated, but is consumed by a large dnode. + * + * The dnode_phys_t buffer may not be in sync with the in-core dnode + * structure, so we try to check the dnode structure first and fall back + * to the dnode_phys_t buffer it doesn't exist. + */ +static boolean_t +dnode_is_consumed(dmu_buf_impl_t *db, int idx) +{ + dnode_handle_t *dnh; + dmu_object_type_t ot; + dnode_children_t *children_dnodes; + dnode_phys_t *dn_block; + int skip; + int i; + + children_dnodes = dmu_buf_get_user(&db->db); + dn_block = (dnode_phys_t *)db->db.db_data; + + for (i = 0; i < idx; i += skip) { + dnh = &children_dnodes->dnc_children[i]; + + zrl_add(&dnh->dnh_zrlock); + if (dnh->dnh_dnode != NULL) { + ot = dnh->dnh_dnode->dn_type; + skip = dnh->dnh_dnode->dn_num_slots; + } else { + ot = dn_block[i].dn_type; + skip = dn_block[i].dn_extra_slots + 1; + } + zrl_remove(&dnh->dnh_zrlock); + + if (ot == DMU_OT_NONE) + skip = 1; + } + + return (i > idx); +} + +/* + * Return true if the given index in the dnode block is a valid + * allocated dnode. That is, the index is not consumed by a large + * dnode and is not free. + * + * The dnode_phys_t buffer may not be in sync with the in-core dnode + * structure, so we try to check the dnode structure first and fall back + * to the dnode_phys_t buffer it doesn't exist. + */ +static boolean_t +dnode_is_allocated(dmu_buf_impl_t *db, int idx) +{ + dnode_handle_t *dnh; + dmu_object_type_t ot; + dnode_children_t *children_dnodes; + dnode_phys_t *dn_block; + + if (dnode_is_consumed(db, idx)) + return (B_FALSE); + + children_dnodes = dmu_buf_get_user(&db->db); + dn_block = (dnode_phys_t *)db->db.db_data; + + dnh = &children_dnodes->dnc_children[idx]; + + zrl_add(&dnh->dnh_zrlock); + if (dnh->dnh_dnode != NULL) + ot = dnh->dnh_dnode->dn_type; + else + ot = dn_block[idx].dn_type; + zrl_remove(&dnh->dnh_zrlock); + + return (ot != DMU_OT_NONE); +} + +/* + * Return true if the given range of indices in the dnode block are + * free. That is, the starting index is not consumed by a large dnode + * and none of the indices are allocated. + * + * The dnode_phys_t buffer may not be in sync with the in-core dnode + * structure, so we try to check the dnode structure first and fall back + * to the dnode_phys_t buffer it doesn't exist. + */ +static boolean_t +dnode_is_free(dmu_buf_impl_t *db, int idx, int slots) +{ + dnode_handle_t *dnh; + dmu_object_type_t ot; + dnode_children_t *children_dnodes; + dnode_phys_t *dn_block; + int i; + + if (idx + slots > DNODES_PER_BLOCK) + return (B_FALSE); + + children_dnodes = dmu_buf_get_user(&db->db); + dn_block = (dnode_phys_t *)db->db.db_data; + + if (dnode_is_consumed(db, idx)) + return (B_FALSE); + + for (i = idx; i < idx + slots; i++) { + dnh = &children_dnodes->dnc_children[i]; + + zrl_add(&dnh->dnh_zrlock); + if (dnh->dnh_dnode != NULL) + ot = dnh->dnh_dnode->dn_type; + else + ot = dn_block[i].dn_type; + zrl_remove(&dnh->dnh_zrlock); + + if (ot != DMU_OT_NONE) + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * errors: + * EINVAL - invalid object number. + * ENOSPC - hole too small to fulfill "slots" request + * EIO - i/o error. + * succeeds even for free dnodes. + */ +int +dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, + void *tag, dnode_t **dnp) +{ + int epb, idx, err, i; + int drop_struct_lock = FALSE; + int type; + uint64_t blk; + dnode_t *mdn, *dn; + dmu_buf_impl_t *db; + dnode_children_t *children_dnodes; + dnode_phys_t *dn_block_begin; + dnode_handle_t *dnh; + + ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0)); + ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0)); + + /* + * If you are holding the spa config lock as writer, you shouldn't + * be asking the DMU to do *anything* unless it's the root pool + * which may require us to read from the root filesystem while + * holding some (not all) of the locks as writer. + */ + ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || + (spa_is_root(os->os_spa) && + spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); + + ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE)); + + if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { + dn = (object == DMU_USERUSED_OBJECT) ? + DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os); + if (dn == NULL) + return (SET_ERROR(ENOENT)); + type = dn->dn_type; + if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) + return (SET_ERROR(ENOENT)); + if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) + return (SET_ERROR(EEXIST)); + DNODE_VERIFY(dn); + (void) refcount_add(&dn->dn_holds, tag); + *dnp = dn; + return (0); + } + + if (object == 0 || object >= DN_MAX_OBJECT) + return (SET_ERROR(EINVAL)); + + mdn = DMU_META_DNODE(os); + ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); + + DNODE_VERIFY(mdn); + + if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { + rw_enter(&mdn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); + + db = dbuf_hold(mdn, blk, FTAG); + if (drop_struct_lock) + rw_exit(&mdn->dn_struct_rwlock); + if (db == NULL) + return (SET_ERROR(EIO)); + err = dbuf_read(db, NULL, DB_RF_CANFAIL); + if (err) { + dbuf_rele(db, FTAG); + return (err); + } + + ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT); + epb = db->db.db_size >> DNODE_SHIFT; + + ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); + children_dnodes = dmu_buf_get_user(&db->db); + if (children_dnodes == NULL) { + dnode_children_t *winner; + children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + + epb * sizeof (dnode_handle_t), KM_SLEEP); + children_dnodes->dnc_count = epb; + dnh = &children_dnodes->dnc_children[0]; + for (i = 0; i < epb; i++) { + zrl_init(&dnh[i].dnh_zrlock); + } + dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL, + dnode_buf_evict_async, NULL); + winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); + if (winner != NULL) { + + for (i = 0; i < epb; i++) { + zrl_destroy(&dnh[i].dnh_zrlock); + } + + kmem_free(children_dnodes, sizeof (dnode_children_t) + + epb * sizeof (dnode_handle_t)); + children_dnodes = winner; + } + } + ASSERT(children_dnodes->dnc_count == epb); + + idx = object & (epb - 1); + dn_block_begin = (dnode_phys_t *)db->db.db_data; + + if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) { + dbuf_rele(db, FTAG); + return (ENOSPC); + } else if ((flag & DNODE_MUST_BE_ALLOCATED) && + !dnode_is_allocated(db, idx)) { + dbuf_rele(db, FTAG); + return (ENOENT); + } + + dnh = &children_dnodes->dnc_children[idx]; + zrl_add(&dnh->dnh_zrlock); + dn = dnh->dnh_dnode; + if (dn == NULL) + dn = dnode_create(os, dn_block_begin + idx, db, object, dnh); + + mutex_enter(&dn->dn_mtx); + type = dn->dn_type; + if (dn->dn_free_txg || + ((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) { + mutex_exit(&dn->dn_mtx); + zrl_remove(&dnh->dnh_zrlock); + dbuf_rele(db, FTAG); + return ((flag & DNODE_MUST_BE_ALLOCATED) ? ENOENT : EEXIST); + } + if (refcount_add(&dn->dn_holds, tag) == 1) + dbuf_add_ref(db, dnh); + mutex_exit(&dn->dn_mtx); + + /* Now we can rely on the hold to prevent the dnode from moving. */ + zrl_remove(&dnh->dnh_zrlock); + + DNODE_VERIFY(dn); + ASSERT3P(dn->dn_dbuf, ==, db); + ASSERT3U(dn->dn_object, ==, object); + dbuf_rele(db, FTAG); + + *dnp = dn; + return (0); +} + +/* + * Return held dnode if the object is allocated, NULL if not. + */ +int +dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) +{ + return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag, + dnp)); +} + +/* + * Can only add a reference if there is already at least one + * reference on the dnode. Returns FALSE if unable to add a + * new reference. + */ +boolean_t +dnode_add_ref(dnode_t *dn, void *tag) +{ + mutex_enter(&dn->dn_mtx); + if (refcount_is_zero(&dn->dn_holds)) { + mutex_exit(&dn->dn_mtx); + return (FALSE); + } + VERIFY(1 < refcount_add(&dn->dn_holds, tag)); + mutex_exit(&dn->dn_mtx); + return (TRUE); +} + +void +dnode_rele(dnode_t *dn, void *tag) +{ + mutex_enter(&dn->dn_mtx); + dnode_rele_and_unlock(dn, tag, B_FALSE); +} + +void +dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) +{ + uint64_t refs; + /* Get while the hold prevents the dnode from moving. */ + dmu_buf_impl_t *db = dn->dn_dbuf; + dnode_handle_t *dnh = dn->dn_handle; + + refs = refcount_remove(&dn->dn_holds, tag); + mutex_exit(&dn->dn_mtx); + + /* + * It's unsafe to release the last hold on a dnode by dnode_rele() or + * indirectly by dbuf_rele() while relying on the dnode handle to + * prevent the dnode from moving, since releasing the last hold could + * result in the dnode's parent dbuf evicting its dnode handles. For + * that reason anyone calling dnode_rele() or dbuf_rele() without some + * other direct or indirect hold on the dnode must first drop the dnode + * handle. + */ + ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); + + /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ + if (refs == 0 && db != NULL) { + /* + * Another thread could add a hold to the dnode handle in + * dnode_hold_impl() while holding the parent dbuf. Since the + * hold on the parent dbuf prevents the handle from being + * destroyed, the hold on the handle is OK. We can't yet assert + * that the handle has zero references, but that will be + * asserted anyway when the handle gets destroyed. + */ + mutex_enter(&db->db_mtx); + dbuf_rele_and_unlock(db, dnh, evicting); + } +} + +void +dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) +{ + objset_t *os = dn->dn_objset; + uint64_t txg = tx->tx_txg; + + if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + dsl_dataset_dirty(os->os_dsl_dataset, tx); + return; + } + + DNODE_VERIFY(dn); + +#ifdef ZFS_DEBUG + mutex_enter(&dn->dn_mtx); + ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); + ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); + mutex_exit(&dn->dn_mtx); +#endif + + /* + * Determine old uid/gid when necessary + */ + dmu_objset_userquota_get_ids(dn, B_TRUE, tx); + + multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK]; + multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn); + + /* + * If we are already marked dirty, we're done. + */ + if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { + multilist_sublist_unlock(mls); + return; + } + + ASSERT(!refcount_is_zero(&dn->dn_holds) || + !avl_is_empty(&dn->dn_dbufs)); + ASSERT(dn->dn_datablksz != 0); + ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]); + ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]); + ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]); + + dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", + dn->dn_object, txg); + + multilist_sublist_insert_head(mls, dn); + + multilist_sublist_unlock(mls); + + /* + * The dnode maintains a hold on its containing dbuf as + * long as there are holds on it. Each instantiated child + * dbuf maintains a hold on the dnode. When the last child + * drops its hold, the dnode will drop its hold on the + * containing dbuf. We add a "dirty hold" here so that the + * dnode will hang around after we finish processing its + * children. + */ + VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); + + (void) dbuf_dirty(dn->dn_dbuf, tx); + + dsl_dataset_dirty(os->os_dsl_dataset, tx); +} + +void +dnode_free(dnode_t *dn, dmu_tx_t *tx) +{ + mutex_enter(&dn->dn_mtx); + if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { + mutex_exit(&dn->dn_mtx); + return; + } + dn->dn_free_txg = tx->tx_txg; + mutex_exit(&dn->dn_mtx); + + dnode_setdirty(dn, tx); +} + +/* + * Try to change the block size for the indicated dnode. This can only + * succeed if there are no blocks allocated or dirty beyond first block + */ +int +dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + int err; + + ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); + if (size == 0) + size = SPA_MINBLOCKSIZE; + else + size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); + + if (ibs == dn->dn_indblkshift) + ibs = 0; + + if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) + return (0); + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + /* Check for any allocated blocks beyond the first */ + if (dn->dn_maxblkid != 0) + goto fail; + + mutex_enter(&dn->dn_dbufs_mtx); + for (db = avl_first(&dn->dn_dbufs); db != NULL; + db = AVL_NEXT(&dn->dn_dbufs, db)) { + if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { + mutex_exit(&dn->dn_dbufs_mtx); + goto fail; + } + } + mutex_exit(&dn->dn_dbufs_mtx); + + if (ibs && dn->dn_nlevels != 1) + goto fail; + + /* resize the old block */ + err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); + if (err == 0) + dbuf_new_size(db, size, tx); + else if (err != ENOENT) + goto fail; + + dnode_setdblksz(dn, size); + dnode_setdirty(dn, tx); + dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; + if (ibs) { + dn->dn_indblkshift = ibs; + dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; + } + /* rele after we have fixed the blocksize in the dnode */ + if (db) + dbuf_rele(db, FTAG); + + rw_exit(&dn->dn_struct_rwlock); + return (0); + +fail: + rw_exit(&dn->dn_struct_rwlock); + return (SET_ERROR(ENOTSUP)); +} + +/* read-holding callers must not rely on the lock being continuously held */ +void +dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) +{ + uint64_t txgoff = tx->tx_txg & TXG_MASK; + int epbs, new_nlevels; + uint64_t sz; + + ASSERT(blkid != DMU_BONUS_BLKID); + + ASSERT(have_read ? + RW_READ_HELD(&dn->dn_struct_rwlock) : + RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + /* + * if we have a read-lock, check to see if we need to do any work + * before upgrading to a write-lock. + */ + if (have_read) { + if (blkid <= dn->dn_maxblkid) + return; + + if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + } + } + + if (blkid <= dn->dn_maxblkid) + goto out; + + dn->dn_maxblkid = blkid; + + /* + * Compute the number of levels necessary to support the new maxblkid. + */ + new_nlevels = 1; + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (sz = dn->dn_nblkptr; + sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) + new_nlevels++; + + if (new_nlevels > dn->dn_nlevels) { + int old_nlevels = dn->dn_nlevels; + dmu_buf_impl_t *db; + list_t *list; + dbuf_dirty_record_t *new, *dr, *dr_next; + + dn->dn_nlevels = new_nlevels; + + ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); + dn->dn_next_nlevels[txgoff] = new_nlevels; + + /* dirty the left indirects */ + db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); + ASSERT(db != NULL); + new = dbuf_dirty(db, tx); + dbuf_rele(db, FTAG); + + /* transfer the dirty records to the new indirect */ + mutex_enter(&dn->dn_mtx); + mutex_enter(&new->dt.di.dr_mtx); + list = &dn->dn_dirty_records[txgoff]; + for (dr = list_head(list); dr; dr = dr_next) { + dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); + if (dr->dr_dbuf->db_level != new_nlevels-1 && + dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { + ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); + list_remove(&dn->dn_dirty_records[txgoff], dr); + list_insert_tail(&new->dt.di.dr_children, dr); + dr->dr_parent = new; + } + } + mutex_exit(&new->dt.di.dr_mtx); + mutex_exit(&dn->dn_mtx); + } + +out: + if (have_read) + rw_downgrade(&dn->dn_struct_rwlock); +} + +static void +dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG); + if (db != NULL) { + dmu_buf_will_dirty(&db->db, tx); + dbuf_rele(db, FTAG); + } +} + +/* + * Dirty all the in-core level-1 dbufs in the range specified by start_blkid + * and end_blkid. + */ +static void +dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) +{ + dmu_buf_impl_t db_search; + dmu_buf_impl_t *db; + avl_index_t where; + + mutex_enter(&dn->dn_dbufs_mtx); + + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + for (;;) { + + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + if (db == NULL || db->db_level != 1 || + db->db_blkid >= end_blkid) { + break; + } + + /* + * Setup the next blkid we want to search for. + */ + db_search.db_blkid = db->db_blkid + 1; + ASSERT3U(db->db_blkid, >=, start_blkid); + + /* + * If the dbuf transitions to DB_EVICTING while we're trying + * to dirty it, then we will be unable to discover it in + * the dbuf hash table. This will result in a call to + * dbuf_create() which needs to acquire the dn_dbufs_mtx + * lock. To avoid a deadlock, we drop the lock before + * dirtying the level-1 dbuf. + */ + mutex_exit(&dn->dn_dbufs_mtx); + dnode_dirty_l1(dn, db->db_blkid, tx); + mutex_enter(&dn->dn_dbufs_mtx); + } + +#ifdef ZFS_DEBUG + /* + * Walk all the in-core level-1 dbufs and verify they have been dirtied. + */ + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { + if (db->db_level != 1 || db->db_blkid >= end_blkid) + break; + ASSERT(db->db_dirtycnt > 0); + } +#endif + mutex_exit(&dn->dn_dbufs_mtx); +} + +void +dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + uint64_t blkoff, blkid, nblks; + int blksz, blkshift, head, tail; + int trunc = FALSE; + int epbs; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + blksz = dn->dn_datablksz; + blkshift = dn->dn_datablkshift; + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + if (len == DMU_OBJECT_END) { + len = UINT64_MAX - off; + trunc = TRUE; + } + + /* + * First, block align the region to free: + */ + if (ISP2(blksz)) { + head = P2NPHASE(off, blksz); + blkoff = P2PHASE(off, blksz); + if ((off >> blkshift) > dn->dn_maxblkid) + goto out; + } else { + ASSERT(dn->dn_maxblkid == 0); + if (off == 0 && len >= blksz) { + /* + * Freeing the whole block; fast-track this request. + */ + blkid = 0; + nblks = 1; + if (dn->dn_nlevels > 1) + dnode_dirty_l1(dn, 0, tx); + goto done; + } else if (off >= blksz) { + /* Freeing past end-of-data */ + goto out; + } else { + /* Freeing part of the block. */ + head = blksz - off; + ASSERT3U(head, >, 0); + } + blkoff = off; + } + /* zero out any partial block data at the start of the range */ + if (head) { + ASSERT3U(blkoff + head, ==, blksz); + if (len < head) + head = len; + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), + TRUE, FALSE, FTAG, &db) == 0) { + caddr_t data; + + /* don't dirty if it isn't on disk and isn't dirty */ + if (db->db_last_dirty || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { + rw_exit(&dn->dn_struct_rwlock); + dmu_buf_will_dirty(&db->db, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + data = db->db.db_data; + bzero(data + blkoff, head); + } + dbuf_rele(db, FTAG); + } + off += head; + len -= head; + } + + /* If the range was less than one block, we're done */ + if (len == 0) + goto out; + + /* If the remaining range is past end of file, we're done */ + if ((off >> blkshift) > dn->dn_maxblkid) + goto out; + + ASSERT(ISP2(blksz)); + if (trunc) + tail = 0; + else + tail = P2PHASE(len, blksz); + + ASSERT0(P2PHASE(off, blksz)); + /* zero out any partial block data at the end of the range */ + if (tail) { + if (len < tail) + tail = len; + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), + TRUE, FALSE, FTAG, &db) == 0) { + /* don't dirty if not on disk and not dirty */ + if (db->db_last_dirty || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { + rw_exit(&dn->dn_struct_rwlock); + dmu_buf_will_dirty(&db->db, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + bzero(db->db.db_data, tail); + } + dbuf_rele(db, FTAG); + } + len -= tail; + } + + /* If the range did not include a full block, we are done */ + if (len == 0) + goto out; + + ASSERT(IS_P2ALIGNED(off, blksz)); + ASSERT(trunc || IS_P2ALIGNED(len, blksz)); + blkid = off >> blkshift; + nblks = len >> blkshift; + if (trunc) + nblks += 1; + + /* + * Dirty all the indirect blocks in this range. Note that only + * the first and last indirect blocks can actually be written + * (if they were partially freed) -- they must be dirtied, even if + * they do not exist on disk yet. The interior blocks will + * be freed by free_children(), so they will not actually be written. + * Even though these interior blocks will not be written, we + * dirty them for two reasons: + * + * - It ensures that the indirect blocks remain in memory until + * syncing context. (They have already been prefetched by + * dmu_tx_hold_free(), so we don't have to worry about reading + * them serially here.) + * + * - The dirty space accounting will put pressure on the txg sync + * mechanism to begin syncing, and to delay transactions if there + * is a large amount of freeing. Even though these indirect + * blocks will not be written, we could need to write the same + * amount of space if we copy the freed BPs into deadlists. + */ + if (dn->dn_nlevels > 1) { + uint64_t first, last; + + first = blkid >> epbs; + dnode_dirty_l1(dn, first, tx); + if (trunc) + last = dn->dn_maxblkid >> epbs; + else + last = (blkid + nblks - 1) >> epbs; + if (last != first) + dnode_dirty_l1(dn, last, tx); + + dnode_dirty_l1range(dn, first, last, tx); + + int shift = dn->dn_datablkshift + dn->dn_indblkshift - + SPA_BLKPTRSHIFT; + for (uint64_t i = first + 1; i < last; i++) { + /* + * Set i to the blockid of the next non-hole + * level-1 indirect block at or after i. Note + * that dnode_next_offset() operates in terms of + * level-0-equivalent bytes. + */ + uint64_t ibyte = i << shift; + int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, + &ibyte, 2, 1, 0); + i = ibyte >> shift; + if (i >= last) + break; + + /* + * Normally we should not see an error, either + * from dnode_next_offset() or dbuf_hold_level() + * (except for ESRCH from dnode_next_offset). + * If there is an i/o error, then when we read + * this block in syncing context, it will use + * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according + * to the "failmode" property. dnode_next_offset() + * doesn't have a flag to indicate MUSTSUCCEED. + */ + if (err != 0) + break; + + dnode_dirty_l1(dn, i, tx); + } + } + +done: + /* + * Add this range to the dnode range list. + * We will finish up this free operation in the syncing phase. + */ + mutex_enter(&dn->dn_mtx); + int txgoff = tx->tx_txg & TXG_MASK; + if (dn->dn_free_ranges[txgoff] == NULL) { + dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL); + } + range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); + range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); + dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", + blkid, nblks, tx->tx_txg); + mutex_exit(&dn->dn_mtx); + + dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); + dnode_setdirty(dn, tx); +out: + + rw_exit(&dn->dn_struct_rwlock); +} + +static boolean_t +dnode_spill_freed(dnode_t *dn) +{ + int i; + + mutex_enter(&dn->dn_mtx); + for (i = 0; i < TXG_SIZE; i++) { + if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK) + break; + } + mutex_exit(&dn->dn_mtx); + return (i < TXG_SIZE); +} + +/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ +uint64_t +dnode_block_freed(dnode_t *dn, uint64_t blkid) +{ + void *dp = spa_get_dsl(dn->dn_objset->os_spa); + int i; + + if (blkid == DMU_BONUS_BLKID) + return (FALSE); + + /* + * If we're in the process of opening the pool, dp will not be + * set yet, but there shouldn't be anything dirty. + */ + if (dp == NULL) + return (FALSE); + + if (dn->dn_free_txg) + return (TRUE); + + if (blkid == DMU_SPILL_BLKID) + return (dnode_spill_freed(dn)); + + mutex_enter(&dn->dn_mtx); + for (i = 0; i < TXG_SIZE; i++) { + if (dn->dn_free_ranges[i] != NULL && + range_tree_contains(dn->dn_free_ranges[i], blkid, 1)) + break; + } + mutex_exit(&dn->dn_mtx); + return (i < TXG_SIZE); +} + +/* call from syncing context when we actually write/free space for this dnode */ +void +dnode_diduse_space(dnode_t *dn, int64_t delta) +{ + uint64_t space; + dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n", + dn, dn->dn_phys, + (u_longlong_t)dn->dn_phys->dn_used, + (longlong_t)delta); + + mutex_enter(&dn->dn_mtx); + space = DN_USED_BYTES(dn->dn_phys); + if (delta > 0) { + ASSERT3U(space + delta, >=, space); /* no overflow */ + } else { + ASSERT3U(space, >=, -delta); /* no underflow */ + } + space += delta; + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { + ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); + ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT)); + dn->dn_phys->dn_used = space >> DEV_BSHIFT; + } else { + dn->dn_phys->dn_used = space; + dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES; + } + mutex_exit(&dn->dn_mtx); +} + +/* + * Scans a block at the indicated "level" looking for a hole or data, + * depending on 'flags'. + * + * If level > 0, then we are scanning an indirect block looking at its + * pointers. If level == 0, then we are looking at a block of dnodes. + * + * If we don't find what we are looking for in the block, we return ESRCH. + * Otherwise, return with *offset pointing to the beginning (if searching + * forwards) or end (if searching backwards) of the range covered by the + * block pointer we matched on (or dnode). + * + * The basic search algorithm used below by dnode_next_offset() is to + * use this function to search up the block tree (widen the search) until + * we find something (i.e., we don't return ESRCH) and then search back + * down the tree (narrow the search) until we reach our original search + * level. + */ +static int +dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, + int lvl, uint64_t blkfill, uint64_t txg) +{ + dmu_buf_impl_t *db = NULL; + void *data = NULL; + uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t epb = 1ULL << epbs; + uint64_t minfill, maxfill; + boolean_t hole; + int i, inc, error, span; + + dprintf("probing object %llu offset %llx level %d of %u\n", + dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); + + hole = ((flags & DNODE_FIND_HOLE) != 0); + inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; + ASSERT(txg == 0 || !hole); + + if (lvl == dn->dn_phys->dn_nlevels) { + error = 0; + epb = dn->dn_phys->dn_nblkptr; + data = dn->dn_phys->dn_blkptr; + } else { + uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); + error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); + if (error) { + if (error != ENOENT) + return (error); + if (hole) + return (0); + /* + * This can only happen when we are searching up + * the block tree for data. We don't really need to + * adjust the offset, as we will just end up looking + * at the pointer to this block in its parent, and its + * going to be unallocated, so we will skip over it. + */ + return (SET_ERROR(ESRCH)); + } + error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); + if (error) { + dbuf_rele(db, FTAG); + return (error); + } + data = db->db.db_data; + } + + + if (db != NULL && txg != 0 && (db->db_blkptr == NULL || + db->db_blkptr->blk_birth <= txg || + BP_IS_HOLE(db->db_blkptr))) { + /* + * This can only happen when we are searching up the tree + * and these conditions mean that we need to keep climbing. + */ + error = SET_ERROR(ESRCH); + } else if (lvl == 0) { + dnode_phys_t *dnp = data; + + ASSERT(dn->dn_type == DMU_OT_DNODE); + ASSERT(!(flags & DNODE_FIND_BACKWARDS)); + + for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1); + i < blkfill; i += dnp[i].dn_extra_slots + 1) { + if ((dnp[i].dn_type == DMU_OT_NONE) == hole) + break; + } + + if (i == blkfill) + error = SET_ERROR(ESRCH); + + *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) + + (i << DNODE_SHIFT); + } else { + blkptr_t *bp = data; + uint64_t start = *offset; + span = (lvl - 1) * epbs + dn->dn_datablkshift; + minfill = 0; + maxfill = blkfill << ((lvl - 1) * epbs); + + if (hole) + maxfill--; + else + minfill++; + + *offset = *offset >> span; + for (i = BF64_GET(*offset, 0, epbs); + i >= 0 && i < epb; i += inc) { + if (BP_GET_FILL(&bp[i]) >= minfill && + BP_GET_FILL(&bp[i]) <= maxfill && + (hole || bp[i].blk_birth > txg)) + break; + if (inc > 0 || *offset > 0) + *offset += inc; + } + *offset = *offset << span; + if (inc < 0) { + /* traversing backwards; position offset at the end */ + ASSERT3U(*offset, <=, start); + *offset = MIN(*offset + (1ULL << span) - 1, start); + } else if (*offset < start) { + *offset = start; + } + if (i < 0 || i >= epb) + error = SET_ERROR(ESRCH); + } + + if (db) + dbuf_rele(db, FTAG); + + return (error); +} + +/* + * Find the next hole, data, or sparse region at or after *offset. + * The value 'blkfill' tells us how many items we expect to find + * in an L0 data block; this value is 1 for normal objects, + * DNODES_PER_BLOCK for the meta dnode, and some fraction of + * DNODES_PER_BLOCK when searching for sparse regions thereof. + * + * Examples: + * + * dnode_next_offset(dn, flags, offset, 1, 1, 0); + * Finds the next/previous hole/data in a file. + * Used in dmu_offset_next(). + * + * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); + * Finds the next free/allocated dnode an objset's meta-dnode. + * Only finds objects that have new contents since txg (ie. + * bonus buffer changes and content removal are ignored). + * Used in dmu_object_next(). + * + * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); + * Finds the next L2 meta-dnode bp that's at most 1/4 full. + * Used in dmu_object_alloc(). + */ +int +dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, + int minlvl, uint64_t blkfill, uint64_t txg) +{ + uint64_t initial_offset = *offset; + int lvl, maxlvl; + int error = 0; + + if (!(flags & DNODE_FIND_HAVELOCK)) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (dn->dn_phys->dn_nlevels == 0) { + error = SET_ERROR(ESRCH); + goto out; + } + + if (dn->dn_datablkshift == 0) { + if (*offset < dn->dn_datablksz) { + if (flags & DNODE_FIND_HOLE) + *offset = dn->dn_datablksz; + } else { + error = SET_ERROR(ESRCH); + } + goto out; + } + + maxlvl = dn->dn_phys->dn_nlevels; + + for (lvl = minlvl; lvl <= maxlvl; lvl++) { + error = dnode_next_offset_level(dn, + flags, offset, lvl, blkfill, txg); + if (error != ESRCH) + break; + } + + while (error == 0 && --lvl >= minlvl) { + error = dnode_next_offset_level(dn, + flags, offset, lvl, blkfill, txg); + } + + /* + * There's always a "virtual hole" at the end of the object, even + * if all BP's which physically exist are non-holes. + */ + if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 && + minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) { + error = 0; + } + + if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? + initial_offset < *offset : initial_offset > *offset)) + error = SET_ERROR(ESRCH); +out: + if (!(flags & DNODE_FIND_HAVELOCK)) + rw_exit(&dn->dn_struct_rwlock); + + return (error); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c new file mode 100644 index 000000000000..551c44aa3f28 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -0,0 +1,777 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/spa.h> +#include <sys/range_tree.h> +#include <sys/zfeature.h> + +static void +dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + int txgoff = tx->tx_txg & TXG_MASK; + int nblkptr = dn->dn_phys->dn_nblkptr; + int old_toplvl = dn->dn_phys->dn_nlevels - 1; + int new_level = dn->dn_next_nlevels[txgoff]; + int i; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + /* this dnode can't be paged out because it's dirty */ + ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); + + db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); + ASSERT(db != NULL); + + dn->dn_phys->dn_nlevels = new_level; + dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, + dn->dn_object, dn->dn_phys->dn_nlevels); + + /* transfer dnode's block pointers to new indirect block */ + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); + ASSERT(db->db.db_data); + ASSERT(arc_released(db->db_buf)); + ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); + bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, + sizeof (blkptr_t) * nblkptr); + arc_buf_freeze(db->db_buf); + + /* set dbuf's parent pointers to new indirect buf */ + for (i = 0; i < nblkptr; i++) { + dmu_buf_impl_t *child = + dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); + + if (child == NULL) + continue; +#ifdef DEBUG + DB_DNODE_ENTER(child); + ASSERT3P(DB_DNODE(child), ==, dn); + DB_DNODE_EXIT(child); +#endif /* DEBUG */ + if (child->db_parent && child->db_parent != dn->dn_dbuf) { + ASSERT(child->db_parent->db_level == db->db_level); + ASSERT(child->db_blkptr != + &dn->dn_phys->dn_blkptr[child->db_blkid]); + mutex_exit(&child->db_mtx); + continue; + } + ASSERT(child->db_parent == NULL || + child->db_parent == dn->dn_dbuf); + + child->db_parent = db; + dbuf_add_ref(db, child); + if (db->db.db_data) + child->db_blkptr = (blkptr_t *)db->db.db_data + i; + else + child->db_blkptr = NULL; + dprintf_dbuf_bp(child, child->db_blkptr, + "changed db_blkptr to new indirect %s", ""); + + mutex_exit(&child->db_mtx); + } + + bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); + + dbuf_rele(db, FTAG); + + rw_exit(&dn->dn_struct_rwlock); +} + +static void +free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + uint64_t bytesfreed = 0; + + dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); + + for (int i = 0; i < num; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + + bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); + ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); + + /* + * Save some useful information on the holes being + * punched, including logical size, type, and indirection + * level. Retaining birth time enables detection of when + * holes are punched for reducing the number of free + * records transmitted during a zfs send. + */ + + uint64_t lsize = BP_GET_LSIZE(bp); + dmu_object_type_t type = BP_GET_TYPE(bp); + uint64_t lvl = BP_GET_LEVEL(bp); + + bzero(bp, sizeof (blkptr_t)); + + if (spa_feature_is_active(dn->dn_objset->os_spa, + SPA_FEATURE_HOLE_BIRTH)) { + BP_SET_LSIZE(bp, lsize); + BP_SET_TYPE(bp, type); + BP_SET_LEVEL(bp, lvl); + BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0); + } + } + dnode_diduse_space(dn, -bytesfreed); +} + +#ifdef ZFS_DEBUG +static void +free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) +{ + int off, num; + int i, err, epbs; + uint64_t txg = tx->tx_txg; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + off = start - (db->db_blkid * 1<<epbs); + num = end - start + 1; + + ASSERT3U(off, >=, 0); + ASSERT3U(num, >=, 0); + ASSERT3U(db->db_level, >, 0); + ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); + ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); + ASSERT(db->db_blkptr != NULL); + + for (i = off; i < off+num; i++) { + uint64_t *buf; + dmu_buf_impl_t *child; + dbuf_dirty_record_t *dr; + int j; + + ASSERT(db->db_level == 1); + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, db->db_level-1, + (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); + rw_exit(&dn->dn_struct_rwlock); + if (err == ENOENT) + continue; + ASSERT(err == 0); + ASSERT(child->db_level == 0); + dr = child->db_last_dirty; + while (dr && dr->dr_txg > txg) + dr = dr->dr_next; + ASSERT(dr == NULL || dr->dr_txg == txg); + + /* data_old better be zeroed */ + if (dr) { + buf = dr->dt.dl.dr_data->b_data; + for (j = 0; j < child->db.db_size >> 3; j++) { + if (buf[j] != 0) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + (void *)child, i, off, num); + } + } + } + + /* + * db_data better be zeroed unless it's dirty in a + * future txg. + */ + mutex_enter(&child->db_mtx); + buf = child->db.db_data; + if (buf != NULL && child->db_state != DB_FILL && + child->db_last_dirty == NULL) { + for (j = 0; j < child->db.db_size >> 3; j++) { + if (buf[j] != 0) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + (void *)child, i, off, num); + } + } + } + mutex_exit(&child->db_mtx); + + dbuf_rele(child, FTAG); + } + DB_DNODE_EXIT(db); +} +#endif + +/* + * We don't usually free the indirect blocks here. If in one txg we have a + * free_range and a write to the same indirect block, it's important that we + * preserve the hole's birth times. Therefore, we don't free any any indirect + * blocks in free_children(). If an indirect block happens to turn into all + * holes, it will be freed by dbuf_write_children_ready, which happens at a + * point in the syncing process where we know for certain the contents of the + * indirect block. + * + * However, if we're freeing a dnode, its space accounting must go to zero + * before we actually try to free the dnode, or we will trip an assertion. In + * addition, we know the case described above cannot occur, because the dnode is + * being freed. Therefore, we free the indirect blocks immediately in that + * case. + */ +static void +free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, + boolean_t free_indirects, dmu_tx_t *tx) +{ + dnode_t *dn; + blkptr_t *bp; + dmu_buf_impl_t *subdb; + uint64_t start, end, dbstart, dbend; + unsigned int epbs, shift, i; + + /* + * There is a small possibility that this block will not be cached: + * 1 - if level > 1 and there are no children with level <= 1 + * 2 - if this block was evicted since we read it from + * dmu_tx_hold_free(). + */ + if (db->db_state != DB_CACHED) + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + + /* + * If we modify this indirect block, and we are not freeing the + * dnode (!free_indirects), then this indirect block needs to get + * written to disk by dbuf_write(). If it is dirty, we know it will + * be written (otherwise, we would have incorrect on-disk state + * because the space would be freed but still referenced by the BP + * in this indirect block). Therefore we VERIFY that it is + * dirty. + * + * Our VERIFY covers some cases that do not actually have to be + * dirty, but the open-context code happens to dirty. E.g. if the + * blocks we are freeing are all holes, because in that case, we + * are only freeing part of this indirect block, so it is an + * ancestor of the first or last block to be freed. The first and + * last L1 indirect blocks are always dirtied by dnode_free_range(). + */ + VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); + + dbuf_release_bp(db); + bp = db->db.db_data; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(epbs, <, 31); + shift = (db->db_level - 1) * epbs; + dbstart = db->db_blkid << epbs; + start = blkid >> shift; + if (dbstart < start) { + bp += start - dbstart; + } else { + start = dbstart; + } + dbend = ((db->db_blkid + 1) << epbs) - 1; + end = (blkid + nblks - 1) >> shift; + if (dbend <= end) + end = dbend; + + ASSERT3U(start, <=, end); + + if (db->db_level == 1) { + FREE_VERIFY(db, start, end, tx); + free_blocks(dn, bp, end-start+1, tx); + } else { + for (uint64_t id = start; id <= end; id++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, + id, TRUE, FALSE, FTAG, &subdb)); + rw_exit(&dn->dn_struct_rwlock); + ASSERT3P(bp, ==, subdb->db_blkptr); + + free_children(subdb, blkid, nblks, free_indirects, tx); + dbuf_rele(subdb, FTAG); + } + } + + if (free_indirects) { + for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) + ASSERT(BP_IS_HOLE(bp)); + bzero(db->db.db_data, db->db.db_size); + free_blocks(dn, db->db_blkptr, 1, tx); + } + + DB_DNODE_EXIT(db); + arc_buf_freeze(db->db_buf); +} + +/* + * Traverse the indicated range of the provided file + * and "free" all the blocks contained there. + */ +static void +dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, + boolean_t free_indirects, dmu_tx_t *tx) +{ + blkptr_t *bp = dn->dn_phys->dn_blkptr; + int dnlevel = dn->dn_phys->dn_nlevels; + boolean_t trunc = B_FALSE; + + if (blkid > dn->dn_phys->dn_maxblkid) + return; + + ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); + if (blkid + nblks > dn->dn_phys->dn_maxblkid) { + nblks = dn->dn_phys->dn_maxblkid - blkid + 1; + trunc = B_TRUE; + } + + /* There are no indirect blocks in the object */ + if (dnlevel == 1) { + if (blkid >= dn->dn_phys->dn_nblkptr) { + /* this range was never made persistent */ + return; + } + ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); + free_blocks(dn, bp + blkid, nblks, tx); + } else { + int shift = (dnlevel - 1) * + (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); + int start = blkid >> shift; + int end = (blkid + nblks - 1) >> shift; + dmu_buf_impl_t *db; + + ASSERT(start < dn->dn_phys->dn_nblkptr); + bp += start; + for (int i = start; i <= end; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, + TRUE, FALSE, FTAG, &db)); + rw_exit(&dn->dn_struct_rwlock); + + free_children(db, blkid, nblks, free_indirects, tx); + dbuf_rele(db, FTAG); + } + } + + if (trunc) { + dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1; + + uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * + (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERT(off < dn->dn_phys->dn_maxblkid || + dn->dn_phys->dn_maxblkid == 0 || + dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); + } +} + +typedef struct dnode_sync_free_range_arg { + dnode_t *dsfra_dnode; + dmu_tx_t *dsfra_tx; + boolean_t dsfra_free_indirects; +} dnode_sync_free_range_arg_t; + +static void +dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) +{ + dnode_sync_free_range_arg_t *dsfra = arg; + dnode_t *dn = dsfra->dsfra_dnode; + + mutex_exit(&dn->dn_mtx); + dnode_sync_free_range_impl(dn, blkid, nblks, + dsfra->dsfra_free_indirects, dsfra->dsfra_tx); + mutex_enter(&dn->dn_mtx); +} + +/* + * Try to kick all the dnode's dbufs out of the cache... + */ +void +dnode_evict_dbufs(dnode_t *dn) +{ + dmu_buf_impl_t db_marker; + dmu_buf_impl_t *db, *db_next; + + mutex_enter(&dn->dn_dbufs_mtx); + for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { + +#ifdef DEBUG + DB_DNODE_ENTER(db); + ASSERT3P(DB_DNODE(db), ==, dn); + DB_DNODE_EXIT(db); +#endif /* DEBUG */ + + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING && + refcount_is_zero(&db->db_holds)) { + db_marker.db_level = db->db_level; + db_marker.db_blkid = db->db_blkid; + db_marker.db_state = DB_SEARCH; + avl_insert_here(&dn->dn_dbufs, &db_marker, db, + AVL_BEFORE); + + /* + * We need to use the "marker" dbuf rather than + * simply getting the next dbuf, because + * dbuf_destroy() may actually remove multiple dbufs. + * It can call itself recursively on the parent dbuf, + * which may also be removed from dn_dbufs. The code + * flow would look like: + * + * dbuf_destroy(): + * dnode_rele_and_unlock(parent_dbuf, evicting=TRUE): + * if (!cacheable || pending_evict) + * dbuf_destroy() + */ + dbuf_destroy(db); + + db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); + avl_remove(&dn->dn_dbufs, &db_marker); + } else { + db->db_pending_evict = TRUE; + mutex_exit(&db->db_mtx); + db_next = AVL_NEXT(&dn->dn_dbufs, db); + } + } + mutex_exit(&dn->dn_dbufs_mtx); + + dnode_evict_bonus(dn); +} + +void +dnode_evict_bonus(dnode_t *dn) +{ + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus != NULL) { + if (refcount_is_zero(&dn->dn_bonus->db_holds)) { + mutex_enter(&dn->dn_bonus->db_mtx); + dbuf_destroy(dn->dn_bonus); + dn->dn_bonus = NULL; + } else { + dn->dn_bonus->db_pending_evict = TRUE; + } + } + rw_exit(&dn->dn_struct_rwlock); +} + +static void +dnode_undirty_dbufs(list_t *list) +{ + dbuf_dirty_record_t *dr; + + while (dr = list_head(list)) { + dmu_buf_impl_t *db = dr->dr_dbuf; + uint64_t txg = dr->dr_txg; + + if (db->db_level != 0) + dnode_undirty_dbufs(&dr->dt.di.dr_children); + + mutex_enter(&db->db_mtx); + /* XXX - use dbuf_undirty()? */ + list_remove(list, dr); + ASSERT(db->db_last_dirty == dr); + db->db_last_dirty = NULL; + db->db_dirtycnt -= 1; + if (db->db_level == 0) { + ASSERT(db->db_blkid == DMU_BONUS_BLKID || + dr->dt.dl.dr_data == db->db_buf); + dbuf_unoverride(dr); + } else { + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); + } +} + +static void +dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) +{ + int txgoff = tx->tx_txg & TXG_MASK; + + ASSERT(dmu_tx_is_syncing(tx)); + + /* + * Our contents should have been freed in dnode_sync() by the + * free range record inserted by the caller of dnode_free(). + */ + ASSERT0(DN_USED_BYTES(dn->dn_phys)); + ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); + + dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); + dnode_evict_dbufs(dn); + + /* + * XXX - It would be nice to assert this, but we may still + * have residual holds from async evictions from the arc... + * + * zfs_obj_to_path() also depends on this being + * commented out. + * + * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); + */ + + /* Undirty next bits */ + dn->dn_next_nlevels[txgoff] = 0; + dn->dn_next_indblkshift[txgoff] = 0; + dn->dn_next_blksz[txgoff] = 0; + + /* ASSERT(blkptrs are zero); */ + ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); + ASSERT(dn->dn_type != DMU_OT_NONE); + + ASSERT(dn->dn_free_txg > 0); + if (dn->dn_allocated_txg != dn->dn_free_txg) + dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); + bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots); + + mutex_enter(&dn->dn_mtx); + dn->dn_type = DMU_OT_NONE; + dn->dn_maxblkid = 0; + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_have_spill = B_FALSE; + mutex_exit(&dn->dn_mtx); + + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + + dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); + /* + * Now that we've released our hold, the dnode may + * be evicted, so we musn't access it. + */ +} + +/* + * Write out the dnode's dirty buffers. + */ +void +dnode_sync(dnode_t *dn, dmu_tx_t *tx) +{ + dnode_phys_t *dnp = dn->dn_phys; + int txgoff = tx->tx_txg & TXG_MASK; + list_t *list = &dn->dn_dirty_records[txgoff]; + static const dnode_phys_t zerodn = { 0 }; + boolean_t kill_spill = B_FALSE; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); + ASSERT(dnp->dn_type != DMU_OT_NONE || + bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0); + DNODE_VERIFY(dn); + + ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); + + if (dmu_objset_userused_enabled(dn->dn_objset) && + !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + mutex_enter(&dn->dn_mtx); + dn->dn_oldused = DN_USED_BYTES(dn->dn_phys); + dn->dn_oldflags = dn->dn_phys->dn_flags; + dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; + mutex_exit(&dn->dn_mtx); + dmu_objset_userquota_get_ids(dn, B_FALSE, tx); + } else { + /* Once we account for it, we should always account for it. */ + ASSERT(!(dn->dn_phys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED)); + } + + mutex_enter(&dn->dn_mtx); + if (dn->dn_allocated_txg == tx->tx_txg) { + /* The dnode is newly allocated or reallocated */ + if (dnp->dn_type == DMU_OT_NONE) { + /* this is a first alloc, not a realloc */ + dnp->dn_nlevels = 1; + dnp->dn_nblkptr = dn->dn_nblkptr; + } + + dnp->dn_type = dn->dn_type; + dnp->dn_bonustype = dn->dn_bonustype; + dnp->dn_bonuslen = dn->dn_bonuslen; + } + + dnp->dn_extra_slots = dn->dn_num_slots - 1; + + ASSERT(dnp->dn_nlevels > 1 || + BP_IS_HOLE(&dnp->dn_blkptr[0]) || + BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) || + BP_GET_LSIZE(&dnp->dn_blkptr[0]) == + dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERT(dnp->dn_nlevels < 2 || + BP_IS_HOLE(&dnp->dn_blkptr[0]) || + BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift); + + if (dn->dn_next_type[txgoff] != 0) { + dnp->dn_type = dn->dn_type; + dn->dn_next_type[txgoff] = 0; + } + + if (dn->dn_next_blksz[txgoff] != 0) { + ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], + SPA_MINBLOCKSIZE) == 0); + ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || + dn->dn_maxblkid == 0 || list_head(list) != NULL || + dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == + dnp->dn_datablkszsec || + !range_tree_is_empty(dn->dn_free_ranges[txgoff])); + dnp->dn_datablkszsec = + dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; + dn->dn_next_blksz[txgoff] = 0; + } + + if (dn->dn_next_bonuslen[txgoff] != 0) { + if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN) + dnp->dn_bonuslen = 0; + else + dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; + ASSERT(dnp->dn_bonuslen <= + DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1)); + dn->dn_next_bonuslen[txgoff] = 0; + } + + if (dn->dn_next_bonustype[txgoff] != 0) { + ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff])); + dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; + dn->dn_next_bonustype[txgoff] = 0; + } + + boolean_t freeing_dnode = dn->dn_free_txg > 0 && + dn->dn_free_txg <= tx->tx_txg; + + /* + * Remove the spill block if we have been explicitly asked to + * remove it, or if the object is being removed. + */ + if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) { + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) + kill_spill = B_TRUE; + dn->dn_rm_spillblk[txgoff] = 0; + } + + if (dn->dn_next_indblkshift[txgoff] != 0) { + ASSERT(dnp->dn_nlevels == 1); + dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; + dn->dn_next_indblkshift[txgoff] = 0; + } + + /* + * Just take the live (open-context) values for checksum and compress. + * Strictly speaking it's a future leak, but nothing bad happens if we + * start using the new checksum or compress algorithm a little early. + */ + dnp->dn_checksum = dn->dn_checksum; + dnp->dn_compress = dn->dn_compress; + + mutex_exit(&dn->dn_mtx); + + if (kill_spill) { + free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx); + mutex_enter(&dn->dn_mtx); + dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; + mutex_exit(&dn->dn_mtx); + } + + /* process all the "freed" ranges in the file */ + if (dn->dn_free_ranges[txgoff] != NULL) { + dnode_sync_free_range_arg_t dsfra; + dsfra.dsfra_dnode = dn; + dsfra.dsfra_tx = tx; + dsfra.dsfra_free_indirects = freeing_dnode; + if (freeing_dnode) { + ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], + 0, dn->dn_maxblkid + 1)); + } + mutex_enter(&dn->dn_mtx); + range_tree_vacate(dn->dn_free_ranges[txgoff], + dnode_sync_free_range, &dsfra); + range_tree_destroy(dn->dn_free_ranges[txgoff]); + dn->dn_free_ranges[txgoff] = NULL; + mutex_exit(&dn->dn_mtx); + } + + if (freeing_dnode) { + dn->dn_objset->os_freed_dnodes++; + dnode_sync_free(dn, tx); + return; + } + + if (dn->dn_num_slots > DNODE_MIN_SLOTS) { + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + mutex_enter(&ds->ds_lock); + ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_DNODE] = + B_TRUE; + mutex_exit(&ds->ds_lock); + } + + if (dn->dn_next_nlevels[txgoff]) { + dnode_increase_indirection(dn, tx); + dn->dn_next_nlevels[txgoff] = 0; + } + + if (dn->dn_next_nblkptr[txgoff]) { + /* this should only happen on a realloc */ + ASSERT(dn->dn_allocated_txg == tx->tx_txg); + if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) { + /* zero the new blkptrs we are gaining */ + bzero(dnp->dn_blkptr + dnp->dn_nblkptr, + sizeof (blkptr_t) * + (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr)); +#ifdef ZFS_DEBUG + } else { + int i; + ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr); + /* the blkptrs we are losing better be unallocated */ + for (i = dn->dn_next_nblkptr[txgoff]; + i < dnp->dn_nblkptr; i++) + ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i])); +#endif + } + mutex_enter(&dn->dn_mtx); + dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff]; + dn->dn_next_nblkptr[txgoff] = 0; + mutex_exit(&dn->dn_mtx); + } + + dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx); + + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + ASSERT3P(list_head(list), ==, NULL); + dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); + } + + /* + * Although we have dropped our reference to the dnode, it + * can't be evicted until its written, and we haven't yet + * initiated the IO for the dnode's dbuf. + */ +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c new file mode 100644 index 000000000000..0a58115341c7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c @@ -0,0 +1,461 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include <sys/zfs_context.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/arc.h> +#include <sys/zap.h> +#include <sys/zfeature.h> +#include <sys/spa.h> +#include <sys/dsl_bookmark.h> +#include <zfs_namecheck.h> + +static int +dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, + dsl_dataset_t **dsp, void *tag, char **shortnamep) +{ + char buf[ZFS_MAX_DATASET_NAME_LEN]; + char *hashp; + + if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + hashp = strchr(fullname, '#'); + if (hashp == NULL) + return (SET_ERROR(EINVAL)); + + *shortnamep = hashp + 1; + if (zfs_component_namecheck(*shortnamep, NULL, NULL)) + return (SET_ERROR(EINVAL)); + (void) strlcpy(buf, fullname, hashp - fullname + 1); + return (dsl_dataset_hold(dp, buf, tag, dsp)); +} + +/* + * Returns ESRCH if bookmark is not found. + */ +static int +dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname, + zfs_bookmark_phys_t *bmark_phys) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t bmark_zapobj = ds->ds_bookmarks; + matchtype_t mt = 0; + int err; + + if (bmark_zapobj == 0) + return (SET_ERROR(ESRCH)); + + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_NORMALIZE; + + err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t), + sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, + NULL, 0, NULL); + + return (err == ENOENT ? ESRCH : err); +} + +/* + * If later_ds is non-NULL, this will return EXDEV if the the specified bookmark + * does not represents an earlier point in later_ds's timeline. + * + * Returns ENOENT if the dataset containing the bookmark does not exist. + * Returns ESRCH if the dataset exists but the bookmark was not found in it. + */ +int +dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname, + dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp) +{ + char *shortname; + dsl_dataset_t *ds; + int error; + + error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname); + if (error != 0) + return (error); + + error = dsl_dataset_bmark_lookup(ds, shortname, bmp); + if (error == 0 && later_ds != NULL) { + if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg)) + error = SET_ERROR(EXDEV); + } + dsl_dataset_rele(ds, FTAG); + return (error); +} + +typedef struct dsl_bookmark_create_arg { + nvlist_t *dbca_bmarks; + nvlist_t *dbca_errors; +} dsl_bookmark_create_arg_t; + +static int +dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *bmark_fs; + char *shortname; + int error; + zfs_bookmark_phys_t bmark_phys; + + if (!snapds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + error = dsl_bookmark_hold_ds(dp, bookmark_name, + &bmark_fs, FTAG, &shortname); + if (error != 0) + return (error); + + if (!dsl_dataset_is_before(bmark_fs, snapds, 0)) { + dsl_dataset_rele(bmark_fs, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = dsl_dataset_bmark_lookup(bmark_fs, shortname, + &bmark_phys); + dsl_dataset_rele(bmark_fs, FTAG); + if (error == 0) + return (SET_ERROR(EEXIST)); + if (error == ESRCH) + return (0); + return (error); +} + +static int +dsl_bookmark_create_check(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_arg_t *dbca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + int rv = 0; + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) + return (SET_ERROR(ENOTSUP)); + + for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { + dsl_dataset_t *snapds; + int error; + + /* note: validity of nvlist checked by ioctl layer */ + error = dsl_dataset_hold(dp, fnvpair_value_string(pair), + FTAG, &snapds); + if (error == 0) { + error = dsl_bookmark_create_check_impl(snapds, + nvpair_name(pair), tx); + dsl_dataset_rele(snapds, FTAG); + } + if (error != 0) { + fnvlist_add_int32(dbca->dbca_errors, + nvpair_name(pair), error); + rv = error; + } + } + + return (rv); +} + +static void +dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_arg_t *dbca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + + ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)); + + for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { + dsl_dataset_t *snapds, *bmark_fs; + zfs_bookmark_phys_t bmark_phys; + char *shortname; + + VERIFY0(dsl_dataset_hold(dp, fnvpair_value_string(pair), + FTAG, &snapds)); + VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), + &bmark_fs, FTAG, &shortname)); + if (bmark_fs->ds_bookmarks == 0) { + bmark_fs->ds_bookmarks = + zap_create_norm(mos, U8_TEXTPREP_TOUPPER, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); + + dsl_dataset_zapify(bmark_fs, tx); + VERIFY0(zap_add(mos, bmark_fs->ds_object, + DS_FIELD_BOOKMARK_NAMES, + sizeof (bmark_fs->ds_bookmarks), 1, + &bmark_fs->ds_bookmarks, tx)); + } + + bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid; + bmark_phys.zbm_creation_txg = + dsl_dataset_phys(snapds)->ds_creation_txg; + bmark_phys.zbm_creation_time = + dsl_dataset_phys(snapds)->ds_creation_time; + + VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks, + shortname, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &bmark_phys, tx)); + + spa_history_log_internal_ds(bmark_fs, "bookmark", tx, + "name=%s creation_txg=%llu target_snap=%llu", + shortname, + (longlong_t)bmark_phys.zbm_creation_txg, + (longlong_t)snapds->ds_object); + + dsl_dataset_rele(bmark_fs, FTAG); + dsl_dataset_rele(snapds, FTAG); + } +} + +/* + * The bookmarks must all be in the same pool. + */ +int +dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors) +{ + nvpair_t *pair; + dsl_bookmark_create_arg_t dbca; + + pair = nvlist_next_nvpair(bmarks, NULL); + if (pair == NULL) + return (0); + + dbca.dbca_bmarks = bmarks; + dbca.dbca_errors = errors; + + return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check, + dsl_bookmark_create_sync, &dbca, + fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL)); +} + +int +dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl) +{ + int err = 0; + zap_cursor_t zc; + zap_attribute_t attr; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + uint64_t bmark_zapobj = ds->ds_bookmarks; + if (bmark_zapobj == 0) + return (0); + + for (zap_cursor_init(&zc, dp->dp_meta_objset, bmark_zapobj); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + char *bmark_name = attr.za_name; + zfs_bookmark_phys_t bmark_phys; + + err = dsl_dataset_bmark_lookup(ds, bmark_name, &bmark_phys); + ASSERT3U(err, !=, ENOENT); + if (err != 0) + break; + + nvlist_t *out_props = fnvlist_alloc(); + if (nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_GUID))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_GUID, bmark_phys.zbm_guid); + } + if (nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_CREATETXG))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_CREATETXG, bmark_phys.zbm_creation_txg); + } + if (nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_CREATION))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_CREATION, bmark_phys.zbm_creation_time); + } + + fnvlist_add_nvlist(outnvl, bmark_name, out_props); + fnvlist_free(out_props); + } + zap_cursor_fini(&zc); + return (err); +} + +/* + * Retrieve the bookmarks that exist in the specified dataset, and the + * requested properties of each bookmark. + * + * The "props" nvlist specifies which properties are requested. + * See lzc_get_bookmarks() for the list of valid properties. + */ +int +dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + + err = dsl_pool_hold(dsname, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + err = dsl_get_bookmarks_impl(ds, props, outnvl); + + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); +} + +typedef struct dsl_bookmark_destroy_arg { + nvlist_t *dbda_bmarks; + nvlist_t *dbda_success; + nvlist_t *dbda_errors; +} dsl_bookmark_destroy_arg_t; + +static int +dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t bmark_zapobj = ds->ds_bookmarks; + matchtype_t mt = 0; + + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_NORMALIZE; + + return (zap_remove_norm(mos, bmark_zapobj, name, mt, tx)); +} + +static int +dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_destroy_arg_t *dbda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + int rv = 0; + + ASSERT(nvlist_empty(dbda->dbda_success)); + ASSERT(nvlist_empty(dbda->dbda_errors)); + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) + return (0); + + for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) { + const char *fullname = nvpair_name(pair); + dsl_dataset_t *ds; + zfs_bookmark_phys_t bm; + int error; + char *shortname; + + error = dsl_bookmark_hold_ds(dp, fullname, &ds, + FTAG, &shortname); + if (error == ENOENT) { + /* ignore it; the bookmark is "already destroyed" */ + continue; + } + if (error == 0) { + error = dsl_dataset_bmark_lookup(ds, shortname, &bm); + dsl_dataset_rele(ds, FTAG); + if (error == ESRCH) { + /* + * ignore it; the bookmark is + * "already destroyed" + */ + continue; + } + } + if (error == 0) { + if (dmu_tx_is_syncing(tx)) { + fnvlist_add_boolean(dbda->dbda_success, + fullname); + } + } else { + fnvlist_add_int32(dbda->dbda_errors, fullname, error); + rv = error; + } + } + return (rv); +} + +static void +dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_destroy_arg_t *dbda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + + for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) { + dsl_dataset_t *ds; + char *shortname; + uint64_t zap_cnt; + + VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), + &ds, FTAG, &shortname)); + VERIFY0(dsl_dataset_bookmark_remove(ds, shortname, tx)); + + /* + * If all of this dataset's bookmarks have been destroyed, + * free the zap object and decrement the feature's use count. + */ + VERIFY0(zap_count(mos, ds->ds_bookmarks, + &zap_cnt)); + if (zap_cnt == 0) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); + ds->ds_bookmarks = 0; + spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); + VERIFY0(zap_remove(mos, ds->ds_object, + DS_FIELD_BOOKMARK_NAMES, tx)); + } + + spa_history_log_internal_ds(ds, "remove bookmark", tx, + "name=%s", shortname); + + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * The bookmarks must all be in the same pool. + */ +int +dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors) +{ + int rv; + dsl_bookmark_destroy_arg_t dbda; + nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL); + if (pair == NULL) + return (0); + + dbda.dbda_bmarks = bmarks; + dbda.dbda_errors = errors; + dbda.dbda_success = fnvlist_alloc(); + + rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check, + dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks), + ZFS_SPACE_CHECK_RESERVED); + fnvlist_free(dbda.dbda_success); + return (rv); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c new file mode 100644 index 000000000000..cc295a3f366e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -0,0 +1,4256 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org> + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 RackTop Systems. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dmu_traverse.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_send.h> +#include <sys/dmu_tx.h> +#include <sys/arc.h> +#include <sys/zio.h> +#include <sys/zap.h> +#include <sys/zfeature.h> +#include <sys/unique.h> +#include <sys/zfs_context.h> +#include <sys/zfs_ioctl.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_onexit.h> +#include <sys/zvol.h> +#include <sys/dsl_scan.h> +#include <sys/dsl_deadlist.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_userhold.h> +#include <sys/dsl_bookmark.h> +#include <sys/dmu_send.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <zfs_fletcher.h> + +SYSCTL_DECL(_vfs_zfs); + +/* + * The SPA supports block sizes up to 16MB. However, very large blocks + * can have an impact on i/o latency (e.g. tying up a spinning disk for + * ~300ms), and also potentially on the memory allocator. Therefore, + * we do not allow the recordsize to be set larger than zfs_max_recordsize + * (default 1MB). Larger blocks can be created by changing this tunable, + * and pools with larger blocks can always be imported and used, regardless + * of this setting. + */ +int zfs_max_recordsize = 1 * 1024 * 1024; +SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN, + &zfs_max_recordsize, 0, + "Maximum block size. Expect dragons when tuning this."); + +#define SWITCH64(x, y) \ + { \ + uint64_t __tmp = (x); \ + (x) = (y); \ + (y) = __tmp; \ + } + +#define DS_REF_MAX (1ULL << 62) + +extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); + +static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, + uint64_t obj, dmu_tx_t *tx); +static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, + dmu_tx_t *tx); + +extern int spa_asize_inflation; + +static zil_header_t zero_zil; + +/* + * Figure out how much of this delta should be propogated to the dsl_dir + * layer. If there's a refreservation, that space has already been + * partially accounted for in our ancestors. + */ +static int64_t +parent_delta(dsl_dataset_t *ds, int64_t delta) +{ + dsl_dataset_phys_t *ds_phys; + uint64_t old_bytes, new_bytes; + + if (ds->ds_reserved == 0) + return (delta); + + ds_phys = dsl_dataset_phys(ds); + old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved); + new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved); + + ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); + return (new_bytes - old_bytes); +} + +void +dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) +{ + int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); + int64_t delta; + + dprintf_bp(bp, "ds=%p", ds); + + ASSERT(dmu_tx_is_syncing(tx)); + /* It could have been compressed away to nothing */ + if (BP_IS_HOLE(bp)) + return; + ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); + ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); + if (ds == NULL) { + dsl_pool_mos_diduse_space(tx->tx_pool, + used, compressed, uncompressed); + return; + } + + ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_lock); + delta = parent_delta(ds, used); + dsl_dataset_phys(ds)->ds_referenced_bytes += used; + dsl_dataset_phys(ds)->ds_compressed_bytes += compressed; + dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed; + dsl_dataset_phys(ds)->ds_unique_bytes += used; + + if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) { + ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] = + B_TRUE; + } + + spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); + if (f != SPA_FEATURE_NONE) + ds->ds_feature_activation_needed[f] = B_TRUE; + + mutex_exit(&ds->ds_lock); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, + compressed, uncompressed, tx); + dsl_dir_transfer_space(ds->ds_dir, used - delta, + DD_USED_REFRSRV, DD_USED_HEAD, NULL); +} + +/* + * Called when the specified segment has been remapped, and is thus no + * longer referenced in the head dataset. The vdev must be indirect. + * + * If the segment is referenced by a snapshot, put it on the remap deadlist. + * Otherwise, add this segment to the obsolete spacemap. + */ +void +dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset, + uint64_t size, uint64_t birth, dmu_tx_t *tx) +{ + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(birth <= tx->tx_txg); + ASSERT(!ds->ds_is_snapshot); + + if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { + spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); + } else { + blkptr_t fakebp; + dva_t *dva = &fakebp.blk_dva[0]; + + ASSERT(ds != NULL); + + mutex_enter(&ds->ds_remap_deadlist_lock); + if (!dsl_dataset_remap_deadlist_exists(ds)) { + dsl_dataset_create_remap_deadlist(ds, tx); + } + mutex_exit(&ds->ds_remap_deadlist_lock); + + BP_ZERO(&fakebp); + fakebp.blk_birth = birth; + DVA_SET_VDEV(dva, vdev); + DVA_SET_OFFSET(dva, offset); + DVA_SET_ASIZE(dva, size); + + dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx); + } +} + +int +dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, + boolean_t async) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + int used = bp_get_dsize_sync(spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); + + if (BP_IS_HOLE(bp)) + return (0); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(bp->blk_birth <= tx->tx_txg); + + if (ds == NULL) { + dsl_free(tx->tx_pool, tx->tx_txg, bp); + dsl_pool_mos_diduse_space(tx->tx_pool, + -used, -compressed, -uncompressed); + return (used); + } + ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); + + ASSERT(!ds->ds_is_snapshot); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { + int64_t delta; + + dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); + dsl_free(tx->tx_pool, tx->tx_txg, bp); + + mutex_enter(&ds->ds_lock); + ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used || + !DS_UNIQUE_IS_ACCURATE(ds)); + delta = parent_delta(ds, -used); + dsl_dataset_phys(ds)->ds_unique_bytes -= used; + mutex_exit(&ds->ds_lock); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + delta, -compressed, -uncompressed, tx); + dsl_dir_transfer_space(ds->ds_dir, -used - delta, + DD_USED_REFRSRV, DD_USED_HEAD, NULL); + } else { + dprintf_bp(bp, "putting on dead list: %s", ""); + if (async) { + /* + * We are here as part of zio's write done callback, + * which means we're a zio interrupt thread. We can't + * call dsl_deadlist_insert() now because it may block + * waiting for I/O. Instead, put bp on the deferred + * queue and let dsl_pool_sync() finish the job. + */ + bplist_append(&ds->ds_pending_deadlist, bp); + } else { + dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); + } + ASSERT3U(ds->ds_prev->ds_object, ==, + dsl_dataset_phys(ds)->ds_prev_snap_obj); + ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0); + /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ + if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == + ds->ds_object && bp->blk_birth > + dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + mutex_enter(&ds->ds_prev->ds_lock); + dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used; + mutex_exit(&ds->ds_prev->ds_lock); + } + if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { + dsl_dir_transfer_space(ds->ds_dir, used, + DD_USED_HEAD, DD_USED_SNAP, tx); + } + } + mutex_enter(&ds->ds_lock); + ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used); + dsl_dataset_phys(ds)->ds_referenced_bytes -= used; + ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed); + dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed; + ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed); + dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed; + mutex_exit(&ds->ds_lock); + + return (used); +} + +/* + * We have to release the fsid syncronously or we risk that a subsequent + * mount of the same dataset will fail to unique_insert the fsid. This + * failure would manifest itself as the fsid of this dataset changing + * between mounts which makes NFS clients quite unhappy. + */ +static void +dsl_dataset_evict_sync(void *dbu) +{ + dsl_dataset_t *ds = dbu; + + ASSERT(ds->ds_owner == NULL); + + unique_remove(ds->ds_fsid_guid); +} + +static void +dsl_dataset_evict_async(void *dbu) +{ + dsl_dataset_t *ds = dbu; + + ASSERT(ds->ds_owner == NULL); + + ds->ds_dbuf = NULL; + + if (ds->ds_objset != NULL) + dmu_objset_evict(ds->ds_objset); + + if (ds->ds_prev) { + dsl_dataset_rele(ds->ds_prev, ds); + ds->ds_prev = NULL; + } + + bplist_destroy(&ds->ds_pending_deadlist); + if (dsl_deadlist_is_open(&ds->ds_deadlist)) + dsl_deadlist_close(&ds->ds_deadlist); + if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) + dsl_deadlist_close(&ds->ds_remap_deadlist); + if (ds->ds_dir) + dsl_dir_async_rele(ds->ds_dir, ds); + + ASSERT(!list_link_active(&ds->ds_synced_link)); + + list_destroy(&ds->ds_prop_cbs); + if (mutex_owned(&ds->ds_lock)) + mutex_exit(&ds->ds_lock); + mutex_destroy(&ds->ds_lock); + if (mutex_owned(&ds->ds_opening_lock)) + mutex_exit(&ds->ds_opening_lock); + mutex_destroy(&ds->ds_opening_lock); + mutex_destroy(&ds->ds_sendstream_lock); + mutex_destroy(&ds->ds_remap_deadlist_lock); + refcount_destroy(&ds->ds_longholds); + rrw_destroy(&ds->ds_bp_rwlock); + + kmem_free(ds, sizeof (dsl_dataset_t)); +} + +int +dsl_dataset_get_snapname(dsl_dataset_t *ds) +{ + dsl_dataset_phys_t *headphys; + int err; + dmu_buf_t *headdbuf; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + if (ds->ds_snapname[0]) + return (0); + if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) + return (0); + + err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, + FTAG, &headdbuf); + if (err != 0) + return (err); + headphys = headdbuf->db_data; + err = zap_value_search(dp->dp_meta_objset, + headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); + dmu_buf_rele(headdbuf, FTAG); + return (err); +} + +int +dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; + matchtype_t mt = 0; + int err; + + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_NORMALIZE; + + err = zap_lookup_norm(mos, snapobj, name, 8, 1, + value, mt, NULL, 0, NULL); + if (err == ENOTSUP && (mt & MT_NORMALIZE)) + err = zap_lookup(mos, snapobj, name, 8, 1, value); + return (err); +} + +int +dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, + boolean_t adj_cnt) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; + matchtype_t mt = 0; + int err; + + dsl_dir_snap_cmtime_update(ds->ds_dir); + + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_NORMALIZE; + + err = zap_remove_norm(mos, snapobj, name, mt, tx); + if (err == ENOTSUP && (mt & MT_NORMALIZE)) + err = zap_remove(mos, snapobj, name, tx); + + if (err == 0 && adj_cnt) + dsl_fs_ss_count_adjust(ds->ds_dir, -1, + DD_FIELD_SNAPSHOT_COUNT, tx); + + return (err); +} + +boolean_t +dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag) +{ + dmu_buf_t *dbuf = ds->ds_dbuf; + boolean_t result = B_FALSE; + + if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset, + ds->ds_object, DMU_BONUS_BLKID, tag)) { + + if (ds == dmu_buf_get_user(dbuf)) + result = B_TRUE; + else + dmu_buf_rele(dbuf, tag); + } + + return (result); +} + +int +dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, + dsl_dataset_t **dsp) +{ + objset_t *mos = dp->dp_meta_objset; + dmu_buf_t *dbuf; + dsl_dataset_t *ds; + int err; + dmu_object_info_t doi; + + ASSERT(dsl_pool_config_held(dp)); + + err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); + if (err != 0) + return (err); + + /* Make sure dsobj has the correct object type. */ + dmu_object_info_from_db(dbuf, &doi); + if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) { + dmu_buf_rele(dbuf, tag); + return (SET_ERROR(EINVAL)); + } + + ds = dmu_buf_get_user(dbuf); + if (ds == NULL) { + dsl_dataset_t *winner = NULL; + + ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); + ds->ds_dbuf = dbuf; + ds->ds_object = dsobj; + ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; + + err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj, + NULL, ds, &ds->ds_dir); + if (err != 0) { + kmem_free(ds, sizeof (dsl_dataset_t)); + dmu_buf_rele(dbuf, tag); + return (err); + } + + mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_remap_deadlist_lock, + NULL, MUTEX_DEFAULT, NULL); + rrw_init(&ds->ds_bp_rwlock, B_FALSE); + refcount_create(&ds->ds_longholds); + + bplist_create(&ds->ds_pending_deadlist); + + list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), + offsetof(dmu_sendarg_t, dsa_link)); + + list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t), + offsetof(dsl_prop_cb_record_t, cbr_ds_node)); + + if (doi.doi_type == DMU_OTN_ZAP_METADATA) { + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (!(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET)) + continue; + err = zap_contains(mos, dsobj, + spa_feature_table[f].fi_guid); + if (err == 0) { + ds->ds_feature_inuse[f] = B_TRUE; + } else { + ASSERT3U(err, ==, ENOENT); + err = 0; + } + } + } + + if (!ds->ds_is_snapshot) { + ds->ds_snapname[0] = '\0'; + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, + ds, &ds->ds_prev); + } + if (doi.doi_type == DMU_OTN_ZAP_METADATA) { + int zaperr = zap_lookup(mos, ds->ds_object, + DS_FIELD_BOOKMARK_NAMES, + sizeof (ds->ds_bookmarks), 1, + &ds->ds_bookmarks); + if (zaperr != ENOENT) + VERIFY0(zaperr); + } + } else { + if (zfs_flags & ZFS_DEBUG_SNAPNAMES) + err = dsl_dataset_get_snapname(ds); + if (err == 0 && + dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { + err = zap_count( + ds->ds_dir->dd_pool->dp_meta_objset, + dsl_dataset_phys(ds)->ds_userrefs_obj, + &ds->ds_userrefs); + } + } + + if (err == 0 && !ds->ds_is_snapshot) { + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + &ds->ds_reserved); + if (err == 0) { + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + &ds->ds_quota); + } + } else { + ds->ds_reserved = ds->ds_quota = 0; + } + + dsl_deadlist_open(&ds->ds_deadlist, + mos, dsl_dataset_phys(ds)->ds_deadlist_obj); + uint64_t remap_deadlist_obj = + dsl_dataset_get_remap_deadlist_object(ds); + if (remap_deadlist_obj != 0) { + dsl_deadlist_open(&ds->ds_remap_deadlist, mos, + remap_deadlist_obj); + } + + dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync, + dsl_dataset_evict_async, &ds->ds_dbuf); + if (err == 0) + winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); + + if (err != 0 || winner != NULL) { + bplist_destroy(&ds->ds_pending_deadlist); + dsl_deadlist_close(&ds->ds_deadlist); + if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) + dsl_deadlist_close(&ds->ds_remap_deadlist); + if (ds->ds_prev) + dsl_dataset_rele(ds->ds_prev, ds); + dsl_dir_rele(ds->ds_dir, ds); + mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_opening_lock); + mutex_destroy(&ds->ds_sendstream_lock); + refcount_destroy(&ds->ds_longholds); + kmem_free(ds, sizeof (dsl_dataset_t)); + if (err != 0) { + dmu_buf_rele(dbuf, tag); + return (err); + } + ds = winner; + } else { + ds->ds_fsid_guid = + unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid); + if (ds->ds_fsid_guid != + dsl_dataset_phys(ds)->ds_fsid_guid) { + zfs_dbgmsg("ds_fsid_guid changed from " + "%llx to %llx for pool %s dataset id %llu", + (long long) + dsl_dataset_phys(ds)->ds_fsid_guid, + (long long)ds->ds_fsid_guid, + spa_name(dp->dp_spa), + dsobj); + } + } + } + ASSERT3P(ds->ds_dbuf, ==, dbuf); + ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data); + ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 || + spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || + dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); + *dsp = ds; + return (0); +} + +int +dsl_dataset_hold(dsl_pool_t *dp, const char *name, + void *tag, dsl_dataset_t **dsp) +{ + dsl_dir_t *dd; + const char *snapname; + uint64_t obj; + int err = 0; + dsl_dataset_t *ds; + + err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); + if (err != 0) + return (err); + + ASSERT(dsl_pool_config_held(dp)); + obj = dsl_dir_phys(dd)->dd_head_dataset_obj; + if (obj != 0) + err = dsl_dataset_hold_obj(dp, obj, tag, &ds); + else + err = SET_ERROR(ENOENT); + + /* we may be looking for a snapshot */ + if (err == 0 && snapname != NULL) { + dsl_dataset_t *snap_ds; + + if (*snapname++ != '@') { + dsl_dataset_rele(ds, tag); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(ENOENT)); + } + + dprintf("looking for snapshot '%s'\n", snapname); + err = dsl_dataset_snap_lookup(ds, snapname, &obj); + if (err == 0) + err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds); + dsl_dataset_rele(ds, tag); + + if (err == 0) { + mutex_enter(&snap_ds->ds_lock); + if (snap_ds->ds_snapname[0] == 0) + (void) strlcpy(snap_ds->ds_snapname, snapname, + sizeof (snap_ds->ds_snapname)); + mutex_exit(&snap_ds->ds_lock); + ds = snap_ds; + } + } + if (err == 0) + *dsp = ds; + dsl_dir_rele(dd, FTAG); + return (err); +} + +int +dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, + void *tag, dsl_dataset_t **dsp) +{ + int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); + if (err != 0) + return (err); + if (!dsl_dataset_tryown(*dsp, tag)) { + dsl_dataset_rele(*dsp, tag); + *dsp = NULL; + return (SET_ERROR(EBUSY)); + } + return (0); +} + +int +dsl_dataset_own(dsl_pool_t *dp, const char *name, + void *tag, dsl_dataset_t **dsp) +{ + int err = dsl_dataset_hold(dp, name, tag, dsp); + if (err != 0) + return (err); + if (!dsl_dataset_tryown(*dsp, tag)) { + dsl_dataset_rele(*dsp, tag); + return (SET_ERROR(EBUSY)); + } + return (0); +} + +/* + * See the comment above dsl_pool_hold() for details. In summary, a long + * hold is used to prevent destruction of a dataset while the pool hold + * is dropped, allowing other concurrent operations (e.g. spa_sync()). + * + * The dataset and pool must be held when this function is called. After it + * is called, the pool hold may be released while the dataset is still held + * and accessed. + */ +void +dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag) +{ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + (void) refcount_add(&ds->ds_longholds, tag); +} + +void +dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag) +{ + (void) refcount_remove(&ds->ds_longholds, tag); +} + +/* Return B_TRUE if there are any long holds on this dataset. */ +boolean_t +dsl_dataset_long_held(dsl_dataset_t *ds) +{ + return (!refcount_is_zero(&ds->ds_longholds)); +} + +void +dsl_dataset_name(dsl_dataset_t *ds, char *name) +{ + if (ds == NULL) { + (void) strcpy(name, "mos"); + } else { + dsl_dir_name(ds->ds_dir, name); + VERIFY0(dsl_dataset_get_snapname(ds)); + if (ds->ds_snapname[0]) { + VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + /* + * We use a "recursive" mutex so that we + * can call dprintf_ds() with ds_lock held. + */ + if (!MUTEX_HELD(&ds->ds_lock)) { + mutex_enter(&ds->ds_lock); + VERIFY3U(strlcat(name, ds->ds_snapname, + ZFS_MAX_DATASET_NAME_LEN), <, + ZFS_MAX_DATASET_NAME_LEN); + mutex_exit(&ds->ds_lock); + } else { + VERIFY3U(strlcat(name, ds->ds_snapname, + ZFS_MAX_DATASET_NAME_LEN), <, + ZFS_MAX_DATASET_NAME_LEN); + } + } + } +} + +int +dsl_dataset_namelen(dsl_dataset_t *ds) +{ + VERIFY0(dsl_dataset_get_snapname(ds)); + mutex_enter(&ds->ds_lock); + int len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname); + mutex_exit(&ds->ds_lock); + return (len); +} + +void +dsl_dataset_rele(dsl_dataset_t *ds, void *tag) +{ + dmu_buf_rele(ds->ds_dbuf, tag); +} + +void +dsl_dataset_disown(dsl_dataset_t *ds, void *tag) +{ + ASSERT3P(ds->ds_owner, ==, tag); + ASSERT(ds->ds_dbuf != NULL); + + mutex_enter(&ds->ds_lock); + ds->ds_owner = NULL; + mutex_exit(&ds->ds_lock); + dsl_dataset_long_rele(ds, tag); + dsl_dataset_rele(ds, tag); +} + +boolean_t +dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) +{ + boolean_t gotit = FALSE; + + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + mutex_enter(&ds->ds_lock); + if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { + ds->ds_owner = tag; + dsl_dataset_long_hold(ds, tag); + gotit = TRUE; + } + mutex_exit(&ds->ds_lock); + return (gotit); +} + +boolean_t +dsl_dataset_has_owner(dsl_dataset_t *ds) +{ + boolean_t rv; + mutex_enter(&ds->ds_lock); + rv = (ds->ds_owner != NULL); + mutex_exit(&ds->ds_lock); + return (rv); +} + +static void +dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; + uint64_t zero = 0; + + VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); + + spa_feature_incr(spa, f, tx); + dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); + + VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, + sizeof (zero), 1, &zero, tx)); +} + +void +dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; + + VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); + + VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx)); + spa_feature_decr(spa, f, tx); +} + +uint64_t +dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, + uint64_t flags, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dd->dd_pool; + dmu_buf_t *dbuf; + dsl_dataset_phys_t *dsphys; + uint64_t dsobj; + objset_t *mos = dp->dp_meta_objset; + + if (origin == NULL) + origin = dp->dp_origin_snap; + + ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); + ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0); + + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, + DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); + VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + bzero(dsphys, sizeof (dsl_dataset_phys_t)); + dsphys->ds_dir_obj = dd->dd_object; + dsphys->ds_flags = flags; + dsphys->ds_fsid_guid = unique_create(); + do { + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + } while (dsphys->ds_guid == 0); + dsphys->ds_snapnames_zapobj = + zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, + DMU_OT_NONE, 0, tx); + dsphys->ds_creation_time = gethrestime_sec(); + dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; + + if (origin == NULL) { + dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); + } else { + dsl_dataset_t *ohds; /* head of the origin snapshot */ + + dsphys->ds_prev_snap_obj = origin->ds_object; + dsphys->ds_prev_snap_txg = + dsl_dataset_phys(origin)->ds_creation_txg; + dsphys->ds_referenced_bytes = + dsl_dataset_phys(origin)->ds_referenced_bytes; + dsphys->ds_compressed_bytes = + dsl_dataset_phys(origin)->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = + dsl_dataset_phys(origin)->ds_uncompressed_bytes; + rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG); + dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp; + rrw_exit(&origin->ds_bp_rwlock, FTAG); + + /* + * Inherit flags that describe the dataset's contents + * (INCONSISTENT) or properties (Case Insensitive). + */ + dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags & + (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET); + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (origin->ds_feature_inuse[f]) + dsl_dataset_activate_feature(dsobj, f, tx); + } + + dmu_buf_will_dirty(origin->ds_dbuf, tx); + dsl_dataset_phys(origin)->ds_num_children++; + + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj, + FTAG, &ohds)); + dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, + dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); + dsl_dataset_rele(ohds, FTAG); + + if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { + if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) { + dsl_dataset_phys(origin)->ds_next_clones_obj = + zap_create(mos, + DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY0(zap_add_int(mos, + dsl_dataset_phys(origin)->ds_next_clones_obj, + dsobj, tx)); + } + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object; + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { + dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); + dsl_dir_phys(origin->ds_dir)->dd_clones = + zap_create(mos, + DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY0(zap_add_int(mos, + dsl_dir_phys(origin->ds_dir)->dd_clones, + dsobj, tx)); + } + } + + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) + dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + + dmu_buf_rele(dbuf, FTAG); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj; + + return (dsobj); +} + +static void +dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + objset_t *os; + + VERIFY0(dmu_objset_from_ds(ds, &os)); + if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + zio_t *zio; + + bzero(&os->os_zil_header, sizeof (os->os_zil_header)); + + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + dsl_dataset_sync(ds, zio, tx); + VERIFY0(zio_wait(zio)); + + /* dsl_dataset_sync_done will drop this reference. */ + dmu_buf_add_ref(ds->ds_dbuf, ds); + dsl_dataset_sync_done(ds, tx); + } +} + +uint64_t +dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, + dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) +{ + dsl_pool_t *dp = pdd->dd_pool; + uint64_t dsobj, ddobj; + dsl_dir_t *dd; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(lastname[0] != '@'); + + ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); + VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); + + dsobj = dsl_dataset_create_sync_dd(dd, origin, + flags & ~DS_CREATE_FLAG_NODIRTY, tx); + + dsl_deleg_set_create_perms(dd, tx, cr); + + /* + * Since we're creating a new node we know it's a leaf, so we can + * initialize the counts if the limit feature is active. + */ + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { + uint64_t cnt = 0; + objset_t *os = dd->dd_pool->dp_meta_objset; + + dsl_dir_zapify(dd, tx); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (cnt), 1, &cnt, tx)); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (cnt), 1, &cnt, tx)); + } + + dsl_dir_rele(dd, FTAG); + + /* + * If we are creating a clone, make sure we zero out any stale + * data from the origin snapshots zil header. + */ + if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) { + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + dsl_dataset_zero_zil(ds, tx); + dsl_dataset_rele(ds, FTAG); + } + + return (dsobj); +} + +#ifdef __FreeBSD__ +/* FreeBSD ioctl compat begin */ +struct destroyarg { + nvlist_t *nvl; + const char *snapname; +}; + +static int +dsl_check_snap_cb(const char *name, void *arg) +{ + struct destroyarg *da = arg; + dsl_dataset_t *ds; + char *dsname; + + dsname = kmem_asprintf("%s@%s", name, da->snapname); + fnvlist_add_boolean(da->nvl, dsname); + kmem_free(dsname, strlen(dsname) + 1); + + return (0); +} + +int +dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname, + nvlist_t *snaps) +{ + struct destroyarg *da; + int err; + + da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP); + da->nvl = snaps; + da->snapname = snapname; + err = dmu_objset_find(fsname, dsl_check_snap_cb, da, + DS_FIND_CHILDREN); + kmem_free(da, sizeof (struct destroyarg)); + + return (err); +} +/* FreeBSD ioctl compat end */ +#endif /* __FreeBSD__ */ + +/* + * The unique space in the head dataset can be calculated by subtracting + * the space used in the most recent snapshot, that is still being used + * in this file system, from the space currently in use. To figure out + * the space in the most recent snapshot still in use, we need to take + * the total space used in the snapshot and subtract out the space that + * has been freed up since the snapshot was taken. + */ +void +dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) +{ + uint64_t mrs_used; + uint64_t dlused, dlcomp, dluncomp; + + ASSERT(!ds->ds_is_snapshot); + + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) + mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes; + else + mrs_used = 0; + + dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); + + ASSERT3U(dlused, <=, mrs_used); + dsl_dataset_phys(ds)->ds_unique_bytes = + dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused); + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_UNIQUE_ACCURATE) + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; +} + +void +dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, + dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t count; + int err; + + ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2); + err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, + obj, tx); + /* + * The err should not be ENOENT, but a bug in a previous version + * of the code could cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a missing entry. + * If we knew that the pool was created after + * SPA_VERSION_NEXT_CLONES, we could assert that it isn't + * ENOENT. However, at least we can check that we don't have + * too many entries in the next_clones_obj even after failing to + * remove this one. + */ + if (err != ENOENT) + VERIFY0(err); + ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, + &count)); + ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2); +} + + +blkptr_t * +dsl_dataset_get_blkptr(dsl_dataset_t *ds) +{ + return (&dsl_dataset_phys(ds)->ds_bp); +} + +spa_t * +dsl_dataset_get_spa(dsl_dataset_t *ds) +{ + return (ds->ds_dir->dd_pool->dp_spa); +} + +void +dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp; + + if (ds == NULL) /* this is the meta-objset */ + return; + + ASSERT(ds->ds_objset != NULL); + + if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) + panic("dirtying snapshot!"); + + /* Must not dirty a dataset in the same txg where it got snapshotted. */ + ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); + + dp = ds->ds_dir->dd_pool; + if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(ds->ds_dbuf, ds); + } +} + +boolean_t +dsl_dataset_is_dirty(dsl_dataset_t *ds) +{ + for (int t = 0; t < TXG_SIZE; t++) { + if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, + ds, t)) + return (B_TRUE); + } + return (B_FALSE); +} + +static int +dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t asize; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + /* + * If there's an fs-only reservation, any blocks that might become + * owned by the snapshot dataset must be accommodated by space + * outside of the reservation. + */ + ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); + asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved); + if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) + return (SET_ERROR(ENOSPC)); + + /* + * Propagate any reserved space for this snapshot to other + * snapshot checks in this sync group. + */ + if (asize > 0) + dsl_dir_willuse_space(ds->ds_dir, asize, tx); + + return (0); +} + +int +dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr) +{ + int error; + uint64_t value; + + ds->ds_trysnap_txg = tx->tx_txg; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + /* + * We don't allow multiple snapshots of the same txg. If there + * is already one, try again. + */ + if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) + return (SET_ERROR(EAGAIN)); + + /* + * Check for conflicting snapshot name. + */ + error = dsl_dataset_snap_lookup(ds, snapname, &value); + if (error == 0) + return (SET_ERROR(EEXIST)); + if (error != ENOENT) + return (error); + + /* + * We don't allow taking snapshots of inconsistent datasets, such as + * those into which we are currently receiving. However, if we are + * creating this snapshot as part of a receive, this check will be + * executed atomically with respect to the completion of the receive + * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this + * case we ignore this, knowing it will be fixed up for us shortly in + * dmu_recv_end_sync(). + */ + if (!recv && DS_IS_INCONSISTENT(ds)) + return (SET_ERROR(EBUSY)); + + /* + * Skip the check for temporary snapshots or if we have already checked + * the counts in dsl_dataset_snapshot_check. This means we really only + * check the count here when we're receiving a stream. + */ + if (cnt != 0 && cr != NULL) { + error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr); + if (error != 0) + return (error); + } + + error = dsl_dataset_snapshot_reserve_space(ds, tx); + if (error != 0) + return (error); + + return (0); +} + +int +dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_arg_t *ddsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + int rv = 0; + + /* + * Pre-compute how many total new snapshots will be created for each + * level in the tree and below. This is needed for validating the + * snapshot limit when either taking a recursive snapshot or when + * taking multiple snapshots. + * + * The problem is that the counts are not actually adjusted when + * we are checking, only when we finally sync. For a single snapshot, + * this is easy, the count will increase by 1 at each node up the tree, + * but its more complicated for the recursive/multiple snapshot case. + * + * The dsl_fs_ss_limit_check function does recursively check the count + * at each level up the tree but since it is validating each snapshot + * independently we need to be sure that we are validating the complete + * count for the entire set of snapshots. We do this by rolling up the + * counts for each component of the name into an nvlist and then + * checking each of those cases with the aggregated count. + * + * This approach properly handles not only the recursive snapshot + * case (where we get all of those on the ddsa_snaps list) but also + * the sibling case (e.g. snapshot a/b and a/c so that we will also + * validate the limit on 'a' using a count of 2). + * + * We validate the snapshot names in the third loop and only report + * name errors once. + */ + if (dmu_tx_is_syncing(tx)) { + nvlist_t *cnt_track = NULL; + cnt_track = fnvlist_alloc(); + + /* Rollup aggregated counts into the cnt_track list */ + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; + pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + char *pdelim; + uint64_t val; + char nm[MAXPATHLEN]; + + (void) strlcpy(nm, nvpair_name(pair), sizeof (nm)); + pdelim = strchr(nm, '@'); + if (pdelim == NULL) + continue; + *pdelim = '\0'; + + do { + if (nvlist_lookup_uint64(cnt_track, nm, + &val) == 0) { + /* update existing entry */ + fnvlist_add_uint64(cnt_track, nm, + val + 1); + } else { + /* add to list */ + fnvlist_add_uint64(cnt_track, nm, 1); + } + + pdelim = strrchr(nm, '/'); + if (pdelim != NULL) + *pdelim = '\0'; + } while (pdelim != NULL); + } + + /* Check aggregated counts at each level */ + for (pair = nvlist_next_nvpair(cnt_track, NULL); + pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) { + int error = 0; + char *name; + uint64_t cnt = 0; + dsl_dataset_t *ds; + + name = nvpair_name(pair); + cnt = fnvpair_value_uint64(pair); + ASSERT(cnt > 0); + + error = dsl_dataset_hold(dp, name, FTAG, &ds); + if (error == 0) { + error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, + ddsa->ddsa_cr); + dsl_dataset_rele(ds, FTAG); + } + + if (error != 0) { + if (ddsa->ddsa_errors != NULL) + fnvlist_add_int32(ddsa->ddsa_errors, + name, error); + rv = error; + /* only report one error for this check */ + break; + } + } + nvlist_free(cnt_track); + } + + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + int error = 0; + dsl_dataset_t *ds; + char *name, *atp; + char dsname[ZFS_MAX_DATASET_NAME_LEN]; + + name = nvpair_name(pair); + if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN) + error = SET_ERROR(ENAMETOOLONG); + if (error == 0) { + atp = strchr(name, '@'); + if (atp == NULL) + error = SET_ERROR(EINVAL); + if (error == 0) + (void) strlcpy(dsname, name, atp - name + 1); + } + if (error == 0) + error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == 0) { + /* passing 0/NULL skips dsl_fs_ss_limit_check */ + error = dsl_dataset_snapshot_check_impl(ds, + atp + 1, tx, B_FALSE, 0, NULL); + dsl_dataset_rele(ds, FTAG); + } + + if (error != 0) { + if (ddsa->ddsa_errors != NULL) { + fnvlist_add_int32(ddsa->ddsa_errors, + name, error); + } + rv = error; + } + } + + return (rv); +} + +void +dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dmu_buf_t *dbuf; + dsl_dataset_phys_t *dsphys; + uint64_t dsobj, crtxg; + objset_t *mos = dp->dp_meta_objset; + objset_t *os; + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + /* + * If we are on an old pool, the zil must not be active, in which + * case it will be zeroed. Usually zil_suspend() accomplishes this. + */ + ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP || + dmu_objset_from_ds(ds, &os) != 0 || + bcmp(&os->os_phys->os_zil_header, &zero_zil, + sizeof (zero_zil)) == 0); + + /* Should not snapshot a dirty dataset. */ + ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, + ds, tx->tx_txg)); + + dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx); + + /* + * The origin's ds_creation_txg has to be < TXG_INITIAL + */ + if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) + crtxg = 1; + else + crtxg = tx->tx_txg; + + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, + DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); + VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + bzero(dsphys, sizeof (dsl_dataset_phys_t)); + dsphys->ds_dir_obj = ds->ds_dir->dd_object; + dsphys->ds_fsid_guid = unique_create(); + do { + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + } while (dsphys->ds_guid == 0); + dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + dsphys->ds_next_snap_obj = ds->ds_object; + dsphys->ds_num_children = 1; + dsphys->ds_creation_time = gethrestime_sec(); + dsphys->ds_creation_txg = crtxg; + dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; + dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes; + dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = + dsl_dataset_phys(ds)->ds_uncompressed_bytes; + dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags; + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp; + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dmu_buf_rele(dbuf, FTAG); + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (ds->ds_feature_inuse[f]) + dsl_dataset_activate_feature(dsobj, f, tx); + } + + ASSERT3U(ds->ds_prev != 0, ==, + dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); + if (ds->ds_prev) { + uint64_t next_clones_obj = + dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj; + ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == + ds->ds_object || + dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1); + if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == + ds->ds_object) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, + dsl_dataset_phys(ds->ds_prev)->ds_creation_txg); + dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj; + } else if (next_clones_obj != 0) { + dsl_dataset_remove_from_next_clones(ds->ds_prev, + dsphys->ds_next_snap_obj, tx); + VERIFY0(zap_add_int(mos, + next_clones_obj, dsobj, tx)); + } + } + + /* + * If we have a reference-reservation on this dataset, we will + * need to increase the amount of refreservation being charged + * since our unique space is going to zero. + */ + if (ds->ds_reserved) { + int64_t delta; + ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); + delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, + ds->ds_reserved); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, + delta, 0, 0, tx); + } + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_deadlist_obj = + dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, + dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_open(&ds->ds_deadlist, mos, + dsl_dataset_phys(ds)->ds_deadlist_obj); + dsl_deadlist_add_key(&ds->ds_deadlist, + dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + + if (dsl_dataset_remap_deadlist_exists(ds)) { + uint64_t remap_deadlist_obj = + dsl_dataset_get_remap_deadlist_object(ds); + /* + * Move the remap_deadlist to the snapshot. The head + * will create a new remap deadlist on demand, from + * dsl_dataset_block_remapped(). + */ + dsl_dataset_unset_remap_deadlist_object(ds, tx); + dsl_deadlist_close(&ds->ds_remap_deadlist); + + dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST, + sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx)); + } + + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg); + dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj; + dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg; + dsl_dataset_phys(ds)->ds_unique_bytes = 0; + + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + + VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj, + snapname, 8, 1, &dsobj, tx)); + + if (ds->ds_prev) + dsl_dataset_rele(ds->ds_prev, ds); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); + + dsl_scan_ds_snapshotted(ds, tx); + + dsl_dir_snap_cmtime_update(ds->ds_dir); + + spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, ""); +} + +void +dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_arg_t *ddsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + dsl_dataset_t *ds; + char *name, *atp; + char dsname[ZFS_MAX_DATASET_NAME_LEN]; + + name = nvpair_name(pair); + atp = strchr(name, '@'); + (void) strlcpy(dsname, name, atp - name + 1); + VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds)); + + dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx); + if (ddsa->ddsa_props != NULL) { + dsl_props_set_sync_impl(ds->ds_prev, + ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); + } + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * The snapshots must all be in the same pool. + * All-or-nothing: if there are any failures, nothing will be modified. + */ +int +dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) +{ + dsl_dataset_snapshot_arg_t ddsa; + nvpair_t *pair; + boolean_t needsuspend; + int error; + spa_t *spa; + char *firstname; + nvlist_t *suspended = NULL; + + pair = nvlist_next_nvpair(snaps, NULL); + if (pair == NULL) + return (0); + firstname = nvpair_name(pair); + + error = spa_open(firstname, &spa, FTAG); + if (error != 0) + return (error); + needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + spa_close(spa, FTAG); + + if (needsuspend) { + suspended = fnvlist_alloc(); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + char *snapname = nvpair_name(pair); + char *atp; + void *cookie; + + atp = strchr(snapname, '@'); + if (atp == NULL) { + error = SET_ERROR(EINVAL); + break; + } + (void) strlcpy(fsname, snapname, atp - snapname + 1); + + error = zil_suspend(fsname, &cookie); + if (error != 0) + break; + fnvlist_add_uint64(suspended, fsname, + (uintptr_t)cookie); + } + } + + ddsa.ddsa_snaps = snaps; + ddsa.ddsa_props = props; + ddsa.ddsa_errors = errors; + ddsa.ddsa_cr = CRED(); + + if (error == 0) { + error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, + dsl_dataset_snapshot_sync, &ddsa, + fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL); + } + + if (suspended != NULL) { + for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL; + pair = nvlist_next_nvpair(suspended, pair)) { + zil_resume((void *)(uintptr_t) + fnvpair_value_uint64(pair)); + } + fnvlist_free(suspended); + } + +#ifdef __FreeBSD__ +#ifdef _KERNEL + if (error == 0) { + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + char *snapname = nvpair_name(pair); + zvol_create_minors(snapname); + } + } +#endif +#endif + return (error); +} + +typedef struct dsl_dataset_snapshot_tmp_arg { + const char *ddsta_fsname; + const char *ddsta_snapname; + minor_t ddsta_cleanup_minor; + const char *ddsta_htag; +} dsl_dataset_snapshot_tmp_arg_t; + +static int +dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + + error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds); + if (error != 0) + return (error); + + /* NULL cred means no limit check for tmp snapshot */ + error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, + tx, B_FALSE, 0, NULL); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOTSUP)); + } + error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag, + B_TRUE, tx); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds)); + + dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx); + dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag, + ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx); + dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx); + + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, + minor_t cleanup_minor, const char *htag) +{ + dsl_dataset_snapshot_tmp_arg_t ddsta; + int error; + spa_t *spa; + boolean_t needsuspend; + void *cookie; + + ddsta.ddsta_fsname = fsname; + ddsta.ddsta_snapname = snapname; + ddsta.ddsta_cleanup_minor = cleanup_minor; + ddsta.ddsta_htag = htag; + + error = spa_open(fsname, &spa, FTAG); + if (error != 0) + return (error); + needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + spa_close(spa, FTAG); + + if (needsuspend) { + error = zil_suspend(fsname, &cookie); + if (error != 0) + return (error); + } + + error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check, + dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED); + + if (needsuspend) + zil_resume(cookie); + return (error); +} + +void +dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(ds->ds_objset != NULL); + ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0); + + /* + * in case we had to change ds_fsid_guid when we opened it, + * sync it out now. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; + + if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) { + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1, + &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1, + &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1, + &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx)); + ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; + } + + dmu_objset_sync(ds->ds_objset, zio, tx); + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (ds->ds_feature_activation_needed[f]) { + if (ds->ds_feature_inuse[f]) + continue; + dsl_dataset_activate_feature(ds->ds_object, f, tx); + ds->ds_feature_inuse[f] = B_TRUE; + } + } +} + +static int +deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, tx); + return (0); +} + +void +dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + objset_t *os = ds->ds_objset; + + bplist_iterate(&ds->ds_pending_deadlist, + deadlist_enqueue_cb, &ds->ds_deadlist, tx); + + if (os->os_synced_dnodes != NULL) { + multilist_destroy(os->os_synced_dnodes); + os->os_synced_dnodes = NULL; + } + + ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx))); + + dmu_buf_rele(ds->ds_dbuf, ds); +} + +int +get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val) +{ + uint64_t count = 0; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + + /* + * There may be missing entries in ds_next_clones_obj + * due to a bug in a previous version of the code. + * Only trust it if it has the right number of entries. + */ + if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { + VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, + &count)); + } + if (count != dsl_dataset_phys(ds)->ds_num_children - 1) { + return (ENOENT); + } + for (zap_cursor_init(&zc, mos, + dsl_dataset_phys(ds)->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *clone; + char buf[ZFS_MAX_DATASET_NAME_LEN]; + VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone)); + dsl_dir_name(clone->ds_dir, buf); + fnvlist_add_boolean(val, buf); + dsl_dataset_rele(clone, FTAG); + } + zap_cursor_fini(&zc); + return (0); +} + +void +get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) +{ + nvlist_t *propval = fnvlist_alloc(); + nvlist_t *val; + + /* + * We use nvlist_alloc() instead of fnvlist_alloc() because the + * latter would allocate the list with NV_UNIQUE_NAME flag. + * As a result, every time a clone name is appended to the list + * it would be (linearly) searched for for a duplicate name. + * We already know that all clone names must be unique and we + * want avoid the quadratic complexity of double-checking that + * because we can have a large number of clones. + */ + VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP)); + + if (get_clones_stat_impl(ds, val) == 0) { + fnvlist_add_nvlist(propval, ZPROP_VALUE, val); + fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), + propval); + } + + nvlist_free(val); + nvlist_free(propval); +} + +/* + * Returns a string that represents the receive resume stats token. It should + * be freed with strfree(). + */ +char * +get_receive_resume_stats_impl(dsl_dataset_t *ds) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (dsl_dataset_has_resume_receive_state(ds)) { + char *str; + void *packed; + uint8_t *compressed; + uint64_t val; + nvlist_t *token_nv = fnvlist_alloc(); + size_t packed_size, compressed_size; + + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "fromguid", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "object", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "offset", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "bytes", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "toguid", val); + } + char buf[256]; + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { + fnvlist_add_string(token_nv, "toname", buf); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_LARGEBLOCK) == 0) { + fnvlist_add_boolean(token_nv, "largeblockok"); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_EMBEDOK) == 0) { + fnvlist_add_boolean(token_nv, "embedok"); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_COMPRESSOK) == 0) { + fnvlist_add_boolean(token_nv, "compressok"); + } + packed = fnvlist_pack(token_nv, &packed_size); + fnvlist_free(token_nv); + compressed = kmem_alloc(packed_size, KM_SLEEP); + + compressed_size = gzip_compress(packed, compressed, + packed_size, packed_size, 6); + + zio_cksum_t cksum; + fletcher_4_native(compressed, compressed_size, NULL, &cksum); + + str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP); + for (int i = 0; i < compressed_size; i++) { + (void) sprintf(str + i * 2, "%02x", compressed[i]); + } + str[compressed_size * 2] = '\0'; + char *propval = kmem_asprintf("%u-%llx-%llx-%s", + ZFS_SEND_RESUME_TOKEN_VERSION, + (longlong_t)cksum.zc_word[0], + (longlong_t)packed_size, str); + kmem_free(packed, packed_size); + kmem_free(str, compressed_size * 2 + 1); + kmem_free(compressed, packed_size); + return (propval); + } + return (spa_strdup("")); +} + +/* + * Returns a string that represents the receive resume stats token of the + * dataset's child. It should be freed with strfree(). + */ +char * +get_child_receive_stats(dsl_dataset_t *ds) +{ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + dsl_dataset_t *recv_ds; + dsl_dataset_name(ds, recvname); + if (strlcat(recvname, "/", sizeof (recvname)) < + sizeof (recvname) && + strlcat(recvname, recv_clone_name, sizeof (recvname)) < + sizeof (recvname) && + dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG, + &recv_ds) == 0) { + char *propval = get_receive_resume_stats_impl(recv_ds); + dsl_dataset_rele(recv_ds, FTAG); + return (propval); + } + return (spa_strdup("")); +} + +static void +get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + char *propval = get_receive_resume_stats_impl(ds); + if (strcmp(propval, "") != 0) { + dsl_prop_nvlist_add_string(nv, + ZFS_PROP_RECEIVE_RESUME_TOKEN, propval); + } else { + char *childval = get_child_receive_stats(ds); + if (strcmp(childval, "") != 0) { + dsl_prop_nvlist_add_string(nv, + ZFS_PROP_RECEIVE_RESUME_TOKEN, childval); + } + strfree(childval); + } + strfree(propval); +} + +uint64_t +dsl_get_refratio(dsl_dataset_t *ds) +{ + uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 : + (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 / + dsl_dataset_phys(ds)->ds_compressed_bytes); + return (ratio); +} + +uint64_t +dsl_get_logicalreferenced(dsl_dataset_t *ds) +{ + return (dsl_dataset_phys(ds)->ds_uncompressed_bytes); +} + +uint64_t +dsl_get_compressratio(dsl_dataset_t *ds) +{ + if (ds->ds_is_snapshot) { + return (dsl_get_refratio(ds)); + } else { + dsl_dir_t *dd = ds->ds_dir; + mutex_enter(&dd->dd_lock); + uint64_t val = dsl_dir_get_compressratio(dd); + mutex_exit(&dd->dd_lock); + return (val); + } +} + +uint64_t +dsl_get_used(dsl_dataset_t *ds) +{ + if (ds->ds_is_snapshot) { + return (dsl_dataset_phys(ds)->ds_unique_bytes); + } else { + dsl_dir_t *dd = ds->ds_dir; + mutex_enter(&dd->dd_lock); + uint64_t val = dsl_dir_get_used(dd); + mutex_exit(&dd->dd_lock); + return (val); + } +} + +uint64_t +dsl_get_creation(dsl_dataset_t *ds) +{ + return (dsl_dataset_phys(ds)->ds_creation_time); +} + +uint64_t +dsl_get_creationtxg(dsl_dataset_t *ds) +{ + return (dsl_dataset_phys(ds)->ds_creation_txg); +} + +uint64_t +dsl_get_refquota(dsl_dataset_t *ds) +{ + return (ds->ds_quota); +} + +uint64_t +dsl_get_refreservation(dsl_dataset_t *ds) +{ + return (ds->ds_reserved); +} + +uint64_t +dsl_get_guid(dsl_dataset_t *ds) +{ + return (dsl_dataset_phys(ds)->ds_guid); +} + +uint64_t +dsl_get_unique(dsl_dataset_t *ds) +{ + return (dsl_dataset_phys(ds)->ds_unique_bytes); +} + +uint64_t +dsl_get_objsetid(dsl_dataset_t *ds) +{ + return (ds->ds_object); +} + +uint64_t +dsl_get_userrefs(dsl_dataset_t *ds) +{ + return (ds->ds_userrefs); +} + +uint64_t +dsl_get_defer_destroy(dsl_dataset_t *ds) +{ + return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0); +} + +uint64_t +dsl_get_referenced(dsl_dataset_t *ds) +{ + return (dsl_dataset_phys(ds)->ds_referenced_bytes); +} + +uint64_t +dsl_get_numclones(dsl_dataset_t *ds) +{ + ASSERT(ds->ds_is_snapshot); + return (dsl_dataset_phys(ds)->ds_num_children - 1); +} + +uint64_t +dsl_get_inconsistent(dsl_dataset_t *ds) +{ + return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ? + 1 : 0); +} + +uint64_t +dsl_get_available(dsl_dataset_t *ds) +{ + uint64_t refdbytes = dsl_get_referenced(ds); + uint64_t availbytes = dsl_dir_space_available(ds->ds_dir, + NULL, 0, TRUE); + if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { + availbytes += + ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; + } + if (ds->ds_quota != 0) { + /* + * Adjust available bytes according to refquota + */ + if (refdbytes < ds->ds_quota) { + availbytes = MIN(availbytes, + ds->ds_quota - refdbytes); + } else { + availbytes = 0; + } + } + return (availbytes); +} + +int +dsl_get_written(dsl_dataset_t *ds, uint64_t *written) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_dataset_t *prev; + int err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); + if (err == 0) { + uint64_t comp, uncomp; + err = dsl_dataset_space_written(prev, ds, written, + &comp, &uncomp); + dsl_dataset_rele(prev, FTAG); + } + return (err); +} + +/* + * 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN. + */ +int +dsl_get_prev_snap(dsl_dataset_t *ds, char *snap) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) { + dsl_dataset_name(ds->ds_prev, snap); + return (0); + } else { + return (ENOENT); + } +} + +/* + * Returns the mountpoint property and source for the given dataset in the value + * and source buffers. The value buffer must be at least as large as MAXPATHLEN + * and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN. + * Returns 0 on success and an error on failure. + */ +int +dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, + char *source) +{ + int error; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* Retrieve the mountpoint value stored in the zap opbject */ + error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1, + ZAP_MAXVALUELEN, value, source); + if (error != 0) { + return (error); + } + + /* Process the dsname and source to find the full mountpoint string */ + if (value[0] == '/') { + char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + char *root = buf; + const char *relpath; + + /* + * If we inherit the mountpoint, even from a dataset + * with a received value, the source will be the path of + * the dataset we inherit from. If source is + * ZPROP_SOURCE_VAL_RECVD, the received value is not + * inherited. + */ + if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { + relpath = ""; + } else { + ASSERT0(strncmp(dsname, source, strlen(source))); + relpath = dsname + strlen(source); + if (relpath[0] == '/') + relpath++; + } + + spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN); + + /* + * Special case an alternate root of '/'. This will + * avoid having multiple leading slashes in the + * mountpoint path. + */ + if (strcmp(root, "/") == 0) + root++; + + /* + * If the mountpoint is '/' then skip over this + * if we are obtaining either an alternate root or + * an inherited mountpoint. + */ + char *mnt = value; + if (value[1] == '\0' && (root[0] != '\0' || + relpath[0] != '\0')) + mnt = value + 1; + + if (relpath[0] == '\0') { + (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s", + root, mnt); + } else { + (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s", + root, mnt, relpath[0] == '@' ? "" : "/", + relpath); + } + kmem_free(buf, ZAP_MAXVALUELEN); + } else { + /* 'legacy' or 'none' */ + (void) snprintf(value, ZAP_MAXVALUELEN, "%s", value); + } + return (0); +} + +void +dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + ASSERT(dsl_pool_config_held(dp)); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, + dsl_get_refratio(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, + dsl_get_logicalreferenced(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, + dsl_get_compressratio(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, + dsl_get_used(ds)); + + if (ds->ds_is_snapshot) { + get_clones_stat(ds, nv); + } else { + char buf[ZFS_MAX_DATASET_NAME_LEN]; + if (dsl_get_prev_snap(ds, buf) == 0) + dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, + buf); + dsl_dir_stats(ds->ds_dir, nv); + } + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, + dsl_get_available(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, + dsl_get_referenced(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, + dsl_get_creation(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, + dsl_get_creationtxg(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, + dsl_get_refquota(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, + dsl_get_refreservation(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, + dsl_get_guid(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, + dsl_get_unique(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, + dsl_get_objsetid(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, + dsl_get_userrefs(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, + dsl_get_defer_destroy(ds)); + + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + uint64_t written; + if (dsl_get_written(ds, &written) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, + written); + } + } + + if (!dsl_dataset_is_snapshot(ds)) { + /* + * A failed "newfs" (e.g. full) resumable receive leaves + * the stats set on this dataset. Check here for the prop. + */ + get_receive_resume_stats(ds, nv); + + /* + * A failed incremental resumable receive leaves the + * stats set on our child named "%recv". Check the child + * for the prop. + */ + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + dsl_dataset_t *recv_ds; + dsl_dataset_name(ds, recvname); + if (strlcat(recvname, "/", sizeof (recvname)) < + sizeof (recvname) && + strlcat(recvname, recv_clone_name, sizeof (recvname)) < + sizeof (recvname) && + dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) { + get_receive_resume_stats(recv_ds, nv); + dsl_dataset_rele(recv_ds, FTAG); + } + } +} + +void +dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + ASSERT(dsl_pool_config_held(dp)); + + stat->dds_creation_txg = dsl_get_creationtxg(ds); + stat->dds_inconsistent = dsl_get_inconsistent(ds); + stat->dds_guid = dsl_get_guid(ds); + stat->dds_origin[0] = '\0'; + if (ds->ds_is_snapshot) { + stat->dds_is_snapshot = B_TRUE; + stat->dds_num_clones = dsl_get_numclones(ds); + } else { + stat->dds_is_snapshot = B_FALSE; + stat->dds_num_clones = 0; + + if (dsl_dir_is_clone(ds->ds_dir)) { + dsl_dir_get_origin(ds->ds_dir, stat->dds_origin); + } + } +} + +uint64_t +dsl_dataset_fsid_guid(dsl_dataset_t *ds) +{ + return (ds->ds_fsid_guid); +} + +void +dsl_dataset_space(dsl_dataset_t *ds, + uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp) +{ + *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes; + *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); + if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) + *availbytesp += + ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; + if (ds->ds_quota != 0) { + /* + * Adjust available bytes according to refquota + */ + if (*refdbytesp < ds->ds_quota) + *availbytesp = MIN(*availbytesp, + ds->ds_quota - *refdbytesp); + else + *availbytesp = 0; + } + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + *availobjsp = DN_MAX_OBJECT - *usedobjsp; +} + +boolean_t +dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + uint64_t birth; + + ASSERT(dsl_pool_config_held(dp)); + if (snap == NULL) + return (B_FALSE); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + birth = dsl_dataset_get_blkptr(ds)->blk_birth; + rrw_exit(&ds->ds_bp_rwlock, FTAG); + if (birth > dsl_dataset_phys(snap)->ds_creation_txg) { + objset_t *os, *os_snap; + /* + * It may be that only the ZIL differs, because it was + * reset in the head. Don't count that as being + * modified. + */ + if (dmu_objset_from_ds(ds, &os) != 0) + return (B_TRUE); + if (dmu_objset_from_ds(snap, &os_snap) != 0) + return (B_TRUE); + return (bcmp(&os->os_phys->os_meta_dnode, + &os_snap->os_phys->os_meta_dnode, + sizeof (os->os_phys->os_meta_dnode)) != 0); + } + return (B_FALSE); +} + +typedef struct dsl_dataset_rename_snapshot_arg { + const char *ddrsa_fsname; + const char *ddrsa_oldsnapname; + const char *ddrsa_newsnapname; + boolean_t ddrsa_recursive; + dmu_tx_t *ddrsa_tx; +} dsl_dataset_rename_snapshot_arg_t; + +/* ARGSUSED */ +static int +dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, + dsl_dataset_t *hds, void *arg) +{ + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + int error; + uint64_t val; + + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); + if (error != 0) { + /* ignore nonexistent snapshots */ + return (error == ENOENT ? 0 : error); + } + + /* new name should not exist */ + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val); + if (error == 0) + error = SET_ERROR(EEXIST); + else if (error == ENOENT) + error = 0; + + /* dataset name + 1 for the "@" + the new snapshot name must fit */ + if (dsl_dir_namelen(hds->ds_dir) + 1 + + strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN) + error = SET_ERROR(ENAMETOOLONG); + + return (error); +} + +static int +dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; + int error; + + error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds); + if (error != 0) + return (error); + + if (ddrsa->ddrsa_recursive) { + error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object, + dsl_dataset_rename_snapshot_check_impl, ddrsa, + DS_FIND_CHILDREN); + } else { + error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa); + } + dsl_dataset_rele(hds, FTAG); + return (error); +} + +static int +dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, + dsl_dataset_t *hds, void *arg) +{ +#ifdef __FreeBSD__ +#ifdef _KERNEL + char *oldname, *newname; +#endif +#endif + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_dataset_t *ds; + uint64_t val; + dmu_tx_t *tx = ddrsa->ddrsa_tx; + int error; + + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); + ASSERT(error == 0 || error == ENOENT); + if (error == ENOENT) { + /* ignore nonexistent snapshots */ + return (0); + } + + VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds)); + + /* log before we change the name */ + spa_history_log_internal_ds(ds, "rename", tx, + "-> @%s", ddrsa->ddrsa_newsnapname); + + VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx, + B_FALSE)); + mutex_enter(&ds->ds_lock); + (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname); + mutex_exit(&ds->ds_lock); + VERIFY0(zap_add(dp->dp_meta_objset, + dsl_dataset_phys(hds)->ds_snapnames_zapobj, + ds->ds_snapname, 8, 1, &ds->ds_object, tx)); + +#ifdef __FreeBSD__ +#ifdef _KERNEL + oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + newname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname, + ddrsa->ddrsa_oldsnapname); + snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname, + ddrsa->ddrsa_newsnapname); + zfsvfs_update_fromname(oldname, newname); + zvol_rename_minors(oldname, newname); + kmem_free(newname, MAXPATHLEN); + kmem_free(oldname, MAXPATHLEN); +#endif +#endif + dsl_dataset_rele(ds, FTAG); + + return (0); +} + +static void +dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; + + VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds)); + ddrsa->ddrsa_tx = tx; + if (ddrsa->ddrsa_recursive) { + VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object, + dsl_dataset_rename_snapshot_sync_impl, ddrsa, + DS_FIND_CHILDREN)); + } else { + VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa)); + } + dsl_dataset_rele(hds, FTAG); +} + +int +dsl_dataset_rename_snapshot(const char *fsname, + const char *oldsnapname, const char *newsnapname, boolean_t recursive) +{ + dsl_dataset_rename_snapshot_arg_t ddrsa; + + ddrsa.ddrsa_fsname = fsname; + ddrsa.ddrsa_oldsnapname = oldsnapname; + ddrsa.ddrsa_newsnapname = newsnapname; + ddrsa.ddrsa_recursive = recursive; + + return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, + dsl_dataset_rename_snapshot_sync, &ddrsa, + 1, ZFS_SPACE_CHECK_RESERVED)); +} + +/* + * If we're doing an ownership handoff, we need to make sure that there is + * only one long hold on the dataset. We're not allowed to change anything here + * so we don't permanently release the long hold or regular hold here. We want + * to do this only when syncing to avoid the dataset unexpectedly going away + * when we release the long hold. + */ +static int +dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) +{ + boolean_t held; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + if (owner != NULL) { + VERIFY3P(ds->ds_owner, ==, owner); + dsl_dataset_long_rele(ds, owner); + } + + held = dsl_dataset_long_held(ds); + + if (owner != NULL) + dsl_dataset_long_hold(ds, owner); + + if (held) + return (SET_ERROR(EBUSY)); + + return (0); +} + +int +dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_rollback_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int64_t unused_refres_delta; + int error; + + error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds); + if (error != 0) + return (error); + + /* must not be a snapshot */ + if (ds->ds_is_snapshot) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* must have a most recent snapshot */ + if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ESRCH)); + } + + /* + * No rollback to a snapshot created in the current txg, because + * the rollback may dirty the dataset and create blocks that are + * not reachable from the rootbp while having a birth txg that + * falls into the snapshot's range. + */ + if (dmu_tx_is_syncing(tx) && + dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EAGAIN)); + } + + /* + * If the expected target snapshot is specified, then check that + * the latest snapshot is it. + */ + if (ddra->ddra_tosnap != NULL) { + dsl_dataset_t *snapds; + + /* Check if the target snapshot exists at all. */ + error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds); + if (error != 0) { + /* + * ESRCH is used to signal that the target snapshot does + * not exist, while ENOENT is used to report that + * the rolled back dataset does not exist. + * ESRCH is also used to cover other cases where the + * target snapshot is not related to the dataset being + * rolled back such as being in a different pool. + */ + if (error == ENOENT || error == EXDEV) + error = SET_ERROR(ESRCH); + dsl_dataset_rele(ds, FTAG); + return (error); + } + ASSERT(snapds->ds_is_snapshot); + + /* Check if the snapshot is the latest snapshot indeed. */ + if (snapds != ds->ds_prev) { + /* + * Distinguish between the case where the only problem + * is intervening snapshots (EEXIST) vs the snapshot + * not being a valid target for rollback (ESRCH). + */ + if (snapds->ds_dir == ds->ds_dir || + (dsl_dir_is_clone(ds->ds_dir) && + dsl_dir_phys(ds->ds_dir)->dd_origin_obj == + snapds->ds_object)) { + error = SET_ERROR(EEXIST); + } else { + error = SET_ERROR(ESRCH); + } + dsl_dataset_rele(snapds, FTAG); + dsl_dataset_rele(ds, FTAG); + return (error); + } + dsl_dataset_rele(snapds, FTAG); + } + + /* must not have any bookmarks after the most recent snapshot */ + nvlist_t *proprequest = fnvlist_alloc(); + fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG)); + nvlist_t *bookmarks = fnvlist_alloc(); + error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks); + fnvlist_free(proprequest); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) { + nvlist_t *valuenv = + fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair), + zfs_prop_to_name(ZFS_PROP_CREATETXG)); + uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value"); + if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) { + fnvlist_free(bookmarks); + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EEXIST)); + } + } + fnvlist_free(bookmarks); + + error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + /* + * Check if the snap we are rolling back to uses more than + * the refquota. + */ + if (ds->ds_quota != 0 && + dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EDQUOT)); + } + + /* + * When we do the clone swap, we will temporarily use more space + * due to the refreservation (the head will no longer have any + * unique space, so the entire amount of the refreservation will need + * to be free). We will immediately destroy the clone, freeing + * this space, but the freeing happens over many txg's. + */ + unused_refres_delta = (int64_t)MIN(ds->ds_reserved, + dsl_dataset_phys(ds)->ds_unique_bytes); + + if (unused_refres_delta > 0 && + unused_refres_delta > + dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOSPC)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_rollback_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds, *clone; + uint64_t cloneobj; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + + VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds)); + + dsl_dataset_name(ds->ds_prev, namebuf); + fnvlist_add_string(ddra->ddra_result, "target", namebuf); + + cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", + ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx); + + VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone)); + + dsl_dataset_clone_swap_sync_impl(clone, ds, tx); + dsl_dataset_zero_zil(ds, tx); + + dsl_destroy_head_sync_impl(clone, tx); + + dsl_dataset_rele(clone, FTAG); + dsl_dataset_rele(ds, FTAG); +} + +/* + * Rolls back the given filesystem or volume to the most recent snapshot. + * The name of the most recent snapshot will be returned under key "target" + * in the result nvlist. + * + * If owner != NULL: + * - The existing dataset MUST be owned by the specified owner at entry + * - Upon return, dataset will still be held by the same owner, whether we + * succeed or not. + * + * This mode is required any time the existing filesystem is mounted. See + * notes above zfs_suspend_fs() for further details. + */ +int +dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner, + nvlist_t *result) +{ + dsl_dataset_rollback_arg_t ddra; + + ddra.ddra_fsname = fsname; + ddra.ddra_tosnap = tosnap; + ddra.ddra_owner = owner; + ddra.ddra_result = result; + + return (dsl_sync_task(fsname, dsl_dataset_rollback_check, + dsl_dataset_rollback_sync, &ddra, + 1, ZFS_SPACE_CHECK_RESERVED)); +} + +struct promotenode { + list_node_t link; + dsl_dataset_t *ds; +}; + +static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); +static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, + void *tag); +static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag); + +int +dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_promote_arg_t *ddpa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; + struct promotenode *snap; + dsl_dataset_t *origin_ds; + int err; + uint64_t unused; + uint64_t ss_mv_cnt; + size_t max_snap_len; + boolean_t conflicting_snaps; + + err = promote_hold(ddpa, dp, FTAG); + if (err != 0) + return (err); + + hds = ddpa->ddpa_clone; + snap = list_head(&ddpa->shared_snaps); + origin_ds = snap->ds; + max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1; + + snap = list_head(&ddpa->origin_snaps); + + if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) { + promote_rele(ddpa, FTAG); + return (SET_ERROR(EXDEV)); + } + + /* + * Compute and check the amount of space to transfer. Since this is + * so expensive, don't do the preliminary check. + */ + if (!dmu_tx_is_syncing(tx)) { + promote_rele(ddpa, FTAG); + return (0); + } + + /* compute origin's new unique space */ + snap = list_tail(&ddpa->clone_snaps); + ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, + origin_ds->ds_object); + dsl_deadlist_space_range(&snap->ds->ds_deadlist, + dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX, + &ddpa->unique, &unused, &unused); + + /* + * Walk the snapshots that we are moving + * + * Compute space to transfer. Consider the incremental changes + * to used by each snapshot: + * (my used) = (prev's used) + (blocks born) - (blocks killed) + * So each snapshot gave birth to: + * (blocks born) = (my used) - (prev's used) + (blocks killed) + * So a sequence would look like: + * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) + * Which simplifies to: + * uN + kN + kN-1 + ... + k1 + k0 + * Note however, if we stop before we reach the ORIGIN we get: + * uN + kN + kN-1 + ... + kM - uM-1 + */ + conflicting_snaps = B_FALSE; + ss_mv_cnt = 0; + ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes; + ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes; + ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes; + for (snap = list_head(&ddpa->shared_snaps); snap; + snap = list_next(&ddpa->shared_snaps, snap)) { + uint64_t val, dlused, dlcomp, dluncomp; + dsl_dataset_t *ds = snap->ds; + + ss_mv_cnt++; + + /* + * If there are long holds, we won't be able to evict + * the objset. + */ + if (dsl_dataset_long_held(ds)) { + err = SET_ERROR(EBUSY); + goto out; + } + + /* Check that the snapshot name does not conflict */ + VERIFY0(dsl_dataset_get_snapname(ds)); + if (strlen(ds->ds_snapname) >= max_snap_len) { + err = SET_ERROR(ENAMETOOLONG); + goto out; + } + err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); + if (err == 0) { + fnvlist_add_boolean(ddpa->err_ds, + snap->ds->ds_snapname); + conflicting_snaps = B_TRUE; + } else if (err != ENOENT) { + goto out; + } + + /* The very first snapshot does not have a deadlist */ + if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0) + continue; + + dsl_deadlist_space(&ds->ds_deadlist, + &dlused, &dlcomp, &dluncomp); + ddpa->used += dlused; + ddpa->comp += dlcomp; + ddpa->uncomp += dluncomp; + } + + /* + * In order to return the full list of conflicting snapshots, we check + * whether there was a conflict after traversing all of them. + */ + if (conflicting_snaps) { + err = SET_ERROR(EEXIST); + goto out; + } + + /* + * If we are a clone of a clone then we never reached ORIGIN, + * so we need to subtract out the clone origin's used space. + */ + if (ddpa->origin_origin) { + ddpa->used -= + dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes; + ddpa->comp -= + dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes; + ddpa->uncomp -= + dsl_dataset_phys(ddpa->origin_origin)-> + ds_uncompressed_bytes; + } + + /* Check that there is enough space and limit headroom here */ + err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, + 0, ss_mv_cnt, ddpa->used, ddpa->cr); + if (err != 0) + goto out; + + /* + * Compute the amounts of space that will be used by snapshots + * after the promotion (for both origin and clone). For each, + * it is the amount of space that will be on all of their + * deadlists (that was not born before their new origin). + */ + if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) { + uint64_t space; + + /* + * Note, typically this will not be a clone of a clone, + * so dd_origin_txg will be < TXG_INITIAL, so + * these snaplist_space() -> dsl_deadlist_space_range() + * calls will be fast because they do not have to + * iterate over all bps. + */ + snap = list_head(&ddpa->origin_snaps); + err = snaplist_space(&ddpa->shared_snaps, + snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap); + if (err != 0) + goto out; + + err = snaplist_space(&ddpa->clone_snaps, + snap->ds->ds_dir->dd_origin_txg, &space); + if (err != 0) + goto out; + ddpa->cloneusedsnap += space; + } + if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags & + DD_FLAG_USED_BREAKDOWN) { + err = snaplist_space(&ddpa->origin_snaps, + dsl_dataset_phys(origin_ds)->ds_creation_txg, + &ddpa->originusedsnap); + if (err != 0) + goto out; + } + +out: + promote_rele(ddpa, FTAG); + return (err); +} + +void +dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_promote_arg_t *ddpa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; + struct promotenode *snap; + dsl_dataset_t *origin_ds; + dsl_dataset_t *origin_head; + dsl_dir_t *dd; + dsl_dir_t *odd = NULL; + uint64_t oldnext_obj; + int64_t delta; +#if defined(__FreeBSD__) && defined(_KERNEL) + char *oldname, *newname; +#endif + + VERIFY0(promote_hold(ddpa, dp, FTAG)); + hds = ddpa->ddpa_clone; + + ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE); + + snap = list_head(&ddpa->shared_snaps); + origin_ds = snap->ds; + dd = hds->ds_dir; + + snap = list_head(&ddpa->origin_snaps); + origin_head = snap->ds; + + /* + * We need to explicitly open odd, since origin_ds's dd will be + * changing. + */ + VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object, + NULL, FTAG, &odd)); + + /* change origin's next snap */ + dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); + oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj; + snap = list_tail(&ddpa->clone_snaps); + ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, + origin_ds->ds_object); + dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object; + + /* change the origin's next clone */ + if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) { + dsl_dataset_remove_from_next_clones(origin_ds, + snap->ds->ds_object, tx); + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dataset_phys(origin_ds)->ds_next_clones_obj, + oldnext_obj, tx)); + } + + /* change origin */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object); + dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj; + dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; + dmu_buf_will_dirty(odd->dd_dbuf, tx); + dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object; + origin_head->ds_dir->dd_origin_txg = + dsl_dataset_phys(origin_ds)->ds_creation_txg; + + /* change dd_clone entries */ + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + VERIFY0(zap_remove_int(dp->dp_meta_objset, + dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx)); + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, + hds->ds_object, tx)); + + VERIFY0(zap_remove_int(dp->dp_meta_objset, + dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, + origin_head->ds_object, tx)); + if (dsl_dir_phys(dd)->dd_clones == 0) { + dsl_dir_phys(dd)->dd_clones = + zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES, + DMU_OT_NONE, 0, tx); + } + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx)); + } + +#if defined(__FreeBSD__) && defined(_KERNEL) + /* Take the spa_namespace_lock early so zvol renames don't deadlock. */ + mutex_enter(&spa_namespace_lock); + + oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + newname = kmem_alloc(MAXPATHLEN, KM_SLEEP); +#endif + + /* move snapshots to this dir */ + for (snap = list_head(&ddpa->shared_snaps); snap; + snap = list_next(&ddpa->shared_snaps, snap)) { + dsl_dataset_t *ds = snap->ds; + + /* + * Property callbacks are registered to a particular + * dsl_dir. Since ours is changing, evict the objset + * so that they will be unregistered from the old dsl_dir. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + + /* move snap name entry */ + VERIFY0(dsl_dataset_get_snapname(ds)); + VERIFY0(dsl_dataset_snap_remove(origin_head, + ds->ds_snapname, tx, B_TRUE)); + VERIFY0(zap_add(dp->dp_meta_objset, + dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, + 8, 1, &ds->ds_object, tx)); + dsl_fs_ss_count_adjust(hds->ds_dir, 1, + DD_FIELD_SNAPSHOT_COUNT, tx); + + /* change containing dsl_dir */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object); + dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object; + ASSERT3P(ds->ds_dir, ==, odd); + dsl_dir_rele(ds->ds_dir, ds); + VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object, + NULL, ds, &ds->ds_dir)); + +#if defined(__FreeBSD__) && defined(_KERNEL) + dsl_dataset_name(ds, newname); + zfsvfs_update_fromname(oldname, newname); + zvol_rename_minors(oldname, newname); +#endif + + /* move any clone references */ + if (dsl_dataset_phys(ds)->ds_next_clones_obj && + spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *cnds; + uint64_t o; + + if (za.za_first_integer == oldnext_obj) { + /* + * We've already moved the + * origin's reference. + */ + continue; + } + + VERIFY0(dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &cnds)); + o = dsl_dir_phys(cnds->ds_dir)-> + dd_head_dataset_obj; + + VERIFY0(zap_remove_int(dp->dp_meta_objset, + dsl_dir_phys(odd)->dd_clones, o, tx)); + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_clones, o, tx)); + dsl_dataset_rele(cnds, FTAG); + } + zap_cursor_fini(&zc); + } + + ASSERT(!dsl_prop_hascb(ds)); + } + +#if defined(__FreeBSD__) && defined(_KERNEL) + mutex_exit(&spa_namespace_lock); + + kmem_free(newname, MAXPATHLEN); + kmem_free(oldname, MAXPATHLEN); +#endif + /* + * Change space accounting. + * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either + * both be valid, or both be 0 (resulting in delta == 0). This + * is true for each of {clone,origin} independently. + */ + + delta = ddpa->cloneusedsnap - + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, >=, 0); + ASSERT3U(ddpa->used, >=, delta); + dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(dd, DD_USED_HEAD, + ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx); + + delta = ddpa->originusedsnap - + dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, <=, 0); + ASSERT3U(ddpa->used, >=, -delta); + dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(odd, DD_USED_HEAD, + -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx); + + dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique; + + /* log history record */ + spa_history_log_internal_ds(hds, "promote", tx, ""); + + dsl_dir_rele(odd, FTAG); + promote_rele(ddpa, FTAG); +} + +/* + * Make a list of dsl_dataset_t's for the snapshots between first_obj + * (exclusive) and last_obj (inclusive). The list will be in reverse + * order (last_obj will be the list_head()). If first_obj == 0, do all + * snapshots back to this dataset's origin. + */ +static int +snaplist_make(dsl_pool_t *dp, + uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag) +{ + uint64_t obj = last_obj; + + list_create(l, sizeof (struct promotenode), + offsetof(struct promotenode, link)); + + while (obj != first_obj) { + dsl_dataset_t *ds; + struct promotenode *snap; + int err; + + err = dsl_dataset_hold_obj(dp, obj, tag, &ds); + ASSERT(err != ENOENT); + if (err != 0) + return (err); + + if (first_obj == 0) + first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj; + + snap = kmem_alloc(sizeof (*snap), KM_SLEEP); + snap->ds = ds; + list_insert_tail(l, snap); + obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + + return (0); +} + +static int +snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) +{ + struct promotenode *snap; + + *spacep = 0; + for (snap = list_head(l); snap; snap = list_next(l, snap)) { + uint64_t used, comp, uncomp; + dsl_deadlist_space_range(&snap->ds->ds_deadlist, + mintxg, UINT64_MAX, &used, &comp, &uncomp); + *spacep += used; + } + return (0); +} + +static void +snaplist_destroy(list_t *l, void *tag) +{ + struct promotenode *snap; + + if (l == NULL || !list_link_active(&l->list_head)) + return; + + while ((snap = list_tail(l)) != NULL) { + list_remove(l, snap); + dsl_dataset_rele(snap->ds, tag); + kmem_free(snap, sizeof (*snap)); + } + list_destroy(l); +} + +static int +promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) +{ + int error; + dsl_dir_t *dd; + struct promotenode *snap; + + error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag, + &ddpa->ddpa_clone); + if (error != 0) + return (error); + dd = ddpa->ddpa_clone->ds_dir; + + if (ddpa->ddpa_clone->ds_is_snapshot || + !dsl_dir_is_clone(dd)) { + dsl_dataset_rele(ddpa->ddpa_clone, tag); + return (SET_ERROR(EINVAL)); + } + + error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj, + &ddpa->shared_snaps, tag); + if (error != 0) + goto out; + + error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object, + &ddpa->clone_snaps, tag); + if (error != 0) + goto out; + + snap = list_head(&ddpa->shared_snaps); + ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj); + error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj, + dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj, + &ddpa->origin_snaps, tag); + if (error != 0) + goto out; + + if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) { + error = dsl_dataset_hold_obj(dp, + dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj, + tag, &ddpa->origin_origin); + if (error != 0) + goto out; + } +out: + if (error != 0) + promote_rele(ddpa, tag); + return (error); +} + +static void +promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag) +{ + snaplist_destroy(&ddpa->shared_snaps, tag); + snaplist_destroy(&ddpa->clone_snaps, tag); + snaplist_destroy(&ddpa->origin_snaps, tag); + if (ddpa->origin_origin != NULL) + dsl_dataset_rele(ddpa->origin_origin, tag); + dsl_dataset_rele(ddpa->ddpa_clone, tag); +} + +/* + * Promote a clone. + * + * If it fails due to a conflicting snapshot name, "conflsnap" will be filled + * in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.) + */ +int +dsl_dataset_promote(const char *name, char *conflsnap) +{ + dsl_dataset_promote_arg_t ddpa = { 0 }; + uint64_t numsnaps; + int error; + nvpair_t *snap_pair; + objset_t *os; + + /* + * We will modify space proportional to the number of + * snapshots. Compute numsnaps. + */ + error = dmu_objset_hold(name, FTAG, &os); + if (error != 0) + return (error); + error = zap_count(dmu_objset_pool(os)->dp_meta_objset, + dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj, + &numsnaps); + dmu_objset_rele(os, FTAG); + if (error != 0) + return (error); + + ddpa.ddpa_clonename = name; + ddpa.err_ds = fnvlist_alloc(); + ddpa.cr = CRED(); + + error = dsl_sync_task(name, dsl_dataset_promote_check, + dsl_dataset_promote_sync, &ddpa, + 2 + numsnaps, ZFS_SPACE_CHECK_RESERVED); + + /* + * Return the first conflicting snapshot found. + */ + snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL); + if (snap_pair != NULL && conflsnap != NULL) + (void) strcpy(conflsnap, nvpair_name(snap_pair)); + + fnvlist_free(ddpa.err_ds); + return (error); +} + +int +dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx) +{ + /* + * "slack" factor for received datasets with refquota set on them. + * See the bottom of this function for details on its use. + */ + uint64_t refquota_slack = DMU_MAX_ACCESS * spa_asize_inflation; + int64_t unused_refres_delta; + + /* they should both be heads */ + if (clone->ds_is_snapshot || + origin_head->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + /* if we are not forcing, the branch point should be just before them */ + if (!force && clone->ds_prev != origin_head->ds_prev) + return (SET_ERROR(EINVAL)); + + /* clone should be the clone (unless they are unrelated) */ + if (clone->ds_prev != NULL && + clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap && + origin_head->ds_dir != clone->ds_prev->ds_dir) + return (SET_ERROR(EINVAL)); + + /* the clone should be a child of the origin */ + if (clone->ds_dir->dd_parent != origin_head->ds_dir) + return (SET_ERROR(EINVAL)); + + /* origin_head shouldn't be modified unless 'force' */ + if (!force && + dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev)) + return (SET_ERROR(ETXTBSY)); + + /* origin_head should have no long holds (e.g. is not mounted) */ + if (dsl_dataset_handoff_check(origin_head, owner, tx)) + return (SET_ERROR(EBUSY)); + + /* check amount of any unconsumed refreservation */ + unused_refres_delta = + (int64_t)MIN(origin_head->ds_reserved, + dsl_dataset_phys(origin_head)->ds_unique_bytes) - + (int64_t)MIN(origin_head->ds_reserved, + dsl_dataset_phys(clone)->ds_unique_bytes); + + if (unused_refres_delta > 0 && + unused_refres_delta > + dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE)) + return (SET_ERROR(ENOSPC)); + + /* + * The clone can't be too much over the head's refquota. + * + * To ensure that the entire refquota can be used, we allow one + * transaction to exceed the the refquota. Therefore, this check + * needs to also allow for the space referenced to be more than the + * refquota. The maximum amount of space that one transaction can use + * on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this + * overage ensures that we are able to receive a filesystem that + * exceeds the refquota on the source system. + * + * So that overage is the refquota_slack we use below. + */ + if (origin_head->ds_quota != 0 && + dsl_dataset_phys(clone)->ds_referenced_bytes > + origin_head->ds_quota + refquota_slack) + return (SET_ERROR(EDQUOT)); + + return (0); +} + +static void +dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone, + dsl_dataset_t *origin, dmu_tx_t *tx) +{ + uint64_t clone_remap_dl_obj, origin_remap_dl_obj; + dsl_pool_t *dp = dmu_tx_pool(tx); + + ASSERT(dsl_pool_sync_context(dp)); + + clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone); + origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin); + + if (clone_remap_dl_obj != 0) { + dsl_deadlist_close(&clone->ds_remap_deadlist); + dsl_dataset_unset_remap_deadlist_object(clone, tx); + } + if (origin_remap_dl_obj != 0) { + dsl_deadlist_close(&origin->ds_remap_deadlist); + dsl_dataset_unset_remap_deadlist_object(origin, tx); + } + + if (clone_remap_dl_obj != 0) { + dsl_dataset_set_remap_deadlist_object(origin, + clone_remap_dl_obj, tx); + dsl_deadlist_open(&origin->ds_remap_deadlist, + dp->dp_meta_objset, clone_remap_dl_obj); + } + if (origin_remap_dl_obj != 0) { + dsl_dataset_set_remap_deadlist_object(clone, + origin_remap_dl_obj, tx); + dsl_deadlist_open(&clone->ds_remap_deadlist, + dp->dp_meta_objset, origin_remap_dl_obj); + } +} + +void +dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + int64_t unused_refres_delta; + + ASSERT(clone->ds_reserved == 0); + /* + * NOTE: On DEBUG kernels there could be a race between this and + * the check function if spa_asize_inflation is adjusted... + */ + ASSERT(origin_head->ds_quota == 0 || + dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota + + DMU_MAX_ACCESS * spa_asize_inflation); + ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); + + /* + * Swap per-dataset feature flags. + */ + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (!(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET)) { + ASSERT(!clone->ds_feature_inuse[f]); + ASSERT(!origin_head->ds_feature_inuse[f]); + continue; + } + + boolean_t clone_inuse = clone->ds_feature_inuse[f]; + boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f]; + + if (clone_inuse) { + dsl_dataset_deactivate_feature(clone->ds_object, f, tx); + clone->ds_feature_inuse[f] = B_FALSE; + } + if (origin_head_inuse) { + dsl_dataset_deactivate_feature(origin_head->ds_object, + f, tx); + origin_head->ds_feature_inuse[f] = B_FALSE; + } + if (clone_inuse) { + dsl_dataset_activate_feature(origin_head->ds_object, + f, tx); + origin_head->ds_feature_inuse[f] = B_TRUE; + } + if (origin_head_inuse) { + dsl_dataset_activate_feature(clone->ds_object, f, tx); + clone->ds_feature_inuse[f] = B_TRUE; + } + } + + dmu_buf_will_dirty(clone->ds_dbuf, tx); + dmu_buf_will_dirty(origin_head->ds_dbuf, tx); + + if (clone->ds_objset != NULL) { + dmu_objset_evict(clone->ds_objset); + clone->ds_objset = NULL; + } + + if (origin_head->ds_objset != NULL) { + dmu_objset_evict(origin_head->ds_objset); + origin_head->ds_objset = NULL; + } + + unused_refres_delta = + (int64_t)MIN(origin_head->ds_reserved, + dsl_dataset_phys(origin_head)->ds_unique_bytes) - + (int64_t)MIN(origin_head->ds_reserved, + dsl_dataset_phys(clone)->ds_unique_bytes); + + /* + * Reset origin's unique bytes, if it exists. + */ + if (clone->ds_prev) { + dsl_dataset_t *origin = clone->ds_prev; + uint64_t comp, uncomp; + + dmu_buf_will_dirty(origin->ds_dbuf, tx); + dsl_deadlist_space_range(&clone->ds_deadlist, + dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX, + &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp); + } + + /* swap blkptrs */ + { + rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG); + rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG); + blkptr_t tmp; + tmp = dsl_dataset_phys(origin_head)->ds_bp; + dsl_dataset_phys(origin_head)->ds_bp = + dsl_dataset_phys(clone)->ds_bp; + dsl_dataset_phys(clone)->ds_bp = tmp; + rrw_exit(&origin_head->ds_bp_rwlock, FTAG); + rrw_exit(&clone->ds_bp_rwlock, FTAG); + } + + /* set dd_*_bytes */ + { + int64_t dused, dcomp, duncomp; + uint64_t cdl_used, cdl_comp, cdl_uncomp; + uint64_t odl_used, odl_comp, odl_uncomp; + + ASSERT3U(dsl_dir_phys(clone->ds_dir)-> + dd_used_breakdown[DD_USED_SNAP], ==, 0); + + dsl_deadlist_space(&clone->ds_deadlist, + &cdl_used, &cdl_comp, &cdl_uncomp); + dsl_deadlist_space(&origin_head->ds_deadlist, + &odl_used, &odl_comp, &odl_uncomp); + + dused = dsl_dataset_phys(clone)->ds_referenced_bytes + + cdl_used - + (dsl_dataset_phys(origin_head)->ds_referenced_bytes + + odl_used); + dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes + + cdl_comp - + (dsl_dataset_phys(origin_head)->ds_compressed_bytes + + odl_comp); + duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes + + cdl_uncomp - + (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes + + odl_uncomp); + + dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD, + dused, dcomp, duncomp, tx); + dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD, + -dused, -dcomp, -duncomp, tx); + + /* + * The difference in the space used by snapshots is the + * difference in snapshot space due to the head's + * deadlist (since that's the only thing that's + * changing that affects the snapused). + */ + dsl_deadlist_space_range(&clone->ds_deadlist, + origin_head->ds_dir->dd_origin_txg, UINT64_MAX, + &cdl_used, &cdl_comp, &cdl_uncomp); + dsl_deadlist_space_range(&origin_head->ds_deadlist, + origin_head->ds_dir->dd_origin_txg, UINT64_MAX, + &odl_used, &odl_comp, &odl_uncomp); + dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used, + DD_USED_HEAD, DD_USED_SNAP, NULL); + } + + /* swap ds_*_bytes */ + SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes, + dsl_dataset_phys(clone)->ds_referenced_bytes); + SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes, + dsl_dataset_phys(clone)->ds_compressed_bytes); + SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes, + dsl_dataset_phys(clone)->ds_uncompressed_bytes); + SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes, + dsl_dataset_phys(clone)->ds_unique_bytes); + + /* apply any parent delta for change in unconsumed refreservation */ + dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV, + unused_refres_delta, 0, 0, tx); + + /* + * Swap deadlists. + */ + dsl_deadlist_close(&clone->ds_deadlist); + dsl_deadlist_close(&origin_head->ds_deadlist); + SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj, + dsl_dataset_phys(clone)->ds_deadlist_obj); + dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, + dsl_dataset_phys(clone)->ds_deadlist_obj); + dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, + dsl_dataset_phys(origin_head)->ds_deadlist_obj); + dsl_dataset_swap_remap_deadlists(clone, origin_head, tx); + + dsl_scan_ds_clone_swapped(origin_head, clone, tx); + + spa_history_log_internal_ds(clone, "clone swap", tx, + "parent=%s", origin_head->ds_dir->dd_myname); +} + +/* + * Given a pool name and a dataset object number in that pool, + * return the name of that dataset. + */ +int +dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int error; + + error = dsl_pool_hold(pname, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); + if (error == 0) { + dsl_dataset_name(ds, buf); + dsl_dataset_rele(ds, FTAG); + } + dsl_pool_rele(dp, FTAG); + + return (error); +} + +int +dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) +{ + int error = 0; + + ASSERT3S(asize, >, 0); + + /* + * *ref_rsrv is the portion of asize that will come from any + * unconsumed refreservation space. + */ + *ref_rsrv = 0; + + mutex_enter(&ds->ds_lock); + /* + * Make a space adjustment for reserved bytes. + */ + if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { + ASSERT3U(*used, >=, + ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); + *used -= + (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); + *ref_rsrv = + asize - MIN(asize, parent_delta(ds, asize + inflight)); + } + + if (!check_quota || ds->ds_quota == 0) { + mutex_exit(&ds->ds_lock); + return (0); + } + /* + * If they are requesting more space, and our current estimate + * is over quota, they get to try again unless the actual + * on-disk is over quota and there are no pending changes (which + * may free up space for us). + */ + if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >= + ds->ds_quota) { + if (inflight > 0 || + dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota) + error = SET_ERROR(ERESTART); + else + error = SET_ERROR(EDQUOT); + } + mutex_exit(&ds->ds_lock); + + return (error); +} + +typedef struct dsl_dataset_set_qr_arg { + const char *ddsqra_name; + zprop_source_t ddsqra_source; + uint64_t ddsqra_value; +} dsl_dataset_set_qr_arg_t; + + +/* ARGSUSED */ +static int +dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + uint64_t newval; + + if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA) + return (SET_ERROR(ENOTSUP)); + + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + + if (ds->ds_is_snapshot) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (newval == 0) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes || + newval < ds->ds_reserved) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOSPC)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t newval; + + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); + + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); + + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval)); + + if (ds->ds_quota != newval) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_quota = newval; + } + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, + uint64_t refquota) +{ + dsl_dataset_set_qr_arg_t ddsqra; + + ddsqra.ddsqra_name = dsname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = refquota; + + return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check, + dsl_dataset_set_refquota_sync, &ddsqra, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +static int +dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + uint64_t newval, unique; + + if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION) + return (SET_ERROR(ENOTSUP)); + + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + + if (ds->ds_is_snapshot) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + /* + * If we are doing the preliminary check in open context, the + * space estimates may be inaccurate. + */ + if (!dmu_tx_is_syncing(tx)) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + mutex_enter(&ds->ds_lock); + if (!DS_UNIQUE_IS_ACCURATE(ds)) + dsl_dataset_recalc_head_uniq(ds); + unique = dsl_dataset_phys(ds)->ds_unique_bytes; + mutex_exit(&ds->ds_lock); + + if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) { + uint64_t delta = MAX(unique, newval) - + MAX(unique, ds->ds_reserved); + + if (delta > + dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) || + (ds->ds_quota > 0 && newval > ds->ds_quota)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOSPC)); + } + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, + zprop_source_t source, uint64_t value, dmu_tx_t *tx) +{ + uint64_t newval; + uint64_t unique; + int64_t delta; + + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + source, sizeof (value), 1, &value, tx); + + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval)); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); + mutex_enter(&ds->ds_lock); + ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); + unique = dsl_dataset_phys(ds)->ds_unique_bytes; + delta = MAX(0, (int64_t)(newval - unique)) - + MAX(0, (int64_t)(ds->ds_reserved - unique)); + ds->ds_reserved = newval; + mutex_exit(&ds->ds_lock); + + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); + mutex_exit(&ds->ds_dir->dd_lock); +} + +static void +dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); + dsl_dataset_set_refreservation_sync_impl(ds, + ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, + uint64_t refreservation) +{ + dsl_dataset_set_qr_arg_t ddsqra; + + ddsqra.ddsqra_name = dsname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = refreservation; + + return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check, + dsl_dataset_set_refreservation_sync, &ddsqra, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +/* + * Return (in *usedp) the amount of space written in new that is not + * present in oldsnap. New may be a snapshot or the head. Old must be + * a snapshot before new, in new's filesystem (or its origin). If not then + * fail and return EINVAL. + * + * The written space is calculated by considering two components: First, we + * ignore any freed space, and calculate the written as new's used space + * minus old's used space. Next, we add in the amount of space that was freed + * between the two snapshots, thus reducing new's used space relative to old's. + * Specifically, this is the space that was born before old->ds_creation_txg, + * and freed before new (ie. on new's deadlist or a previous deadlist). + * + * space freed [---------------------] + * snapshots ---O-------O--------O-------O------ + * oldsnap new + */ +int +dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err = 0; + uint64_t snapobj; + dsl_pool_t *dp = new->ds_dir->dd_pool; + + ASSERT(dsl_pool_config_held(dp)); + + *usedp = 0; + *usedp += dsl_dataset_phys(new)->ds_referenced_bytes; + *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes; + + *compp = 0; + *compp += dsl_dataset_phys(new)->ds_compressed_bytes; + *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes; + + *uncompp = 0; + *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes; + *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes; + + snapobj = new->ds_object; + while (snapobj != oldsnap->ds_object) { + dsl_dataset_t *snap; + uint64_t used, comp, uncomp; + + if (snapobj == new->ds_object) { + snap = new; + } else { + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); + if (err != 0) + break; + } + + if (dsl_dataset_phys(snap)->ds_prev_snap_txg == + dsl_dataset_phys(oldsnap)->ds_creation_txg) { + /* + * The blocks in the deadlist can not be born after + * ds_prev_snap_txg, so get the whole deadlist space, + * which is more efficient (especially for old-format + * deadlists). Unfortunately the deadlist code + * doesn't have enough information to make this + * optimization itself. + */ + dsl_deadlist_space(&snap->ds_deadlist, + &used, &comp, &uncomp); + } else { + dsl_deadlist_space_range(&snap->ds_deadlist, + 0, dsl_dataset_phys(oldsnap)->ds_creation_txg, + &used, &comp, &uncomp); + } + *usedp += used; + *compp += comp; + *uncompp += uncomp; + + /* + * If we get to the beginning of the chain of snapshots + * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap + * was not a snapshot of/before new. + */ + snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + if (snap != new) + dsl_dataset_rele(snap, FTAG); + if (snapobj == 0) { + err = SET_ERROR(EINVAL); + break; + } + + } + return (err); +} + +/* + * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, + * lastsnap, and all snapshots in between are deleted. + * + * blocks that would be freed [---------------------------] + * snapshots ---O-------O--------O-------O--------O + * firstsnap lastsnap + * + * This is the set of blocks that were born after the snap before firstsnap, + * (birth > firstsnap->prev_snap_txg) and died before the snap after the + * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). + * We calculate this by iterating over the relevant deadlists (from the snap + * after lastsnap, backward to the snap after firstsnap), summing up the + * space on the deadlist that was born after the snap before firstsnap. + */ +int +dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, + dsl_dataset_t *lastsnap, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err = 0; + uint64_t snapobj; + dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; + + ASSERT(firstsnap->ds_is_snapshot); + ASSERT(lastsnap->ds_is_snapshot); + + /* + * Check that the snapshots are in the same dsl_dir, and firstsnap + * is before lastsnap. + */ + if (firstsnap->ds_dir != lastsnap->ds_dir || + dsl_dataset_phys(firstsnap)->ds_creation_txg > + dsl_dataset_phys(lastsnap)->ds_creation_txg) + return (SET_ERROR(EINVAL)); + + *usedp = *compp = *uncompp = 0; + + snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj; + while (snapobj != firstsnap->ds_object) { + dsl_dataset_t *ds; + uint64_t used, comp, uncomp; + + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); + if (err != 0) + break; + + dsl_deadlist_space_range(&ds->ds_deadlist, + dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX, + &used, &comp, &uncomp); + *usedp += used; + *compp += comp; + *uncompp += uncomp; + + snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + ASSERT3U(snapobj, !=, 0); + dsl_dataset_rele(ds, FTAG); + } + return (err); +} + +/* + * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. + * For example, they could both be snapshots of the same filesystem, and + * 'earlier' is before 'later'. Or 'earlier' could be the origin of + * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's + * filesystem. Or 'earlier' could be the origin's origin. + * + * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg. + */ +boolean_t +dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, + uint64_t earlier_txg) +{ + dsl_pool_t *dp = later->ds_dir->dd_pool; + int error; + boolean_t ret; + + ASSERT(dsl_pool_config_held(dp)); + ASSERT(earlier->ds_is_snapshot || earlier_txg != 0); + + if (earlier_txg == 0) + earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg; + + if (later->ds_is_snapshot && + earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg) + return (B_FALSE); + + if (later->ds_dir == earlier->ds_dir) + return (B_TRUE); + if (!dsl_dir_is_clone(later->ds_dir)) + return (B_FALSE); + + if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object) + return (B_TRUE); + dsl_dataset_t *origin; + error = dsl_dataset_hold_obj(dp, + dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin); + if (error != 0) + return (B_FALSE); + ret = dsl_dataset_is_before(origin, earlier, earlier_txg); + dsl_dataset_rele(origin, FTAG); + return (ret); +} + +void +dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx); +} + +boolean_t +dsl_dataset_is_zapified(dsl_dataset_t *ds) +{ + dmu_object_info_t doi; + + dmu_object_info_from_db(ds->ds_dbuf, &doi); + return (doi.doi_type == DMU_OTN_ZAP_METADATA); +} + +boolean_t +dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds) +{ + return (dsl_dataset_is_zapified(ds) && + zap_contains(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0); +} + +uint64_t +dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds) +{ + uint64_t remap_deadlist_obj; + int err; + + if (!dsl_dataset_is_zapified(ds)) + return (0); + + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, + DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1, + &remap_deadlist_obj); + + if (err != 0) { + VERIFY3S(err, ==, ENOENT); + return (0); + } + + ASSERT(remap_deadlist_obj != 0); + return (remap_deadlist_obj); +} + +boolean_t +dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds) +{ + EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist), + dsl_dataset_get_remap_deadlist_object(ds) != 0); + return (dsl_deadlist_is_open(&ds->ds_remap_deadlist)); +} + +static void +dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj, + dmu_tx_t *tx) +{ + ASSERT(obj != 0); + dsl_dataset_zapify(ds, tx); + VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, + DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx)); +} + +static void +dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx)); +} + +void +dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t remap_deadlist_object; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_dataset_remap_deadlist_exists(ds)); + + remap_deadlist_object = ds->ds_remap_deadlist.dl_object; + dsl_deadlist_close(&ds->ds_remap_deadlist); + dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx); + dsl_dataset_unset_remap_deadlist_object(ds, tx); + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); +} + +void +dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t remap_deadlist_obj; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock)); + /* + * Currently we only create remap deadlists when there are indirect + * vdevs with referenced mappings. + */ + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + remap_deadlist_obj = dsl_deadlist_clone( + &ds->ds_deadlist, UINT64_MAX, + dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); + dsl_dataset_set_remap_deadlist_object(ds, + remap_deadlist_obj, tx); + dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa), + remap_deadlist_obj); + spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c new file mode 100644 index 000000000000..2f3647bc8e86 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c @@ -0,0 +1,561 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include <sys/dsl_dataset.h> +#include <sys/dmu.h> +#include <sys/refcount.h> +#include <sys/zap.h> +#include <sys/zfs_context.h> +#include <sys/dsl_pool.h> + +/* + * Deadlist concurrency: + * + * Deadlists can only be modified from the syncing thread. + * + * Except for dsl_deadlist_insert(), it can only be modified with the + * dp_config_rwlock held with RW_WRITER. + * + * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can + * be called concurrently, from open context, with the dl_config_rwlock held + * with RW_READER. + * + * Therefore, we only need to provide locking between dsl_deadlist_insert() and + * the accessors, protecting: + * dl_phys->dl_used,comp,uncomp + * and protecting the dl_tree from being loaded. + * The locking is provided by dl_lock. Note that locking on the bpobj_t + * provides its own locking, and dl_oldfmt is immutable. + */ + +static int +dsl_deadlist_compare(const void *arg1, const void *arg2) +{ + const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1; + const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2; + + return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); +} + +static void +dsl_deadlist_load_tree(dsl_deadlist_t *dl) +{ + zap_cursor_t zc; + zap_attribute_t za; + + ASSERT(MUTEX_HELD(&dl->dl_lock)); + + ASSERT(!dl->dl_oldfmt); + if (dl->dl_havetree) + return; + + avl_create(&dl->dl_tree, dsl_deadlist_compare, + sizeof (dsl_deadlist_entry_t), + offsetof(dsl_deadlist_entry_t, dle_node)); + for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); + dle->dle_mintxg = zfs_strtonum(za.za_name, NULL); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, + za.za_first_integer)); + avl_add(&dl->dl_tree, dle); + } + zap_cursor_fini(&zc); + dl->dl_havetree = B_TRUE; +} + +void +dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) +{ + dmu_object_info_t doi; + + ASSERT(!dsl_deadlist_is_open(dl)); + + mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); + dl->dl_os = os; + dl->dl_object = object; + VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); + dmu_object_info_from_db(dl->dl_dbuf, &doi); + if (doi.doi_type == DMU_OT_BPOBJ) { + dmu_buf_rele(dl->dl_dbuf, dl); + dl->dl_dbuf = NULL; + dl->dl_oldfmt = B_TRUE; + VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); + return; + } + + dl->dl_oldfmt = B_FALSE; + dl->dl_phys = dl->dl_dbuf->db_data; + dl->dl_havetree = B_FALSE; +} + +boolean_t +dsl_deadlist_is_open(dsl_deadlist_t *dl) +{ + return (dl->dl_os != NULL); +} + +void +dsl_deadlist_close(dsl_deadlist_t *dl) +{ + void *cookie = NULL; + dsl_deadlist_entry_t *dle; + + ASSERT(dsl_deadlist_is_open(dl)); + + if (dl->dl_oldfmt) { + dl->dl_oldfmt = B_FALSE; + bpobj_close(&dl->dl_bpobj); + dl->dl_os = NULL; + dl->dl_object = 0; + return; + } + + if (dl->dl_havetree) { + while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) + != NULL) { + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + } + avl_destroy(&dl->dl_tree); + } + dmu_buf_rele(dl->dl_dbuf, dl); + mutex_destroy(&dl->dl_lock); + dl->dl_dbuf = NULL; + dl->dl_phys = NULL; + dl->dl_os = NULL; + dl->dl_object = 0; +} + +uint64_t +dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) +{ + if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) + return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); + return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, + sizeof (dsl_deadlist_phys_t), tx)); +} + +void +dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) +{ + dmu_object_info_t doi; + zap_cursor_t zc; + zap_attribute_t za; + + VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); + if (doi.doi_type == DMU_OT_BPOBJ) { + bpobj_free(os, dlobj, tx); + return; + } + + for (zap_cursor_init(&zc, os, dlobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t obj = za.za_first_integer; + if (obj == dmu_objset_pool(os)->dp_empty_bpobj) + bpobj_decr_empty(os, tx); + else + bpobj_free(os, obj, tx); + } + zap_cursor_fini(&zc); + VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); +} + +static void +dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, + const blkptr_t *bp, dmu_tx_t *tx) +{ + ASSERT(MUTEX_HELD(&dl->dl_lock)); + if (dle->dle_bpobj.bpo_object == + dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { + uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); + bpobj_close(&dle->dle_bpobj); + bpobj_decr_empty(dl->dl_os, tx); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, + dle->dle_mintxg, obj, tx)); + } + bpobj_enqueue(&dle->dle_bpobj, bp, tx); +} + +static void +dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, + uint64_t obj, dmu_tx_t *tx) +{ + ASSERT(MUTEX_HELD(&dl->dl_lock)); + if (dle->dle_bpobj.bpo_object != + dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { + bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); + } else { + bpobj_close(&dle->dle_bpobj); + bpobj_decr_empty(dl->dl_os, tx); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, + dle->dle_mintxg, obj, tx)); + } +} + +void +dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + if (dl->dl_oldfmt) { + bpobj_enqueue(&dl->dl_bpobj, bp, tx); + return; + } + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + + dmu_buf_will_dirty(dl->dl_dbuf, tx); + dl->dl_phys->dl_used += + bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); + dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); + dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); + + dle_tofind.dle_mintxg = bp->blk_birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + else + dle = AVL_PREV(&dl->dl_tree, dle); + dle_enqueue(dl, dle, bp, tx); + mutex_exit(&dl->dl_lock); +} + +/* + * Insert new key in deadlist, which must be > all current entries. + * mintxg is not inclusive. + */ +void +dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + uint64_t obj; + dsl_deadlist_entry_t *dle; + + if (dl->dl_oldfmt) + return; + + dle = kmem_alloc(sizeof (*dle), KM_SLEEP); + dle->dle_mintxg = mintxg; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + + obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + avl_add(&dl->dl_tree, dle); + + VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, + mintxg, obj, tx)); + mutex_exit(&dl->dl_lock); +} + +/* + * Remove this key, merging its entries into the previous key. + */ +void +dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle, *dle_prev; + + if (dl->dl_oldfmt) + return; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); + dle_prev = AVL_PREV(&dl->dl_tree, dle); + + dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); + + avl_remove(&dl->dl_tree, dle); + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + + VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); + mutex_exit(&dl->dl_lock); +} + +/* + * Walk ds's snapshots to regenerate generate ZAP & AVL. + */ +static void +dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, + uint64_t mrs_obj, dmu_tx_t *tx) +{ + dsl_deadlist_t dl = { 0 }; + dsl_pool_t *dp = dmu_objset_pool(os); + + dsl_deadlist_open(&dl, os, dlobj); + if (dl.dl_oldfmt) { + dsl_deadlist_close(&dl); + return; + } + + while (mrs_obj != 0) { + dsl_dataset_t *ds; + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); + dsl_deadlist_add_key(&dl, + dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsl_dataset_rele(ds, FTAG); + } + dsl_deadlist_close(&dl); +} + +uint64_t +dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, + uint64_t mrs_obj, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t *dle; + uint64_t newobj; + + newobj = dsl_deadlist_alloc(dl->dl_os, tx); + + if (dl->dl_oldfmt) { + dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); + return (newobj); + } + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + + for (dle = avl_first(&dl->dl_tree); dle; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + uint64_t obj; + + if (dle->dle_mintxg >= maxtxg) + break; + + obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, + dle->dle_mintxg, obj, tx)); + } + mutex_exit(&dl->dl_lock); + return (newobj); +} + +void +dsl_deadlist_space(dsl_deadlist_t *dl, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + ASSERT(dsl_deadlist_is_open(dl)); + if (dl->dl_oldfmt) { + VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, + usedp, compp, uncompp)); + return; + } + + mutex_enter(&dl->dl_lock); + *usedp = dl->dl_phys->dl_used; + *compp = dl->dl_phys->dl_comp; + *uncompp = dl->dl_phys->dl_uncomp; + mutex_exit(&dl->dl_lock); +} + +/* + * return space used in the range (mintxg, maxtxg]. + * Includes maxtxg, does not include mintxg. + * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is + * larger than any bp in the deadlist (eg. UINT64_MAX)). + */ +void +dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + dsl_deadlist_entry_t *dle; + dsl_deadlist_entry_t dle_tofind; + avl_index_t where; + + if (dl->dl_oldfmt) { + VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, + mintxg, maxtxg, usedp, compp, uncompp)); + return; + } + + *usedp = *compp = *uncompp = 0; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + /* + * If we don't find this mintxg, there shouldn't be anything + * after it either. + */ + ASSERT(dle != NULL || + avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); + + for (; dle && dle->dle_mintxg < maxtxg; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + uint64_t used, comp, uncomp; + + VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, + &used, &comp, &uncomp)); + + *usedp += used; + *compp += comp; + *uncompp += uncomp; + } + mutex_exit(&dl->dl_lock); +} + +static void +dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, + dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + uint64_t used, comp, uncomp; + bpobj_t bpo; + + ASSERT(MUTEX_HELD(&dl->dl_lock)); + + VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); + bpobj_close(&bpo); + + dsl_deadlist_load_tree(dl); + + dmu_buf_will_dirty(dl->dl_dbuf, tx); + dl->dl_phys->dl_used += used; + dl->dl_phys->dl_comp += comp; + dl->dl_phys->dl_uncomp += uncomp; + + dle_tofind.dle_mintxg = birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + dle_enqueue_subobj(dl, dle, obj, tx); +} + +static int +dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, tx); + return (0); +} + +/* + * Merge the deadlist pointed to by 'obj' into dl. obj will be left as + * an empty deadlist. + */ +void +dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + dmu_buf_t *bonus; + dsl_deadlist_phys_t *dlp; + dmu_object_info_t doi; + + VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); + if (doi.doi_type == DMU_OT_BPOBJ) { + bpobj_t bpo; + VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY3U(0, ==, bpobj_iterate(&bpo, + dsl_deadlist_insert_cb, dl, tx)); + bpobj_close(&bpo); + return; + } + + mutex_enter(&dl->dl_lock); + for (zap_cursor_init(&zc, dl->dl_os, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t mintxg = zfs_strtonum(za.za_name, NULL); + dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); + VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); + } + zap_cursor_fini(&zc); + + VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); + dlp = bonus->db_data; + dmu_buf_will_dirty(bonus, tx); + bzero(dlp, sizeof (*dlp)); + dmu_buf_rele(bonus, FTAG); + mutex_exit(&dl->dl_lock); +} + +/* + * Remove entries on dl that are >= mintxg, and put them on the bpobj. + */ +void +dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, + dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + ASSERT(!dl->dl_oldfmt); + + mutex_enter(&dl->dl_lock); + dmu_buf_will_dirty(dl->dl_dbuf, tx); + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); + while (dle) { + uint64_t used, comp, uncomp; + dsl_deadlist_entry_t *dle_next; + + bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); + + VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, + &used, &comp, &uncomp)); + ASSERT3U(dl->dl_phys->dl_used, >=, used); + ASSERT3U(dl->dl_phys->dl_comp, >=, comp); + ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); + dl->dl_phys->dl_used -= used; + dl->dl_phys->dl_comp -= comp; + dl->dl_phys->dl_uncomp -= uncomp; + + VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, + dle->dle_mintxg, tx)); + + dle_next = AVL_NEXT(&dl->dl_tree, dle); + avl_remove(&dl->dl_tree, dle); + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + dle = dle_next; + } + mutex_exit(&dl->dl_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c new file mode 100644 index 000000000000..0ad658f910ec --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c @@ -0,0 +1,760 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + */ + +/* + * DSL permissions are stored in a two level zap attribute + * mechanism. The first level identifies the "class" of + * entry. The class is identified by the first 2 letters of + * the attribute. The second letter "l" or "d" identifies whether + * it is a local or descendent permission. The first letter + * identifies the type of entry. + * + * ul$<id> identifies permissions granted locally for this userid. + * ud$<id> identifies permissions granted on descendent datasets for + * this userid. + * Ul$<id> identifies permission sets granted locally for this userid. + * Ud$<id> identifies permission sets granted on descendent datasets for + * this userid. + * gl$<id> identifies permissions granted locally for this groupid. + * gd$<id> identifies permissions granted on descendent datasets for + * this groupid. + * Gl$<id> identifies permission sets granted locally for this groupid. + * Gd$<id> identifies permission sets granted on descendent datasets for + * this groupid. + * el$ identifies permissions granted locally for everyone. + * ed$ identifies permissions granted on descendent datasets + * for everyone. + * El$ identifies permission sets granted locally for everyone. + * Ed$ identifies permission sets granted to descendent datasets for + * everyone. + * c-$ identifies permission to create at dataset creation time. + * C-$ identifies permission sets to grant locally at dataset creation + * time. + * s-$@<name> permissions defined in specified set @<name> + * S-$@<name> Sets defined in named set @<name> + * + * Each of the above entities points to another zap attribute that contains one + * attribute for each allowed permission, such as create, destroy,... + * All of the "upper" case class types will specify permission set names + * rather than permissions. + * + * Basically it looks something like this: + * ul$12 -> ZAP OBJ -> permissions... + * + * The ZAP OBJ is referred to as the jump object. + */ + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_deleg.h> +#include <sys/spa.h> +#include <sys/zap.h> +#include <sys/fs/zfs.h> +#include <sys/cred.h> +#include <sys/sunddi.h> + +#include "zfs_deleg.h" + +/* + * Validate that user is allowed to delegate specified permissions. + * + * In order to delegate "create" you must have "create" + * and "allow". + */ +int +dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr) +{ + nvpair_t *whopair = NULL; + int error; + + if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) + return (error); + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + nvlist_t *perms; + nvpair_t *permpair = NULL; + + VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); + + while (permpair = nvlist_next_nvpair(perms, permpair)) { + const char *perm = nvpair_name(permpair); + + if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0) + return (SET_ERROR(EPERM)); + + if ((error = dsl_deleg_access(ddname, perm, cr)) != 0) + return (error); + } + } + return (0); +} + +/* + * Validate that user is allowed to unallow specified permissions. They + * must have the 'allow' permission, and even then can only unallow + * perms for their uid. + */ +int +dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) +{ + nvpair_t *whopair = NULL; + int error; + char idstr[32]; + + if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) + return (error); + + (void) snprintf(idstr, sizeof (idstr), "%lld", + (longlong_t)crgetuid(cr)); + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + zfs_deleg_who_type_t type = nvpair_name(whopair)[0]; + + if (type != ZFS_DELEG_USER && + type != ZFS_DELEG_USER_SETS) + return (SET_ERROR(EPERM)); + + if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0) + return (SET_ERROR(EPERM)); + } + return (0); +} + +typedef struct dsl_deleg_arg { + const char *dda_name; + nvlist_t *dda_nvlist; +} dsl_deleg_arg_t; + +static void +dsl_deleg_set_sync(void *arg, dmu_tx_t *tx) +{ + dsl_deleg_arg_t *dda = arg; + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + nvpair_t *whopair = NULL; + uint64_t zapobj; + + VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); + + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; + if (zapobj == 0) { + dmu_buf_will_dirty(dd->dd_dbuf, tx); + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos, + DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + } + + while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) { + const char *whokey = nvpair_name(whopair); + nvlist_t *perms; + nvpair_t *permpair = NULL; + uint64_t jumpobj; + + perms = fnvpair_value_nvlist(whopair); + + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { + jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS, + zapobj, whokey, tx); + } + + while (permpair = nvlist_next_nvpair(perms, permpair)) { + const char *perm = nvpair_name(permpair); + uint64_t n = 0; + + VERIFY(zap_update(mos, jumpobj, + perm, 8, 1, &n, tx) == 0); + spa_history_log_internal_dd(dd, "permission update", tx, + "%s %s", whokey, perm); + } + } + dsl_dir_rele(dd, FTAG); +} + +static void +dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx) +{ + dsl_deleg_arg_t *dda = arg; + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + nvpair_t *whopair = NULL; + uint64_t zapobj; + + VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; + if (zapobj == 0) { + dsl_dir_rele(dd, FTAG); + return; + } + + while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) { + const char *whokey = nvpair_name(whopair); + nvlist_t *perms; + nvpair_t *permpair = NULL; + uint64_t jumpobj; + + if (nvpair_value_nvlist(whopair, &perms) != 0) { + if (zap_lookup(mos, zapobj, whokey, 8, + 1, &jumpobj) == 0) { + (void) zap_remove(mos, zapobj, whokey, tx); + VERIFY(0 == zap_destroy(mos, jumpobj, tx)); + } + spa_history_log_internal_dd(dd, "permission who remove", + tx, "%s", whokey); + continue; + } + + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) + continue; + + while (permpair = nvlist_next_nvpair(perms, permpair)) { + const char *perm = nvpair_name(permpair); + uint64_t n = 0; + + (void) zap_remove(mos, jumpobj, perm, tx); + if (zap_count(mos, jumpobj, &n) == 0 && n == 0) { + (void) zap_remove(mos, zapobj, + whokey, tx); + VERIFY(0 == zap_destroy(mos, + jumpobj, tx)); + } + spa_history_log_internal_dd(dd, "permission remove", tx, + "%s %s", whokey, perm); + } + } + dsl_dir_rele(dd, FTAG); +} + +static int +dsl_deleg_check(void *arg, dmu_tx_t *tx) +{ + dsl_deleg_arg_t *dda = arg; + dsl_dir_t *dd; + int error; + + if (spa_version(dmu_tx_pool(tx)->dp_spa) < + SPA_VERSION_DELEGATED_PERMS) { + return (SET_ERROR(ENOTSUP)); + } + + error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL); + if (error == 0) + dsl_dir_rele(dd, FTAG); + return (error); +} + +int +dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) +{ + dsl_deleg_arg_t dda; + + /* nvp must already have been verified to be valid */ + + dda.dda_name = ddname; + dda.dda_nvlist = nvp; + + return (dsl_sync_task(ddname, dsl_deleg_check, + unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, + &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED)); +} + +/* + * Find all 'allow' permissions from a given point and then continue + * traversing up to the root. + * + * This function constructs an nvlist of nvlists. + * each setpoint is an nvlist composed of an nvlist of an nvlist + * of the individual * users/groups/everyone/create + * permissions. + * + * The nvlist will look like this. + * + * { source fsname -> { whokeys { permissions,...}, ...}} + * + * The fsname nvpairs will be arranged in a bottom up order. For example, + * if we have the following structure a/b/c then the nvpairs for the fsnames + * will be ordered a/b/c, a/b, a. + */ +int +dsl_deleg_get(const char *ddname, nvlist_t **nvp) +{ + dsl_dir_t *dd, *startdd; + dsl_pool_t *dp; + int error; + objset_t *mos; + + error = dsl_pool_hold(ddname, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + dp = startdd->dd_pool; + mos = dp->dp_meta_objset; + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + for (dd = startdd; dd != NULL; dd = dd->dd_parent) { + zap_cursor_t basezc; + zap_attribute_t baseza; + nvlist_t *sp_nvp; + uint64_t n; + char source[ZFS_MAX_DATASET_NAME_LEN]; + + if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 || + zap_count(mos, + dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0) + continue; + + sp_nvp = fnvlist_alloc(); + for (zap_cursor_init(&basezc, mos, + dsl_dir_phys(dd)->dd_deleg_zapobj); + zap_cursor_retrieve(&basezc, &baseza) == 0; + zap_cursor_advance(&basezc)) { + zap_cursor_t zc; + zap_attribute_t za; + nvlist_t *perms_nvp; + + ASSERT(baseza.za_integer_length == 8); + ASSERT(baseza.za_num_integers == 1); + + perms_nvp = fnvlist_alloc(); + for (zap_cursor_init(&zc, mos, baseza.za_first_integer); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + fnvlist_add_boolean(perms_nvp, za.za_name); + } + zap_cursor_fini(&zc); + fnvlist_add_nvlist(sp_nvp, baseza.za_name, perms_nvp); + fnvlist_free(perms_nvp); + } + + zap_cursor_fini(&basezc); + + dsl_dir_name(dd, source); + fnvlist_add_nvlist(*nvp, source, sp_nvp); + nvlist_free(sp_nvp); + } + + dsl_dir_rele(startdd, FTAG); + dsl_pool_rele(dp, FTAG); + return (0); +} + +/* + * Routines for dsl_deleg_access() -- access checking. + */ +typedef struct perm_set { + avl_node_t p_node; + boolean_t p_matched; + char p_setname[ZFS_MAX_DELEG_NAME]; +} perm_set_t; + +static int +perm_set_compare(const void *arg1, const void *arg2) +{ + const perm_set_t *node1 = (const perm_set_t *)arg1; + const perm_set_t *node2 = (const perm_set_t *)arg2; + int val; + + val = strcmp(node1->p_setname, node2->p_setname); + + return (AVL_ISIGN(val)); +} + +/* + * Determine whether a specified permission exists. + * + * First the base attribute has to be retrieved. i.e. ul$12 + * Once the base object has been retrieved the actual permission + * is lookup up in the zap object the base object points to. + * + * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if + * there is no perm in that jumpobj. + */ +static int +dsl_check_access(objset_t *mos, uint64_t zapobj, + char type, char checkflag, void *valp, const char *perm) +{ + int error; + uint64_t jumpobj, zero; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, type, checkflag, valp); + error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); + if (error == 0) { + error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero); + if (error == ENOENT) + error = SET_ERROR(EPERM); + } + return (error); +} + +/* + * check a specified user/group for a requested permission + */ +static int +dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm, + int checkflag, cred_t *cr) +{ + const gid_t *gids; + int ngids; + int i; + uint64_t id; + + /* check for user */ + id = crgetuid(cr); + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_USER, checkflag, &id, perm) == 0) + return (0); + + /* check for users primary group */ + id = crgetgid(cr); + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) + return (0); + + /* check for everyone entry */ + id = -1; + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0) + return (0); + + /* check each supplemental group user is a member of */ + ngids = crgetngroups(cr); + gids = crgetgroups(cr); + for (i = 0; i != ngids; i++) { + id = gids[i]; + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) + return (0); + } + + return (SET_ERROR(EPERM)); +} + +/* + * Iterate over the sets specified in the specified zapobj + * and load them into the permsets avl tree. + */ +static int +dsl_load_sets(objset_t *mos, uint64_t zapobj, + char type, char checkflag, void *valp, avl_tree_t *avl) +{ + zap_cursor_t zc; + zap_attribute_t za; + perm_set_t *permnode; + avl_index_t idx; + uint64_t jumpobj; + int error; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, type, checkflag, valp); + + error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); + if (error != 0) + return (error); + + for (zap_cursor_init(&zc, mos, jumpobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP); + (void) strlcpy(permnode->p_setname, za.za_name, + sizeof (permnode->p_setname)); + permnode->p_matched = B_FALSE; + + if (avl_find(avl, permnode, &idx) == NULL) { + avl_insert(avl, permnode, idx); + } else { + kmem_free(permnode, sizeof (perm_set_t)); + } + } + zap_cursor_fini(&zc); + return (0); +} + +/* + * Load all permissions user based on cred belongs to. + */ +static void +dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, + char checkflag, cred_t *cr) +{ + const gid_t *gids; + int ngids, i; + uint64_t id; + + id = crgetuid(cr); + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_USER_SETS, checkflag, &id, avl); + + id = crgetgid(cr); + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); + + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl); + + ngids = crgetngroups(cr); + gids = crgetgroups(cr); + for (i = 0; i != ngids; i++) { + id = gids[i]; + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); + } +} + +/* + * Check if user has requested permission. + */ +int +dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) +{ + dsl_dir_t *dd; + dsl_pool_t *dp; + void *cookie; + int error; + char checkflag; + objset_t *mos; + avl_tree_t permsets; + perm_set_t *setnode; + + dp = ds->ds_dir->dd_pool; + mos = dp->dp_meta_objset; + + if (dsl_delegation_on(mos) == B_FALSE) + return (SET_ERROR(ECANCELED)); + + if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) < + SPA_VERSION_DELEGATED_PERMS) + return (SET_ERROR(EPERM)); + + if (ds->ds_is_snapshot) { + /* + * Snapshots are treated as descendents only, + * local permissions do not apply. + */ + checkflag = ZFS_DELEG_DESCENDENT; + } else { + checkflag = ZFS_DELEG_LOCAL; + } + + avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), + offsetof(perm_set_t, p_node)); + + ASSERT(dsl_pool_config_held(dp)); + for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, + checkflag = ZFS_DELEG_DESCENDENT) { + uint64_t zapobj; + boolean_t expanded; + + /* + * If not in global zone then make sure + * the zoned property is set + */ + if (!INGLOBALZONE(curthread)) { + uint64_t zoned; + + if (dsl_prop_get_dd(dd, + zfs_prop_to_name(ZFS_PROP_ZONED), + 8, 1, &zoned, NULL, B_FALSE) != 0) + break; + if (!zoned) + break; + } + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; + + if (zapobj == 0) + continue; + + dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr); +again: + expanded = B_FALSE; + for (setnode = avl_first(&permsets); setnode; + setnode = AVL_NEXT(&permsets, setnode)) { + if (setnode->p_matched == B_TRUE) + continue; + + /* See if this set directly grants this permission */ + error = dsl_check_access(mos, zapobj, + ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm); + if (error == 0) + goto success; + if (error == EPERM) + setnode->p_matched = B_TRUE; + + /* See if this set includes other sets */ + error = dsl_load_sets(mos, zapobj, + ZFS_DELEG_NAMED_SET_SETS, 0, + setnode->p_setname, &permsets); + if (error == 0) + setnode->p_matched = expanded = B_TRUE; + } + /* + * If we expanded any sets, that will define more sets, + * which we need to check. + */ + if (expanded) + goto again; + + error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr); + if (error == 0) + goto success; + } + error = SET_ERROR(EPERM); +success: + + cookie = NULL; + while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) + kmem_free(setnode, sizeof (perm_set_t)); + + return (error); +} + +int +dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int error; + + error = dsl_pool_hold(dsname, FTAG, &dp); + if (error != 0) + return (error); + error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == 0) { + error = dsl_deleg_access_impl(ds, perm, cr); + dsl_dataset_rele(ds, FTAG); + } + dsl_pool_rele(dp, FTAG); + + return (error); +} + +/* + * Other routines. + */ + +static void +copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj, + boolean_t dosets, uint64_t uid, dmu_tx_t *tx) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t jumpobj, pjumpobj; + uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; + zap_cursor_t zc; + zap_attribute_t za; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, + dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE, + ZFS_DELEG_LOCAL, NULL); + if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0) + return; + + if (zapobj == 0) { + dmu_buf_will_dirty(dd->dd_dbuf, tx); + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos, + DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + } + + zfs_deleg_whokey(whokey, + dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER, + ZFS_DELEG_LOCAL, &uid); + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) { + jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0); + } + + for (zap_cursor_init(&zc, mos, pjumpobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t zero = 0; + ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); + + VERIFY(zap_update(mos, jumpobj, za.za_name, + 8, 1, &zero, tx) == 0); + } + zap_cursor_fini(&zc); +} + +/* + * set all create time permission on new dataset. + */ +void +dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr) +{ + dsl_dir_t *dd; + uint64_t uid = crgetuid(cr); + + if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) < + SPA_VERSION_DELEGATED_PERMS) + return; + + for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) { + uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; + + if (pzapobj == 0) + continue; + + copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx); + copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx); + } +} + +int +dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + + if (zapobj == 0) + return (0); + + for (zap_cursor_init(&zc, mos, zapobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); + VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx)); + } + zap_cursor_fini(&zc); + VERIFY(0 == zap_destroy(mos, zapobj, tx)); + return (0); +} + +boolean_t +dsl_delegation_on(objset_t *os) +{ + return (!!spa_delegation(os->os_spa)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c new file mode 100644 index 000000000000..7183795311aa --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c @@ -0,0 +1,1080 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2013 by Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include <sys/zfs_context.h> +#include <sys/dsl_userhold.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_destroy.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_scan.h> +#include <sys/dmu_objset.h> +#include <sys/zap.h> +#include <sys/zfeature.h> +#include <sys/zfs_ioctl.h> +#include <sys/dsl_deleg.h> +#include <sys/dmu_impl.h> +#include <sys/zcp.h> + +int +dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) +{ + if (!ds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + if (dsl_dataset_long_held(ds)) + return (SET_ERROR(EBUSY)); + + /* + * Only allow deferred destroy on pools that support it. + * NOTE: deferred destroy is only supported on snapshots. + */ + if (defer) { + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < + SPA_VERSION_USERREFS) + return (SET_ERROR(ENOTSUP)); + return (0); + } + + /* + * If this snapshot has an elevated user reference count, + * we can't destroy it yet. + */ + if (ds->ds_userrefs > 0) + return (SET_ERROR(EBUSY)); + + /* + * Can't delete a branch point. + */ + if (dsl_dataset_phys(ds)->ds_num_children > 1) + return (SET_ERROR(EEXIST)); + + return (0); +} + +int +dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_snapshot_arg_t *ddsa = arg; + const char *dsname = ddsa->ddsa_name; + boolean_t defer = ddsa->ddsa_defer; + + dsl_pool_t *dp = dmu_tx_pool(tx); + int error = 0; + dsl_dataset_t *ds; + + error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + + /* + * If the snapshot does not exist, silently ignore it, and + * dsl_destroy_snapshot_sync() will be a no-op + * (it's "already destroyed"). + */ + if (error == ENOENT) + return (0); + + if (error == 0) { + error = dsl_destroy_snapshot_check_impl(ds, defer); + dsl_dataset_rele(ds, FTAG); + } + + return (error); +} + +struct process_old_arg { + dsl_dataset_t *ds; + dsl_dataset_t *ds_prev; + boolean_t after_branch_point; + zio_t *pio; + uint64_t used, comp, uncomp; +}; + +static int +process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct process_old_arg *poa = arg; + dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; + + ASSERT(!BP_IS_HOLE(bp)); + + if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { + dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); + if (poa->ds_prev && !poa->after_branch_point && + bp->blk_birth > + dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { + dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes += + bp_get_dsize_sync(dp->dp_spa, bp); + } + } else { + poa->used += bp_get_dsize_sync(dp->dp_spa, bp); + poa->comp += BP_GET_PSIZE(bp); + poa->uncomp += BP_GET_UCSIZE(bp); + dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); + } + return (0); +} + +static void +process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, + dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) +{ + struct process_old_arg poa = { 0 }; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t deadlist_obj; + + ASSERT(ds->ds_deadlist.dl_oldfmt); + ASSERT(ds_next->ds_deadlist.dl_oldfmt); + + poa.ds = ds; + poa.ds_prev = ds_prev; + poa.after_branch_point = after_branch_point; + poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, + process_old_cb, &poa, tx)); + VERIFY0(zio_wait(poa.pio)); + ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes); + + /* change snapused */ + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -poa.used, -poa.comp, -poa.uncomp, tx); + + /* swap next's deadlist to our deadlist */ + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_close(&ds_next->ds_deadlist); + deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; + dsl_dataset_phys(ds)->ds_deadlist_obj = + dsl_dataset_phys(ds_next)->ds_deadlist_obj; + dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj; + dsl_deadlist_open(&ds->ds_deadlist, mos, + dsl_dataset_phys(ds)->ds_deadlist_obj); + dsl_deadlist_open(&ds_next->ds_deadlist, mos, + dsl_dataset_phys(ds_next)->ds_deadlist_obj); +} + +static void +dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + + /* + * If it is the old version, dd_clones doesn't exist so we can't + * find the clones, but dsl_deadlist_remove_key() is a no-op so it + * doesn't matter. + */ + if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0) + return; + + for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *clone; + + VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone)); + if (clone->ds_dir->dd_origin_txg > mintxg) { + dsl_deadlist_remove_key(&clone->ds_deadlist, + mintxg, tx); + if (dsl_dataset_remap_deadlist_exists(clone)) { + dsl_deadlist_remove_key( + &clone->ds_remap_deadlist, mintxg, tx); + } + dsl_dataset_remove_clones_key(clone, mintxg, tx); + } + dsl_dataset_rele(clone, FTAG); + } + zap_cursor_fini(&zc); +} + +static void +dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* Move blocks to be obsoleted to pool's obsolete list. */ + if (dsl_dataset_remap_deadlist_exists(ds_next)) { + if (!bpobj_is_open(&dp->dp_obsolete_bpobj)) + dsl_pool_create_obsolete_bpobj(dp, tx); + + dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist, + &dp->dp_obsolete_bpobj, + dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + } + + /* Merge our deadlist into next's and free it. */ + if (dsl_dataset_remap_deadlist_exists(ds)) { + uint64_t remap_deadlist_object = + dsl_dataset_get_remap_deadlist_object(ds); + ASSERT(remap_deadlist_object != 0); + + mutex_enter(&ds_next->ds_remap_deadlist_lock); + if (!dsl_dataset_remap_deadlist_exists(ds_next)) + dsl_dataset_create_remap_deadlist(ds_next, tx); + mutex_exit(&ds_next->ds_remap_deadlist_lock); + + dsl_deadlist_merge(&ds_next->ds_remap_deadlist, + remap_deadlist_object, tx); + dsl_dataset_destroy_remap_deadlist(ds, tx); + } +} + +void +dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) +{ + int err; + int after_branch_point = FALSE; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + dsl_dataset_t *ds_prev = NULL; + uint64_t obj; + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + ASSERT(refcount_is_zero(&ds->ds_longholds)); + + if (defer && + (ds->ds_userrefs > 0 || + dsl_dataset_phys(ds)->ds_num_children > 1)) { + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY; + spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); + return; + } + + ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); + + /* We need to log before removing it from the namespace. */ + spa_history_log_internal_ds(ds, "destroy", tx, ""); + + dsl_scan_ds_destroyed(ds, tx); + + obj = ds->ds_object; + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (ds->ds_feature_inuse[f]) { + dsl_dataset_deactivate_feature(obj, f, tx); + ds->ds_feature_inuse[f] = B_FALSE; + } + } + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + ASSERT3P(ds->ds_prev, ==, NULL); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev)); + after_branch_point = + (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj); + + dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); + if (after_branch_point && + dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) { + dsl_dataset_remove_from_next_clones(ds_prev, obj, tx); + if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { + VERIFY0(zap_add_int(mos, + dsl_dataset_phys(ds_prev)-> + ds_next_clones_obj, + dsl_dataset_phys(ds)->ds_next_snap_obj, + tx)); + } + } + if (!after_branch_point) { + dsl_dataset_phys(ds_prev)->ds_next_snap_obj = + dsl_dataset_phys(ds)->ds_next_snap_obj; + } + } + + dsl_dataset_t *ds_next; + uint64_t old_unique; + uint64_t used = 0, comp = 0, uncomp = 0; + + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next)); + ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj); + + old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes; + + dmu_buf_will_dirty(ds_next->ds_dbuf, tx); + dsl_dataset_phys(ds_next)->ds_prev_snap_obj = + dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsl_dataset_phys(ds_next)->ds_prev_snap_txg = + dsl_dataset_phys(ds)->ds_prev_snap_txg; + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, + ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0); + + if (ds_next->ds_deadlist.dl_oldfmt) { + process_old_deadlist(ds, ds_prev, ds_next, + after_branch_point, tx); + } else { + /* Adjust prev's unique space. */ + if (ds_prev && !after_branch_point) { + dsl_deadlist_space_range(&ds_next->ds_deadlist, + dsl_dataset_phys(ds_prev)->ds_prev_snap_txg, + dsl_dataset_phys(ds)->ds_prev_snap_txg, + &used, &comp, &uncomp); + dsl_dataset_phys(ds_prev)->ds_unique_bytes += used; + } + + /* Adjust snapused. */ + dsl_deadlist_space_range(&ds_next->ds_deadlist, + dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX, + &used, &comp, &uncomp); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -used, -comp, -uncomp, tx); + + /* Move blocks to be freed to pool's free list. */ + dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, + &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg, + tx); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, + DD_USED_HEAD, used, comp, uncomp, tx); + + /* Merge our deadlist into next's and free it. */ + dsl_deadlist_merge(&ds_next->ds_deadlist, + dsl_dataset_phys(ds)->ds_deadlist_obj, tx); + } + + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_deadlist_obj = 0; + + dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx); + + /* Collapse range in clone heads */ + dsl_dataset_remove_clones_key(ds, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + + if (ds_next->ds_is_snapshot) { + dsl_dataset_t *ds_nextnext; + + /* + * Update next's unique to include blocks which + * were previously shared by only this snapshot + * and it. Those blocks will be born after the + * prev snap and before this snap, and will have + * died after the next snap and before the one + * after that (ie. be on the snap after next's + * deadlist). + */ + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds_next)->ds_next_snap_obj, + FTAG, &ds_nextnext)); + dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, + dsl_dataset_phys(ds)->ds_prev_snap_txg, + dsl_dataset_phys(ds)->ds_creation_txg, + &used, &comp, &uncomp); + dsl_dataset_phys(ds_next)->ds_unique_bytes += used; + dsl_dataset_rele(ds_nextnext, FTAG); + ASSERT3P(ds_next->ds_prev, ==, NULL); + + /* Collapse range in this head. */ + dsl_dataset_t *hds; + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds)); + dsl_deadlist_remove_key(&hds->ds_deadlist, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + if (dsl_dataset_remap_deadlist_exists(hds)) { + dsl_deadlist_remove_key(&hds->ds_remap_deadlist, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + } + dsl_dataset_rele(hds, FTAG); + + } else { + ASSERT3P(ds_next->ds_prev, ==, ds); + dsl_dataset_rele(ds_next->ds_prev, ds_next); + ds_next->ds_prev = NULL; + if (ds_prev) { + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, + ds_next, &ds_next->ds_prev)); + } + + dsl_dataset_recalc_head_uniq(ds_next); + + /* + * Reduce the amount of our unconsumed refreservation + * being charged to our parent by the amount of + * new unique data we have gained. + */ + if (old_unique < ds_next->ds_reserved) { + int64_t mrsdelta; + uint64_t new_unique = + dsl_dataset_phys(ds_next)->ds_unique_bytes; + + ASSERT(old_unique <= new_unique); + mrsdelta = MIN(new_unique - old_unique, + ds_next->ds_reserved - old_unique); + dsl_dir_diduse_space(ds->ds_dir, + DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); + } + } + dsl_dataset_rele(ds_next, FTAG); + + /* + * This must be done after the dsl_traverse(), because it will + * re-open the objset. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + + /* remove from snapshot namespace */ + dsl_dataset_t *ds_head; + ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head)); + VERIFY0(dsl_dataset_get_snapname(ds)); +#ifdef ZFS_DEBUG + { + uint64_t val; + + err = dsl_dataset_snap_lookup(ds_head, + ds->ds_snapname, &val); + ASSERT0(err); + ASSERT3U(val, ==, obj); + } +#endif + VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE)); + dsl_dataset_rele(ds_head, FTAG); + + if (ds_prev != NULL) + dsl_dataset_rele(ds_prev, FTAG); + + spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + + if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { + uint64_t count; + ASSERT0(zap_count(mos, + dsl_dataset_phys(ds)->ds_next_clones_obj, &count) && + count == 0); + VERIFY0(dmu_object_free(mos, + dsl_dataset_phys(ds)->ds_next_clones_obj, tx)); + } + if (dsl_dataset_phys(ds)->ds_props_obj != 0) + VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj, + tx)); + if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) + VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, + tx)); + dsl_dir_rele(ds->ds_dir, ds); + ds->ds_dir = NULL; + dmu_object_free_zapified(mos, obj, tx); +} + +void +dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_snapshot_arg_t *ddsa = arg; + const char *dsname = ddsa->ddsa_name; + boolean_t defer = ddsa->ddsa_defer; + + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + int error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == ENOENT) + return; + ASSERT0(error); + dsl_destroy_snapshot_sync_impl(ds, defer, tx); + dsl_dataset_rele(ds, FTAG); +} + +/* + * The semantics of this function are described in the comment above + * lzc_destroy_snaps(). To summarize: + * + * The snapshots must all be in the same pool. + * + * Snapshots that don't exist will be silently ignored (considered to be + * "already deleted"). + * + * On success, all snaps will be destroyed and this will return 0. + * On failure, no snaps will be destroyed, the errlist will be filled in, + * and this will return an errno. + */ +int +dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, + nvlist_t *errlist) +{ + if (nvlist_next_nvpair(snaps, NULL) == NULL) + return (0); + + /* + * lzc_destroy_snaps() is documented to take an nvlist whose + * values "don't matter". We need to convert that nvlist to + * one that we know can be converted to LUA. We also don't + * care about any duplicate entries because the nvlist will + * be converted to a LUA table which should take care of this. + */ + nvlist_t *snaps_normalized; + VERIFY0(nvlist_alloc(&snaps_normalized, 0, KM_SLEEP)); + for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { + fnvlist_add_boolean_value(snaps_normalized, + nvpair_name(pair), B_TRUE); + } + + nvlist_t *arg; + VERIFY0(nvlist_alloc(&arg, 0, KM_SLEEP)); + fnvlist_add_nvlist(arg, "snaps", snaps_normalized); + fnvlist_free(snaps_normalized); + fnvlist_add_boolean_value(arg, "defer", defer); + + nvlist_t *wrapper; + VERIFY0(nvlist_alloc(&wrapper, 0, KM_SLEEP)); + fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg); + fnvlist_free(arg); + + const char *program = + "arg = ...\n" + "snaps = arg['snaps']\n" + "defer = arg['defer']\n" + "errors = { }\n" + "has_errors = false\n" + "for snap, v in pairs(snaps) do\n" + " errno = zfs.check.destroy{snap, defer=defer}\n" + " zfs.debug('snap: ' .. snap .. ' errno: ' .. errno)\n" + " if errno == ENOENT then\n" + " snaps[snap] = nil\n" + " elseif errno ~= 0 then\n" + " errors[snap] = errno\n" + " has_errors = true\n" + " end\n" + "end\n" + "if has_errors then\n" + " return errors\n" + "end\n" + "for snap, v in pairs(snaps) do\n" + " errno = zfs.sync.destroy{snap, defer=defer}\n" + " assert(errno == 0)\n" + "end\n" + "return { }\n"; + + nvlist_t *result = fnvlist_alloc(); + int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)), + program, + B_TRUE, + 0, + zfs_lua_max_memlimit, + nvlist_next_nvpair(wrapper, NULL), result); + if (error != 0) { + char *errorstr = NULL; + (void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr); + if (errorstr != NULL) { + zfs_dbgmsg(errorstr); + } + return (error); + } + fnvlist_free(wrapper); + + /* + * lzc_destroy_snaps() is documented to fill the errlist with + * int32 values, so we need to covert the int64 values that are + * returned from LUA. + */ + int rv = 0; + nvlist_t *errlist_raw = fnvlist_lookup_nvlist(result, ZCP_RET_RETURN); + for (nvpair_t *pair = nvlist_next_nvpair(errlist_raw, NULL); + pair != NULL; pair = nvlist_next_nvpair(errlist_raw, pair)) { + int32_t val = (int32_t)fnvpair_value_int64(pair); + if (rv == 0) + rv = val; + fnvlist_add_int32(errlist, nvpair_name(pair), val); + } + fnvlist_free(result); + return (rv); +} + +int +dsl_destroy_snapshot(const char *name, boolean_t defer) +{ + int error; + nvlist_t *nvl = fnvlist_alloc(); + nvlist_t *errlist = fnvlist_alloc(); + + fnvlist_add_boolean(nvl, name); + error = dsl_destroy_snapshots_nvl(nvl, defer, errlist); + fnvlist_free(errlist); + fnvlist_free(nvl); + return (error); +} + +struct killarg { + dsl_dataset_t *ds; + dmu_tx_t *tx; +}; + +/* ARGSUSED */ +static int +kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + struct killarg *ka = arg; + dmu_tx_t *tx = ka->tx; + + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + return (0); + + if (zb->zb_level == ZB_ZIL_LEVEL) { + ASSERT(zilog != NULL); + /* + * It's a block in the intent log. It has no + * accounting, so just free it. + */ + dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); + } else { + ASSERT(zilog == NULL); + ASSERT3U(bp->blk_birth, >, + dsl_dataset_phys(ka->ds)->ds_prev_snap_txg); + (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); + } + + return (0); +} + +static void +old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + struct killarg ka; + + /* + * Free everything that we point to (that's born after + * the previous snapshot, if we are a clone) + * + * NB: this should be very quick, because we already + * freed all the objects in open context. + */ + ka.ds = ds; + ka.tx = tx; + VERIFY0(traverse_dataset(ds, + dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST, + kill_blkptr, &ka)); + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + dsl_dataset_phys(ds)->ds_unique_bytes == 0); +} + +int +dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) +{ + int error; + uint64_t count; + objset_t *mos; + + ASSERT(!ds->ds_is_snapshot); + if (ds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + if (refcount_count(&ds->ds_longholds) != expected_holds) + return (SET_ERROR(EBUSY)); + + mos = ds->ds_dir->dd_pool->dp_meta_objset; + + /* + * Can't delete a head dataset if there are snapshots of it. + * (Except if the only snapshots are from the branch we cloned + * from.) + */ + if (ds->ds_prev != NULL && + dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object) + return (SET_ERROR(EBUSY)); + + /* + * Can't delete if there are children of this fs. + */ + error = zap_count(mos, + dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count); + if (error != 0) + return (error); + if (count != 0) + return (SET_ERROR(EEXIST)); + + if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && + dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && + ds->ds_prev->ds_userrefs == 0) { + /* We need to remove the origin snapshot as well. */ + if (!refcount_is_zero(&ds->ds_prev->ds_longholds)) + return (SET_ERROR(EBUSY)); + } + return (0); +} + +int +dsl_destroy_head_check(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + + error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds); + if (error != 0) + return (error); + + error = dsl_destroy_head_check_impl(ds, 0); + dsl_dataset_rele(ds, FTAG); + return (error); +} + +static void +dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) +{ + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + dd_used_t t; + + ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock)); + + VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); + + ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj); + + /* + * Decrement the filesystem count for all parent filesystems. + * + * When we receive an incremental stream into a filesystem that already + * exists, a temporary clone is created. We never count this temporary + * clone, whose name begins with a '%'. + */ + if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL) + dsl_fs_ss_count_adjust(dd->dd_parent, -1, + DD_FIELD_FILESYSTEM_COUNT, tx); + + /* + * Remove our reservation. The impl() routine avoids setting the + * actual property, which would require the (already destroyed) ds. + */ + dsl_dir_set_reservation_sync_impl(dd, 0, tx); + + ASSERT0(dsl_dir_phys(dd)->dd_used_bytes); + ASSERT0(dsl_dir_phys(dd)->dd_reserved); + for (t = 0; t < DD_USED_NUM; t++) + ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]); + + VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx)); + VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx)); + VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx)); + VERIFY0(zap_remove(mos, + dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, + dd->dd_myname, tx)); + + dsl_dir_rele(dd, FTAG); + dmu_object_free_zapified(mos, ddobj, tx); +} + +void +dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + uint64_t obj, ddobj, prevobj = 0; + boolean_t rmorigin; + + ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); + ASSERT(ds->ds_prev == NULL || + dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + /* We need to log before removing it from the namespace. */ + spa_history_log_internal_ds(ds, "destroy", tx, ""); + + rmorigin = (dsl_dir_is_clone(ds->ds_dir) && + DS_IS_DEFER_DESTROY(ds->ds_prev) && + dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && + ds->ds_prev->ds_userrefs == 0); + + /* Remove our reservation. */ + if (ds->ds_reserved != 0) { + dsl_dataset_set_refreservation_sync_impl(ds, + (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), + 0, tx); + ASSERT0(ds->ds_reserved); + } + + obj = ds->ds_object; + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (ds->ds_feature_inuse[f]) { + dsl_dataset_deactivate_feature(obj, f, tx); + ds->ds_feature_inuse[f] = B_FALSE; + } + } + + dsl_scan_ds_destroyed(ds, tx); + + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + /* This is a clone */ + ASSERT(ds->ds_prev != NULL); + ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=, + obj); + ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj); + + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) { + dsl_dataset_remove_from_next_clones(ds->ds_prev, + obj, tx); + } + + ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1); + dsl_dataset_phys(ds->ds_prev)->ds_num_children--; + } + + /* + * Destroy the deadlist. Unless it's a clone, the + * deadlist should be empty since the dataset has no snapshots. + * (If it's a clone, it's safe to ignore the deadlist contents + * since they are still referenced by the origin snapshot.) + */ + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_deadlist_obj = 0; + + if (dsl_dataset_remap_deadlist_exists(ds)) + dsl_dataset_destroy_remap_deadlist(ds, tx); + + objset_t *os; + VERIFY0(dmu_objset_from_ds(ds, &os)); + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { + old_synchronous_dataset_destroy(ds, tx); + } else { + /* + * Move the bptree into the pool's list of trees to + * clean up and update space accounting information. + */ + uint64_t used, comp, uncomp; + + zil_destroy_sync(dmu_objset_zil(os), tx); + + if (!spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dsl_scan_t *scn = dp->dp_scan; + spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, + tx); + dp->dp_bptree_obj = bptree_alloc(mos, tx); + VERIFY0(zap_add(mos, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj, tx)); + ASSERT(!scn->scn_async_destroying); + scn->scn_async_destroying = B_TRUE; + } + + used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; + comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; + uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; + + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + dsl_dataset_phys(ds)->ds_unique_bytes == used); + + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + bptree_add(mos, dp->dp_bptree_obj, + &dsl_dataset_phys(ds)->ds_bp, + dsl_dataset_phys(ds)->ds_prev_snap_txg, + used, comp, uncomp, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + -used, -comp, -uncomp, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); + } + + if (ds->ds_prev != NULL) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + VERIFY0(zap_remove_int(mos, + dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones, + ds->ds_object, tx)); + } + prevobj = ds->ds_prev->ds_object; + dsl_dataset_rele(ds->ds_prev, ds); + ds->ds_prev = NULL; + } + + /* + * This must be done after the dsl_traverse(), because it will + * re-open the objset. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + + /* Erase the link in the dir */ + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0; + ddobj = ds->ds_dir->dd_object; + ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0); + VERIFY0(zap_destroy(mos, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx)); + + if (ds->ds_bookmarks != 0) { + VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); + spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); + } + + spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + + ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj); + ASSERT0(dsl_dataset_phys(ds)->ds_props_obj); + ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj); + dsl_dir_rele(ds->ds_dir, ds); + ds->ds_dir = NULL; + dmu_object_free_zapified(mos, obj, tx); + + dsl_dir_destroy_sync(ddobj, tx); + + if (rmorigin) { + dsl_dataset_t *prev; + VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev)); + dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); + dsl_dataset_rele(prev, FTAG); + } +} + +void +dsl_destroy_head_sync(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); + dsl_destroy_head_sync_impl(ds, tx); + dsl_dataset_rele(ds, FTAG); +} + +static void +dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); + + /* Mark it as inconsistent on-disk, in case we crash */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; + + spa_history_log_internal_ds(ds, "destroy begin", tx, ""); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_destroy_head(const char *name) +{ + dsl_destroy_head_arg_t ddha; + int error; + spa_t *spa; + boolean_t isenabled; + +#ifdef _KERNEL + zfs_destroy_unmount_origin(name); +#endif + + error = spa_open(name, &spa, FTAG); + if (error != 0) + return (error); + isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY); + spa_close(spa, FTAG); + + ddha.ddha_name = name; + + if (!isenabled) { + objset_t *os; + + error = dsl_sync_task(name, dsl_destroy_head_check, + dsl_destroy_head_begin_sync, &ddha, + 0, ZFS_SPACE_CHECK_DESTROY); + if (error != 0) + return (error); + + /* + * Head deletion is processed in one txg on old pools; + * remove the objects from open context so that the txg sync + * is not too long. + */ + error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os); + if (error == 0) { + uint64_t prev_snap_txg = + dsl_dataset_phys(dmu_objset_ds(os))-> + ds_prev_snap_txg; + for (uint64_t obj = 0; error == 0; + error = dmu_object_next(os, &obj, FALSE, + prev_snap_txg)) + (void) dmu_free_long_object(os, obj); + /* sync out all frees */ + txg_wait_synced(dmu_objset_pool(os), 0); + dmu_objset_disown(os, FTAG); + } + } + + return (dsl_sync_task(name, dsl_destroy_head_check, + dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY)); +} + +/* + * Note, this function is used as the callback for dmu_objset_find(). We + * always return 0 so that we will continue to find and process + * inconsistent datasets, even if we encounter an error trying to + * process one of them. + */ +/* ARGSUSED */ +int +dsl_destroy_inconsistent(const char *dsname, void *arg) +{ + objset_t *os; + + if (dmu_objset_hold(dsname, FTAG, &os) == 0) { + boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os)); + + /* + * If the dataset is inconsistent because a resumable receive + * has failed, then do not destroy it. + */ + if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os))) + need_destroy = B_FALSE; + + dmu_objset_rele(os, FTAG); + if (need_destroy) + (void) dsl_destroy_head(dsname); + } + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c new file mode 100644 index 000000000000..1a4194ebf16d --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c @@ -0,0 +1,2158 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. + * All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + */ + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_deleg.h> +#include <sys/dmu_impl.h> +#include <sys/spa.h> +#include <sys/metaslab.h> +#include <sys/zap.h> +#include <sys/zio.h> +#include <sys/arc.h> +#include <sys/sunddi.h> +#include <sys/zvol.h> +#ifdef _KERNEL +#include <sys/zfs_vfsops.h> +#endif +#include <sys/zfeature.h> +#include <sys/policy.h> +#include <sys/zfs_znode.h> +#include "zfs_namecheck.h" +#include "zfs_prop.h" + +/* + * Filesystem and Snapshot Limits + * ------------------------------ + * + * These limits are used to restrict the number of filesystems and/or snapshots + * that can be created at a given level in the tree or below. A typical + * use-case is with a delegated dataset where the administrator wants to ensure + * that a user within the zone is not creating too many additional filesystems + * or snapshots, even though they're not exceeding their space quota. + * + * The filesystem and snapshot counts are stored as extensible properties. This + * capability is controlled by a feature flag and must be enabled to be used. + * Once enabled, the feature is not active until the first limit is set. At + * that point, future operations to create/destroy filesystems or snapshots + * will validate and update the counts. + * + * Because the count properties will not exist before the feature is active, + * the counts are updated when a limit is first set on an uninitialized + * dsl_dir node in the tree (The filesystem/snapshot count on a node includes + * all of the nested filesystems/snapshots. Thus, a new leaf node has a + * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and + * snapshot count properties on a node indicate uninitialized counts on that + * node.) When first setting a limit on an uninitialized node, the code starts + * at the filesystem with the new limit and descends into all sub-filesystems + * to add the count properties. + * + * In practice this is lightweight since a limit is typically set when the + * filesystem is created and thus has no children. Once valid, changing the + * limit value won't require a re-traversal since the counts are already valid. + * When recursively fixing the counts, if a node with a limit is encountered + * during the descent, the counts are known to be valid and there is no need to + * descend into that filesystem's children. The counts on filesystems above the + * one with the new limit will still be uninitialized, unless a limit is + * eventually set on one of those filesystems. The counts are always recursively + * updated when a limit is set on a dataset, unless there is already a limit. + * When a new limit value is set on a filesystem with an existing limit, it is + * possible for the new limit to be less than the current count at that level + * since a user who can change the limit is also allowed to exceed the limit. + * + * Once the feature is active, then whenever a filesystem or snapshot is + * created, the code recurses up the tree, validating the new count against the + * limit at each initialized level. In practice, most levels will not have a + * limit set. If there is a limit at any initialized level up the tree, the + * check must pass or the creation will fail. Likewise, when a filesystem or + * snapshot is destroyed, the counts are recursively adjusted all the way up + * the initizized nodes in the tree. Renaming a filesystem into different point + * in the tree will first validate, then update the counts on each branch up to + * the common ancestor. A receive will also validate the counts and then update + * them. + * + * An exception to the above behavior is that the limit is not enforced if the + * user has permission to modify the limit. This is primarily so that + * recursive snapshots in the global zone always work. We want to prevent a + * denial-of-service in which a lower level delegated dataset could max out its + * limit and thus block recursive snapshots from being taken in the global zone. + * Because of this, it is possible for the snapshot count to be over the limit + * and snapshots taken in the global zone could cause a lower level dataset to + * hit or exceed its limit. The administrator taking the global zone recursive + * snapshot should be aware of this side-effect and behave accordingly. + * For consistency, the filesystem limit is also not enforced if the user can + * modify the limit. + * + * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() + * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in + * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by + * dsl_dir_init_fs_ss_count(). + * + * There is a special case when we receive a filesystem that already exists. In + * this case a temporary clone name of %X is created (see dmu_recv_begin). We + * never update the filesystem counts for temporary clones. + * + * Likewise, we do not update the snapshot counts for temporary snapshots, + * such as those created by zfs diff. + */ + +extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); + +static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); + +typedef struct ddulrt_arg { + dsl_dir_t *ddulrta_dd; + uint64_t ddlrta_txg; +} ddulrt_arg_t; + +static void +dsl_dir_evict_async(void *dbu) +{ + dsl_dir_t *dd = dbu; + dsl_pool_t *dp = dd->dd_pool; + int t; + + dd->dd_dbuf = NULL; + + for (t = 0; t < TXG_SIZE; t++) { + ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); + ASSERT(dd->dd_tempreserved[t] == 0); + ASSERT(dd->dd_space_towrite[t] == 0); + } + + if (dd->dd_parent) + dsl_dir_async_rele(dd->dd_parent, dd); + + spa_async_close(dd->dd_pool->dp_spa, dd); + + dsl_prop_fini(dd); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); +} + +int +dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, + const char *tail, void *tag, dsl_dir_t **ddp) +{ + dmu_buf_t *dbuf; + dsl_dir_t *dd; + int err; + + ASSERT(dsl_pool_config_held(dp)); + + err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); + if (err != 0) + return (err); + dd = dmu_buf_get_user(dbuf); +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(dbuf, &doi); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); + ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); + } +#endif + if (dd == NULL) { + dsl_dir_t *winner; + + dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); + dd->dd_object = ddobj; + dd->dd_dbuf = dbuf; + dd->dd_pool = dp; + mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); + dsl_prop_init(dd); + + dsl_dir_snap_cmtime_update(dd); + + if (dsl_dir_phys(dd)->dd_parent_obj) { + err = dsl_dir_hold_obj(dp, + dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, + &dd->dd_parent); + if (err != 0) + goto errout; + if (tail) { +#ifdef ZFS_DEBUG + uint64_t foundobj; + + err = zap_lookup(dp->dp_meta_objset, + dsl_dir_phys(dd->dd_parent)-> + dd_child_dir_zapobj, tail, + sizeof (foundobj), 1, &foundobj); + ASSERT(err || foundobj == ddobj); +#endif + (void) strcpy(dd->dd_myname, tail); + } else { + err = zap_value_search(dp->dp_meta_objset, + dsl_dir_phys(dd->dd_parent)-> + dd_child_dir_zapobj, + ddobj, 0, dd->dd_myname); + } + if (err != 0) + goto errout; + } else { + (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); + } + + if (dsl_dir_is_clone(dd)) { + dmu_buf_t *origin_bonus; + dsl_dataset_phys_t *origin_phys; + + /* + * We can't open the origin dataset, because + * that would require opening this dsl_dir. + * Just look at its phys directly instead. + */ + err = dmu_bonus_hold(dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_origin_obj, FTAG, + &origin_bonus); + if (err != 0) + goto errout; + origin_phys = origin_bonus->db_data; + dd->dd_origin_txg = + origin_phys->ds_creation_txg; + dmu_buf_rele(origin_bonus, FTAG); + } + + dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, + &dd->dd_dbuf); + winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); + if (winner != NULL) { + if (dd->dd_parent) + dsl_dir_rele(dd->dd_parent, dd); + dsl_prop_fini(dd); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); + dd = winner; + } else { + spa_open_ref(dp->dp_spa, dd); + } + } + + /* + * The dsl_dir_t has both open-to-close and instantiate-to-evict + * holds on the spa. We need the open-to-close holds because + * otherwise the spa_refcnt wouldn't change when we open a + * dir which the spa also has open, so we could incorrectly + * think it was OK to unload/export/destroy the pool. We need + * the instantiate-to-evict hold because the dsl_dir_t has a + * pointer to the dd_pool, which has a pointer to the spa_t. + */ + spa_open_ref(dp->dp_spa, tag); + ASSERT3P(dd->dd_pool, ==, dp); + ASSERT3U(dd->dd_object, ==, ddobj); + ASSERT3P(dd->dd_dbuf, ==, dbuf); + *ddp = dd; + return (0); + +errout: + if (dd->dd_parent) + dsl_dir_rele(dd->dd_parent, dd); + dsl_prop_fini(dd); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); + dmu_buf_rele(dbuf, tag); + return (err); +} + +void +dsl_dir_rele(dsl_dir_t *dd, void *tag) +{ + dprintf_dd(dd, "%s\n", ""); + spa_close(dd->dd_pool->dp_spa, tag); + dmu_buf_rele(dd->dd_dbuf, tag); +} + +/* + * Remove a reference to the given dsl dir that is being asynchronously + * released. Async releases occur from a taskq performing eviction of + * dsl datasets and dirs. This process is identical to a normal release + * with the exception of using the async API for releasing the reference on + * the spa. + */ +void +dsl_dir_async_rele(dsl_dir_t *dd, void *tag) +{ + dprintf_dd(dd, "%s\n", ""); + spa_async_close(dd->dd_pool->dp_spa, tag); + dmu_buf_rele(dd->dd_dbuf, tag); +} + +/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ +void +dsl_dir_name(dsl_dir_t *dd, char *buf) +{ + if (dd->dd_parent) { + dsl_dir_name(dd->dd_parent, buf); + VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <, + ZFS_MAX_DATASET_NAME_LEN); + } else { + buf[0] = '\0'; + } + if (!MUTEX_HELD(&dd->dd_lock)) { + /* + * recursive mutex so that we can use + * dprintf_dd() with dd_lock held + */ + mutex_enter(&dd->dd_lock); + VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + mutex_exit(&dd->dd_lock); + } else { + VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + } +} + +/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ +int +dsl_dir_namelen(dsl_dir_t *dd) +{ + int result = 0; + + if (dd->dd_parent) { + /* parent's name + 1 for the "/" */ + result = dsl_dir_namelen(dd->dd_parent) + 1; + } + + if (!MUTEX_HELD(&dd->dd_lock)) { + /* see dsl_dir_name */ + mutex_enter(&dd->dd_lock); + result += strlen(dd->dd_myname); + mutex_exit(&dd->dd_lock); + } else { + result += strlen(dd->dd_myname); + } + + return (result); +} + +static int +getcomponent(const char *path, char *component, const char **nextp) +{ + char *p; + + if ((path == NULL) || (path[0] == '\0')) + return (SET_ERROR(ENOENT)); + /* This would be a good place to reserve some namespace... */ + p = strpbrk(path, "/@"); + if (p && (p[1] == '/' || p[1] == '@')) { + /* two separators in a row */ + return (SET_ERROR(EINVAL)); + } + if (p == NULL || p == path) { + /* + * if the first thing is an @ or /, it had better be an + * @ and it had better not have any more ats or slashes, + * and it had better have something after the @. + */ + if (p != NULL && + (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) + return (SET_ERROR(EINVAL)); + if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + (void) strcpy(component, path); + p = NULL; + } else if (p[0] == '/') { + if (p - path >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + (void) strncpy(component, path, p - path); + component[p - path] = '\0'; + p++; + } else if (p[0] == '@') { + /* + * if the next separator is an @, there better not be + * any more slashes. + */ + if (strchr(path, '/')) + return (SET_ERROR(EINVAL)); + if (p - path >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + (void) strncpy(component, path, p - path); + component[p - path] = '\0'; + } else { + panic("invalid p=%p", (void *)p); + } + *nextp = p; + return (0); +} + +/* + * Return the dsl_dir_t, and possibly the last component which couldn't + * be found in *tail. The name must be in the specified dsl_pool_t. This + * thread must hold the dp_config_rwlock for the pool. Returns NULL if the + * path is bogus, or if tail==NULL and we couldn't parse the whole name. + * (*tail)[0] == '@' means that the last component is a snapshot. + */ +int +dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, + dsl_dir_t **ddp, const char **tailp) +{ + char buf[ZFS_MAX_DATASET_NAME_LEN]; + const char *spaname, *next, *nextnext = NULL; + int err; + dsl_dir_t *dd; + uint64_t ddobj; + + err = getcomponent(name, buf, &next); + if (err != 0) + return (err); + + /* Make sure the name is in the specified pool. */ + spaname = spa_name(dp->dp_spa); + if (strcmp(buf, spaname) != 0) + return (SET_ERROR(EXDEV)); + + ASSERT(dsl_pool_config_held(dp)); + + err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); + if (err != 0) { + return (err); + } + + while (next != NULL) { + dsl_dir_t *child_dd; + err = getcomponent(next, buf, &nextnext); + if (err != 0) + break; + ASSERT(next[0] != '\0'); + if (next[0] == '@') + break; + dprintf("looking up %s in obj%lld\n", + buf, dsl_dir_phys(dd)->dd_child_dir_zapobj); + + err = zap_lookup(dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_child_dir_zapobj, + buf, sizeof (ddobj), 1, &ddobj); + if (err != 0) { + if (err == ENOENT) + err = 0; + break; + } + + err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); + if (err != 0) + break; + dsl_dir_rele(dd, tag); + dd = child_dd; + next = nextnext; + } + + if (err != 0) { + dsl_dir_rele(dd, tag); + return (err); + } + + /* + * It's an error if there's more than one component left, or + * tailp==NULL and there's any component left. + */ + if (next != NULL && + (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { + /* bad path name */ + dsl_dir_rele(dd, tag); + dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); + err = SET_ERROR(ENOENT); + } + if (tailp != NULL) + *tailp = next; + *ddp = dd; + return (err); +} + +/* + * If the counts are already initialized for this filesystem and its + * descendants then do nothing, otherwise initialize the counts. + * + * The counts on this filesystem, and those below, may be uninitialized due to + * either the use of a pre-existing pool which did not support the + * filesystem/snapshot limit feature, or one in which the feature had not yet + * been enabled. + * + * Recursively descend the filesystem tree and update the filesystem/snapshot + * counts on each filesystem below, then update the cumulative count on the + * current filesystem. If the filesystem already has a count set on it, + * then we know that its counts, and the counts on the filesystems below it, + * are already correct, so we don't have to update this filesystem. + */ +static void +dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) +{ + uint64_t my_fs_cnt = 0; + uint64_t my_ss_cnt = 0; + dsl_pool_t *dp = dd->dd_pool; + objset_t *os = dp->dp_meta_objset; + zap_cursor_t *zc; + zap_attribute_t *za; + dsl_dataset_t *ds; + + ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); + ASSERT(dsl_pool_config_held(dp)); + ASSERT(dmu_tx_is_syncing(tx)); + + dsl_dir_zapify(dd, tx); + + /* + * If the filesystem count has already been initialized then we + * don't need to recurse down any further. + */ + if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) + return; + + zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* Iterate my child dirs */ + for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); + zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { + dsl_dir_t *chld_dd; + uint64_t count; + + VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, + &chld_dd)); + + /* + * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and + * temporary datasets. + */ + if (chld_dd->dd_myname[0] == '$' || + chld_dd->dd_myname[0] == '%') { + dsl_dir_rele(chld_dd, FTAG); + continue; + } + + my_fs_cnt++; /* count this child */ + + dsl_dir_init_fs_ss_count(chld_dd, tx); + + VERIFY0(zap_lookup(os, chld_dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); + my_fs_cnt += count; + VERIFY0(zap_lookup(os, chld_dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); + my_ss_cnt += count; + + dsl_dir_rele(chld_dd, FTAG); + } + zap_cursor_fini(zc); + /* Count my snapshots (we counted children's snapshots above) */ + VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, + dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); + + for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + /* Don't count temporary snapshots */ + if (za->za_name[0] != '%') + my_ss_cnt++; + } + zap_cursor_fini(zc); + + dsl_dataset_rele(ds, FTAG); + + kmem_free(zc, sizeof (zap_cursor_t)); + kmem_free(za, sizeof (zap_attribute_t)); + + /* we're in a sync task, update counts */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); +} + +static int +dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) +{ + char *ddname = (char *)arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + dsl_dir_t *dd; + int error; + + error = dsl_dataset_hold(dp, ddname, FTAG, &ds); + if (error != 0) + return (error); + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + dd = ds->ds_dir; + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && + dsl_dir_is_zapified(dd) && + zap_contains(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT) == 0) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EALREADY)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) +{ + char *ddname = (char *)arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + spa_t *spa; + + VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); + + spa = dsl_dataset_get_spa(ds); + + if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { + /* + * Since the feature was not active and we're now setting a + * limit, increment the feature-active counter so that the + * feature becomes active for the first time. + * + * We are already in a sync task so we can update the MOS. + */ + spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); + } + + /* + * Since we are now setting a non-UINT64_MAX limit on the filesystem, + * we need to ensure the counts are correct. Descend down the tree from + * this point and update all of the counts to be accurate. + */ + dsl_dir_init_fs_ss_count(ds->ds_dir, tx); + + dsl_dataset_rele(ds, FTAG); +} + +/* + * Make sure the feature is enabled and activate it if necessary. + * Since we're setting a limit, ensure the on-disk counts are valid. + * This is only called by the ioctl path when setting a limit value. + * + * We do not need to validate the new limit, since users who can change the + * limit are also allowed to exceed the limit. + */ +int +dsl_dir_activate_fs_ss_limit(const char *ddname) +{ + int error; + + error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, + dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, + ZFS_SPACE_CHECK_RESERVED); + + if (error == EALREADY) + error = 0; + + return (error); +} + +/* + * Used to determine if the filesystem_limit or snapshot_limit should be + * enforced. We allow the limit to be exceeded if the user has permission to + * write the property value. We pass in the creds that we got in the open + * context since we will always be the GZ root in syncing context. We also have + * to handle the case where we are allowed to change the limit on the current + * dataset, but there may be another limit in the tree above. + * + * We can never modify these two properties within a non-global zone. In + * addition, the other checks are modeled on zfs_secpolicy_write_perms. We + * can't use that function since we are already holding the dp_config_rwlock. + * In addition, we already have the dd and dealing with snapshots is simplified + * in this code. + */ + +typedef enum { + ENFORCE_ALWAYS, + ENFORCE_NEVER, + ENFORCE_ABOVE +} enforce_res_t; + +static enforce_res_t +dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) +{ + enforce_res_t enforce = ENFORCE_ALWAYS; + uint64_t obj; + dsl_dataset_t *ds; + uint64_t zoned; + + ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT); + +#ifdef _KERNEL +#ifdef __FreeBSD__ + if (jailed(cr)) +#else + if (crgetzoneid(cr) != GLOBAL_ZONEID) +#endif + return (ENFORCE_ALWAYS); + + if (secpolicy_zfs(cr) == 0) + return (ENFORCE_NEVER); +#endif + + if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) + return (ENFORCE_ALWAYS); + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + + if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) + return (ENFORCE_ALWAYS); + + if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) { + /* Only root can access zoned fs's from the GZ */ + enforce = ENFORCE_ALWAYS; + } else { + if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) + enforce = ENFORCE_ABOVE; + } + + dsl_dataset_rele(ds, FTAG); + return (enforce); +} + +static void +dsl_dir_update_last_remap_txg_sync(void *varg, dmu_tx_t *tx) +{ + ddulrt_arg_t *arg = varg; + uint64_t last_remap_txg; + dsl_dir_t *dd = arg->ddulrta_dd; + objset_t *mos = dd->dd_pool->dp_meta_objset; + + dsl_dir_zapify(dd, tx); + if (zap_lookup(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, + sizeof (last_remap_txg), 1, &last_remap_txg) != 0 || + last_remap_txg < arg->ddlrta_txg) { + VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, + sizeof (arg->ddlrta_txg), 1, &arg->ddlrta_txg, tx)); + } +} + +int +dsl_dir_update_last_remap_txg(dsl_dir_t *dd, uint64_t txg) +{ + ddulrt_arg_t arg; + arg.ddulrta_dd = dd; + arg.ddlrta_txg = txg; + + return (dsl_sync_task(spa_name(dd->dd_pool->dp_spa), + NULL, dsl_dir_update_last_remap_txg_sync, &arg, + 1, ZFS_SPACE_CHECK_RESERVED)); +} + +/* + * Check if adding additional child filesystem(s) would exceed any filesystem + * limits or adding additional snapshot(s) would exceed any snapshot limits. + * The prop argument indicates which limit to check. + * + * Note that all filesystem limits up to the root (or the highest + * initialized) filesystem or the given ancestor must be satisfied. + */ +int +dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, + dsl_dir_t *ancestor, cred_t *cr) +{ + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t limit, count; + char *count_prop; + enforce_res_t enforce; + int err = 0; + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT); + + /* + * If we're allowed to change the limit, don't enforce the limit + * e.g. this can happen if a snapshot is taken by an administrative + * user in the global zone (i.e. a recursive snapshot by root). + * However, we must handle the case of delegated permissions where we + * are allowed to change the limit on the current dataset, but there + * is another limit in the tree above. + */ + enforce = dsl_enforce_ds_ss_limits(dd, prop, cr); + if (enforce == ENFORCE_NEVER) + return (0); + + /* + * e.g. if renaming a dataset with no snapshots, count adjustment + * is 0. + */ + if (delta == 0) + return (0); + + if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { + /* + * We don't enforce the limit for temporary snapshots. This is + * indicated by a NULL cred_t argument. + */ + if (cr == NULL) + return (0); + + count_prop = DD_FIELD_SNAPSHOT_COUNT; + } else { + count_prop = DD_FIELD_FILESYSTEM_COUNT; + } + + /* + * If an ancestor has been provided, stop checking the limit once we + * hit that dir. We need this during rename so that we don't overcount + * the check once we recurse up to the common ancestor. + */ + if (ancestor == dd) + return (0); + + /* + * If we hit an uninitialized node while recursing up the tree, we can + * stop since we know there is no limit here (or above). The counts are + * not valid on this node and we know we won't touch this node's counts. + */ + if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object, + count_prop, sizeof (count), 1, &count) == ENOENT) + return (0); + + err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, + B_FALSE); + if (err != 0) + return (err); + + /* Is there a limit which we've hit? */ + if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) + return (SET_ERROR(EDQUOT)); + + if (dd->dd_parent != NULL) + err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, + ancestor, cr); + + return (err); +} + +/* + * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all + * parents. When a new filesystem/snapshot is created, increment the count on + * all parents, and when a filesystem/snapshot is destroyed, decrement the + * count. + */ +void +dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, + dmu_tx_t *tx) +{ + int err; + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t count; + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || + strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); + + /* + * When we receive an incremental stream into a filesystem that already + * exists, a temporary clone is created. We don't count this temporary + * clone, whose name begins with a '%'. We also ignore hidden ($FREE, + * $MOS & $ORIGIN) objsets. + */ + if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') && + strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) + return; + + /* + * e.g. if renaming a dataset with no snapshots, count adjustment is 0 + */ + if (delta == 0) + return; + + /* + * If we hit an uninitialized node while recursing up the tree, we can + * stop since we know the counts are not valid on this node and we + * know we shouldn't touch this node's counts. An uninitialized count + * on the node indicates that either the feature has not yet been + * activated or there are no limits on this part of the tree. + */ + if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, + prop, sizeof (count), 1, &count)) == ENOENT) + return; + VERIFY0(err); + + count += delta; + /* Use a signed verify to make sure we're not neg. */ + VERIFY3S(count, >=, 0); + + VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, + tx)); + + /* Roll up this additional count into our ancestors */ + if (dd->dd_parent != NULL) + dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); +} + +uint64_t +dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, + dmu_tx_t *tx) +{ + objset_t *mos = dp->dp_meta_objset; + uint64_t ddobj; + dsl_dir_phys_t *ddphys; + dmu_buf_t *dbuf; + + ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, + DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); + if (pds) { + VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, + name, sizeof (uint64_t), 1, &ddobj, tx)); + } else { + /* it's the root dir */ + VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); + } + VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + ddphys = dbuf->db_data; + + ddphys->dd_creation_time = gethrestime_sec(); + if (pds) { + ddphys->dd_parent_obj = pds->dd_object; + + /* update the filesystem counts */ + dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); + } + ddphys->dd_props_zapobj = zap_create(mos, + DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); + ddphys->dd_child_dir_zapobj = zap_create(mos, + DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); + if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) + ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; + dmu_buf_rele(dbuf, FTAG); + + return (ddobj); +} + +boolean_t +dsl_dir_is_clone(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_origin_obj && + (dd->dd_pool->dp_origin_snap == NULL || + dsl_dir_phys(dd)->dd_origin_obj != + dd->dd_pool->dp_origin_snap->ds_object)); +} + + +uint64_t +dsl_dir_get_used(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_used_bytes); +} + +uint64_t +dsl_dir_get_compressed(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_compressed_bytes); +} + +uint64_t +dsl_dir_get_quota(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_quota); +} + +uint64_t +dsl_dir_get_reservation(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_reserved); +} + +uint64_t +dsl_dir_get_compressratio(dsl_dir_t *dd) +{ + /* a fixed point number, 100x the ratio */ + return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : + (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / + dsl_dir_phys(dd)->dd_compressed_bytes)); +} + +uint64_t +dsl_dir_get_logicalused(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_uncompressed_bytes); +} + +uint64_t +dsl_dir_get_usedsnap(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); +} + +uint64_t +dsl_dir_get_usedds(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); +} + +uint64_t +dsl_dir_get_usedrefreserv(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); +} + +uint64_t +dsl_dir_get_usedchild(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); +} + +void +dsl_dir_get_origin(dsl_dir_t *dd, char *buf) +{ + dsl_dataset_t *ds; + VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, + dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); + + dsl_dataset_name(ds, buf); + + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count) +{ + if (dsl_dir_is_zapified(dd)) { + objset_t *os = dd->dd_pool->dp_meta_objset; + return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (*count), 1, count)); + } else { + return (ENOENT); + } +} + +int +dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count) +{ + if (dsl_dir_is_zapified(dd)) { + objset_t *os = dd->dd_pool->dp_meta_objset; + return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (*count), 1, count)); + } else { + return (ENOENT); + } +} + +int +dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count) +{ + if (dsl_dir_is_zapified(dd)) { + objset_t *os = dd->dd_pool->dp_meta_objset; + return (zap_lookup(os, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, + sizeof (*count), 1, count)); + } else { + return (ENOENT); + } +} + +void +dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) +{ + mutex_enter(&dd->dd_lock); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, + dsl_dir_get_quota(dd)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, + dsl_dir_get_reservation(dd)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, + dsl_dir_get_logicalused(dd)); + if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, + dsl_dir_get_usedsnap(dd)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, + dsl_dir_get_usedds(dd)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, + dsl_dir_get_usedrefreserv(dd)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, + dsl_dir_get_usedchild(dd)); + } + mutex_exit(&dd->dd_lock); + + uint64_t count; + if (dsl_dir_get_filesystem_count(dd, &count) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT, + count); + } + if (dsl_dir_get_snapshot_count(dd, &count) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, + count); + } + if (dsl_dir_get_remaptxg(dd, &count) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REMAPTXG, + count); + } + + if (dsl_dir_is_clone(dd)) { + char buf[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dir_get_origin(dd, buf); + dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); + } + +} + +void +dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dd->dd_pool; + + ASSERT(dsl_dir_phys(dd)); + + if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(dd->dd_dbuf, dd); + } +} + +static int64_t +parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) +{ + uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); + uint64_t new_accounted = + MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); + return (new_accounted - old_accounted); +} + +void +dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + + mutex_enter(&dd->dd_lock); + ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]); + dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, + dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); + dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; + mutex_exit(&dd->dd_lock); + + /* release the hold from dsl_dir_dirty */ + dmu_buf_rele(dd->dd_dbuf, dd); +} + +static uint64_t +dsl_dir_space_towrite(dsl_dir_t *dd) +{ + uint64_t space = 0; + + ASSERT(MUTEX_HELD(&dd->dd_lock)); + + for (int i = 0; i < TXG_SIZE; i++) { + space += dd->dd_space_towrite[i & TXG_MASK]; + ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0); + } + return (space); +} + +/* + * How much space would dd have available if ancestor had delta applied + * to it? If ondiskonly is set, we're only interested in what's + * on-disk, not estimated pending changes. + */ +uint64_t +dsl_dir_space_available(dsl_dir_t *dd, + dsl_dir_t *ancestor, int64_t delta, int ondiskonly) +{ + uint64_t parentspace, myspace, quota, used; + + /* + * If there are no restrictions otherwise, assume we have + * unlimited space available. + */ + quota = UINT64_MAX; + parentspace = UINT64_MAX; + + if (dd->dd_parent != NULL) { + parentspace = dsl_dir_space_available(dd->dd_parent, + ancestor, delta, ondiskonly); + } + + mutex_enter(&dd->dd_lock); + if (dsl_dir_phys(dd)->dd_quota != 0) + quota = dsl_dir_phys(dd)->dd_quota; + used = dsl_dir_phys(dd)->dd_used_bytes; + if (!ondiskonly) + used += dsl_dir_space_towrite(dd); + + if (dd->dd_parent == NULL) { + uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, + ZFS_SPACE_CHECK_NORMAL); + quota = MIN(quota, poolsize); + } + + if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { + /* + * We have some space reserved, in addition to what our + * parent gave us. + */ + parentspace += dsl_dir_phys(dd)->dd_reserved - used; + } + + if (dd == ancestor) { + ASSERT(delta <= 0); + ASSERT(used >= -delta); + used += delta; + if (parentspace != UINT64_MAX) + parentspace -= delta; + } + + if (used > quota) { + /* over quota */ + myspace = 0; + } else { + /* + * the lesser of the space provided by our parent and + * the space left in our quota + */ + myspace = MIN(parentspace, quota - used); + } + + mutex_exit(&dd->dd_lock); + + return (myspace); +} + +struct tempreserve { + list_node_t tr_node; + dsl_dir_t *tr_ds; + uint64_t tr_size; +}; + +static int +dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, + boolean_t ignorequota, list_t *tr_list, + dmu_tx_t *tx, boolean_t first) +{ + uint64_t txg = tx->tx_txg; + uint64_t quota; + struct tempreserve *tr; + int retval = EDQUOT; + uint64_t ref_rsrv = 0; + + ASSERT3U(txg, !=, 0); + ASSERT3S(asize, >, 0); + + mutex_enter(&dd->dd_lock); + + /* + * Check against the dsl_dir's quota. We don't add in the delta + * when checking for over-quota because they get one free hit. + */ + uint64_t est_inflight = dsl_dir_space_towrite(dd); + for (int i = 0; i < TXG_SIZE; i++) + est_inflight += dd->dd_tempreserved[i]; + uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; + + /* + * On the first iteration, fetch the dataset's used-on-disk and + * refreservation values. Also, if checkrefquota is set, test if + * allocating this space would exceed the dataset's refquota. + */ + if (first && tx->tx_objset) { + int error; + dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; + + error = dsl_dataset_check_quota(ds, !netfree, + asize, est_inflight, &used_on_disk, &ref_rsrv); + if (error != 0) { + mutex_exit(&dd->dd_lock); + return (error); + } + } + + /* + * If this transaction will result in a net free of space, + * we want to let it through. + */ + if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) + quota = UINT64_MAX; + else + quota = dsl_dir_phys(dd)->dd_quota; + + /* + * Adjust the quota against the actual pool size at the root + * minus any outstanding deferred frees. + * To ensure that it's possible to remove files from a full + * pool without inducing transient overcommits, we throttle + * netfree transactions against a quota that is slightly larger, + * but still within the pool's allocation slop. In cases where + * we're very close to full, this will allow a steady trickle of + * removes to get through. + */ + uint64_t deferred = 0; + if (dd->dd_parent == NULL) { + uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool, + (netfree) ? + ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL); + + if (avail < quota) { + quota = avail; + retval = ENOSPC; + } + } + + /* + * If they are requesting more space, and our current estimate + * is over quota, they get to try again unless the actual + * on-disk is over quota and there are no pending changes (which + * may free up space for us). + */ + if (used_on_disk + est_inflight >= quota) { + if (est_inflight > 0 || used_on_disk < quota || + (retval == ENOSPC && used_on_disk < quota + deferred)) + retval = ERESTART; + dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " + "quota=%lluK tr=%lluK err=%d\n", + used_on_disk>>10, est_inflight>>10, + quota>>10, asize>>10, retval); + mutex_exit(&dd->dd_lock); + return (SET_ERROR(retval)); + } + + /* We need to up our estimated delta before dropping dd_lock */ + dd->dd_tempreserved[txg & TXG_MASK] += asize; + + uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, + asize - ref_rsrv); + mutex_exit(&dd->dd_lock); + + tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_ds = dd; + tr->tr_size = asize; + list_insert_tail(tr_list, tr); + + /* see if it's OK with our parent */ + if (dd->dd_parent != NULL && parent_rsrv != 0) { + boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); + + return (dsl_dir_tempreserve_impl(dd->dd_parent, + parent_rsrv, netfree, ismos, tr_list, tx, B_FALSE)); + } else { + return (0); + } +} + +/* + * Reserve space in this dsl_dir, to be used in this tx's txg. + * After the space has been dirtied (and dsl_dir_willuse_space() + * has been called), the reservation should be canceled, using + * dsl_dir_tempreserve_clear(). + */ +int +dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, + boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx) +{ + int err; + list_t *tr_list; + + if (asize == 0) { + *tr_cookiep = NULL; + return (0); + } + + tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(tr_list, sizeof (struct tempreserve), + offsetof(struct tempreserve, tr_node)); + ASSERT3S(asize, >, 0); + + err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg); + if (err == 0) { + struct tempreserve *tr; + + tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_size = lsize; + list_insert_tail(tr_list, tr); + } else { + if (err == EAGAIN) { + /* + * If arc_memory_throttle() detected that pageout + * is running and we are low on memory, we delay new + * non-pageout transactions to give pageout an + * advantage. + * + * It is unfortunate to be delaying while the caller's + * locks are held. + */ + txg_delay(dd->dd_pool, tx->tx_txg, + MSEC2NSEC(10), MSEC2NSEC(10)); + err = SET_ERROR(ERESTART); + } + } + + if (err == 0) { + err = dsl_dir_tempreserve_impl(dd, asize, netfree, + B_FALSE, tr_list, tx, B_TRUE); + } + + if (err != 0) + dsl_dir_tempreserve_clear(tr_list, tx); + else + *tr_cookiep = tr_list; + + return (err); +} + +/* + * Clear a temporary reservation that we previously made with + * dsl_dir_tempreserve_space(). + */ +void +dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) +{ + int txgidx = tx->tx_txg & TXG_MASK; + list_t *tr_list = tr_cookie; + struct tempreserve *tr; + + ASSERT3U(tx->tx_txg, !=, 0); + + if (tr_cookie == NULL) + return; + + while ((tr = list_head(tr_list)) != NULL) { + if (tr->tr_ds) { + mutex_enter(&tr->tr_ds->dd_lock); + ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, + tr->tr_size); + tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; + mutex_exit(&tr->tr_ds->dd_lock); + } else { + arc_tempreserve_clear(tr->tr_size); + } + list_remove(tr_list, tr); + kmem_free(tr, sizeof (struct tempreserve)); + } + + kmem_free(tr_list, sizeof (list_t)); +} + +/* + * This should be called from open context when we think we're going to write + * or free space, for example when dirtying data. Be conservative; it's okay + * to write less space or free more, but we don't want to write more or free + * less than the amount specified. + */ +void +dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) +{ + int64_t parent_space; + uint64_t est_used; + + mutex_enter(&dd->dd_lock); + if (space > 0) + dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; + + est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes; + parent_space = parent_delta(dd, est_used, space); + mutex_exit(&dd->dd_lock); + + /* Make sure that we clean up dd_space_to* */ + dsl_dir_dirty(dd, tx); + + /* XXX this is potentially expensive and unnecessary... */ + if (parent_space && dd->dd_parent) + dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); +} + +/* call from syncing context when we actually write/free space for this dd */ +void +dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, + int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) +{ + int64_t accounted_delta; + + /* + * dsl_dataset_set_refreservation_sync_impl() calls this with + * dd_lock held, so that it can atomically update + * ds->ds_reserved and the dsl_dir accounting, so that + * dsl_dataset_check_quota() can see dataset and dir accounting + * consistently. + */ + boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(type < DD_USED_NUM); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + if (needlock) + mutex_enter(&dd->dd_lock); + accounted_delta = + parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used); + ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used); + ASSERT(compressed >= 0 || + dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed); + ASSERT(uncompressed >= 0 || + dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed); + dsl_dir_phys(dd)->dd_used_bytes += used; + dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed; + dsl_dir_phys(dd)->dd_compressed_bytes += compressed; + + if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { + ASSERT(used > 0 || + dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used); + dsl_dir_phys(dd)->dd_used_breakdown[type] += used; +#ifdef DEBUG + dd_used_t t; + uint64_t u = 0; + for (t = 0; t < DD_USED_NUM; t++) + u += dsl_dir_phys(dd)->dd_used_breakdown[t]; + ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes); +#endif + } + if (needlock) + mutex_exit(&dd->dd_lock); + + if (dd->dd_parent != NULL) { + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, + accounted_delta, compressed, uncompressed, tx); + dsl_dir_transfer_space(dd->dd_parent, + used - accounted_delta, + DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL); + } +} + +void +dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) +{ + ASSERT(tx == NULL || dmu_tx_is_syncing(tx)); + ASSERT(oldtype < DD_USED_NUM); + ASSERT(newtype < DD_USED_NUM); + + if (delta == 0 || + !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN)) + return; + + if (tx != NULL) + dmu_buf_will_dirty(dd->dd_dbuf, tx); + mutex_enter(&dd->dd_lock); + ASSERT(delta > 0 ? + dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta : + dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta); + ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta)); + dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta; + dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta; + mutex_exit(&dd->dd_lock); +} + +typedef struct dsl_dir_set_qr_arg { + const char *ddsqra_name; + zprop_source_t ddsqra_source; + uint64_t ddsqra_value; +} dsl_dir_set_qr_arg_t; + +static int +dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) +{ + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + uint64_t towrite, newval; + + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + + error = dsl_prop_predict(ds->ds_dir, "quota", + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (newval == 0) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + mutex_enter(&ds->ds_dir->dd_lock); + /* + * If we are doing the preliminary check in open context, and + * there are pending changes, then don't fail it, since the + * pending changes could under-estimate the amount of space to be + * freed up. + */ + towrite = dsl_dir_space_towrite(ds->ds_dir); + if ((dmu_tx_is_syncing(tx) || towrite == 0) && + (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || + newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { + error = SET_ERROR(ENOSPC); + } + mutex_exit(&ds->ds_dir->dd_lock); + dsl_dataset_rele(ds, FTAG); + return (error); +} + +static void +dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t newval; + + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); + + if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); + + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); + } else { + newval = ddsqra->ddsqra_value; + spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", + zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); + } + + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); + dsl_dir_phys(ds->ds_dir)->dd_quota = newval; + mutex_exit(&ds->ds_dir->dd_lock); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) +{ + dsl_dir_set_qr_arg_t ddsqra; + + ddsqra.ddsqra_name = ddname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = quota; + + return (dsl_sync_task(ddname, dsl_dir_set_quota_check, + dsl_dir_set_quota_sync, &ddsqra, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +int +dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) +{ + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + dsl_dir_t *dd; + uint64_t newval, used, avail; + int error; + + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + dd = ds->ds_dir; + + /* + * If we are doing the preliminary check in open context, the + * space estimates may be inaccurate. + */ + if (!dmu_tx_is_syncing(tx)) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_RESERVATION), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + mutex_enter(&dd->dd_lock); + used = dsl_dir_phys(dd)->dd_used_bytes; + mutex_exit(&dd->dd_lock); + + if (dd->dd_parent) { + avail = dsl_dir_space_available(dd->dd_parent, + NULL, 0, FALSE); + } else { + avail = dsl_pool_adjustedsize(dd->dd_pool, + ZFS_SPACE_CHECK_NORMAL) - used; + } + + if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { + uint64_t delta = MAX(used, newval) - + MAX(used, dsl_dir_phys(dd)->dd_reserved); + + if (delta > avail || + (dsl_dir_phys(dd)->dd_quota > 0 && + newval > dsl_dir_phys(dd)->dd_quota)) + error = SET_ERROR(ENOSPC); + } + + dsl_dataset_rele(ds, FTAG); + return (error); +} + +void +dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) +{ + uint64_t used; + int64_t delta; + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + mutex_enter(&dd->dd_lock); + used = dsl_dir_phys(dd)->dd_used_bytes; + delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); + dsl_dir_phys(dd)->dd_reserved = value; + + if (dd->dd_parent != NULL) { + /* Roll up this additional usage into our ancestors */ + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, + delta, 0, 0, tx); + } + mutex_exit(&dd->dd_lock); +} + +static void +dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t newval; + + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); + + if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_RESERVATION), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); + + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); + } else { + newval = ddsqra->ddsqra_value; + spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", + zfs_prop_to_name(ZFS_PROP_RESERVATION), + (longlong_t)newval); + } + + dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dir_set_reservation(const char *ddname, zprop_source_t source, + uint64_t reservation) +{ + dsl_dir_set_qr_arg_t ddsqra; + + ddsqra.ddsqra_name = ddname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = reservation; + + return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, + dsl_dir_set_reservation_sync, &ddsqra, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +static dsl_dir_t * +closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) +{ + for (; ds1; ds1 = ds1->dd_parent) { + dsl_dir_t *dd; + for (dd = ds2; dd; dd = dd->dd_parent) { + if (ds1 == dd) + return (dd); + } + } + return (NULL); +} + +/* + * If delta is applied to dd, how much of that delta would be applied to + * ancestor? Syncing context only. + */ +static int64_t +would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) +{ + if (dd == ancestor) + return (delta); + + mutex_enter(&dd->dd_lock); + delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); + mutex_exit(&dd->dd_lock); + return (would_change(dd->dd_parent, delta, ancestor)); +} + +typedef struct dsl_dir_rename_arg { + const char *ddra_oldname; + const char *ddra_newname; + cred_t *ddra_cred; +} dsl_dir_rename_arg_t; + +typedef struct dsl_valid_rename_arg { + int char_delta; + int nest_delta; +} dsl_valid_rename_arg_t; + +/* ARGSUSED */ +static int +dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + dsl_valid_rename_arg_t *dvra = arg; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + + dsl_dataset_name(ds, namebuf); + + ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + int namelen = strlen(namebuf) + dvra->char_delta; + int depth = get_dataset_depth(namebuf) + dvra->nest_delta; + + if (namelen >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting) + return (SET_ERROR(ENAMETOOLONG)); + return (0); +} + +static int +dsl_dir_rename_check(void *arg, dmu_tx_t *tx) +{ + dsl_dir_rename_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd, *newparent; + dsl_valid_rename_arg_t dvra; + const char *mynewname; + int error; + + /* target dir should exist */ + error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); + if (error != 0) + return (error); + + /* new parent should exist */ + error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, + &newparent, &mynewname); + if (error != 0) { + dsl_dir_rele(dd, FTAG); + return (error); + } + + /* can't rename to different pool */ + if (dd->dd_pool != newparent->dd_pool) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(EXDEV)); + } + + /* new name should not already exist */ + if (mynewname == NULL) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(EEXIST)); + } + + ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + dvra.char_delta = strlen(ddra->ddra_newname) + - strlen(ddra->ddra_oldname); + dvra.nest_delta = get_dataset_depth(ddra->ddra_newname) + - get_dataset_depth(ddra->ddra_oldname); + + /* if the name length is growing, validate child name lengths */ + if (dvra.char_delta > 0 || dvra.nest_delta > 0) { + error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, + &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + if (error != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (error); + } + } + + if (dmu_tx_is_syncing(tx)) { + if (spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_FS_SS_LIMIT)) { + /* + * Although this is the check function and we don't + * normally make on-disk changes in check functions, + * we need to do that here. + * + * Ensure this portion of the tree's counts have been + * initialized in case the new parent has limits set. + */ + dsl_dir_init_fs_ss_count(dd, tx); + } + } + + if (newparent != dd->dd_parent) { + /* is there enough space? */ + uint64_t myspace = + MAX(dsl_dir_phys(dd)->dd_used_bytes, + dsl_dir_phys(dd)->dd_reserved); + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t fs_cnt = 0; + uint64_t ss_cnt = 0; + + if (dsl_dir_is_zapified(dd)) { + int err; + + err = zap_lookup(os, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, + &fs_cnt); + if (err != ENOENT && err != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (err); + } + + /* + * have to add 1 for the filesystem itself that we're + * moving + */ + fs_cnt++; + + err = zap_lookup(os, dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, + &ss_cnt); + if (err != ENOENT && err != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (err); + } + } + + /* no rename into our descendant */ + if (closest_common_ancestor(dd, newparent) == dd) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = dsl_dir_transfer_possible(dd->dd_parent, + newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred); + if (error != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (error); + } + } + + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (0); +} + +static void +dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dir_rename_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd, *newparent; + const char *mynewname; + int error; + objset_t *mos = dp->dp_meta_objset; + + VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); + VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, + &mynewname)); + + /* Log this before we change the name. */ + spa_history_log_internal_dd(dd, "rename", tx, + "-> %s", ddra->ddra_newname); + + if (newparent != dd->dd_parent) { + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t fs_cnt = 0; + uint64_t ss_cnt = 0; + + /* + * We already made sure the dd counts were initialized in the + * check function. + */ + if (spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_FS_SS_LIMIT)) { + VERIFY0(zap_lookup(os, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, + &fs_cnt)); + /* add 1 for the filesystem itself that we're moving */ + fs_cnt++; + + VERIFY0(zap_lookup(os, dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, + &ss_cnt)); + } + + dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, + DD_FIELD_FILESYSTEM_COUNT, tx); + dsl_fs_ss_count_adjust(newparent, fs_cnt, + DD_FIELD_FILESYSTEM_COUNT, tx); + + dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, + DD_FIELD_SNAPSHOT_COUNT, tx); + dsl_fs_ss_count_adjust(newparent, ss_cnt, + DD_FIELD_SNAPSHOT_COUNT, tx); + + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, + -dsl_dir_phys(dd)->dd_used_bytes, + -dsl_dir_phys(dd)->dd_compressed_bytes, + -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); + dsl_dir_diduse_space(newparent, DD_USED_CHILD, + dsl_dir_phys(dd)->dd_used_bytes, + dsl_dir_phys(dd)->dd_compressed_bytes, + dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); + + if (dsl_dir_phys(dd)->dd_reserved > + dsl_dir_phys(dd)->dd_used_bytes) { + uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - + dsl_dir_phys(dd)->dd_used_bytes; + + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, + -unused_rsrv, 0, 0, tx); + dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, + unused_rsrv, 0, 0, tx); + } + } + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + /* remove from old parent zapobj */ + error = zap_remove(mos, + dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, + dd->dd_myname, tx); + ASSERT0(error); + + (void) strcpy(dd->dd_myname, mynewname); + dsl_dir_rele(dd->dd_parent, dd); + dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; + VERIFY0(dsl_dir_hold_obj(dp, + newparent->dd_object, NULL, dd, &dd->dd_parent)); + + /* add to new parent zapobj */ + VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, + dd->dd_myname, 8, 1, &dd->dd_object, tx)); + +#ifdef __FreeBSD__ +#ifdef _KERNEL + zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname); + zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname); +#endif +#endif + + dsl_prop_notify_all(dd); + + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); +} + +int +dsl_dir_rename(const char *oldname, const char *newname) +{ + dsl_dir_rename_arg_t ddra; + + ddra.ddra_oldname = oldname; + ddra.ddra_newname = newname; + ddra.ddra_cred = CRED(); + + return (dsl_sync_task(oldname, + dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, + 3, ZFS_SPACE_CHECK_RESERVED)); +} + +int +dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, + uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr) +{ + dsl_dir_t *ancestor; + int64_t adelta; + uint64_t avail; + int err; + + ancestor = closest_common_ancestor(sdd, tdd); + adelta = would_change(sdd, -space, ancestor); + avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); + if (avail < space) + return (SET_ERROR(ENOSPC)); + + err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, + ancestor, cr); + if (err != 0) + return (err); + err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, + ancestor, cr); + if (err != 0) + return (err); + + return (0); +} + +timestruc_t +dsl_dir_snap_cmtime(dsl_dir_t *dd) +{ + timestruc_t t; + + mutex_enter(&dd->dd_lock); + t = dd->dd_snap_cmtime; + mutex_exit(&dd->dd_lock); + + return (t); +} + +void +dsl_dir_snap_cmtime_update(dsl_dir_t *dd) +{ + timestruc_t t; + + gethrestime(&t); + mutex_enter(&dd->dd_lock); + dd->dd_snap_cmtime = t; + mutex_exit(&dd->dd_lock); +} + +void +dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); +} + +boolean_t +dsl_dir_is_zapified(dsl_dir_t *dd) +{ + dmu_object_info_t doi; + + dmu_object_info_from_db(dd->dd_dbuf, &doi); + return (doi.doi_type == DMU_OTN_ZAP_METADATA); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c new file mode 100644 index 000000000000..2d18fe065a42 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -0,0 +1,1366 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_scan.h> +#include <sys/dnode.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/arc.h> +#include <sys/zap.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/spa_impl.h> +#include <sys/dsl_deadlist.h> +#include <sys/vdev_impl.h> +#include <sys/metaslab_impl.h> +#include <sys/bptree.h> +#include <sys/zfeature.h> +#include <sys/zil_impl.h> +#include <sys/dsl_userhold.h> + +#if defined(__FreeBSD__) && defined(_KERNEL) +#include <sys/types.h> +#include <sys/sysctl.h> +#endif + +/* + * ZFS Write Throttle + * ------------------ + * + * ZFS must limit the rate of incoming writes to the rate at which it is able + * to sync data modifications to the backend storage. Throttling by too much + * creates an artificial limit; throttling by too little can only be sustained + * for short periods and would lead to highly lumpy performance. On a per-pool + * basis, ZFS tracks the amount of modified (dirty) data. As operations change + * data, the amount of dirty data increases; as ZFS syncs out data, the amount + * of dirty data decreases. When the amount of dirty data exceeds a + * predetermined threshold further modifications are blocked until the amount + * of dirty data decreases (as data is synced out). + * + * The limit on dirty data is tunable, and should be adjusted according to + * both the IO capacity and available memory of the system. The larger the + * window, the more ZFS is able to aggregate and amortize metadata (and data) + * changes. However, memory is a limited resource, and allowing for more dirty + * data comes at the cost of keeping other useful data in memory (for example + * ZFS data cached by the ARC). + * + * Implementation + * + * As buffers are modified dsl_pool_willuse_space() increments both the per- + * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of + * dirty space used; dsl_pool_dirty_space() decrements those values as data + * is synced out from dsl_pool_sync(). While only the poolwide value is + * relevant, the per-txg value is useful for debugging. The tunable + * zfs_dirty_data_max determines the dirty space limit. Once that value is + * exceeded, new writes are halted until space frees up. + * + * The zfs_dirty_data_sync tunable dictates the threshold at which we + * ensure that there is a txg syncing (see the comment in txg.c for a full + * description of transaction group stages). + * + * The IO scheduler uses both the dirty space limit and current amount of + * dirty data as inputs. Those values affect the number of concurrent IOs ZFS + * issues. See the comment in vdev_queue.c for details of the IO scheduler. + * + * The delay is also calculated based on the amount of dirty data. See the + * comment above dmu_tx_delay() for details. + */ + +/* + * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, + * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system. + */ +uint64_t zfs_dirty_data_max; +uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024; +int zfs_dirty_data_max_percent = 10; + +/* + * If there is at least this much dirty data, push out a txg. + */ +uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024; + +/* + * Once there is this amount of dirty data, the dmu_tx_delay() will kick in + * and delay each transaction. + * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. + */ +int zfs_delay_min_dirty_percent = 60; + +/* + * This controls how quickly the delay approaches infinity. + * Larger values cause it to delay more for a given amount of dirty data. + * Therefore larger values will cause there to be less dirty data for a + * given throughput. + * + * For the smoothest delay, this value should be about 1 billion divided + * by the maximum number of operations per second. This will smoothly + * handle between 10x and 1/10th this number. + * + * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the + * multiply in dmu_tx_delay(). + */ +uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; + +/* + * This determines the number of threads used by the dp_sync_taskq. + */ +int zfs_sync_taskq_batch_pct = 75; + +/* + * These tunables determine the behavior of how zil_itxg_clean() is + * called via zil_clean() in the context of spa_sync(). When an itxg + * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching. + * If the dispatch fails, the call to zil_itxg_clean() will occur + * synchronously in the context of spa_sync(), which can negatively + * impact the performance of spa_sync() (e.g. in the case of the itxg + * list having a large number of itxs that needs to be cleaned). + * + * Thus, these tunables can be used to manipulate the behavior of the + * taskq used by zil_clean(); they determine the number of taskq entries + * that are pre-populated when the taskq is first created (via the + * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of + * taskq entries that are cached after an on-demand allocation (via the + * "zfs_zil_clean_taskq_maxalloc"). + * + * The idea being, we want to try reasonably hard to ensure there will + * already be a taskq entry pre-allocated by the time that it is needed + * by zil_clean(). This way, we can avoid the possibility of an + * on-demand allocation of a new taskq entry from failing, which would + * result in zil_itxg_clean() being called synchronously from zil_clean() + * (which can adversely affect performance of spa_sync()). + * + * Additionally, the number of threads used by the taskq can be + * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable. + */ +int zfs_zil_clean_taskq_nthr_pct = 100; +int zfs_zil_clean_taskq_minalloc = 1024; +int zfs_zil_clean_taskq_maxalloc = 1024 * 1024; + +#if defined(__FreeBSD__) && defined(_KERNEL) + +extern int zfs_vdev_async_write_active_max_dirty_percent; + +SYSCTL_DECL(_vfs_zfs); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN, + &zfs_dirty_data_max, 0, + "The maximum amount of dirty data in bytes after which new writes are " + "halted until space becomes available"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN, + &zfs_dirty_data_max_max, 0, + "The absolute cap on dirty_data_max when auto calculating"); + +static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent, + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), + sysctl_zfs_dirty_data_max_percent, "I", + "The percent of physical memory used to auto calculate dirty_data_max"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN, + &zfs_dirty_data_sync, 0, + "Force a txg if the number of dirty buffer bytes exceed this value"); + +static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS); +/* No zfs_delay_min_dirty_percent tunable due to limit requirements */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent, + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int), + sysctl_zfs_delay_min_dirty_percent, "I", + "The limit of outstanding dirty data before transactions are delayed"); + +static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS); +/* No zfs_delay_scale tunable due to limit requirements */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), + sysctl_zfs_delay_scale, "QU", + "Controls how quickly the delay approaches infinity"); + +static int +sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS) +{ + int val, err; + + val = zfs_dirty_data_max_percent; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < 0 || val > 100) + return (EINVAL); + + zfs_dirty_data_max_percent = val; + + return (0); +} + +static int +sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS) +{ + int val, err; + + val = zfs_delay_min_dirty_percent; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < zfs_vdev_async_write_active_max_dirty_percent) + return (EINVAL); + + zfs_delay_min_dirty_percent = val; + + return (0); +} + +static int +sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_delay_scale; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val > UINT64_MAX / zfs_dirty_data_max) + return (EINVAL); + + zfs_delay_scale = val; + + return (0); +} +#endif + +int +dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) +{ + uint64_t obj; + int err; + + err = zap_lookup(dp->dp_meta_objset, + dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, + name, sizeof (obj), 1, &obj); + if (err) + return (err); + + return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); +} + +static dsl_pool_t * +dsl_pool_open_impl(spa_t *spa, uint64_t txg) +{ + dsl_pool_t *dp; + blkptr_t *bp = spa_get_rootblkptr(spa); + + dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); + dp->dp_spa = spa; + dp->dp_meta_rootbp = *bp; + rrw_init(&dp->dp_config_rwlock, B_TRUE); + txg_init(dp, txg); + + txg_list_create(&dp->dp_dirty_datasets, spa, + offsetof(dsl_dataset_t, ds_dirty_link)); + txg_list_create(&dp->dp_dirty_zilogs, spa, + offsetof(zilog_t, zl_dirty_link)); + txg_list_create(&dp->dp_dirty_dirs, spa, + offsetof(dsl_dir_t, dd_dirty_link)); + txg_list_create(&dp->dp_sync_tasks, spa, + offsetof(dsl_sync_task_t, dst_node)); + txg_list_create(&dp->dp_early_sync_tasks, spa, + offsetof(dsl_sync_task_t, dst_node)); + + dp->dp_sync_taskq = taskq_create("dp_sync_taskq", + zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, + TASKQ_THREADS_CPU_PCT); + + dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", + zfs_zil_clean_taskq_nthr_pct, minclsyspri, + zfs_zil_clean_taskq_minalloc, + zfs_zil_clean_taskq_maxalloc, + TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); + + mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); + + dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, + 1, 4, 0); + + return (dp); +} + +int +dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) +{ + int err; + dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + + err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, + &dp->dp_meta_objset); + if (err != 0) + dsl_pool_close(dp); + else + *dpp = dp; + + return (err); +} + +int +dsl_pool_open(dsl_pool_t *dp) +{ + int err; + dsl_dir_t *dd; + dsl_dataset_t *ds; + uint64_t obj; + + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, + &dp->dp_root_dir_obj); + if (err) + goto out; + + err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, + NULL, dp, &dp->dp_root_dir); + if (err) + goto out; + + err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); + if (err) + goto out; + + if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { + err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); + if (err) + goto out; + err = dsl_dataset_hold_obj(dp, + dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); + if (err == 0) { + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, + &dp->dp_origin_snap); + dsl_dataset_rele(ds, FTAG); + } + dsl_dir_rele(dd, dp); + if (err) + goto out; + } + + if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, + &dp->dp_free_dir); + if (err) + goto out; + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); + if (err) + goto out; + VERIFY0(bpobj_open(&dp->dp_free_bpobj, + dp->dp_meta_objset, obj)); + } + + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj); + if (err == 0) { + VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, + dp->dp_meta_objset, obj)); + } else if (err == ENOENT) { + /* + * We might not have created the remap bpobj yet. + */ + err = 0; + } else { + goto out; + } + } + + /* + * Note: errors ignored, because the these special dirs, used for + * space accounting, are only created on demand. + */ + (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, + &dp->dp_leak_dir); + + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj); + if (err != 0) + goto out; + } + + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, + &dp->dp_empty_bpobj); + if (err != 0) + goto out; + } + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, + &dp->dp_tmp_userrefs_obj); + if (err == ENOENT) + err = 0; + if (err) + goto out; + + err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); + +out: + rrw_exit(&dp->dp_config_rwlock, FTAG); + return (err); +} + +void +dsl_pool_close(dsl_pool_t *dp) +{ + /* + * Drop our references from dsl_pool_open(). + * + * Since we held the origin_snap from "syncing" context (which + * includes pool-opening context), it actually only got a "ref" + * and not a hold, so just drop that here. + */ + if (dp->dp_origin_snap != NULL) + dsl_dataset_rele(dp->dp_origin_snap, dp); + if (dp->dp_mos_dir != NULL) + dsl_dir_rele(dp->dp_mos_dir, dp); + if (dp->dp_free_dir != NULL) + dsl_dir_rele(dp->dp_free_dir, dp); + if (dp->dp_leak_dir != NULL) + dsl_dir_rele(dp->dp_leak_dir, dp); + if (dp->dp_root_dir != NULL) + dsl_dir_rele(dp->dp_root_dir, dp); + + bpobj_close(&dp->dp_free_bpobj); + bpobj_close(&dp->dp_obsolete_bpobj); + + /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ + if (dp->dp_meta_objset != NULL) + dmu_objset_evict(dp->dp_meta_objset); + + txg_list_destroy(&dp->dp_dirty_datasets); + txg_list_destroy(&dp->dp_dirty_zilogs); + txg_list_destroy(&dp->dp_sync_tasks); + txg_list_destroy(&dp->dp_early_sync_tasks); + txg_list_destroy(&dp->dp_dirty_dirs); + + taskq_destroy(dp->dp_zil_clean_taskq); + taskq_destroy(dp->dp_sync_taskq); + + /* + * We can't set retry to TRUE since we're explicitly specifying + * a spa to flush. This is good enough; any missed buffers for + * this spa won't cause trouble, and they'll eventually fall + * out of the ARC just like any other unused buffer. + */ + arc_flush(dp->dp_spa, FALSE); + + txg_fini(dp); + dsl_scan_fini(dp); + dmu_buf_user_evict_wait(); + + rrw_destroy(&dp->dp_config_rwlock); + mutex_destroy(&dp->dp_lock); + taskq_destroy(dp->dp_vnrele_taskq); + if (dp->dp_blkstats != NULL) { + mutex_destroy(&dp->dp_blkstats->zab_lock); + kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + } + kmem_free(dp, sizeof (dsl_pool_t)); +} + +void +dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) +{ + uint64_t obj; + /* + * Currently, we only create the obsolete_bpobj where there are + * indirect vdevs with referenced mappings. + */ + ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL)); + /* create and open the obsolete_bpobj */ + obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); + VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); +} + +void +dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) +{ + spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + VERIFY0(zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ, tx)); + bpobj_free(dp->dp_meta_objset, + dp->dp_obsolete_bpobj.bpo_object, tx); + bpobj_close(&dp->dp_obsolete_bpobj); +} + +dsl_pool_t * +dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) +{ + int err; + dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); + dsl_dataset_t *ds; + uint64_t obj; + + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + + /* create and open the MOS (meta-objset) */ + dp->dp_meta_objset = dmu_objset_create_impl(spa, + NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); + + /* create the pool directory */ + err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); + ASSERT0(err); + + /* Initialize scan structures */ + VERIFY0(dsl_scan_init(dp, txg)); + + /* create and open the root dir */ + dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); + VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, + NULL, dp, &dp->dp_root_dir)); + + /* create and open the meta-objset dir */ + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); + VERIFY0(dsl_pool_open_special_dir(dp, + MOS_DIR_NAME, &dp->dp_mos_dir)); + + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + /* create and open the free dir */ + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, + FREE_DIR_NAME, tx); + VERIFY0(dsl_pool_open_special_dir(dp, + FREE_DIR_NAME, &dp->dp_free_dir)); + + /* create and open the free_bplist */ + obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); + VERIFY0(bpobj_open(&dp->dp_free_bpobj, + dp->dp_meta_objset, obj)); + } + + if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) + dsl_pool_create_origin(dp, tx); + + /* create the root dataset */ + obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); + + /* create the root objset */ + VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); +#ifdef _KERNEL + { + objset_t *os; + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + os = dmu_objset_create_impl(dp->dp_spa, ds, + dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + zfs_create_fs(os, kcred, zplprops, tx); + } +#endif + dsl_dataset_rele(ds, FTAG); + + dmu_tx_commit(tx); + + rrw_exit(&dp->dp_config_rwlock, FTAG); + + return (dp); +} + +/* + * Account for the meta-objset space in its placeholder dsl_dir. + */ +void +dsl_pool_mos_diduse_space(dsl_pool_t *dp, + int64_t used, int64_t comp, int64_t uncomp) +{ + ASSERT3U(comp, ==, uncomp); /* it's all metadata */ + mutex_enter(&dp->dp_lock); + dp->dp_mos_used_delta += used; + dp->dp_mos_compressed_delta += comp; + dp->dp_mos_uncompressed_delta += uncomp; + mutex_exit(&dp->dp_lock); +} + +static void +dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) +{ + zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + dmu_objset_sync(dp->dp_meta_objset, zio, tx); + VERIFY0(zio_wait(zio)); + dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); + spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); +} + +static void +dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) +{ + ASSERT(MUTEX_HELD(&dp->dp_lock)); + + if (delta < 0) + ASSERT3U(-delta, <=, dp->dp_dirty_total); + + dp->dp_dirty_total += delta; + + /* + * Note: we signal even when increasing dp_dirty_total. + * This ensures forward progress -- each thread wakes the next waiter. + */ + if (dp->dp_dirty_total < zfs_dirty_data_max) + cv_signal(&dp->dp_spaceavail_cv); +} + +static boolean_t +dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) +{ + spa_t *spa = dp->dp_spa; + vdev_t *rvd = spa->spa_root_vdev; + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + txg_list_t *tl = &vd->vdev_ms_list; + metaslab_t *ms; + + for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms; + ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) { + VERIFY(range_tree_is_empty(ms->ms_freeing)); + VERIFY(range_tree_is_empty(ms->ms_checkpointing)); + } + } + + return (B_TRUE); +} + +void +dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) +{ + zio_t *zio; + dmu_tx_t *tx; + dsl_dir_t *dd; + dsl_dataset_t *ds; + objset_t *mos = dp->dp_meta_objset; + list_t synced_datasets; + + list_create(&synced_datasets, sizeof (dsl_dataset_t), + offsetof(dsl_dataset_t, ds_synced_link)); + + tx = dmu_tx_create_assigned(dp, txg); + + /* + * Run all early sync tasks before writing out any dirty blocks. + * For more info on early sync tasks see block comment in + * dsl_early_sync_task(). + */ + if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) { + dsl_sync_task_t *dst; + + ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); + while ((dst = + txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) { + ASSERT(dsl_early_sync_task_verify(dp, txg)); + dsl_sync_task_sync(dst, tx); + } + ASSERT(dsl_early_sync_task_verify(dp, txg)); + } + + /* + * Write out all dirty blocks of dirty datasets. + */ + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { + /* + * We must not sync any non-MOS datasets twice, because + * we may have taken a snapshot of them. However, we + * may sync newly-created datasets on pass 2. + */ + ASSERT(!list_link_active(&ds->ds_synced_link)); + list_insert_tail(&synced_datasets, ds); + dsl_dataset_sync(ds, zio, tx); + } + VERIFY0(zio_wait(zio)); + + /* + * We have written all of the accounted dirty data, so our + * dp_space_towrite should now be zero. However, some seldom-used + * code paths do not adhere to this (e.g. dbuf_undirty(), also + * rounding error in dbuf_write_physdone). + * Shore up the accounting of any dirtied space now. + */ + dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); + + /* + * Update the long range free counter after + * we're done syncing user data + */ + mutex_enter(&dp->dp_lock); + ASSERT(spa_sync_pass(dp->dp_spa) == 1 || + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0); + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0; + mutex_exit(&dp->dp_lock); + + /* + * After the data blocks have been written (ensured by the zio_wait() + * above), update the user/group space accounting. This happens + * in tasks dispatched to dp_sync_taskq, so wait for them before + * continuing. + */ + for (ds = list_head(&synced_datasets); ds != NULL; + ds = list_next(&synced_datasets, ds)) { + dmu_objset_do_userquota_updates(ds->ds_objset, tx); + } + taskq_wait(dp->dp_sync_taskq); + + /* + * Sync the datasets again to push out the changes due to + * userspace updates. This must be done before we process the + * sync tasks, so that any snapshots will have the correct + * user accounting information (and we won't get confused + * about which blocks are part of the snapshot). + */ + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { + ASSERT(list_link_active(&ds->ds_synced_link)); + dmu_buf_rele(ds->ds_dbuf, ds); + dsl_dataset_sync(ds, zio, tx); + } + VERIFY0(zio_wait(zio)); + + /* + * Now that the datasets have been completely synced, we can + * clean up our in-memory structures accumulated while syncing: + * + * - move dead blocks from the pending deadlist to the on-disk deadlist + * - release hold from dsl_dataset_dirty() + */ + while ((ds = list_remove_head(&synced_datasets)) != NULL) { + dsl_dataset_sync_done(ds, tx); + } + while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { + dsl_dir_sync(dd, tx); + } + + /* + * The MOS's space is accounted for in the pool/$MOS + * (dp_mos_dir). We can't modify the mos while we're syncing + * it, so we remember the deltas and apply them here. + */ + if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || + dp->dp_mos_uncompressed_delta != 0) { + dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, + dp->dp_mos_used_delta, + dp->dp_mos_compressed_delta, + dp->dp_mos_uncompressed_delta, tx); + dp->dp_mos_used_delta = 0; + dp->dp_mos_compressed_delta = 0; + dp->dp_mos_uncompressed_delta = 0; + } + + if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) { + dsl_pool_sync_mos(dp, tx); + } + + /* + * If we modify a dataset in the same txg that we want to destroy it, + * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. + * dsl_dir_destroy_check() will fail if there are unexpected holds. + * Therefore, we want to sync the MOS (thus syncing the dd_dbuf + * and clearing the hold on it) before we process the sync_tasks. + * The MOS data dirtied by the sync_tasks will be synced on the next + * pass. + */ + if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { + dsl_sync_task_t *dst; + /* + * No more sync tasks should have been added while we + * were syncing. + */ + ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); + while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) + dsl_sync_task_sync(dst, tx); + } + + dmu_tx_commit(tx); + + DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); +} + +void +dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) +{ + zilog_t *zilog; + + while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) { + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); + /* + * We don't remove the zilog from the dp_dirty_zilogs + * list until after we've cleaned it. This ensures that + * callers of zilog_is_dirty() receive an accurate + * answer when they are racing with the spa sync thread. + */ + zil_clean(zilog, txg); + (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg); + ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); + dmu_buf_rele(ds->ds_dbuf, zilog); + } + ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); +} + +/* + * TRUE if the current thread is the tx_sync_thread or if we + * are being called from SPA context during pool initialization. + */ +int +dsl_pool_sync_context(dsl_pool_t *dp) +{ + return (curthread == dp->dp_tx.tx_sync_thread || + spa_is_initializing(dp->dp_spa) || + taskq_member(dp->dp_sync_taskq, curthread)); +} + +/* + * This function returns the amount of allocatable space in the pool + * minus whatever space is currently reserved by ZFS for specific + * purposes. Specifically: + * + * 1] Any reserved SLOP space + * 2] Any space used by the checkpoint + * 3] Any space used for deferred frees + * + * The latter 2 are especially important because they are needed to + * rectify the SPA's and DMU's different understanding of how much space + * is used. Now the DMU is aware of that extra space tracked by the SPA + * without having to maintain a separate special dir (e.g similar to + * $MOS, $FREEING, and $LEAKED). + * + * Note: By deferred frees here, we mean the frees that were deferred + * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the + * segments placed in ms_defer trees during metaslab_sync_done(). + */ +uint64_t +dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy) +{ + spa_t *spa = dp->dp_spa; + uint64_t space, resv, adjustedsize; + uint64_t spa_deferred_frees = + spa->spa_deferred_bpobj.bpo_phys->bpo_bytes; + + space = spa_get_dspace(spa) + - spa_get_checkpoint_space(spa) - spa_deferred_frees; + resv = spa_get_slop_space(spa); + + switch (slop_policy) { + case ZFS_SPACE_CHECK_NORMAL: + break; + case ZFS_SPACE_CHECK_RESERVED: + resv >>= 1; + break; + case ZFS_SPACE_CHECK_EXTRA_RESERVED: + resv >>= 2; + break; + case ZFS_SPACE_CHECK_NONE: + resv = 0; + break; + default: + panic("invalid slop policy value: %d", slop_policy); + break; + } + adjustedsize = (space >= resv) ? (space - resv) : 0; + + return (adjustedsize); +} + +uint64_t +dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy) +{ + uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy); + uint64_t deferred = + metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); + uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0; + return (quota); +} + +boolean_t +dsl_pool_need_dirty_delay(dsl_pool_t *dp) +{ + uint64_t delay_min_bytes = + zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; + boolean_t rv; + + mutex_enter(&dp->dp_lock); + if (dp->dp_dirty_total > zfs_dirty_data_sync) + txg_kick(dp); + rv = (dp->dp_dirty_total > delay_min_bytes); + mutex_exit(&dp->dp_lock); + return (rv); +} + +void +dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) +{ + if (space > 0) { + mutex_enter(&dp->dp_lock); + dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; + dsl_pool_dirty_delta(dp, space); + mutex_exit(&dp->dp_lock); + } +} + +void +dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) +{ + ASSERT3S(space, >=, 0); + if (space == 0) + return; + mutex_enter(&dp->dp_lock); + if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { + /* XXX writing something we didn't dirty? */ + space = dp->dp_dirty_pertxg[txg & TXG_MASK]; + } + ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); + dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; + ASSERT3U(dp->dp_dirty_total, >=, space); + dsl_pool_dirty_delta(dp, -space); + mutex_exit(&dp->dp_lock); +} + +/* ARGSUSED */ +static int +upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds, *prev = NULL; + int err; + + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); + if (err) + return (err); + + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) + break; + dsl_dataset_rele(ds, FTAG); + ds = prev; + prev = NULL; + } + + if (prev == NULL) { + prev = dp->dp_origin_snap; + + /* + * The $ORIGIN can't have any data, or the accounting + * will be wrong. + */ + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + /* The origin doesn't get attached to itself */ + if (ds->ds_object == prev->ds_object) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; + dsl_dataset_phys(ds)->ds_prev_snap_txg = + dsl_dataset_phys(prev)->ds_creation_txg; + + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; + + dmu_buf_will_dirty(prev->ds_dbuf, tx); + dsl_dataset_phys(prev)->ds_num_children++; + + if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { + ASSERT(ds->ds_prev == NULL); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, + ds, &ds->ds_prev)); + } + } + + ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); + + if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { + dmu_buf_will_dirty(prev->ds_dbuf, tx); + dsl_dataset_phys(prev)->ds_next_clones_obj = + zap_create(dp->dp_meta_objset, + DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); + + dsl_dataset_rele(ds, FTAG); + if (prev != dp->dp_origin_snap) + dsl_dataset_rele(prev, FTAG); + return (0); +} + +void +dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dp->dp_origin_snap != NULL); + + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, + tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); +} + +/* ARGSUSED */ +static int +upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + dmu_tx_t *tx = arg; + objset_t *mos = dp->dp_meta_objset; + + if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { + dsl_dataset_t *origin; + + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); + + if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { + dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); + dsl_dir_phys(origin->ds_dir)->dd_clones = + zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, + 0, tx); + } + + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dir_phys(origin->ds_dir)->dd_clones, + ds->ds_object, tx)); + + dsl_dataset_rele(origin, FTAG); + } + return (0); +} + +void +dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + uint64_t obj; + + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); + VERIFY0(dsl_pool_open_special_dir(dp, + FREE_DIR_NAME, &dp->dp_free_dir)); + + /* + * We can't use bpobj_alloc(), because spa_version() still + * returns the old version, and we need a new-version bpobj with + * subobj support. So call dmu_object_alloc() directly. + */ + obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, + SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); + VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); + VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); + + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); +} + +void +dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) +{ + uint64_t dsobj; + dsl_dataset_t *ds; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dp->dp_origin_snap == NULL); + ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); + + /* create the origin dir, ds, & snap-ds */ + dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, + NULL, 0, kcred, tx); + VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); + VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, + dp, &dp->dp_origin_snap)); + dsl_dataset_rele(ds, FTAG); +} + +taskq_t * +dsl_pool_vnrele_taskq(dsl_pool_t *dp) +{ + return (dp->dp_vnrele_taskq); +} + +/* + * Walk through the pool-wide zap object of temporary snapshot user holds + * and release them. + */ +void +dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) +{ + zap_attribute_t za; + zap_cursor_t zc; + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj = dp->dp_tmp_userrefs_obj; + nvlist_t *holds; + + if (zapobj == 0) + return; + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + + holds = fnvlist_alloc(); + + for (zap_cursor_init(&zc, mos, zapobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + char *htag; + nvlist_t *tags; + + htag = strchr(za.za_name, '-'); + *htag = '\0'; + ++htag; + if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { + tags = fnvlist_alloc(); + fnvlist_add_boolean(tags, htag); + fnvlist_add_nvlist(holds, za.za_name, tags); + fnvlist_free(tags); + } else { + fnvlist_add_boolean(tags, htag); + } + } + dsl_dataset_user_release_tmp(dp, holds); + fnvlist_free(holds); + zap_cursor_fini(&zc); +} + +/* + * Create the pool-wide zap object for storing temporary snapshot holds. + */ +void +dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) +{ + objset_t *mos = dp->dp_meta_objset; + + ASSERT(dp->dp_tmp_userrefs_obj == 0); + ASSERT(dmu_tx_is_syncing(tx)); + + dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); +} + +static int +dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) +{ + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj = dp->dp_tmp_userrefs_obj; + char *name; + int error; + + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + ASSERT(dmu_tx_is_syncing(tx)); + + /* + * If the pool was created prior to SPA_VERSION_USERREFS, the + * zap object for temporary holds might not exist yet. + */ + if (zapobj == 0) { + if (holding) { + dsl_pool_user_hold_create_obj(dp, tx); + zapobj = dp->dp_tmp_userrefs_obj; + } else { + return (SET_ERROR(ENOENT)); + } + } + + name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); + if (holding) + error = zap_add(mos, zapobj, name, 8, 1, &now, tx); + else + error = zap_remove(mos, zapobj, name, tx); + strfree(name); + + return (error); +} + +/* + * Add a temporary hold for the given dataset object and tag. + */ +int +dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, + uint64_t now, dmu_tx_t *tx) +{ + return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); +} + +/* + * Release a temporary hold for the given dataset object and tag. + */ +int +dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, + dmu_tx_t *tx) +{ + return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, + tx, B_FALSE)); +} + +/* + * DSL Pool Configuration Lock + * + * The dp_config_rwlock protects against changes to DSL state (e.g. dataset + * creation / destruction / rename / property setting). It must be held for + * read to hold a dataset or dsl_dir. I.e. you must call + * dsl_pool_config_enter() or dsl_pool_hold() before calling + * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock + * must be held continuously until all datasets and dsl_dirs are released. + * + * The only exception to this rule is that if a "long hold" is placed on + * a dataset, then the dp_config_rwlock may be dropped while the dataset + * is still held. The long hold will prevent the dataset from being + * destroyed -- the destroy will fail with EBUSY. A long hold can be + * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset + * (by calling dsl_{dataset,objset}_{try}own{_obj}). + * + * Legitimate long-holders (including owners) should be long-running, cancelable + * tasks that should cause "zfs destroy" to fail. This includes DMU + * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), + * "zfs send", and "zfs diff". There are several other long-holders whose + * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). + * + * The usual formula for long-holding would be: + * dsl_pool_hold() + * dsl_dataset_hold() + * ... perform checks ... + * dsl_dataset_long_hold() + * dsl_pool_rele() + * ... perform long-running task ... + * dsl_dataset_long_rele() + * dsl_dataset_rele() + * + * Note that when the long hold is released, the dataset is still held but + * the pool is not held. The dataset may change arbitrarily during this time + * (e.g. it could be destroyed). Therefore you shouldn't do anything to the + * dataset except release it. + * + * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only + * or modifying operations. + * + * Modifying operations should generally use dsl_sync_task(). The synctask + * infrastructure enforces proper locking strategy with respect to the + * dp_config_rwlock. See the comment above dsl_sync_task() for details. + * + * Read-only operations will manually hold the pool, then the dataset, obtain + * information from the dataset, then release the pool and dataset. + * dmu_objset_{hold,rele}() are convenience routines that also do the pool + * hold/rele. + */ + +int +dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) +{ + spa_t *spa; + int error; + + error = spa_open(name, &spa, tag); + if (error == 0) { + *dp = spa_get_dsl(spa); + dsl_pool_config_enter(*dp, tag); + } + return (error); +} + +void +dsl_pool_rele(dsl_pool_t *dp, void *tag) +{ + dsl_pool_config_exit(dp, tag); + spa_close(dp->dp_spa, tag); +} + +void +dsl_pool_config_enter(dsl_pool_t *dp, void *tag) +{ + /* + * We use a "reentrant" reader-writer lock, but not reentrantly. + * + * The rrwlock can (with the track_all flag) track all reading threads, + * which is very useful for debugging which code path failed to release + * the lock, and for verifying that the *current* thread does hold + * the lock. + * + * (Unlike a rwlock, which knows that N threads hold it for + * read, but not *which* threads, so rw_held(RW_READER) returns TRUE + * if any thread holds it for read, even if this thread doesn't). + */ + ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); + rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); +} + +void +dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag) +{ + ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); + rrw_enter_read_prio(&dp->dp_config_rwlock, tag); +} + +void +dsl_pool_config_exit(dsl_pool_t *dp, void *tag) +{ + rrw_exit(&dp->dp_config_rwlock, tag); +} + +boolean_t +dsl_pool_config_held(dsl_pool_t *dp) +{ + return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); +} + +boolean_t +dsl_pool_config_held_writer(dsl_pool_t *dp) +{ + return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c new file mode 100644 index 000000000000..50aef5b618f9 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c @@ -0,0 +1,1211 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2015, Joyent, Inc. + */ + +#include <sys/zfs_context.h> +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/spa.h> +#include <sys/zap.h> +#include <sys/fs/zfs.h> + +#include "zfs_prop.h" + +#define ZPROP_INHERIT_SUFFIX "$inherit" +#define ZPROP_RECVD_SUFFIX "$recvd" + +static int +dodefault(zfs_prop_t prop, int intsz, int numints, void *buf) +{ + /* + * The setonce properties are read-only, BUT they still + * have a default value that can be used as the initial + * value. + */ + if (prop == ZPROP_INVAL || + (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop))) + return (SET_ERROR(ENOENT)); + + if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { + if (zfs_prop_default_string(prop) == NULL) + return (SET_ERROR(ENOENT)); + if (intsz != 1) + return (SET_ERROR(EOVERFLOW)); + (void) strncpy(buf, zfs_prop_default_string(prop), + numints); + } else { + if (intsz != 8 || numints < 1) + return (SET_ERROR(EOVERFLOW)); + + *(uint64_t *)buf = zfs_prop_default_numeric(prop); + } + + return (0); +} + +int +dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, + int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot) +{ + int err = ENOENT; + dsl_dir_t *target = dd; + objset_t *mos = dd->dd_pool->dp_meta_objset; + zfs_prop_t prop; + boolean_t inheritable; + boolean_t inheriting = B_FALSE; + char *inheritstr; + char *recvdstr; + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + + if (setpoint) + setpoint[0] = '\0'; + + prop = zfs_name_to_prop(propname); + inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); + inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); + recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + + /* + * Note: dd may become NULL, therefore we shouldn't dereference it + * after this loop. + */ + for (; dd != NULL; dd = dd->dd_parent) { + if (dd != target || snapshot) { + if (!inheritable) + break; + inheriting = B_TRUE; + } + + /* Check for a local value. */ + err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, + propname, intsz, numints, buf); + if (err != ENOENT) { + if (setpoint != NULL && err == 0) + dsl_dir_name(dd, setpoint); + break; + } + + /* + * Skip the check for a received value if there is an explicit + * inheritance entry. + */ + err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, + inheritstr); + if (err != 0 && err != ENOENT) + break; + + if (err == ENOENT) { + /* Check for a received value. */ + err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, + recvdstr, intsz, numints, buf); + if (err != ENOENT) { + if (setpoint != NULL && err == 0) { + if (inheriting) { + dsl_dir_name(dd, setpoint); + } else { + (void) strcpy(setpoint, + ZPROP_SOURCE_VAL_RECVD); + } + } + break; + } + } + + /* + * If we found an explicit inheritance entry, err is zero even + * though we haven't yet found the value, so reinitializing err + * at the end of the loop (instead of at the beginning) ensures + * that err has a valid post-loop value. + */ + err = SET_ERROR(ENOENT); + } + + if (err == ENOENT) + err = dodefault(prop, intsz, numints, buf); + + strfree(inheritstr); + strfree(recvdstr); + + return (err); +} + +int +dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, + int intsz, int numints, void *buf, char *setpoint) +{ + zfs_prop_t prop = zfs_name_to_prop(propname); + boolean_t inheritable; + uint64_t zapobj; + + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); + zapobj = dsl_dataset_phys(ds)->ds_props_obj; + + if (zapobj != 0) { + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + int err; + + ASSERT(ds->ds_is_snapshot); + + /* Check for a local value. */ + err = zap_lookup(mos, zapobj, propname, intsz, numints, buf); + if (err != ENOENT) { + if (setpoint != NULL && err == 0) + dsl_dataset_name(ds, setpoint); + return (err); + } + + /* + * Skip the check for a received value if there is an explicit + * inheritance entry. + */ + if (inheritable) { + char *inheritstr = kmem_asprintf("%s%s", propname, + ZPROP_INHERIT_SUFFIX); + err = zap_contains(mos, zapobj, inheritstr); + strfree(inheritstr); + if (err != 0 && err != ENOENT) + return (err); + } + + if (err == ENOENT) { + /* Check for a received value. */ + char *recvdstr = kmem_asprintf("%s%s", propname, + ZPROP_RECVD_SUFFIX); + err = zap_lookup(mos, zapobj, recvdstr, + intsz, numints, buf); + strfree(recvdstr); + if (err != ENOENT) { + if (setpoint != NULL && err == 0) + (void) strcpy(setpoint, + ZPROP_SOURCE_VAL_RECVD); + return (err); + } + } + } + + return (dsl_prop_get_dd(ds->ds_dir, propname, + intsz, numints, buf, setpoint, ds->ds_is_snapshot)); +} + +static dsl_prop_record_t * +dsl_prop_record_find(dsl_dir_t *dd, const char *propname) +{ + dsl_prop_record_t *pr = NULL; + + ASSERT(MUTEX_HELD(&dd->dd_lock)); + + for (pr = list_head(&dd->dd_props); + pr != NULL; pr = list_next(&dd->dd_props, pr)) { + if (strcmp(pr->pr_propname, propname) == 0) + break; + } + + return (pr); +} + +static dsl_prop_record_t * +dsl_prop_record_create(dsl_dir_t *dd, const char *propname) +{ + dsl_prop_record_t *pr; + + ASSERT(MUTEX_HELD(&dd->dd_lock)); + + pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP); + pr->pr_propname = spa_strdup(propname); + list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t), + offsetof(dsl_prop_cb_record_t, cbr_pr_node)); + list_insert_head(&dd->dd_props, pr); + + return (pr); +} + +void +dsl_prop_init(dsl_dir_t *dd) +{ + list_create(&dd->dd_props, sizeof (dsl_prop_record_t), + offsetof(dsl_prop_record_t, pr_node)); +} + +void +dsl_prop_fini(dsl_dir_t *dd) +{ + dsl_prop_record_t *pr; + + while ((pr = list_remove_head(&dd->dd_props)) != NULL) { + list_destroy(&pr->pr_cbs); + strfree((char *)pr->pr_propname); + kmem_free(pr, sizeof (dsl_prop_record_t)); + } + list_destroy(&dd->dd_props); +} + +/* + * Register interest in the named property. We'll call the callback + * once to notify it of the current property value, and again each time + * the property changes, until this callback is unregistered. + * + * Return 0 on success, errno if the prop is not an integer value. + */ +int +dsl_prop_register(dsl_dataset_t *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_pool_t *dp = dd->dd_pool; + uint64_t value; + dsl_prop_record_t *pr; + dsl_prop_cb_record_t *cbr; + int err; + + ASSERT(dsl_pool_config_held(dp)); + + err = dsl_prop_get_int_ds(ds, propname, &value); + if (err != 0) + return (err); + + cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP); + cbr->cbr_ds = ds; + cbr->cbr_func = callback; + cbr->cbr_arg = cbarg; + + mutex_enter(&dd->dd_lock); + pr = dsl_prop_record_find(dd, propname); + if (pr == NULL) + pr = dsl_prop_record_create(dd, propname); + cbr->cbr_pr = pr; + list_insert_head(&pr->pr_cbs, cbr); + list_insert_head(&ds->ds_prop_cbs, cbr); + mutex_exit(&dd->dd_lock); + + cbr->cbr_func(cbr->cbr_arg, value); + return (0); +} + +int +dsl_prop_get(const char *dsname, const char *propname, + int intsz, int numints, void *buf, char *setpoint) +{ + objset_t *os; + int error; + + error = dmu_objset_hold(dsname, FTAG, &os); + if (error != 0) + return (error); + + error = dsl_prop_get_ds(dmu_objset_ds(os), propname, + intsz, numints, buf, setpoint); + + dmu_objset_rele(os, FTAG); + return (error); +} + +/* + * Get the current property value. It may have changed by the time this + * function returns, so it is NOT safe to follow up with + * dsl_prop_register() and assume that the value has not changed in + * between. + * + * Return 0 on success, ENOENT if ddname is invalid. + */ +int +dsl_prop_get_integer(const char *ddname, const char *propname, + uint64_t *valuep, char *setpoint) +{ + return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); +} + +int +dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname, + uint64_t *valuep) +{ + return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL)); +} + +/* + * Predict the effective value of the given special property if it were set with + * the given value and source. This is not a general purpose function. It exists + * only to handle the special requirements of the quota and reservation + * properties. The fact that these properties are non-inheritable greatly + * simplifies the prediction logic. + * + * Returns 0 on success, a positive error code on failure, or -1 if called with + * a property not handled by this function. + */ +int +dsl_prop_predict(dsl_dir_t *dd, const char *propname, + zprop_source_t source, uint64_t value, uint64_t *newvalp) +{ + zfs_prop_t prop = zfs_name_to_prop(propname); + objset_t *mos; + uint64_t zapobj; + uint64_t version; + char *recvdstr; + int err = 0; + + switch (prop) { + case ZFS_PROP_QUOTA: + case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFQUOTA: + case ZFS_PROP_REFRESERVATION: + break; + default: + return (-1); + } + + mos = dd->dd_pool->dp_meta_objset; + zapobj = dsl_dir_phys(dd)->dd_props_zapobj; + recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + + version = spa_version(dd->dd_pool->dp_spa); + if (version < SPA_VERSION_RECVD_PROPS) { + if (source & ZPROP_SRC_NONE) + source = ZPROP_SRC_NONE; + else if (source & ZPROP_SRC_RECEIVED) + source = ZPROP_SRC_LOCAL; + } + + switch (source) { + case ZPROP_SRC_NONE: + /* Revert to the received value, if any. */ + err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp); + if (err == ENOENT) + *newvalp = 0; + break; + case ZPROP_SRC_LOCAL: + *newvalp = value; + break; + case ZPROP_SRC_RECEIVED: + /* + * If there's no local setting, then the new received value will + * be the effective value. + */ + err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); + if (err == ENOENT) + *newvalp = value; + break; + case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): + /* + * We're clearing the received value, so the local setting (if + * it exists) remains the effective value. + */ + err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); + if (err == ENOENT) + *newvalp = 0; + break; + default: + panic("unexpected property source: %d", source); + } + + strfree(recvdstr); + + if (err == ENOENT) + return (0); + + return (err); +} + +/* + * Unregister all callbacks that are registered with the + * given callback argument. + */ +void +dsl_prop_unregister_all(dsl_dataset_t *ds, void *cbarg) +{ + dsl_prop_cb_record_t *cbr, *next_cbr; + + dsl_dir_t *dd = ds->ds_dir; + + mutex_enter(&dd->dd_lock); + next_cbr = list_head(&ds->ds_prop_cbs); + while (next_cbr != NULL) { + cbr = next_cbr; + next_cbr = list_next(&ds->ds_prop_cbs, cbr); + if (cbr->cbr_arg == cbarg) { + list_remove(&ds->ds_prop_cbs, cbr); + list_remove(&cbr->cbr_pr->pr_cbs, cbr); + kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); + } + } + mutex_exit(&dd->dd_lock); +} + +boolean_t +dsl_prop_hascb(dsl_dataset_t *ds) +{ + return (!list_is_empty(&ds->ds_prop_cbs)); +} + +/* ARGSUSED */ +static int +dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_record_t *pr; + dsl_prop_cb_record_t *cbr; + + mutex_enter(&dd->dd_lock); + for (pr = list_head(&dd->dd_props); + pr; pr = list_next(&dd->dd_props, pr)) { + for (cbr = list_head(&pr->pr_cbs); cbr; + cbr = list_next(&pr->pr_cbs, cbr)) { + uint64_t value; + + /* + * Callback entries do not have holds on their + * datasets so that datasets with registered + * callbacks are still eligible for eviction. + * Unlike operations to update properties on a + * single dataset, we are performing a recursive + * descent of related head datasets. The caller + * of this function only has a dataset hold on + * the passed in head dataset, not the snapshots + * associated with this dataset. Without a hold, + * the dataset pointer within callback records + * for snapshots can be invalidated by eviction + * at any time. + * + * Use dsl_dataset_try_add_ref() to verify + * that the dataset for a snapshot has not + * begun eviction processing and to prevent + * eviction from occurring for the duration of + * the callback. If the hold attempt fails, + * this object is already being evicted and the + * callback can be safely ignored. + */ + if (ds != cbr->cbr_ds && + !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG)) + continue; + + if (dsl_prop_get_ds(cbr->cbr_ds, + cbr->cbr_pr->pr_propname, sizeof (value), 1, + &value, NULL) == 0) + cbr->cbr_func(cbr->cbr_arg, value); + + if (ds != cbr->cbr_ds) + dsl_dataset_rele(cbr->cbr_ds, FTAG); + } + } + mutex_exit(&dd->dd_lock); + + return (0); +} + +/* + * Update all property values for ddobj & its descendants. This is used + * when renaming the dir. + */ +void +dsl_prop_notify_all(dsl_dir_t *dd) +{ + dsl_pool_t *dp = dd->dd_pool; + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb, + NULL, DS_FIND_CHILDREN); +} + +static void +dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, + const char *propname, uint64_t value, int first) +{ + dsl_dir_t *dd; + dsl_prop_record_t *pr; + dsl_prop_cb_record_t *cbr; + objset_t *mos = dp->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t *za; + int err; + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); + if (err) + return; + + if (!first) { + /* + * If the prop is set here, then this change is not + * being inherited here or below; stop the recursion. + */ + err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, + propname); + if (err == 0) { + dsl_dir_rele(dd, FTAG); + return; + } + ASSERT3U(err, ==, ENOENT); + } + + mutex_enter(&dd->dd_lock); + pr = dsl_prop_record_find(dd, propname); + if (pr != NULL) { + for (cbr = list_head(&pr->pr_cbs); cbr; + cbr = list_next(&pr->pr_cbs, cbr)) { + uint64_t propobj; + + /* + * cbr->cbr_ds may be invalidated due to eviction, + * requiring the use of dsl_dataset_try_add_ref(). + * See comment block in dsl_prop_notify_all_cb() + * for details. + */ + if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG)) + continue; + + propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj; + + /* + * If the property is not set on this ds, then it is + * inherited here; call the callback. + */ + if (propobj == 0 || + zap_contains(mos, propobj, propname) != 0) + cbr->cbr_func(cbr->cbr_arg, value); + + dsl_dataset_rele(cbr->cbr_ds, FTAG); + } + } + mutex_exit(&dd->dd_lock); + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, mos, + dsl_dir_phys(dd)->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, za) == 0; + zap_cursor_advance(&zc)) { + dsl_prop_changed_notify(dp, za->za_first_integer, + propname, value, FALSE); + } + kmem_free(za, sizeof (zap_attribute_t)); + zap_cursor_fini(&zc); + dsl_dir_rele(dd, FTAG); +} + +void +dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, + zprop_source_t source, int intsz, int numints, const void *value, + dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t zapobj, intval, dummy; + int isint; + char valbuf[32]; + const char *valstr = NULL; + char *inheritstr; + char *recvdstr; + char *tbuf = NULL; + int err; + uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); + + isint = (dodefault(zfs_name_to_prop(propname), 8, 1, &intval) == 0); + + if (ds->ds_is_snapshot) { + ASSERT(version >= SPA_VERSION_SNAP_PROPS); + if (dsl_dataset_phys(ds)->ds_props_obj == 0) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_props_obj = + zap_create(mos, + DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); + } + zapobj = dsl_dataset_phys(ds)->ds_props_obj; + } else { + zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj; + } + + if (version < SPA_VERSION_RECVD_PROPS) { + if (source & ZPROP_SRC_NONE) + source = ZPROP_SRC_NONE; + else if (source & ZPROP_SRC_RECEIVED) + source = ZPROP_SRC_LOCAL; + } + + inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); + recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + + switch (source) { + case ZPROP_SRC_NONE: + /* + * revert to received value, if any (inherit -S) + * - remove propname + * - remove propname$inherit + */ + err = zap_remove(mos, zapobj, propname, tx); + ASSERT(err == 0 || err == ENOENT); + err = zap_remove(mos, zapobj, inheritstr, tx); + ASSERT(err == 0 || err == ENOENT); + break; + case ZPROP_SRC_LOCAL: + /* + * remove propname$inherit + * set propname -> value + */ + err = zap_remove(mos, zapobj, inheritstr, tx); + ASSERT(err == 0 || err == ENOENT); + VERIFY0(zap_update(mos, zapobj, propname, + intsz, numints, value, tx)); + break; + case ZPROP_SRC_INHERITED: + /* + * explicitly inherit + * - remove propname + * - set propname$inherit + */ + err = zap_remove(mos, zapobj, propname, tx); + ASSERT(err == 0 || err == ENOENT); + if (version >= SPA_VERSION_RECVD_PROPS && + dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) { + dummy = 0; + VERIFY0(zap_update(mos, zapobj, inheritstr, + 8, 1, &dummy, tx)); + } + break; + case ZPROP_SRC_RECEIVED: + /* + * set propname$recvd -> value + */ + err = zap_update(mos, zapobj, recvdstr, + intsz, numints, value, tx); + ASSERT(err == 0); + break; + case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED): + /* + * clear local and received settings + * - remove propname + * - remove propname$inherit + * - remove propname$recvd + */ + err = zap_remove(mos, zapobj, propname, tx); + ASSERT(err == 0 || err == ENOENT); + err = zap_remove(mos, zapobj, inheritstr, tx); + ASSERT(err == 0 || err == ENOENT); + /* FALLTHRU */ + case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): + /* + * remove propname$recvd + */ + err = zap_remove(mos, zapobj, recvdstr, tx); + ASSERT(err == 0 || err == ENOENT); + break; + default: + cmn_err(CE_PANIC, "unexpected property source: %d", source); + } + + strfree(inheritstr); + strfree(recvdstr); + + if (isint) { + VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); + + if (ds->ds_is_snapshot) { + dsl_prop_cb_record_t *cbr; + /* + * It's a snapshot; nothing can inherit this + * property, so just look for callbacks on this + * ds here. + */ + mutex_enter(&ds->ds_dir->dd_lock); + for (cbr = list_head(&ds->ds_prop_cbs); cbr; + cbr = list_next(&ds->ds_prop_cbs, cbr)) { + if (strcmp(cbr->cbr_pr->pr_propname, + propname) == 0) + cbr->cbr_func(cbr->cbr_arg, intval); + } + mutex_exit(&ds->ds_dir->dd_lock); + } else { + dsl_prop_changed_notify(ds->ds_dir->dd_pool, + ds->ds_dir->dd_object, propname, intval, TRUE); + } + + (void) snprintf(valbuf, sizeof (valbuf), + "%lld", (longlong_t)intval); + valstr = valbuf; + } else { + if (source == ZPROP_SRC_LOCAL) { + valstr = value; + } else { + tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + if (dsl_prop_get_ds(ds, propname, 1, + ZAP_MAXVALUELEN, tbuf, NULL) == 0) + valstr = tbuf; + } + } + + spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE || + source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx, + "%s=%s", propname, (valstr == NULL ? "" : valstr)); + + if (tbuf != NULL) + kmem_free(tbuf, ZAP_MAXVALUELEN); +} + +int +dsl_prop_set_int(const char *dsname, const char *propname, + zprop_source_t source, uint64_t value) +{ + nvlist_t *nvl = fnvlist_alloc(); + int error; + + fnvlist_add_uint64(nvl, propname, value); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); +} + +int +dsl_prop_set_string(const char *dsname, const char *propname, + zprop_source_t source, const char *value) +{ + nvlist_t *nvl = fnvlist_alloc(); + int error; + + fnvlist_add_string(nvl, propname, value); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); +} + +int +dsl_prop_inherit(const char *dsname, const char *propname, + zprop_source_t source) +{ + nvlist_t *nvl = fnvlist_alloc(); + int error; + + fnvlist_add_boolean(nvl, propname); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); +} + +typedef struct dsl_props_set_arg { + const char *dpsa_dsname; + zprop_source_t dpsa_source; + nvlist_t *dpsa_props; +} dsl_props_set_arg_t; + +static int +dsl_props_set_check(void *arg, dmu_tx_t *tx) +{ + dsl_props_set_arg_t *dpsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t version; + nvpair_t *elem = NULL; + int err; + + err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds); + if (err != 0) + return (err); + + version = spa_version(ds->ds_dir->dd_pool->dp_spa); + while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) { + if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENAMETOOLONG)); + } + if (nvpair_type(elem) == DATA_TYPE_STRING) { + char *valstr = fnvpair_value_string(elem); + if (strlen(valstr) >= (version < + SPA_VERSION_STMF_PROP ? + ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { + dsl_dataset_rele(ds, FTAG); + return (E2BIG); + } + } + } + + if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOTSUP)); + } + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source, + nvlist_t *props, dmu_tx_t *tx) +{ + nvpair_t *elem = NULL; + + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + nvpair_t *pair = elem; + const char *name = nvpair_name(pair); + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + /* + * This usually happens when we reuse the nvlist_t data + * returned by the counterpart dsl_prop_get_all_impl(). + * For instance we do this to restore the original + * received properties when an error occurs in the + * zfs_ioc_recv() codepath. + */ + nvlist_t *attrs = fnvpair_value_nvlist(pair); + pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); + } + + if (nvpair_type(pair) == DATA_TYPE_STRING) { + const char *value = fnvpair_value_string(pair); + dsl_prop_set_sync_impl(ds, name, + source, 1, strlen(value) + 1, value, tx); + } else if (nvpair_type(pair) == DATA_TYPE_UINT64) { + uint64_t intval = fnvpair_value_uint64(pair); + dsl_prop_set_sync_impl(ds, name, + source, sizeof (intval), 1, &intval, tx); + } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) { + dsl_prop_set_sync_impl(ds, name, + source, 0, 0, NULL, tx); + } else { + panic("invalid nvpair type"); + } + } +} + +static void +dsl_props_set_sync(void *arg, dmu_tx_t *tx) +{ + dsl_props_set_arg_t *dpsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds)); + dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx); + dsl_dataset_rele(ds, FTAG); +} + +/* + * All-or-nothing; if any prop can't be set, nothing will be modified. + */ +int +dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) +{ + dsl_props_set_arg_t dpsa; + int nblks = 0; + + dpsa.dpsa_dsname = dsname; + dpsa.dpsa_source = source; + dpsa.dpsa_props = props; + + /* + * If the source includes NONE, then we will only be removing entries + * from the ZAP object. In that case don't check for ENOSPC. + */ + if ((source & ZPROP_SRC_NONE) == 0) + nblks = 2 * fnvlist_num_pairs(props); + + return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync, + &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED)); +} + +typedef enum dsl_prop_getflags { + DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */ + DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */ + DSL_PROP_GET_LOCAL = 0x4, /* local properties */ + DSL_PROP_GET_RECEIVED = 0x8 /* received properties */ +} dsl_prop_getflags_t; + +static int +dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, + const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err = 0; + + for (zap_cursor_init(&zc, mos, propobj); + (err = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + nvlist_t *propval; + zfs_prop_t prop; + char buf[ZAP_MAXNAMELEN]; + char *valstr; + const char *suffix; + const char *propname; + const char *source; + + suffix = strchr(za.za_name, '$'); + + if (suffix == NULL) { + /* + * Skip local properties if we only want received + * properties. + */ + if (flags & DSL_PROP_GET_RECEIVED) + continue; + + propname = za.za_name; + source = setpoint; + } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) { + /* Skip explicitly inherited entries. */ + continue; + } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) { + if (flags & DSL_PROP_GET_LOCAL) + continue; + + (void) strncpy(buf, za.za_name, (suffix - za.za_name)); + buf[suffix - za.za_name] = '\0'; + propname = buf; + + if (!(flags & DSL_PROP_GET_RECEIVED)) { + /* Skip if locally overridden. */ + err = zap_contains(mos, propobj, propname); + if (err == 0) + continue; + if (err != ENOENT) + break; + + /* Skip if explicitly inherited. */ + valstr = kmem_asprintf("%s%s", propname, + ZPROP_INHERIT_SUFFIX); + err = zap_contains(mos, propobj, valstr); + strfree(valstr); + if (err == 0) + continue; + if (err != ENOENT) + break; + } + + source = ((flags & DSL_PROP_GET_INHERITING) ? + setpoint : ZPROP_SOURCE_VAL_RECVD); + } else { + /* + * For backward compatibility, skip suffixes we don't + * recognize. + */ + continue; + } + + prop = zfs_name_to_prop(propname); + + /* Skip non-inheritable properties. */ + if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL && + !zfs_prop_inheritable(prop)) + continue; + + /* Skip properties not valid for this type. */ + if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL && + !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT)) + continue; + + /* Skip properties already defined. */ + if (nvlist_exists(nv, propname)) + continue; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if (za.za_integer_length == 1) { + /* + * String property + */ + char *tmp = kmem_alloc(za.za_num_integers, + KM_SLEEP); + err = zap_lookup(mos, propobj, + za.za_name, 1, za.za_num_integers, tmp); + if (err != 0) { + kmem_free(tmp, za.za_num_integers); + break; + } + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, + tmp) == 0); + kmem_free(tmp, za.za_num_integers); + } else { + /* + * Integer property + */ + ASSERT(za.za_integer_length == 8); + (void) nvlist_add_uint64(propval, ZPROP_VALUE, + za.za_first_integer); + } + + VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0); + VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); + nvlist_free(propval); + } + zap_cursor_fini(&zc); + if (err == ENOENT) + err = 0; + return (err); +} + +/* + * Iterate over all properties for this dataset and return them in an nvlist. + */ +static int +dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, + dsl_prop_getflags_t flags) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + int err = 0; + char setpoint[ZFS_MAX_DATASET_NAME_LEN]; + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + if (ds->ds_is_snapshot) + flags |= DSL_PROP_GET_SNAPSHOT; + + ASSERT(dsl_pool_config_held(dp)); + + if (dsl_dataset_phys(ds)->ds_props_obj != 0) { + ASSERT(flags & DSL_PROP_GET_SNAPSHOT); + dsl_dataset_name(ds, setpoint); + err = dsl_prop_get_all_impl(mos, + dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp); + if (err) + goto out; + } + + for (; dd != NULL; dd = dd->dd_parent) { + if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) { + if (flags & (DSL_PROP_GET_LOCAL | + DSL_PROP_GET_RECEIVED)) + break; + flags |= DSL_PROP_GET_INHERITING; + } + dsl_dir_name(dd, setpoint); + err = dsl_prop_get_all_impl(mos, + dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp); + if (err) + break; + } +out: + return (err); +} + +boolean_t +dsl_prop_get_hasrecvd(const char *dsname) +{ + uint64_t dummy; + + return (0 == + dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL)); +} + +static int +dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source) +{ + uint64_t version; + spa_t *spa; + int error = 0; + + VERIFY0(spa_open(dsname, &spa, FTAG)); + version = spa_version(spa); + spa_close(spa, FTAG); + + if (version >= SPA_VERSION_RECVD_PROPS) + error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0); + return (error); +} + +/* + * Call after successfully receiving properties to ensure that only the first + * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties. + */ +int +dsl_prop_set_hasrecvd(const char *dsname) +{ + int error = 0; + if (!dsl_prop_get_hasrecvd(dsname)) + error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL); + return (error); +} + +void +dsl_prop_unset_hasrecvd(const char *dsname) +{ + VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE)); +} + +int +dsl_prop_get_all(objset_t *os, nvlist_t **nvp) +{ + return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0)); +} + +int +dsl_prop_get_received(const char *dsname, nvlist_t **nvp) +{ + objset_t *os; + int error; + + /* + * Received properties are not distinguishable from local properties + * until the dataset has received properties on or after + * SPA_VERSION_RECVD_PROPS. + */ + dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ? + DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL); + + error = dmu_objset_hold(dsname, FTAG, &os); + if (error != 0) + return (error); + error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags); + dmu_objset_rele(os, FTAG); + return (error); +} + +void +dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) +{ + nvlist_t *propval; + const char *propname = zfs_prop_to_name(prop); + uint64_t default_value; + + if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); + return; + } + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); + /* Indicate the default source if we can. */ + if (dodefault(prop, 8, 1, &default_value) == 0 && + value == default_value) { + VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0); + } + VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); + nvlist_free(propval); +} + +void +dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) +{ + nvlist_t *propval; + const char *propname = zfs_prop_to_name(prop); + + if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); + return; + } + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); + VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); + nvlist_free(propval); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c new file mode 100644 index 000000000000..a32df14e7507 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -0,0 +1,3939 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2016 Gary Mills + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017 Datto Inc. + */ + +#include <sys/dsl_scan.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> +#include <sys/dnode.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/arc.h> +#include <sys/zap.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zil_impl.h> +#include <sys/zio_checksum.h> +#include <sys/ddt.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#include <sys/zfeature.h> +#include <sys/abd.h> +#include <sys/range_tree.h> +#ifdef _KERNEL +#include <sys/zfs_vfsops.h> +#endif + +/* + * Grand theory statement on scan queue sorting + * + * Scanning is implemented by recursively traversing all indirection levels + * in an object and reading all blocks referenced from said objects. This + * results in us approximately traversing the object from lowest logical + * offset to the highest. For best performance, we would want the logical + * blocks to be physically contiguous. However, this is frequently not the + * case with pools given the allocation patterns of copy-on-write filesystems. + * So instead, we put the I/Os into a reordering queue and issue them in a + * way that will most benefit physical disks (LBA-order). + * + * Queue management: + * + * Ideally, we would want to scan all metadata and queue up all block I/O + * prior to starting to issue it, because that allows us to do an optimal + * sorting job. This can however consume large amounts of memory. Therefore + * we continuously monitor the size of the queues and constrain them to 5% + * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this + * limit, we clear out a few of the largest extents at the head of the queues + * to make room for more scanning. Hopefully, these extents will be fairly + * large and contiguous, allowing us to approach sequential I/O throughput + * even without a fully sorted tree. + * + * Metadata scanning takes place in dsl_scan_visit(), which is called from + * dsl_scan_sync() every spa_sync(). If we have either fully scanned all + * metadata on the pool, or we need to make room in memory because our + * queues are too large, dsl_scan_visit() is postponed and + * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies + * that metadata scanning and queued I/O issuing are mutually exclusive. This + * allows us to provide maximum sequential I/O throughput for the majority of + * I/O's issued since sequential I/O performance is significantly negatively + * impacted if it is interleaved with random I/O. + * + * Implementation Notes + * + * One side effect of the queued scanning algorithm is that the scanning code + * needs to be notified whenever a block is freed. This is needed to allow + * the scanning code to remove these I/Os from the issuing queue. Additionally, + * we do not attempt to queue gang blocks to be issued sequentially since this + * is very hard to do and would have an extremely limitted performance benefit. + * Instead, we simply issue gang I/Os as soon as we find them using the legacy + * algorithm. + * + * Backwards compatibility + * + * This new algorithm is backwards compatible with the legacy on-disk data + * structures (and therefore does not require a new feature flag). + * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan + * will stop scanning metadata (in logical order) and wait for all outstanding + * sorted I/O to complete. Once this is done, we write out a checkpoint + * bookmark, indicating that we have scanned everything logically before it. + * If the pool is imported on a machine without the new sorting algorithm, + * the scan simply resumes from the last checkpoint using the legacy algorithm. + */ + +typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, + const zbookmark_phys_t *); + +static scan_cb_t dsl_scan_scrub_cb; + +static int scan_ds_queue_compare(const void *a, const void *b); +static int scan_prefetch_queue_compare(const void *a, const void *b); +static void scan_ds_queue_clear(dsl_scan_t *scn); +static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, + uint64_t *txg); +static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); +static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); +static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); + +extern int zfs_vdev_async_write_active_min_dirty_percent; + +/* + * By default zfs will check to ensure it is not over the hard memory + * limit before each txg. If finer-grained control of this is needed + * this value can be set to 1 to enable checking before scanning each + * block. + */ +int zfs_scan_strict_mem_lim = B_FALSE; + +/* + * Maximum number of parallelly executing I/Os per top-level vdev. + * Tune with care. Very high settings (hundreds) are known to trigger + * some firmware bugs and resets on certain SSDs. + */ +int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ +unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver -- 2 is a good number */ +unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub -- 4 is a good number */ +unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ + +/* + * Maximum number of parallelly executed bytes per leaf vdev. We attempt + * to strike a balance here between keeping the vdev queues full of I/Os + * at all times and not overflowing the queues to cause long latency, + * which would cause long txg sync times. No matter what, we will not + * overload the drives with I/O, since that is protected by + * zfs_vdev_scrub_max_active. + */ +unsigned long zfs_scan_vdev_limit = 4 << 20; + +int zfs_scan_issue_strategy = 0; +int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */ +uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ + +unsigned int zfs_scan_checkpoint_intval = 7200; /* seconds */ +#define ZFS_SCAN_CHECKPOINT_INTVAL SEC_TO_TICK(zfs_scan_checkpoint_intval) + +/* + * fill_weight is non-tunable at runtime, so we copy it at module init from + * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would + * break queue sorting. + */ +uint64_t zfs_scan_fill_weight = 3; +static uint64_t fill_weight; + +/* See dsl_scan_should_clear() for details on the memory limit tunables */ +uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ +uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ +int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */ +int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */ + +unsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ +unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ +unsigned int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */ +unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ +boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ +boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, + &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN, + &zfs_resilver_delay, 0, "Number of ticks to delay resilver"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN, + &zfs_scrub_delay, 0, "Number of ticks to delay scrub"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN, + &zfs_scan_idle, 0, "Idle scan window in clock ticks"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN, + &zfs_scrub_min_time_ms, 0, "Min millisecs to scrub per txg"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN, + &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN, + &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN, + &zfs_no_scrub_io, 0, "Disable scrub I/O"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN, + &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_legacy, CTLFLAG_RWTUN, + &zfs_scan_legacy, 0, "Scrub using legacy non-sequential method"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_checkpoint_interval, CTLFLAG_RWTUN, + &zfs_scan_checkpoint_intval, 0, "Scan progress on-disk checkpointing interval"); + +enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; +/* max number of blocks to free in a single TXG */ +uint64_t zfs_async_block_max_blocks = UINT64_MAX; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, + &zfs_async_block_max_blocks, 0, "Maximum number of blocks to free in one TXG"); + +/* + * We wait a few txgs after importing a pool to begin scanning so that + * the import / mounting code isn't held up by scrub / resilver IO. + * Unfortunately, it is a bit difficult to determine exactly how long + * this will take since userspace will trigger fs mounts asynchronously + * and the kernel will create zvol minors asynchronously. As a result, + * the value provided here is a bit arbitrary, but represents a + * reasonable estimate of how many txgs it will take to finish fully + * importing a pool + */ +#define SCAN_IMPORT_WAIT_TXGS 5 + + +#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ + ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ + (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) + +extern int zfs_txg_timeout; + +/* + * Enable/disable the processing of the free_bpobj object. + */ +boolean_t zfs_free_bpobj_enabled = B_TRUE; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN, + &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing"); + +/* the order has to match pool_scan_type */ +static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { + NULL, + dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ + dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ +}; + +/* In core node for the scn->scn_queue. Represents a dataset to be scanned */ +typedef struct { + uint64_t sds_dsobj; + uint64_t sds_txg; + avl_node_t sds_node; +} scan_ds_t; + +/* + * This controls what conditions are placed on dsl_scan_sync_state(): + * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0 + * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0. + * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise + * write out the scn_phys_cached version. + * See dsl_scan_sync_state for details. + */ +typedef enum { + SYNC_OPTIONAL, + SYNC_MANDATORY, + SYNC_CACHED +} state_sync_type_t; + +/* + * This struct represents the minimum information needed to reconstruct a + * zio for sequential scanning. This is useful because many of these will + * accumulate in the sequential IO queues before being issued, so saving + * memory matters here. + */ +typedef struct scan_io { + /* fields from blkptr_t */ + uint64_t sio_offset; + uint64_t sio_blk_prop; + uint64_t sio_phys_birth; + uint64_t sio_birth; + zio_cksum_t sio_cksum; + uint32_t sio_asize; + + /* fields from zio_t */ + int sio_flags; + zbookmark_phys_t sio_zb; + + /* members for queue sorting */ + union { + avl_node_t sio_addr_node; /* link into issueing queue */ + list_node_t sio_list_node; /* link for issuing to disk */ + } sio_nodes; +} scan_io_t; + +struct dsl_scan_io_queue { + dsl_scan_t *q_scn; /* associated dsl_scan_t */ + vdev_t *q_vd; /* top-level vdev that this queue represents */ + + /* trees used for sorting I/Os and extents of I/Os */ + range_tree_t *q_exts_by_addr; + avl_tree_t q_exts_by_size; + avl_tree_t q_sios_by_addr; + + /* members for zio rate limiting */ + uint64_t q_maxinflight_bytes; + uint64_t q_inflight_bytes; + kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */ + + /* per txg statistics */ + uint64_t q_total_seg_size_this_txg; + uint64_t q_segs_this_txg; + uint64_t q_total_zio_size_this_txg; + uint64_t q_zios_this_txg; +}; + +/* private data for dsl_scan_prefetch_cb() */ +typedef struct scan_prefetch_ctx { + refcount_t spc_refcnt; /* refcount for memory management */ + dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */ + boolean_t spc_root; /* is this prefetch for an objset? */ + uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */ + uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */ +} scan_prefetch_ctx_t; + +/* private data for dsl_scan_prefetch() */ +typedef struct scan_prefetch_issue_ctx { + avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */ + scan_prefetch_ctx_t *spic_spc; /* spc for the callback */ + blkptr_t spic_bp; /* bp to prefetch */ + zbookmark_phys_t spic_zb; /* bookmark to prefetch */ +} scan_prefetch_issue_ctx_t; + +static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue); +static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, + scan_io_t *sio); + +static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); +static void scan_io_queues_destroy(dsl_scan_t *scn); + +static kmem_cache_t *sio_cache; + +void +scan_init(void) +{ + /* + * This is used in ext_size_compare() to weight segments + * based on how sparse they are. This cannot be changed + * mid-scan and the tree comparison functions don't currently + * have a mechansim for passing additional context to the + * compare functions. Thus we store this value globally and + * we only allow it to be set at module intiailization time + */ + fill_weight = zfs_scan_fill_weight; + + sio_cache = kmem_cache_create("sio_cache", + sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +scan_fini(void) +{ + kmem_cache_destroy(sio_cache); +} + +static inline boolean_t +dsl_scan_is_running(const dsl_scan_t *scn) +{ + return (scn->scn_phys.scn_state == DSS_SCANNING); +} + +boolean_t +dsl_scan_resilvering(dsl_pool_t *dp) +{ + return (dsl_scan_is_running(dp->dp_scan) && + dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); +} + +static inline void +sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id) +{ + bzero(bp, sizeof (*bp)); + DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize); + DVA_SET_VDEV(&bp->blk_dva[0], vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset); + bp->blk_prop = sio->sio_blk_prop; + bp->blk_phys_birth = sio->sio_phys_birth; + bp->blk_birth = sio->sio_birth; + bp->blk_fill = 1; /* we always only work with data pointers */ + bp->blk_cksum = sio->sio_cksum; +} + +static inline void +bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) +{ + /* we discard the vdev id, since we can deduce it from the queue */ + sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]); + sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]); + sio->sio_blk_prop = bp->blk_prop; + sio->sio_phys_birth = bp->blk_phys_birth; + sio->sio_birth = bp->blk_birth; + sio->sio_cksum = bp->blk_cksum; +} + +void +dsl_scan_global_init(void) +{ + /* + * This is used in ext_size_compare() to weight segments + * based on how sparse they are. This cannot be changed + * mid-scan and the tree comparison functions don't currently + * have a mechansim for passing additional context to the + * compare functions. Thus we store this value globally and + * we only allow it to be set at module intiailization time + */ + fill_weight = zfs_scan_fill_weight; +} + +int +dsl_scan_init(dsl_pool_t *dp, uint64_t txg) +{ + int err; + dsl_scan_t *scn; + spa_t *spa = dp->dp_spa; + uint64_t f; + + scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); + scn->scn_dp = dp; + + /* + * It's possible that we're resuming a scan after a reboot so + * make sure that the scan_async_destroying flag is initialized + * appropriately. + */ + ASSERT(!scn->scn_async_destroying); + scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_ASYNC_DESTROY); + + bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); + avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), + offsetof(scan_ds_t, sds_node)); + avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, + sizeof (scan_prefetch_issue_ctx_t), + offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_func", sizeof (uint64_t), 1, &f); + if (err == 0) { + /* + * There was an old-style scrub in progress. Restart a + * new-style scrub from the beginning. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("old-style scrub was in progress; " + "restarting new-style scrub in txg %llu", + (longlong_t)scn->scn_restart_txg); + + /* + * Load the queue obj from the old location so that it + * can be freed by dsl_scan_done(). + */ + (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_queue", sizeof (uint64_t), 1, + &scn->scn_phys.scn_queue_obj); + } else { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys); + if (err == ENOENT) + return (0); + else if (err) + return (err); + + /* + * We might be restarting after a reboot, so jump the issued + * counter to how far we've scanned. We know we're consistent + * up to here. + */ + scn->scn_issued_before_pass = scn->scn_phys.scn_examined; + + if (dsl_scan_is_running(scn) && + spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { + /* + * A new-type scrub was in progress on an old + * pool, and the pool was accessed by old + * software. Restart from the beginning, since + * the old software may have changed the pool in + * the meantime. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("new-style scrub was modified " + "by old software; restarting in txg %llu", + (longlong_t)scn->scn_restart_txg); + } + } + + /* reload the queue into the in-core state */ + if (scn->scn_phys.scn_queue_obj != 0) { + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + scan_ds_queue_insert(scn, + zfs_strtonum(za.za_name, NULL), + za.za_first_integer); + } + zap_cursor_fini(&zc); + } + + spa_scan_stat_init(spa); + return (0); +} + +void +dsl_scan_fini(dsl_pool_t *dp) +{ + if (dp->dp_scan != NULL) { + dsl_scan_t *scn = dp->dp_scan; + + if (scn->scn_taskq != NULL) + taskq_destroy(scn->scn_taskq); + scan_ds_queue_clear(scn); + avl_destroy(&scn->scn_queue); + avl_destroy(&scn->scn_prefetch_queue); + + kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); + dp->dp_scan = NULL; + } +} + +static boolean_t +dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) +{ + return (scn->scn_restart_txg != 0 && + scn->scn_restart_txg <= tx->tx_txg); +} + +boolean_t +dsl_scan_scrubbing(const dsl_pool_t *dp) +{ + dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys; + + return (scn_phys->scn_state == DSS_SCANNING && + scn_phys->scn_func == POOL_SCAN_SCRUB); +} + +boolean_t +dsl_scan_is_paused_scrub(const dsl_scan_t *scn) +{ + return (dsl_scan_scrubbing(scn->scn_dp) && + scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); +} + +/* + * Writes out a persistent dsl_scan_phys_t record to the pool directory. + * Because we can be running in the block sorting algorithm, we do not always + * want to write out the record, only when it is "safe" to do so. This safety + * condition is achieved by making sure that the sorting queues are empty + * (scn_bytes_pending == 0). When this condition is not true, the sync'd state + * is inconsistent with how much actual scanning progress has been made. The + * kind of sync to be performed is specified by the sync_type argument. If the + * sync is optional, we only sync if the queues are empty. If the sync is + * mandatory, we do a hard ASSERT to make sure that the queues are empty. The + * third possible state is a "cached" sync. This is done in response to: + * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been + * destroyed, so we wouldn't be able to restart scanning from it. + * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been + * superseded by a newer snapshot. + * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been + * swapped with its clone. + * In all cases, a cached sync simply rewrites the last record we've written, + * just slightly modified. For the modifications that are performed to the + * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed, + * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped. + */ +static void +dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) +{ + int i; + spa_t *spa = scn->scn_dp->dp_spa; + + ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0); + if (scn->scn_bytes_pending == 0) { + for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue; + + if (q == NULL) + continue; + + mutex_enter(&vd->vdev_scan_io_queue_lock); + ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); + ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL); + ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + if (scn->scn_phys.scn_queue_obj != 0) + scan_ds_queue_sync(scn, tx); + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys, tx)); + bcopy(&scn->scn_phys, &scn->scn_phys_cached, + sizeof (scn->scn_phys)); + + if (scn->scn_checkpointing) + zfs_dbgmsg("finish scan checkpoint"); + + scn->scn_checkpointing = B_FALSE; + scn->scn_last_checkpoint = ddi_get_lbolt(); + } else if (sync_type == SYNC_CACHED) { + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys_cached, tx)); + } +} + +/* ARGSUSED */ +static int +dsl_scan_setup_check(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + if (dsl_scan_is_running(scn)) + return (SET_ERROR(EBUSY)); + + return (0); +} + +static void +dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + pool_scan_func_t *funcp = arg; + dmu_object_type_t ot = 0; + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + ASSERT(!dsl_scan_is_running(scn)); + ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + bzero(&scn->scn_phys, sizeof (scn->scn_phys)); + scn->scn_phys.scn_func = *funcp; + scn->scn_phys.scn_state = DSS_SCANNING; + scn->scn_phys.scn_min_txg = 0; + scn->scn_phys.scn_max_txg = tx->tx_txg; + scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ + scn->scn_phys.scn_start_time = gethrestime_sec(); + scn->scn_phys.scn_errors = 0; + scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; + scn->scn_issued_before_pass = 0; + scn->scn_restart_txg = 0; + scn->scn_done_txg = 0; + scn->scn_last_checkpoint = 0; + scn->scn_checkpointing = B_FALSE; + spa_scan_stat_init(spa); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; + + /* rewrite all disk labels */ + vdev_config_dirty(spa->spa_root_vdev); + + if (vdev_resilver_needed(spa->spa_root_vdev, + &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { + spa_event_notify(spa, NULL, NULL, + ESC_ZFS_RESILVER_START); + } else { + spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); + } + + spa->spa_scrub_started = B_TRUE; + /* + * If this is an incremental scrub, limit the DDT scrub phase + * to just the auto-ditto class (for correctness); the rest + * of the scrub should go faster using top-down pruning. + */ + if (scn->scn_phys.scn_min_txg > TXG_INITIAL) + scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; + + } + + /* back to the generic stuff */ + + if (dp->dp_blkstats == NULL) { + dp->dp_blkstats = + kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + mutex_init(&dp->dp_blkstats->zab_lock, NULL, + MUTEX_DEFAULT, NULL); + } + bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type)); + + if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) + ot = DMU_OT_ZAP_OTHER; + + scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, + ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); + + bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); + + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); + + spa_history_log_internal(spa, "scan setup", tx, + "func=%u mintxg=%llu maxtxg=%llu", + *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); +} + +/* + * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. + * Can also be called to resume a paused scrub. + */ +int +dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + /* + * Purge all vdev caches and probe all devices. We do this here + * rather than in sync context because this requires a writer lock + * on the spa_config lock, which we can't do from sync context. The + * spa_scrub_reopen flag indicates that vdev_open() should not + * attempt to start another scrub. + */ + spa_vdev_state_enter(spa, SCL_NONE); + spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + (void) spa_vdev_state_exit(spa, NULL, 0); + + if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { + /* got scrub start cmd, resume paused scrub */ + int err = dsl_scrub_set_pause_resume(scn->scn_dp, + POOL_SCRUB_NORMAL); + if (err == 0) { + spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); + return (ECANCELED); + } + return (SET_ERROR(err)); + } + + return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, + dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +/* ARGSUSED */ +static void +dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) +{ + static const char *old_names[] = { + "scrub_bookmark", + "scrub_ddt_bookmark", + "scrub_ddt_class_max", + "scrub_queue", + "scrub_min_txg", + "scrub_max_txg", + "scrub_func", + "scrub_errors", + NULL + }; + + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + int i; + + /* Remove any remnants of an old-style scrub. */ + for (i = 0; old_names[i]; i++) { + (void) zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); + } + + if (scn->scn_phys.scn_queue_obj != 0) { + VERIFY0(dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = 0; + } + scan_ds_queue_clear(scn); + + scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; + + /* + * If we were "restarted" from a stopped state, don't bother + * with anything else. + */ + if (!dsl_scan_is_running(scn)) { + ASSERT(!scn->scn_is_sorted); + return; + } + + if (scn->scn_is_sorted) { + scan_io_queues_destroy(scn); + scn->scn_is_sorted = B_FALSE; + + if (scn->scn_taskq != NULL) { + taskq_destroy(scn->scn_taskq); + scn->scn_taskq = NULL; + } + } + + scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED; + + if (dsl_scan_restarting(scn, tx)) + spa_history_log_internal(spa, "scan aborted, restarting", tx, + "errors=%llu", spa_get_errlog_size(spa)); + else if (!complete) + spa_history_log_internal(spa, "scan cancelled", tx, + "errors=%llu", spa_get_errlog_size(spa)); + else + spa_history_log_internal(spa, "scan done", tx, + "errors=%llu", spa_get_errlog_size(spa)); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + spa->spa_scrub_started = B_FALSE; + spa->spa_scrub_active = B_FALSE; + + /* + * If the scrub/resilver completed, update all DTLs to + * reflect this. Whether it succeeded or not, vacate + * all temporary scrub DTLs. + * + * As the scrub does not currently support traversing + * data that have been freed but are part of a checkpoint, + * we don't mark the scrub as done in the DTLs as faults + * may still exist in those vdevs. + */ + if (complete && + !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, + scn->scn_phys.scn_max_txg, B_TRUE); + + spa_event_notify(spa, NULL, NULL, + scn->scn_phys.scn_min_txg ? + ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + } else { + vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, + 0, B_TRUE); + } + spa_errlog_rotate(spa); + + /* + * We may have finished replacing a device. + * Let the async thread assess this and handle the detach. + */ + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); + } + + scn->scn_phys.scn_end_time = gethrestime_sec(); + + ASSERT(!dsl_scan_is_running(scn)); +} + +/* ARGSUSED */ +static int +dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + if (!dsl_scan_is_running(scn)) + return (SET_ERROR(ENOENT)); + return (0); +} + +/* ARGSUSED */ +static void +dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + dsl_scan_done(scn, B_FALSE, tx); + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); + spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT); +} + +int +dsl_scan_cancel(dsl_pool_t *dp) +{ + return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, + dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); +} + +static int +dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx) +{ + pool_scrub_cmd_t *cmd = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_scan_t *scn = dp->dp_scan; + + if (*cmd == POOL_SCRUB_PAUSE) { + /* can't pause a scrub when there is no in-progress scrub */ + if (!dsl_scan_scrubbing(dp)) + return (SET_ERROR(ENOENT)); + + /* can't pause a paused scrub */ + if (dsl_scan_is_paused_scrub(scn)) + return (SET_ERROR(EBUSY)); + } else if (*cmd != POOL_SCRUB_NORMAL) { + return (SET_ERROR(ENOTSUP)); + } + + return (0); +} + +static void +dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) +{ + pool_scrub_cmd_t *cmd = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + if (*cmd == POOL_SCRUB_PAUSE) { + /* can't pause a scrub when there is no in-progress scrub */ + spa->spa_scan_pass_scrub_pause = gethrestime_sec(); + scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED; + dsl_scan_sync_state(scn, tx, SYNC_CACHED); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); + } else { + ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); + if (dsl_scan_is_paused_scrub(scn)) { + /* + * We need to keep track of how much time we spend + * paused per pass so that we can adjust the scrub rate + * shown in the output of 'zpool status' + */ + spa->spa_scan_pass_scrub_spent_paused += + gethrestime_sec() - spa->spa_scan_pass_scrub_pause; + spa->spa_scan_pass_scrub_pause = 0; + scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; + dsl_scan_sync_state(scn, tx, SYNC_CACHED); + } + } +} + +/* + * Set scrub pause/resume state if it makes sense to do so + */ +int +dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) +{ + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, + ZFS_SPACE_CHECK_RESERVED)); +} + + +/* start a new scan, or restart an existing one. */ +void +dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) +{ + if (txg == 0) { + dmu_tx_t *tx; + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + + txg = dmu_tx_get_txg(tx); + dp->dp_scan->scn_restart_txg = txg; + dmu_tx_commit(tx); + } else { + dp->dp_scan->scn_restart_txg = txg; + } + zfs_dbgmsg("restarting resilver txg=%llu", txg); +} + +void +dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) +{ + zio_free(dp->dp_spa, txg, bp); +} + +void +dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) +{ + ASSERT(dsl_pool_sync_context(dp)); + zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), + pio->io_flags)); +} + +static int +scan_ds_queue_compare(const void *a, const void *b) +{ + const scan_ds_t *sds_a = a, *sds_b = b; + + if (sds_a->sds_dsobj < sds_b->sds_dsobj) + return (-1); + if (sds_a->sds_dsobj == sds_b->sds_dsobj) + return (0); + return (1); +} + +static void +scan_ds_queue_clear(dsl_scan_t *scn) +{ + void *cookie = NULL; + scan_ds_t *sds; + while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) { + kmem_free(sds, sizeof (*sds)); + } +} + +static boolean_t +scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg) +{ + scan_ds_t srch, *sds; + + srch.sds_dsobj = dsobj; + sds = avl_find(&scn->scn_queue, &srch, NULL); + if (sds != NULL && txg != NULL) + *txg = sds->sds_txg; + return (sds != NULL); +} + +static void +scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg) +{ + scan_ds_t *sds; + avl_index_t where; + + sds = kmem_zalloc(sizeof (*sds), KM_SLEEP); + sds->sds_dsobj = dsobj; + sds->sds_txg = txg; + + VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL); + avl_insert(&scn->scn_queue, sds, where); +} + +static void +scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj) +{ + scan_ds_t srch, *sds; + + srch.sds_dsobj = dsobj; + + sds = avl_find(&scn->scn_queue, &srch, NULL); + VERIFY(sds != NULL); + avl_remove(&scn->scn_queue, sds); + kmem_free(sds, sizeof (*sds)); +} + +static void +scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? + DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; + + ASSERT0(scn->scn_bytes_pending); + ASSERT(scn->scn_phys.scn_queue_obj != 0); + + VERIFY0(dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot, + DMU_OT_NONE, 0, tx); + for (scan_ds_t *sds = avl_first(&scn->scn_queue); + sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) { + VERIFY0(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, sds->sds_dsobj, + sds->sds_txg, tx)); + } +} + +/* + * Computes the memory limit state that we're currently in. A sorted scan + * needs quite a bit of memory to hold the sorting queue, so we need to + * reasonably constrain the size so it doesn't impact overall system + * performance. We compute two limits: + * 1) Hard memory limit: if the amount of memory used by the sorting + * queues on a pool gets above this value, we stop the metadata + * scanning portion and start issuing the queued up and sorted + * I/Os to reduce memory usage. + * This limit is calculated as a fraction of physmem (by default 5%). + * We constrain the lower bound of the hard limit to an absolute + * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain + * the upper bound to 5% of the total pool size - no chance we'll + * ever need that much memory, but just to keep the value in check. + * 2) Soft memory limit: once we hit the hard memory limit, we start + * issuing I/O to reduce queue memory usage, but we don't want to + * completely empty out the queues, since we might be able to find I/Os + * that will fill in the gaps of our non-sequential IOs at some point + * in the future. So we stop the issuing of I/Os once the amount of + * memory used drops below the soft limit (at which point we stop issuing + * I/O and start scanning metadata again). + * + * This limit is calculated by subtracting a fraction of the hard + * limit from the hard limit. By default this fraction is 5%, so + * the soft limit is 95% of the hard limit. We cap the size of the + * difference between the hard and soft limits at an absolute + * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is + * sufficient to not cause too frequent switching between the + * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's + * worth of queues is about 1.2 GiB of on-pool data, so scanning + * that should take at least a decent fraction of a second). + */ +static boolean_t +dsl_scan_should_clear(dsl_scan_t *scn) +{ + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; + uint64_t mlim_hard, mlim_soft, mused; + uint64_t alloc = metaslab_class_get_alloc(spa_normal_class( + scn->scn_dp->dp_spa)); + + mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, + zfs_scan_mem_lim_min); + mlim_hard = MIN(mlim_hard, alloc / 20); + mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact, + zfs_scan_mem_lim_soft_max); + mused = 0; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *tvd = rvd->vdev_child[i]; + dsl_scan_io_queue_t *queue; + + mutex_enter(&tvd->vdev_scan_io_queue_lock); + queue = tvd->vdev_scan_io_queue; + if (queue != NULL) { + /* #extents in exts_by_size = # in exts_by_addr */ + mused += avl_numnodes(&queue->q_exts_by_size) * + sizeof (range_seg_t) + + avl_numnodes(&queue->q_sios_by_addr) * + sizeof (scan_io_t); + } + mutex_exit(&tvd->vdev_scan_io_queue_lock); + } + + dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); + + if (mused == 0) + ASSERT0(scn->scn_bytes_pending); + + /* + * If we are above our hard limit, we need to clear out memory. + * If we are below our soft limit, we need to accumulate sequential IOs. + * Otherwise, we should keep doing whatever we are currently doing. + */ + if (mused >= mlim_hard) + return (B_TRUE); + else if (mused < mlim_soft) + return (B_FALSE); + else + return (scn->scn_clearing); +} + +static boolean_t +dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) +{ + /* we never skip user/group accounting objects */ + if (zb && (int64_t)zb->zb_object < 0) + return (B_FALSE); + + if (scn->scn_suspending) + return (B_TRUE); /* we're already suspending */ + + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) + return (B_FALSE); /* we're resuming */ + + /* We only know how to resume from level-0 blocks. */ + if (zb && zb->zb_level != 0) + return (B_FALSE); + + /* + * We suspend if: + * - we have scanned for at least the minimum time (default 1 sec + * for scrub, 3 sec for resilver), and either we have sufficient + * dirty data that we are starting to write more quickly + * (default 30%), or someone is explicitly waiting for this txg + * to complete. + * or + * - the spa is shutting down because this pool is being exported + * or the machine is rebooting. + * or + * - the scan queue has reached its memory use limit + */ + uint64_t elapsed_nanosecs = gethrtime(); + uint64_t curr_time_ns = gethrtime(); + uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; + + int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; + int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; + + if ((NSEC2MSEC(scan_time_ns) > mintime && + (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || + txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa) || + (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) { + if (zb) { + dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + scn->scn_phys.scn_bookmark = *zb; + } else { + dsl_scan_phys_t *scnp = &scn->scn_phys; + + dprintf("suspending at at DDT bookmark " + "%llx/%llx/%llx/%llx\n", + (longlong_t)scnp->scn_ddt_bookmark.ddb_class, + (longlong_t)scnp->scn_ddt_bookmark.ddb_type, + (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, + (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); + } + scn->scn_suspending = B_TRUE; + return (B_TRUE); + } + return (B_FALSE); +} + +typedef struct zil_scan_arg { + dsl_pool_t *zsa_dp; + zil_header_t *zsa_zh; +} zil_scan_arg_t; + +/* ARGSUSED */ +static int +dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + zbookmark_phys_t zb; + + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * One block ("stubby") can be allocated a long time ago; we + * want to visit that one because it has been allocated + * (on-disk) even if it hasn't been claimed (even though for + * scrub there's nothing to do to it). + */ + if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa)) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + return (0); +} + +/* ARGSUSED */ +static int +dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + if (lrc->lrc_txtype == TX_WRITE) { + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_phys_t zb; + + if (BP_IS_HOLE(bp) || + bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * birth can be < claim_txg if this record's txg is + * already txg sync'ed (but this log block contains + * other records that are not synced) + */ + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + } + return (0); +} + +static void +dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + zil_scan_arg_t zsa = { dp, zh }; + zilog_t *zilog; + + ASSERT(spa_writeable(dp->dp_spa)); + + /* + * We only want to visit blocks that have been claimed + * but not yet replayed. + */ + if (claim_txg == 0) + return; + + zilog = zil_alloc(dp->dp_meta_objset, zh); + + (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, + claim_txg); + + zil_free(zilog); +} + +/* + * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea + * here is to sort the AVL tree by the order each block will be needed. + */ +static int +scan_prefetch_queue_compare(const void *a, const void *b) +{ + const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b; + const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc; + const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc; + + return (zbookmark_compare(spc_a->spc_datablkszsec, + spc_a->spc_indblkshift, spc_b->spc_datablkszsec, + spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb)); +} + +static void +scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag) +{ + if (refcount_remove(&spc->spc_refcnt, tag) == 0) { + refcount_destroy(&spc->spc_refcnt); + kmem_free(spc, sizeof (scan_prefetch_ctx_t)); + } +} + +static scan_prefetch_ctx_t * +scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag) +{ + scan_prefetch_ctx_t *spc; + + spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP); + refcount_create(&spc->spc_refcnt); + refcount_add(&spc->spc_refcnt, tag); + spc->spc_scn = scn; + if (dnp != NULL) { + spc->spc_datablkszsec = dnp->dn_datablkszsec; + spc->spc_indblkshift = dnp->dn_indblkshift; + spc->spc_root = B_FALSE; + } else { + spc->spc_datablkszsec = 0; + spc->spc_indblkshift = 0; + spc->spc_root = B_TRUE; + } + + return (spc); +} + +static void +scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag) +{ + refcount_add(&spc->spc_refcnt, tag); +} + +static boolean_t +dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc, + const zbookmark_phys_t *zb) +{ + zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark; + dnode_phys_t tmp_dnp; + dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp; + + if (zb->zb_objset != last_zb->zb_objset) + return (B_TRUE); + if ((int64_t)zb->zb_object < 0) + return (B_FALSE); + + tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec; + tmp_dnp.dn_indblkshift = spc->spc_indblkshift; + + if (zbookmark_subtree_completed(dnp, zb, last_zb)) + return (B_TRUE); + + return (B_FALSE); +} + +static void +dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) +{ + avl_index_t idx; + dsl_scan_t *scn = spc->spc_scn; + spa_t *spa = scn->scn_dp->dp_spa; + scan_prefetch_issue_ctx_t *spic; + + if (zfs_no_scrub_prefetch) + return; + + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && + BP_GET_TYPE(bp) != DMU_OT_OBJSET)) + return; + + if (dsl_scan_check_prefetch_resume(spc, zb)) + return; + + scan_prefetch_ctx_add_ref(spc, scn); + spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP); + spic->spic_spc = spc; + spic->spic_bp = *bp; + spic->spic_zb = *zb; + + /* + * Add the IO to the queue of blocks to prefetch. This allows us to + * prioritize blocks that we will need first for the main traversal + * thread. + */ + mutex_enter(&spa->spa_scrub_lock); + if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) { + /* this block is already queued for prefetch */ + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + scan_prefetch_ctx_rele(spc, scn); + mutex_exit(&spa->spa_scrub_lock); + return; + } + + avl_insert(&scn->scn_prefetch_queue, spic, idx); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); +} + +static void +dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp, + uint64_t objset, uint64_t object) +{ + int i; + zbookmark_phys_t zb; + scan_prefetch_ctx_t *spc; + + if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) + return; + + SET_BOOKMARK(&zb, objset, object, 0, 0); + + spc = scan_prefetch_ctx_create(scn, dnp, FTAG); + + for (i = 0; i < dnp->dn_nblkptr; i++) { + zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]); + zb.zb_blkid = i; + dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zb.zb_level = 0; + zb.zb_blkid = DMU_SPILL_BLKID; + dsl_scan_prefetch(spc, &dnp->dn_spill, &zb); + } + + scan_prefetch_ctx_rele(spc, FTAG); +} + +void +dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *private) +{ + scan_prefetch_ctx_t *spc = private; + dsl_scan_t *scn = spc->spc_scn; + spa_t *spa = scn->scn_dp->dp_spa; + + /* broadcast that the IO has completed for rate limitting purposes */ + mutex_enter(&spa->spa_scrub_lock); + ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); + spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); + + /* if there was an error or we are done prefetching, just cleanup */ + if (buf == NULL || scn->scn_suspending) + goto out; + + if (BP_GET_LEVEL(bp) > 0) { + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + zbookmark_phys_t czb; + + for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, zb->zb_blkid * epb + i); + dsl_scan_prefetch(spc, cbp, &czb); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + dnode_phys_t *cdnp = buf->b_data; + int i; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + for (i = 0, cdnp = buf->b_data; i < epb; + i += cdnp->dn_extra_slots + 1, + cdnp += cdnp->dn_extra_slots + 1) { + dsl_scan_prefetch_dnode(scn, cdnp, + zb->zb_objset, zb->zb_blkid * epb + i); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + objset_phys_t *osp = buf->b_data; + + dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode, + zb->zb_objset, DMU_META_DNODE_OBJECT); + + if (OBJSET_BUF_HAS_USERUSED(buf)) { + dsl_scan_prefetch_dnode(scn, + &osp->os_groupused_dnode, zb->zb_objset, + DMU_GROUPUSED_OBJECT); + dsl_scan_prefetch_dnode(scn, + &osp->os_userused_dnode, zb->zb_objset, + DMU_USERUSED_OBJECT); + } + } + +out: + if (buf != NULL) + arc_buf_destroy(buf, private); + scan_prefetch_ctx_rele(spc, scn); +} + +/* ARGSUSED */ +static void +dsl_scan_prefetch_thread(void *arg) +{ + dsl_scan_t *scn = arg; + spa_t *spa = scn->scn_dp->dp_spa; + vdev_t *rvd = spa->spa_root_vdev; + uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; + scan_prefetch_issue_ctx_t *spic; + + /* loop until we are told to stop */ + while (!scn->scn_prefetch_stop) { + arc_flags_t flags = ARC_FLAG_NOWAIT | + ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; + + mutex_enter(&spa->spa_scrub_lock); + + /* + * Wait until we have an IO to issue and are not above our + * maximum in flight limit. + */ + while (!scn->scn_prefetch_stop && + (avl_numnodes(&scn->scn_prefetch_queue) == 0 || + spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) { + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + } + + /* recheck if we should stop since we waited for the cv */ + if (scn->scn_prefetch_stop) { + mutex_exit(&spa->spa_scrub_lock); + break; + } + + /* remove the prefetch IO from the tree */ + spic = avl_first(&scn->scn_prefetch_queue); + spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp); + avl_remove(&scn->scn_prefetch_queue, spic); + + mutex_exit(&spa->spa_scrub_lock); + + /* issue the prefetch asynchronously */ + (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, + &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, + ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); + + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + } + + ASSERT(scn->scn_prefetch_stop); + + /* free any prefetches we didn't get to complete */ + mutex_enter(&spa->spa_scrub_lock); + while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) { + avl_remove(&scn->scn_prefetch_queue, spic); + scan_prefetch_ctx_rele(spic->spic_spc, scn); + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + } + ASSERT0(avl_numnodes(&scn->scn_prefetch_queue)); + mutex_exit(&spa->spa_scrub_lock); +} + +static boolean_t +dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, + const zbookmark_phys_t *zb) +{ + /* + * We never skip over user/group accounting objects (obj<0) + */ + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && + (int64_t)zb->zb_object >= 0) { + /* + * If we already visited this bp & everything below (in + * a prior txg sync), don't bother doing it again. + */ + if (zbookmark_subtree_completed(dnp, zb, + &scn->scn_phys.scn_bookmark)) + return (B_TRUE); + + /* + * If we found the block we're trying to resume from, or + * we went past it to a different object, zero it out to + * indicate that it's OK to start checking for suspending + * again. + */ + if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || + zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { + dprintf("resuming at %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); + } + } + return (B_FALSE); +} + +static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, + dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, + dmu_objset_type_t ostype, dmu_tx_t *tx); +static void dsl_scan_visitdnode( + dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, + dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); + +/* + * Return nonzero on i/o error. + * Return new buf to write out in *bufp. + */ +static int +dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, + dnode_phys_t *dnp, const blkptr_t *bp, + const zbookmark_phys_t *zb, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; + int err; + + if (BP_GET_LEVEL(bp) > 0) { + arc_flags_t flags = ARC_FLAG_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + arc_buf_t *buf; + + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + zbookmark_phys_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + dsl_scan_visitbp(cbp, &czb, dnp, + ds, scn, ostype, tx); + } + arc_buf_destroy(buf, &buf); + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + arc_flags_t flags = ARC_FLAG_WAIT; + dnode_phys_t *cdnp; + int i; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + arc_buf_t *buf; + + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cdnp = buf->b_data; i < epb; + i += cdnp->dn_extra_slots + 1, + cdnp += cdnp->dn_extra_slots + 1) { + dsl_scan_visitdnode(scn, ds, ostype, + cdnp, zb->zb_blkid * epb + i, tx); + } + + arc_buf_destroy(buf, &buf); + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + arc_flags_t flags = ARC_FLAG_WAIT; + objset_phys_t *osp; + arc_buf_t *buf; + + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + + osp = buf->b_data; + + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); + + if (OBJSET_BUF_HAS_USERUSED(buf)) { + /* + * We also always visit user/group accounting + * objects, and never skip them, even if we are + * suspending. This is necessary so that the space + * deltas from this txg get integrated. + */ + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_groupused_dnode, + DMU_GROUPUSED_OBJECT, tx); + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_userused_dnode, + DMU_USERUSED_OBJECT, tx); + } + arc_buf_destroy(buf, &buf); + } + + return (0); +} + +static void +dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, + dmu_objset_type_t ostype, dnode_phys_t *dnp, + uint64_t object, dmu_tx_t *tx) +{ + int j; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zbookmark_phys_t czb; + + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + dnp->dn_nlevels - 1, j); + dsl_scan_visitbp(&dnp->dn_blkptr[j], + &czb, dnp, ds, scn, ostype, tx); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zbookmark_phys_t czb; + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + 0, DMU_SPILL_BLKID); + dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp), + &czb, dnp, ds, scn, ostype, tx); + } +} + +/* + * The arguments are in this order because mdb can only print the + * first 5; we want them to be useful. + */ +static void +dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, + dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, + dmu_objset_type_t ostype, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + blkptr_t *bp_toread = NULL; + + if (dsl_scan_check_suspend(scn, zb)) + return; + + if (dsl_scan_check_resume(scn, dnp, zb)) + return; + + scn->scn_visited_this_txg++; + + dprintf_bp(bp, + "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", + ds, ds ? ds->ds_object : 0, + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, + bp); + + if (BP_IS_HOLE(bp)) { + scn->scn_holes_this_txg++; + return; + } + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { + scn->scn_lt_min_this_txg++; + return; + } + + bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + *bp_toread = *bp; + + if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0) + return; + + /* + * If dsl_scan_ddt() has already visited this block, it will have + * already done any translations or scrubbing, so don't call the + * callback again. + */ + if (ddt_class_contains(dp->dp_spa, + scn->scn_phys.scn_ddt_class_max, bp)) { + scn->scn_ddt_contained_this_txg++; + goto out; + } + + /* + * If this block is from the future (after cur_max_txg), then we + * are doing this on behalf of a deleted snapshot, and we will + * revisit the future block on the next pass of this dataset. + * Don't scan it now unless we need to because something + * under it was modified. + */ + if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { + scn->scn_gt_max_this_txg++; + goto out; + } + + scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); +out: + kmem_free(bp_toread, sizeof (blkptr_t)); +} + +static void +dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, + dmu_tx_t *tx) +{ + zbookmark_phys_t zb; + scan_prefetch_ctx_t *spc; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + + if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) { + SET_BOOKMARK(&scn->scn_prefetch_bookmark, + zb.zb_objset, 0, 0, 0); + } else { + scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark; + } + + scn->scn_objsets_visited_this_txg++; + + spc = scan_prefetch_ctx_create(scn, NULL, FTAG); + dsl_scan_prefetch(spc, bp, &zb); + scan_prefetch_ctx_rele(spc, FTAG); + + dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); + + dprintf_ds(ds, "finished scan%s", ""); +} + +static void +ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys) +{ + if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) { + if (ds->ds_is_snapshot) { + /* + * Note: + * - scn_cur_{min,max}_txg stays the same. + * - Setting the flag is not really necessary if + * scn_cur_max_txg == scn_max_txg, because there + * is nothing after this snapshot that we care + * about. However, we set it anyway and then + * ignore it when we retraverse it in + * dsl_scan_visitds(). + */ + scn_phys->scn_bookmark.zb_objset = + dsl_dataset_phys(ds)->ds_next_snap_obj; + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)-> + ds_next_snap_obj); + scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN; + } else { + SET_BOOKMARK(&scn_phys->scn_bookmark, + ZB_DESTROYED_OBJSET, 0, 0, 0); + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset bookmark to -1,0,0,0", + (u_longlong_t)ds->ds_object); + } + } +} + +/* + * Invoked when a dataset is destroyed. We need to make sure that: + * + * 1) If it is the dataset that was currently being scanned, we write + * a new dsl_scan_phys_t and marking the objset reference in it + * as destroyed. + * 2) Remove it from the work queue, if it was present. + * + * If the dataset was actually a snapshot, instead of marking the dataset + * as destroyed, we instead substitute the next snapshot in line. + */ +void +dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (!dsl_scan_is_running(scn)) + return; + + ds_destroyed_scn_phys(ds, &scn->scn_phys); + ds_destroyed_scn_phys(ds, &scn->scn_phys_cached); + + if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds->ds_object); + if (ds->ds_is_snapshot) + scan_ds_queue_insert(scn, + dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, &mintxg) == 0) { + ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + if (ds->ds_is_snapshot) { + /* + * We keep the same mintxg; it could be > + * ds_creation_txg if the previous snapshot was + * deleted too. + */ + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + dsl_dataset_phys(ds)->ds_next_snap_obj, + mintxg, tx) == 0); + zfs_dbgmsg("destroying ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)-> + ds_next_snap_obj); + } else { + zfs_dbgmsg("destroying ds %llu; in queue; removing", + (u_longlong_t)ds->ds_object); + } + } + + /* + * dsl_scan_sync() should be called after this, and should sync + * out our changed state, but just to be safe, do it here. + */ + dsl_scan_sync_state(scn, tx, SYNC_CACHED); +} + +static void +ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark) +{ + if (scn_bookmark->zb_objset == ds->ds_object) { + scn_bookmark->zb_objset = + dsl_dataset_phys(ds)->ds_prev_snap_obj; + zfs_dbgmsg("snapshotting ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); + } +} + +/* + * Called when a dataset is snapshotted. If we were currently traversing + * this snapshot, we reset our bookmark to point at the newly created + * snapshot. We also modify our work queue to remove the old snapshot and + * replace with the new one. + */ +void +dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (!dsl_scan_is_running(scn)) + return; + + ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); + + ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark); + ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark); + + if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds->ds_object); + scan_ds_queue_insert(scn, + dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, &mintxg) == 0) { + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); + zfs_dbgmsg("snapshotting ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); + } + + dsl_scan_sync_state(scn, tx, SYNC_CACHED); +} + +static void +ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2, + zbookmark_phys_t *scn_bookmark) +{ + if (scn_bookmark->zb_objset == ds1->ds_object) { + scn_bookmark->zb_objset = ds2->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } else if (scn_bookmark->zb_objset == ds2->ds_object) { + scn_bookmark->zb_objset = ds1->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } +} + +/* + * Called when a parent dataset and its clone are swapped. If we were + * currently traversing the dataset, we need to switch to traversing the + * newly promoted parent. + */ +void +dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds1->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (!dsl_scan_is_running(scn)) + return; + + ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); + ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); + + if (scan_ds_queue_contains(scn, ds1->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds1->ds_object); + scan_ds_queue_insert(scn, ds2->ds_object, mintxg); + } + if (scan_ds_queue_contains(scn, ds2->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds2->ds_object); + scan_ds_queue_insert(scn, ds1->ds_object, mintxg); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds1->ds_object, &mintxg) == 0) { + int err; + ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); + err = zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); + VERIFY(err == 0 || err == EEXIST); + if (err == EEXIST) { + /* Both were there to begin with */ + VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds1->ds_object, mintxg, tx)); + } + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds2->ds_object, &mintxg) == 0) { + ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); + VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } + + dsl_scan_sync_state(scn, tx, SYNC_CACHED); +} + +/* ARGSUSED */ +static int +enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) +{ + uint64_t originobj = *(uint64_t *)arg; + dsl_dataset_t *ds; + int err; + dsl_scan_t *scn = dp->dp_scan; + + if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj) + return (0); + + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); + if (err) + return (err); + + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); + + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + ds = prev; + } + scan_ds_queue_insert(scn, ds->ds_object, + dsl_dataset_phys(ds)->ds_prev_snap_txg); + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + dsl_dataset_t *ds; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + + if (scn->scn_phys.scn_cur_min_txg >= + scn->scn_phys.scn_max_txg) { + /* + * This can happen if this snapshot was created after the + * scan started, and we already completed a previous snapshot + * that was created after the scan started. This snapshot + * only references blocks with: + * + * birth < our ds_creation_txg + * cur_min_txg is no less than ds_creation_txg. + * We have already visited these blocks. + * or + * birth > scn_max_txg + * The scan requested not to visit these blocks. + * + * Subsequent snapshots (and clones) can reference our + * blocks, or blocks with even higher birth times. + * Therefore we do not need to visit them either, + * so we do not add them to the work queue. + * + * Note that checking for cur_min_txg >= cur_max_txg + * is not sufficient, because in that case we may need to + * visit subsequent snapshots. This happens when min_txg > 0, + * which raises cur_min_txg. In this case we will visit + * this dataset but skip all of its blocks, because the + * rootbp's birth time is < cur_min_txg. Then we will + * add the next snapshots/clones to the work queue. + */ + char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " + "cur_min_txg (%llu) >= max_txg (%llu)", + (longlong_t)dsobj, dsname, + (longlong_t)scn->scn_phys.scn_cur_min_txg, + (longlong_t)scn->scn_phys.scn_max_txg); + kmem_free(dsname, MAXNAMELEN); + + goto out; + } + + /* + * Only the ZIL in the head (non-snapshot) is valid. Even though + * snapshots can have ZIL block pointers (which may be the same + * BP as in the head), they must be ignored. In addition, $ORIGIN + * doesn't have a objset (i.e. its ds_bp is a hole) so we don't + * need to look for a ZIL in it either. So we traverse the ZIL here, + * rather than in scan_recurse(), because the regular snapshot + * block-sharing rules don't apply to it. + */ + if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) && + (dp->dp_origin_snap == NULL || + ds->ds_dir != dp->dp_origin_snap->ds_dir)) { + objset_t *os; + if (dmu_objset_from_ds(ds, &os) != 0) { + goto out; + } + dsl_scan_zil(dp, &os->os_zil_header); + } + + /* + * Iterate over the bps in this ds. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " + "suspending=%u", + (longlong_t)dsobj, dsname, + (longlong_t)scn->scn_phys.scn_cur_min_txg, + (longlong_t)scn->scn_phys.scn_cur_max_txg, + (int)scn->scn_suspending); + kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); + + if (scn->scn_suspending) + goto out; + + /* + * We've finished this pass over this dataset. + */ + + /* + * If we did not completely visit this dataset, do another pass. + */ + if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { + zfs_dbgmsg("incomplete pass; visiting again"); + scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; + scan_ds_queue_insert(scn, ds->ds_object, + scn->scn_phys.scn_cur_max_txg); + goto out; + } + + /* + * Add descendent datasets to work queue. + */ + if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { + scan_ds_queue_insert(scn, + dsl_dataset_phys(ds)->ds_next_snap_obj, + dsl_dataset_phys(ds)->ds_creation_txg); + } + if (dsl_dataset_phys(ds)->ds_num_children > 1) { + boolean_t usenext = B_FALSE; + if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { + uint64_t count; + /* + * A bug in a previous version of the code could + * cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a + * missing entry. Therefore we can only use the + * next_clones_obj when its count is correct. + */ + int err = zap_count(dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj, &count); + if (err == 0 && + count == dsl_dataset_phys(ds)->ds_num_children - 1) + usenext = B_TRUE; + } + + if (usenext) { + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + scan_ds_queue_insert(scn, + zfs_strtonum(za.za_name, NULL), + dsl_dataset_phys(ds)->ds_creation_txg); + } + zap_cursor_fini(&zc); + } else { + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + enqueue_clones_cb, &ds->ds_object, + DS_FIND_CHILDREN)); + } + } + +out: + dsl_dataset_rele(ds, FTAG); +} + +/* ARGSUSED */ +static int +enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) +{ + dsl_dataset_t *ds; + int err; + dsl_scan_t *scn = dp->dp_scan; + + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); + if (err) + return (err); + + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + /* + * If this is a clone, we don't need to worry about it for now. + */ + if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(prev, FTAG); + return (0); + } + dsl_dataset_rele(ds, FTAG); + ds = prev; + } + + scan_ds_queue_insert(scn, ds->ds_object, + dsl_dataset_phys(ds)->ds_prev_snap_txg); + dsl_dataset_rele(ds, FTAG); + return (0); +} + +/* ARGSUSED */ +void +dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + const ddt_key_t *ddk = &dde->dde_key; + ddt_phys_t *ddp = dde->dde_phys; + blkptr_t bp; + zbookmark_phys_t zb = { 0 }; + int p; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) + continue; + ddt_bp_create(checksum, ddk, ddp, &bp); + + scn->scn_visited_this_txg++; + scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); + } +} + +/* + * Scrub/dedup interaction. + * + * If there are N references to a deduped block, we don't want to scrub it + * N times -- ideally, we should scrub it exactly once. + * + * We leverage the fact that the dde's replication class (enum ddt_class) + * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest + * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. + * + * To prevent excess scrubbing, the scrub begins by walking the DDT + * to find all blocks with refcnt > 1, and scrubs each of these once. + * Since there are two replication classes which contain blocks with + * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. + * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. + * + * There would be nothing more to say if a block's refcnt couldn't change + * during a scrub, but of course it can so we must account for changes + * in a block's replication class. + * + * Here's an example of what can occur: + * + * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 + * when visited during the top-down scrub phase, it will be scrubbed twice. + * This negates our scrub optimization, but is otherwise harmless. + * + * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 + * on each visit during the top-down scrub phase, it will never be scrubbed. + * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's + * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to + * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 + * while a scrub is in progress, it scrubs the block right then. + */ +static void +dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) +{ + ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; + ddt_entry_t dde = { 0 }; + int error; + uint64_t n = 0; + + while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { + ddt_t *ddt; + + if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) + break; + dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", + (longlong_t)ddb->ddb_class, + (longlong_t)ddb->ddb_type, + (longlong_t)ddb->ddb_checksum, + (longlong_t)ddb->ddb_cursor); + + /* There should be no pending changes to the dedup table */ + ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; + ASSERT(avl_first(&ddt->ddt_tree) == NULL); + + dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); + n++; + + if (dsl_scan_check_suspend(scn, NULL)) + break; + } + + zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; " + "suspending=%u", (longlong_t)n, + (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); + + ASSERT(error == 0 || error == ENOENT); + ASSERT(error != ENOENT || + ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); +} + +static uint64_t +dsl_scan_ds_maxtxg(dsl_dataset_t *ds) +{ + uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; + if (ds->ds_is_snapshot) + return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); + return (smt); +} + +static void +dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) +{ + scan_ds_t *sds; + dsl_pool_t *dp = scn->scn_dp; + + if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= + scn->scn_phys.scn_ddt_class_max) { + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_ddt(scn, tx); + if (scn->scn_suspending) + return; + } + + if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { + /* First do the MOS & ORIGIN */ + + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_visit_rootbp(scn, NULL, + &dp->dp_meta_rootbp, tx); + spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); + if (scn->scn_suspending) + return; + + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + enqueue_cb, NULL, DS_FIND_CHILDREN)); + } else { + dsl_scan_visitds(scn, + dp->dp_origin_snap->ds_object, tx); + } + ASSERT(!scn->scn_suspending); + } else if (scn->scn_phys.scn_bookmark.zb_objset != + ZB_DESTROYED_OBJSET) { + uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset; + /* + * If we were suspended, continue from here. Note if the + * ds we were suspended on was deleted, the zb_objset may + * be -1, so we will skip this and find a new objset + * below. + */ + dsl_scan_visitds(scn, dsobj, tx); + if (scn->scn_suspending) + return; + } + + /* + * In case we suspended right at the end of the ds, zero the + * bookmark so we don't think that we're still trying to resume. + */ + bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); + + /* + * Keep pulling things out of the dataset avl queue. Updates to the + * persistent zap-object-as-queue happen only at checkpoints. + */ + while ((sds = avl_first(&scn->scn_queue)) != NULL) { + dsl_dataset_t *ds; + uint64_t dsobj = sds->sds_dsobj; + uint64_t txg = sds->sds_txg; + + /* dequeue and free the ds from the queue */ + scan_ds_queue_remove(scn, dsobj); + sds = NULL; /* must not be touched after removal */ + + /* Set up min / max txg */ + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + if (txg != 0) { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, txg); + } else { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, + dsl_dataset_phys(ds)->ds_prev_snap_txg); + } + scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); + dsl_dataset_rele(ds, FTAG); + + dsl_scan_visitds(scn, dsobj, tx); + if (scn->scn_suspending) + return; + } + /* No more objsets to fetch, we're done */ + scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET; + ASSERT0(scn->scn_suspending); +} + +static uint64_t +dsl_scan_count_leaves(vdev_t *vd) +{ + uint64_t i, leaves = 0; + + /* we only count leaves that belong to the main pool and are readable */ + if (vd->vdev_islog || vd->vdev_isspare || + vd->vdev_isl2cache || !vdev_readable(vd)) + return (0); + + if (vd->vdev_ops->vdev_op_leaf) + return (1); + + for (i = 0; i < vd->vdev_children; i++) { + leaves += dsl_scan_count_leaves(vd->vdev_child[i]); + } + + return (leaves); +} + + +static void +scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp) +{ + int i; + uint64_t cur_size = 0; + + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]); + } + + q->q_total_zio_size_this_txg += cur_size; + q->q_zios_this_txg++; +} + +static void +scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start, + uint64_t end) +{ + q->q_total_seg_size_this_txg += end - start; + q->q_segs_this_txg++; +} + +static boolean_t +scan_io_queue_check_suspend(dsl_scan_t *scn) +{ + /* See comment in dsl_scan_check_suspend() */ + uint64_t curr_time_ns = gethrtime(); + uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; + int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; + int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; + + return ((NSEC2MSEC(scan_time_ns) > mintime && + (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || + txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa)); +} + +/* + * Given a list of scan_io_t's in io_list, this issues the io's out to + * disk. This consumes the io_list and frees the scan_io_t's. This is + * called when emptying queues, either when we're up against the memory + * limit or when we have finished scanning. Returns B_TRUE if we stopped + * processing the list before we finished. Any zios that were not issued + * will remain in the io_list. + */ +static boolean_t +scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) +{ + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio; + int64_t bytes_issued = 0; + boolean_t suspended = B_FALSE; + + while ((sio = list_head(io_list)) != NULL) { + blkptr_t bp; + + if (scan_io_queue_check_suspend(scn)) { + suspended = B_TRUE; + break; + } + + sio2bp(sio, &bp, queue->q_vd->vdev_id); + bytes_issued += sio->sio_asize; + scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, + &sio->sio_zb, queue); + (void) list_remove_head(io_list); + scan_io_queues_update_zio_stats(queue, &bp); + kmem_free(sio, sizeof (*sio)); + } + + atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); + + return (suspended); +} + +/* + * Given a range_seg_t (extent) and a list, this function passes over a + * scan queue and gathers up the appropriate ios which fit into that + * scan seg (starting from lowest LBA). At the end, we remove the segment + * from the q_exts_by_addr range tree. + */ +static boolean_t +scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) +{ + scan_io_t srch_sio, *sio, *next_sio; + avl_index_t idx; + uint_t num_sios = 0; + int64_t bytes_issued = 0; + + ASSERT(rs != NULL); + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + srch_sio.sio_offset = rs->rs_start; + + /* + * The exact start of the extent might not contain any matching zios, + * so if that's the case, examine the next one in the tree. + */ + sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx); + if (sio == NULL) + sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); + + while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) { + ASSERT3U(sio->sio_offset, >=, rs->rs_start); + ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end); + + next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); + avl_remove(&queue->q_sios_by_addr, sio); + + bytes_issued += sio->sio_asize; + num_sios++; + list_insert_tail(list, sio); + sio = next_sio; + } + + /* + * We limit the number of sios we process at once to 32 to avoid + * biting off more than we can chew. If we didn't take everything + * in the segment we update it to reflect the work we were able to + * complete. Otherwise, we remove it from the range tree entirely. + */ + if (sio != NULL && sio->sio_offset < rs->rs_end) { + range_tree_adjust_fill(queue->q_exts_by_addr, rs, + -bytes_issued); + range_tree_resize_segment(queue->q_exts_by_addr, rs, + sio->sio_offset, rs->rs_end - sio->sio_offset); + + return (B_TRUE); + } else { + range_tree_remove(queue->q_exts_by_addr, rs->rs_start, + rs->rs_end - rs->rs_start); + return (B_FALSE); + } +} + + +/* + * This is called from the queue emptying thread and selects the next + * extent from which we are to issue io's. The behavior of this function + * depends on the state of the scan, the current memory consumption and + * whether or not we are performing a scan shutdown. + * 1) We select extents in an elevator algorithm (LBA-order) if the scan + * needs to perform a checkpoint + * 2) We select the largest available extent if we are up against the + * memory limit. + * 3) Otherwise we don't select any extents. + */ +static const range_seg_t * +scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) +{ + dsl_scan_t *scn = queue->q_scn; + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + ASSERT(scn->scn_is_sorted); + + /* handle tunable overrides */ + if (scn->scn_checkpointing || scn->scn_clearing) { + if (zfs_scan_issue_strategy == 1) { + return (range_tree_first(queue->q_exts_by_addr)); + } else if (zfs_scan_issue_strategy == 2) { + return (avl_first(&queue->q_exts_by_size)); + } + } + + /* + * During normal clearing, we want to issue our largest segments + * first, keeping IO as sequential as possible, and leaving the + * smaller extents for later with the hope that they might eventually + * grow to larger sequential segments. However, when the scan is + * checkpointing, no new extents will be added to the sorting queue, + * so the way we are sorted now is as good as it will ever get. + * In this case, we instead switch to issuing extents in LBA order. + */ + if (scn->scn_checkpointing) { + return (range_tree_first(queue->q_exts_by_addr)); + } else if (scn->scn_clearing) { + return (avl_first(&queue->q_exts_by_size)); + } else { + return (NULL); + } +} + +static void +scan_io_queues_run_one(void *arg) +{ + dsl_scan_io_queue_t *queue = arg; + kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; + boolean_t suspended = B_FALSE; + range_seg_t *rs = NULL; + scan_io_t *sio = NULL; + list_t sio_list; + uint64_t bytes_per_leaf = zfs_scan_vdev_limit; + uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd); + + ASSERT(queue->q_scn->scn_is_sorted); + + list_create(&sio_list, sizeof (scan_io_t), + offsetof(scan_io_t, sio_nodes.sio_list_node)); + mutex_enter(q_lock); + + /* calculate maximum in-flight bytes for this txg (min 1MB) */ + queue->q_maxinflight_bytes = + MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); + + /* reset per-queue scan statistics for this txg */ + queue->q_total_seg_size_this_txg = 0; + queue->q_segs_this_txg = 0; + queue->q_total_zio_size_this_txg = 0; + queue->q_zios_this_txg = 0; + + /* loop until we have run out of time or sios */ + while ((rs = (range_seg_t*)scan_io_queue_fetch_ext(queue)) != NULL) { + uint64_t seg_start = 0, seg_end = 0; + boolean_t more_left = B_TRUE; + + ASSERT(list_is_empty(&sio_list)); + + /* loop while we still have sios left to process in this rs */ + while (more_left) { + scan_io_t *first_sio, *last_sio; + + /* + * We have selected which extent needs to be + * processed next. Gather up the corresponding sios. + */ + more_left = scan_io_queue_gather(queue, rs, &sio_list); + ASSERT(!list_is_empty(&sio_list)); + first_sio = list_head(&sio_list); + last_sio = list_tail(&sio_list); + + seg_end = last_sio->sio_offset + last_sio->sio_asize; + if (seg_start == 0) + seg_start = first_sio->sio_offset; + + /* + * Issuing sios can take a long time so drop the + * queue lock. The sio queue won't be updated by + * other threads since we're in syncing context so + * we can be sure that our trees will remain exactly + * as we left them. + */ + mutex_exit(q_lock); + suspended = scan_io_queue_issue(queue, &sio_list); + mutex_enter(q_lock); + + if (suspended) + break; + } + /* update statistics for debugging purposes */ + scan_io_queues_update_seg_stats(queue, seg_start, seg_end); + + if (suspended) + break; + } + + + /* If we were suspended in the middle of processing, + * requeue any unfinished sios and exit. + */ + while ((sio = list_head(&sio_list)) != NULL) { + list_remove(&sio_list, sio); + scan_io_queue_insert_impl(queue, sio); + } + + mutex_exit(q_lock); + list_destroy(&sio_list); +} + +/* + * Performs an emptying run on all scan queues in the pool. This just + * punches out one thread per top-level vdev, each of which processes + * only that vdev's scan queue. We can parallelize the I/O here because + * we know that each queue's io's only affect its own top-level vdev. + * + * This function waits for the queue runs to complete, and must be + * called from dsl_scan_sync (or in general, syncing context). + */ +static void +scan_io_queues_run(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + + ASSERT(scn->scn_is_sorted); + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + if (scn->scn_bytes_pending == 0) + return; + + if (scn->scn_taskq == NULL) { + char *tq_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN + 16, + KM_SLEEP); + int nthreads = spa->spa_root_vdev->vdev_children; + + /* + * We need to make this taskq *always* execute as many + * threads in parallel as we have top-level vdevs and no + * less, otherwise strange serialization of the calls to + * scan_io_queues_run_one can occur during spa_sync runs + * and that significantly impacts performance. + */ + (void) snprintf(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16, + "dsl_scan_tq_%s", spa->spa_name); + scn->scn_taskq = taskq_create(tq_name, nthreads, minclsyspri, + nthreads, nthreads, TASKQ_PREPOPULATE); + kmem_free(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16); + } + + for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + + mutex_enter(&vd->vdev_scan_io_queue_lock); + if (vd->vdev_scan_io_queue != NULL) { + VERIFY(taskq_dispatch(scn->scn_taskq, + scan_io_queues_run_one, vd->vdev_scan_io_queue, + TQ_SLEEP) != TASKQID_INVALID); + } + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + /* + * Wait for the queues to finish issuing thir IOs for this run + * before we return. There may still be IOs in flight at this + * point. + */ + taskq_wait(scn->scn_taskq); +} + +static boolean_t +dsl_scan_async_block_should_pause(dsl_scan_t *scn) +{ + uint64_t elapsed_nanosecs; + + if (zfs_recover) + return (B_FALSE); + + if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks) + return (B_TRUE); + + elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms && + txg_sync_waiting(scn->scn_dp)) || + spa_shutting_down(scn->scn_dp->dp_spa)); +} + +static int +dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg; + + if (!scn->scn_is_bptree || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { + if (dsl_scan_async_block_should_pause(scn)) + return (SET_ERROR(ERESTART)); + } + + zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, + dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, + -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), + -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); + scn->scn_visited_this_txg++; + return (0); +} + +static void +dsl_scan_update_stats(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + uint64_t i; + uint64_t seg_size_total = 0, zio_size_total = 0; + uint64_t seg_count_total = 0, zio_count_total = 0; + + for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue; + + if (queue == NULL) + continue; + + seg_size_total += queue->q_total_seg_size_this_txg; + zio_size_total += queue->q_total_zio_size_this_txg; + seg_count_total += queue->q_segs_this_txg; + zio_count_total += queue->q_zios_this_txg; + } + + if (seg_count_total == 0 || zio_count_total == 0) { + scn->scn_avg_seg_size_this_txg = 0; + scn->scn_avg_zio_size_this_txg = 0; + scn->scn_segs_this_txg = 0; + scn->scn_zios_this_txg = 0; + return; + } + + scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total; + scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total; + scn->scn_segs_this_txg = seg_count_total; + scn->scn_zios_this_txg = zio_count_total; +} + +static int +dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg; + const dva_t *dva = &bp->blk_dva[0]; + + if (dsl_scan_async_block_should_pause(scn)) + return (SET_ERROR(ERESTART)); + + spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa, + DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), + DVA_GET_ASIZE(dva), tx); + scn->scn_visited_this_txg++; + return (0); +} + +boolean_t +dsl_scan_active(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + uint64_t used = 0, comp, uncomp; + + if (spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + if (spa_shutting_down(spa)) + return (B_FALSE); + if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) || + (scn->scn_async_destroying && !scn->scn_async_stalled)) + return (B_TRUE); + + if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, + &used, &comp, &uncomp); + } + return (used != 0); +} + +static boolean_t +dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + vdev_t *vd; + + vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + + if (vd->vdev_ops == &vdev_indirect_ops) { + /* + * The indirect vdev can point to multiple + * vdevs. For simplicity, always create + * the resilver zio_t. zio_vdev_io_start() + * will bypass the child resilver i/o's if + * they are on vdevs that don't have DTL's. + */ + return (B_TRUE); + } + + if (DVA_GET_GANG(dva)) { + /* + * Gang members may be spread across multiple + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. + * XXX -- it would be better to change our + * allocation policy to ensure that all + * gang members reside on the same vdev. + */ + return (B_TRUE); + } + + /* + * Check if the txg falls within the range which must be + * resilvered. DVAs outside this range can always be skipped. + */ + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + /* + * Check if the top-level vdev must resilver this offset. + * When the offset does not intersect with a dirty leaf DTL + * then it may be possible to skip the resilver IO. The psize + * is provided instead of asize to simplify the check for RAIDZ. + */ + if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) + return (B_FALSE); + + return (B_TRUE); +} + +static int +dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) +{ + int err = 0; + dsl_scan_t *scn = dp->dp_scan; + spa_t *spa = dp->dp_spa; + + if (spa_suspend_async_destroy(spa)) + return (0); + + if (zfs_free_bpobj_enabled && + spa_version(spa) >= SPA_VERSION_DEADLISTS) { + scn->scn_is_bptree = B_FALSE; + scn->scn_async_block_min_time_ms = zfs_free_min_time_ms; + scn->scn_zio_root = zio_root(spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bpobj_iterate(&dp->dp_free_bpobj, + dsl_scan_free_block_cb, scn, tx); + VERIFY0(zio_wait(scn->scn_zio_root)); + scn->scn_zio_root = NULL; + + if (err != 0 && err != ERESTART) + zfs_panic_recover("error %u from bpobj_iterate()", err); + } + + if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { + ASSERT(scn->scn_async_destroying); + scn->scn_is_bptree = B_TRUE; + scn->scn_zio_root = zio_root(spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bptree_iterate(dp->dp_meta_objset, + dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); + VERIFY0(zio_wait(scn->scn_zio_root)); + scn->scn_zio_root = NULL; + + if (err == EIO || err == ECKSUM) { + err = 0; + } else if (err != 0 && err != ERESTART) { + zfs_panic_recover("error %u from " + "traverse_dataset_destroyed()", err); + } + + if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { + /* finished; deactivate async destroy feature */ + spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); + ASSERT(!spa_feature_is_active(spa, + SPA_FEATURE_ASYNC_DESTROY)); + VERIFY0(zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, tx)); + VERIFY0(bptree_free(dp->dp_meta_objset, + dp->dp_bptree_obj, tx)); + dp->dp_bptree_obj = 0; + scn->scn_async_destroying = B_FALSE; + scn->scn_async_stalled = B_FALSE; + } else { + /* + * If we didn't make progress, mark the async + * destroy as stalled, so that we will not initiate + * a spa_sync() on its behalf. Note that we only + * check this if we are not finished, because if the + * bptree had no blocks for us to visit, we can + * finish without "making progress". + */ + scn->scn_async_stalled = + (scn->scn_visited_this_txg == 0); + } + } + if (scn->scn_visited_this_txg) { + zfs_dbgmsg("freed %llu blocks in %llums from " + "free_bpobj/bptree txg %llu; err=%d", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t) + NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), + (longlong_t)tx->tx_txg, err); + scn->scn_visited_this_txg = 0; + + /* + * Write out changes to the DDT that may be required as a + * result of the blocks freed. This ensures that the DDT + * is clean when a scrub/resilver runs. + */ + ddt_sync(spa, tx->tx_txg); + } + if (err != 0) + return (err); + if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && + zfs_free_leak_on_eio && + (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || + dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || + dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { + /* + * We have finished background destroying, but there is still + * some space left in the dp_free_dir. Transfer this leaked + * space to the dp_leak_dir. + */ + if (dp->dp_leak_dir == NULL) { + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, + LEAK_DIR_NAME, tx); + VERIFY0(dsl_pool_open_special_dir(dp, + LEAK_DIR_NAME, &dp->dp_leak_dir)); + rrw_exit(&dp->dp_config_rwlock, FTAG); + } + dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, + dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, + dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, + dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, + -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, + -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); + } + + if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { + /* finished; verify that space accounting went to zero */ + ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); + ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); + ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); + } + + EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj), + 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ)); + if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_OBSOLETE_COUNTS)); + + scn->scn_is_bptree = B_FALSE; + scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms; + err = bpobj_iterate(&dp->dp_obsolete_bpobj, + dsl_scan_obsolete_block_cb, scn, tx); + if (err != 0 && err != ERESTART) + zfs_panic_recover("error %u from bpobj_iterate()", err); + + if (bpobj_is_empty(&dp->dp_obsolete_bpobj)) + dsl_pool_destroy_obsolete_bpobj(dp, tx); + } + + return (0); +} + +/* + * This is the primary entry point for scans that is called from syncing + * context. Scans must happen entirely during syncing context so that we + * cna guarantee that blocks we are currently scanning will not change out + * from under us. While a scan is active, this funciton controls how quickly + * transaction groups proceed, instead of the normal handling provided by + * txg_sync_thread(). + */ +void +dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dp->dp_scan; + spa_t *spa = dp->dp_spa; + int err = 0; + state_sync_type_t sync_type = SYNC_OPTIONAL; + + /* + * Check for scn_restart_txg before checking spa_load_state, so + * that we can restart an old-style scan while the pool is being + * imported (see dsl_scan_init). + */ + if (dsl_scan_restarting(scn, tx)) { + pool_scan_func_t func = POOL_SCAN_SCRUB; + dsl_scan_done(scn, B_FALSE, tx); + if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + func = POOL_SCAN_RESILVER; + zfs_dbgmsg("restarting scan func=%u txg=%llu", + func, (longlong_t)tx->tx_txg); + dsl_scan_setup_sync(&func, tx); + } + + /* + * Only process scans in sync pass 1. + */ + if (spa_sync_pass(dp->dp_spa) > 1) + return; + + /* + * If the spa is shutting down, then stop scanning. This will + * ensure that the scan does not dirty any new data during the + * shutdown phase. + */ + if (spa_shutting_down(spa)) + return; + + /* + * If the scan is inactive due to a stalled async destroy, try again. + */ + if (!scn->scn_async_stalled && !dsl_scan_active(scn)) + return; + + /* reset scan statistics */ + scn->scn_visited_this_txg = 0; + scn->scn_holes_this_txg = 0; + scn->scn_lt_min_this_txg = 0; + scn->scn_gt_max_this_txg = 0; + scn->scn_ddt_contained_this_txg = 0; + scn->scn_objsets_visited_this_txg = 0; + scn->scn_avg_seg_size_this_txg = 0; + scn->scn_segs_this_txg = 0; + scn->scn_avg_zio_size_this_txg = 0; + scn->scn_zios_this_txg = 0; + scn->scn_suspending = B_FALSE; + scn->scn_sync_start_time = gethrtime(); + spa->spa_scrub_active = B_TRUE; + + /* + * First process the async destroys. If we pause, don't do + * any scrubbing or resilvering. This ensures that there are no + * async destroys while we are scanning, so the scan code doesn't + * have to worry about traversing it. It is also faster to free the + * blocks than to scrub them. + */ + err = dsl_process_async_destroys(dp, tx); + if (err != 0) + return; + + if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) + return; + + /* + * Wait a few txgs after importing to begin scanning so that + * we can get the pool imported quickly. + */ + if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS) + return; + + /* + * It is possible to switch from unsorted to sorted at any time, + * but afterwards the scan will remain sorted unless reloaded from + * a checkpoint after a reboot. + */ + if (!zfs_scan_legacy) { + scn->scn_is_sorted = B_TRUE; + if (scn->scn_last_checkpoint == 0) + scn->scn_last_checkpoint = ddi_get_lbolt(); + } + + /* + * For sorted scans, determine what kind of work we will be doing + * this txg based on our memory limitations and whether or not we + * need to perform a checkpoint. + */ + if (scn->scn_is_sorted) { + /* + * If we are over our checkpoint interval, set scn_clearing + * so that we can begin checkpointing immediately. The + * checkpoint allows us to save a consisent bookmark + * representing how much data we have scrubbed so far. + * Otherwise, use the memory limit to determine if we should + * scan for metadata or start issue scrub IOs. We accumulate + * metadata until we hit our hard memory limit at which point + * we issue scrub IOs until we are at our soft memory limit. + */ + if (scn->scn_checkpointing || + ddi_get_lbolt() - scn->scn_last_checkpoint > + SEC_TO_TICK(zfs_scan_checkpoint_intval)) { + if (!scn->scn_checkpointing) + zfs_dbgmsg("begin scan checkpoint"); + + scn->scn_checkpointing = B_TRUE; + scn->scn_clearing = B_TRUE; + } else { + boolean_t should_clear = dsl_scan_should_clear(scn); + if (should_clear && !scn->scn_clearing) { + zfs_dbgmsg("begin scan clearing"); + scn->scn_clearing = B_TRUE; + } else if (!should_clear && scn->scn_clearing) { + zfs_dbgmsg("finish scan clearing"); + scn->scn_clearing = B_FALSE; + } + } + } else { + ASSERT0(scn->scn_checkpointing); + ASSERT0(scn->scn_clearing); + } + + if (!scn->scn_clearing && scn->scn_done_txg == 0) { + /* Need to scan metadata for more blocks to scrub */ + dsl_scan_phys_t *scnp = &scn->scn_phys; + taskqid_t prefetch_tqid; + uint64_t bytes_per_leaf = zfs_scan_vdev_limit; + uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev); + + /* + * Calculate the max number of in-flight bytes for pool-wide + * scanning operations (minimum 1MB). Limits for the issuing + * phase are done per top-level vdev and are handled separately. + */ + scn->scn_maxinflight_bytes = + MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); + + if (scnp->scn_ddt_bookmark.ddb_class <= + scnp->scn_ddt_class_max) { + ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark)); + zfs_dbgmsg("doing scan sync txg %llu; " + "ddt bm=%llu/%llu/%llu/%llx", + (longlong_t)tx->tx_txg, + (longlong_t)scnp->scn_ddt_bookmark.ddb_class, + (longlong_t)scnp->scn_ddt_bookmark.ddb_type, + (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, + (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); + } else { + zfs_dbgmsg("doing scan sync txg %llu; " + "bm=%llu/%llu/%llu/%llu", + (longlong_t)tx->tx_txg, + (longlong_t)scnp->scn_bookmark.zb_objset, + (longlong_t)scnp->scn_bookmark.zb_object, + (longlong_t)scnp->scn_bookmark.zb_level, + (longlong_t)scnp->scn_bookmark.zb_blkid); + } + + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + + scn->scn_prefetch_stop = B_FALSE; + prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq, + dsl_scan_prefetch_thread, scn, TQ_SLEEP); + ASSERT(prefetch_tqid != TASKQID_INVALID); + + dsl_pool_config_enter(dp, FTAG); + dsl_scan_visit(scn, tx); + dsl_pool_config_exit(dp, FTAG); + + mutex_enter(&dp->dp_spa->spa_scrub_lock); + scn->scn_prefetch_stop = B_TRUE; + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&dp->dp_spa->spa_scrub_lock); + + taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid); + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + zfs_dbgmsg("scan visited %llu blocks in %llums " + "(%llu os's, %llu holes, %llu < mintxg, " + "%llu in ddt, %llu > maxtxg)", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t)NSEC2MSEC(gethrtime() - + scn->scn_sync_start_time), + (longlong_t)scn->scn_objsets_visited_this_txg, + (longlong_t)scn->scn_holes_this_txg, + (longlong_t)scn->scn_lt_min_this_txg, + (longlong_t)scn->scn_ddt_contained_this_txg, + (longlong_t)scn->scn_gt_max_this_txg); + + if (!scn->scn_suspending) { + ASSERT0(avl_numnodes(&scn->scn_queue)); + scn->scn_done_txg = tx->tx_txg + 1; + if (scn->scn_is_sorted) { + scn->scn_checkpointing = B_TRUE; + scn->scn_clearing = B_TRUE; + } + zfs_dbgmsg("scan complete txg %llu", + (longlong_t)tx->tx_txg); + } + } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) { + /* need to issue scrubbing IOs from per-vdev queues */ + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + scan_io_queues_run(scn); + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + /* calculate and dprintf the current memory usage */ + (void) dsl_scan_should_clear(scn); + dsl_scan_update_stats(scn); + + zfs_dbgmsg("scrubbed %llu blocks (%llu segs) in %llums " + "(avg_block_size = %llu, avg_seg_size = %llu)", + (longlong_t)scn->scn_zios_this_txg, + (longlong_t)scn->scn_segs_this_txg, + (longlong_t)NSEC2MSEC(gethrtime() - + scn->scn_sync_start_time), + (longlong_t)scn->scn_avg_zio_size_this_txg, + (longlong_t)scn->scn_avg_seg_size_this_txg); + } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) { + /* Finished with everything. Mark the scrub as complete */ + zfs_dbgmsg("scan issuing complete txg %llu", + (longlong_t)tx->tx_txg); + ASSERT3U(scn->scn_done_txg, !=, 0); + ASSERT0(spa->spa_scrub_inflight); + ASSERT0(scn->scn_bytes_pending); + dsl_scan_done(scn, B_TRUE, tx); + sync_type = SYNC_MANDATORY; + } + + dsl_scan_sync_state(scn, tx, sync_type); +} + +static void +count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) +{ + int i; + + /* update the spa's stats on how many bytes we have issued */ + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, + DVA_GET_ASIZE(&bp->blk_dva[i])); + } + + /* + * If we resume after a reboot, zab will be NULL; don't record + * incomplete stats in that case. + */ + if (zab == NULL) + return; + + mutex_enter(&zab->zab_lock); + + for (i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; + int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; + if (t & DMU_OT_NEWTYPE) + t = DMU_OT_OTHER; + zfs_blkstat_t *zb = &zab->zab_type[l][t]; + int equal; + + zb->zb_count++; + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + zb->zb_ditto_2_of_2_samevdev++; + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal == 1) + zb->zb_ditto_2_of_3_samevdev++; + else if (equal == 3) + zb->zb_ditto_3_of_3_samevdev++; + break; + } + } + + mutex_exit(&zab->zab_lock); +} + +static void +scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) +{ + avl_index_t idx; + int64_t asize = sio->sio_asize; + dsl_scan_t *scn = queue->q_scn; + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { + /* block is already scheduled for reading */ + atomic_add_64(&scn->scn_bytes_pending, -asize); + kmem_free(sio, sizeof (*sio)); + return; + } + avl_insert(&queue->q_sios_by_addr, sio, idx); + range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize); +} + +/* + * Given all the info we got from our metadata scanning process, we + * construct a scan_io_t and insert it into the scan sorting queue. The + * I/O must already be suitable for us to process. This is controlled + * by dsl_scan_enqueue(). + */ +static void +scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, + int zio_flags, const zbookmark_phys_t *zb) +{ + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP); + + ASSERT0(BP_IS_GANG(bp)); + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + bp2sio(bp, sio, dva_i); + sio->sio_flags = zio_flags; + sio->sio_zb = *zb; + + /* + * Increment the bytes pending counter now so that we can't + * get an integer underflow in case the worker processes the + * zio before we get to incrementing this counter. + */ + atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize); + + scan_io_queue_insert_impl(queue, sio); +} + +/* + * Given a set of I/O parameters as discovered by the metadata traversal + * process, attempts to place the I/O into the sorted queues (if allowed), + * or immediately executes the I/O. + */ +static void +dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb) +{ + spa_t *spa = dp->dp_spa; + + ASSERT(!BP_IS_EMBEDDED(bp)); + + /* + * Gang blocks are hard to issue sequentially, so we just issue them + * here immediately instead of queuing them. + */ + if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) { + scan_exec_io(dp, bp, zio_flags, zb, NULL); + return; + } + for (int i = 0; i < BP_GET_NDVAS(bp); i++) { + dva_t dva; + vdev_t *vdev; + + dva = bp->blk_dva[i]; + vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); + ASSERT(vdev != NULL); + + mutex_enter(&vdev->vdev_scan_io_queue_lock); + if (vdev->vdev_scan_io_queue == NULL) + vdev->vdev_scan_io_queue = scan_io_queue_create(vdev); + ASSERT(dp->dp_scan != NULL); + scan_io_queue_insert(vdev->vdev_scan_io_queue, bp, + i, zio_flags, zb); + mutex_exit(&vdev->vdev_scan_io_queue_lock); + } +} + +static int +dsl_scan_scrub_cb(dsl_pool_t *dp, + const blkptr_t *bp, const zbookmark_phys_t *zb) +{ + dsl_scan_t *scn = dp->dp_scan; + spa_t *spa = dp->dp_spa; + uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); + size_t psize = BP_GET_PSIZE(bp); + boolean_t needs_io; + int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; + int d; + + if (phys_birth <= scn->scn_phys.scn_min_txg || + phys_birth >= scn->scn_phys.scn_max_txg) { + count_block(scn, dp->dp_blkstats, bp); + return (0); + } + + /* Embedded BP's have phys_birth==0, so we reject them above. */ + ASSERT(!BP_IS_EMBEDDED(bp)); + + ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); + if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { + zio_flags |= ZIO_FLAG_SCRUB; + needs_io = B_TRUE; + } else { + ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); + zio_flags |= ZIO_FLAG_RESILVER; + needs_io = B_FALSE; + } + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == ZB_ZIL_LEVEL) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + const dva_t *dva = &bp->blk_dva[d]; + + /* + * Keep track of how much data we've examined so that + * zpool(1M) status can make useful progress reports. + */ + scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); + spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); + + /* if it's a resilver, this may not be in the target range */ + if (!needs_io) + needs_io = dsl_scan_need_resilver(spa, dva, psize, + phys_birth); + } + + if (needs_io && !zfs_no_scrub_io) { + dsl_scan_enqueue(dp, bp, zio_flags, zb); + } else { + count_block(scn, dp->dp_blkstats, bp); + } + + /* do not relocate this block */ + return (0); +} + +static void +dsl_scan_scrub_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + dsl_scan_io_queue_t *queue = zio->io_private; + + abd_free(zio->io_abd); + + if (queue == NULL) { + mutex_enter(&spa->spa_scrub_lock); + ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); + spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); + } else { + mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock); + ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp)); + queue->q_inflight_bytes -= BP_GET_PSIZE(bp); + cv_broadcast(&queue->q_zio_cv); + mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock); + } + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { + atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); + } +} + +/* + * Given a scanning zio's information, executes the zio. The zio need + * not necessarily be only sortable, this function simply executes the + * zio, no matter what it is. The optional queue argument allows the + * caller to specify that they want per top level vdev IO rate limiting + * instead of the legacy global limiting. + */ +static void +scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + size_t size = BP_GET_PSIZE(bp); + abd_t *data = abd_alloc_for_io(size, B_FALSE); + unsigned int scan_delay = 0; + + if (queue == NULL) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight += BP_GET_PSIZE(bp); + mutex_exit(&spa->spa_scrub_lock); + } else { + kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; + + mutex_enter(q_lock); + while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) + cv_wait(&queue->q_zio_cv, q_lock); + queue->q_inflight_bytes += BP_GET_PSIZE(bp); + mutex_exit(q_lock); + } + + if (zio_flags & ZIO_FLAG_RESILVER) + scan_delay = zfs_resilver_delay; + else { + ASSERT(zio_flags & ZIO_FLAG_SCRUB); + scan_delay = zfs_scrub_delay; + } + + if (scan_delay && (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)) + delay(MAX((int)scan_delay, 0)); + + count_block(dp->dp_scan, dp->dp_blkstats, bp); + zio_nowait(zio_read(dp->dp_scan->scn_zio_root, spa, bp, data, size, + dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb)); +} + +/* + * This is the primary extent sorting algorithm. We balance two parameters: + * 1) how many bytes of I/O are in an extent + * 2) how well the extent is filled with I/O (as a fraction of its total size) + * Since we allow extents to have gaps between their constituent I/Os, it's + * possible to have a fairly large extent that contains the same amount of + * I/O bytes than a much smaller extent, which just packs the I/O more tightly. + * The algorithm sorts based on a score calculated from the extent's size, + * the relative fill volume (in %) and a "fill weight" parameter that controls + * the split between whether we prefer larger extents or more well populated + * extents: + * + * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT) + * + * Example: + * 1) assume extsz = 64 MiB + * 2) assume fill = 32 MiB (extent is half full) + * 3) assume fill_weight = 3 + * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100 + * SCORE = 32M + (50 * 3 * 32M) / 100 + * SCORE = 32M + (4800M / 100) + * SCORE = 32M + 48M + * ^ ^ + * | +--- final total relative fill-based score + * +--------- final total fill-based score + * SCORE = 80M + * + * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards + * extents that are more completely filled (in a 3:2 ratio) vs just larger. + * Note that as an optimization, we replace multiplication and division by + * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128). + */ +static int +ext_size_compare(const void *x, const void *y) +{ + const range_seg_t *rsa = x, *rsb = y; + uint64_t sa = rsa->rs_end - rsa->rs_start, + sb = rsb->rs_end - rsb->rs_start; + uint64_t score_a, score_b; + + score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * + fill_weight * rsa->rs_fill) >> 7); + score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) * + fill_weight * rsb->rs_fill) >> 7); + + if (score_a > score_b) + return (-1); + if (score_a == score_b) { + if (rsa->rs_start < rsb->rs_start) + return (-1); + if (rsa->rs_start == rsb->rs_start) + return (0); + return (1); + } + return (1); +} + +/* + * Comparator for the q_sios_by_addr tree. Sorting is simply performed + * based on LBA-order (from lowest to highest). + */ +static int +io_addr_compare(const void *x, const void *y) +{ + const scan_io_t *a = x, *b = y; + + if (a->sio_offset < b->sio_offset) + return (-1); + if (a->sio_offset == b->sio_offset) + return (0); + return (1); +} + +/* IO queues are created on demand when they are needed. */ +static dsl_scan_io_queue_t * +scan_io_queue_create(vdev_t *vd) +{ + dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; + dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP); + + q->q_scn = scn; + q->q_vd = vd; + cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); + q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, + &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); + avl_create(&q->q_sios_by_addr, io_addr_compare, + sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); + + return (q); +} + +/* + * Destroys a scan queue and all segments and scan_io_t's contained in it. + * No further execution of I/O occurs, anything pending in the queue is + * simply freed without being executed. + */ +void +dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) +{ + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio; + void *cookie = NULL; + int64_t bytes_dequeued = 0; + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != + NULL) { + ASSERT(range_tree_contains(queue->q_exts_by_addr, + sio->sio_offset, sio->sio_asize)); + bytes_dequeued += sio->sio_asize; + kmem_free(sio, sizeof (*sio)); + } + + atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); + range_tree_vacate(queue->q_exts_by_addr, NULL, queue); + range_tree_destroy(queue->q_exts_by_addr); + avl_destroy(&queue->q_sios_by_addr); + cv_destroy(&queue->q_zio_cv); + + kmem_free(queue, sizeof (*queue)); +} + +/* + * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is + * called on behalf of vdev_top_transfer when creating or destroying + * a mirror vdev due to zpool attach/detach. + */ +void +dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd) +{ + mutex_enter(&svd->vdev_scan_io_queue_lock); + mutex_enter(&tvd->vdev_scan_io_queue_lock); + + VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); + tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; + svd->vdev_scan_io_queue = NULL; + if (tvd->vdev_scan_io_queue != NULL) + tvd->vdev_scan_io_queue->q_vd = tvd; + + mutex_exit(&tvd->vdev_scan_io_queue_lock); + mutex_exit(&svd->vdev_scan_io_queue_lock); +} + +static void +scan_io_queues_destroy(dsl_scan_t *scn) +{ + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; + + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *tvd = rvd->vdev_child[i]; + + mutex_enter(&tvd->vdev_scan_io_queue_lock); + if (tvd->vdev_scan_io_queue != NULL) + dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue); + tvd->vdev_scan_io_queue = NULL; + mutex_exit(&tvd->vdev_scan_io_queue_lock); + } +} + +static void +dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + vdev_t *vdev; + kmutex_t *q_lock; + dsl_scan_io_queue_t *queue; + scan_io_t srch, *sio; + avl_index_t idx; + uint64_t start, size; + + vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i])); + ASSERT(vdev != NULL); + q_lock = &vdev->vdev_scan_io_queue_lock; + queue = vdev->vdev_scan_io_queue; + + mutex_enter(q_lock); + if (queue == NULL) { + mutex_exit(q_lock); + return; + } + + bp2sio(bp, &srch, dva_i); + start = srch.sio_offset; + size = srch.sio_asize; + + /* + * We can find the zio in two states: + * 1) Cold, just sitting in the queue of zio's to be issued at + * some point in the future. In this case, all we do is + * remove the zio from the q_sios_by_addr tree, decrement + * its data volume from the containing range_seg_t and + * resort the q_exts_by_size tree to reflect that the + * range_seg_t has lost some of its 'fill'. We don't shorten + * the range_seg_t - this is usually rare enough not to be + * worth the extra hassle of trying keep track of precise + * extent boundaries. + * 2) Hot, where the zio is currently in-flight in + * dsl_scan_issue_ios. In this case, we can't simply + * reach in and stop the in-flight zio's, so we instead + * block the caller. Eventually, dsl_scan_issue_ios will + * be done with issuing the zio's it gathered and will + * signal us. + */ + sio = avl_find(&queue->q_sios_by_addr, &srch, &idx); + if (sio != NULL) { + int64_t asize = sio->sio_asize; + blkptr_t tmpbp; + + /* Got it while it was cold in the queue */ + ASSERT3U(start, ==, sio->sio_offset); + ASSERT3U(size, ==, asize); + avl_remove(&queue->q_sios_by_addr, sio); + + ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); + range_tree_remove_fill(queue->q_exts_by_addr, start, size); + + /* + * We only update scn_bytes_pending in the cold path, + * otherwise it will already have been accounted for as + * part of the zio's execution. + */ + atomic_add_64(&scn->scn_bytes_pending, -asize); + + /* count the block as though we issued it */ + sio2bp(sio, &tmpbp, dva_i); + count_block(scn, dp->dp_blkstats, &tmpbp); + + kmem_free(sio, sizeof (*sio)); + } + mutex_exit(q_lock); +} + +/* + * Callback invoked when a zio_free() zio is executing. This needs to be + * intercepted to prevent the zio from deallocating a particular portion + * of disk space and it then getting reallocated and written to, while we + * still have it queued up for processing. + */ +void +dsl_scan_freed(spa_t *spa, const blkptr_t *bp) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(scn != NULL); + if (!dsl_scan_is_running(scn)) + return; + + for (int i = 0; i < BP_GET_NDVAS(bp); i++) + dsl_scan_freed_dva(spa, bp, i); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c new file mode 100644 index 000000000000..7014581d0e47 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c @@ -0,0 +1,238 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> +#include <sys/metaslab.h> + +#define DST_AVG_BLKSHIFT 14 + +/* ARGSUSED */ +static int +dsl_null_checkfunc(void *arg, dmu_tx_t *tx) +{ + return (0); +} + +static int +dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check, boolean_t early) +{ + spa_t *spa; + dmu_tx_t *tx; + int err; + dsl_sync_task_t dst = { 0 }; + dsl_pool_t *dp; + + err = spa_open(pool, &spa, FTAG); + if (err != 0) + return (err); + dp = spa_get_dsl(spa); + +top: + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + dst.dst_pool = dp; + dst.dst_txg = dmu_tx_get_txg(tx); + dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT; + dst.dst_space_check = space_check; + dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc; + dst.dst_syncfunc = syncfunc; + dst.dst_arg = arg; + dst.dst_error = 0; + dst.dst_nowaiter = B_FALSE; + + dsl_pool_config_enter(dp, FTAG); + err = dst.dst_checkfunc(arg, tx); + dsl_pool_config_exit(dp, FTAG); + + if (err != 0) { + dmu_tx_commit(tx); + spa_close(spa, FTAG); + return (err); + } + + txg_list_t *task_list = (early) ? + &dp->dp_early_sync_tasks : &dp->dp_sync_tasks; + VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg)); + + dmu_tx_commit(tx); + + txg_wait_synced(dp, dst.dst_txg); + + if (dst.dst_error == EAGAIN) { + txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE); + goto top; + } + + spa_close(spa, FTAG); + return (dst.dst_error); +} + +/* + * Called from open context to perform a callback in syncing context. Waits + * for the operation to complete. + * + * The checkfunc will be called from open context as a preliminary check + * which can quickly fail. If it succeeds, it will be called again from + * syncing context. The checkfunc should generally be designed to work + * properly in either context, but if necessary it can check + * dmu_tx_is_syncing(tx). + * + * The synctask infrastructure enforces proper locking strategy with respect + * to the dp_config_rwlock -- the lock will always be held when the callbacks + * are called. It will be held for read during the open-context (preliminary) + * call to the checkfunc, and then held for write from syncing context during + * the calls to the check and sync funcs. + * + * A dataset or pool name can be passed as the first argument. Typically, + * the check func will hold, check the return value of the hold, and then + * release the dataset. The sync func will VERIFYO(hold()) the dataset. + * This is safe because no changes can be made between the check and sync funcs, + * and the sync func will only be called if the check func successfully opened + * the dataset. + */ +int +dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check) +{ + return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg, + blocks_modified, space_check, B_FALSE)); +} + +/* + * An early synctask works exactly as a standard synctask with one important + * difference on the way it is handled during syncing context. Standard + * synctasks run after we've written out all the dirty blocks of dirty + * datasets. Early synctasks are executed before writing out any dirty data, + * and thus before standard synctasks. + * + * For that reason, early synctasks can affect the process of writing dirty + * changes to disk for the txg that they run and should be used with caution. + * In addition, early synctasks should not dirty any metaslabs as this would + * invalidate the precodition/invariant for subsequent early synctasks. + * [see dsl_pool_sync() and dsl_early_sync_task_verify()] + */ +int +dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check) +{ + return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg, + blocks_modified, space_check, B_TRUE)); +} + +static void +dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx, + boolean_t early) +{ + dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP); + + dst->dst_pool = dp; + dst->dst_txg = dmu_tx_get_txg(tx); + dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT; + dst->dst_space_check = space_check; + dst->dst_checkfunc = dsl_null_checkfunc; + dst->dst_syncfunc = syncfunc; + dst->dst_arg = arg; + dst->dst_error = 0; + dst->dst_nowaiter = B_TRUE; + + txg_list_t *task_list = (early) ? + &dp->dp_early_sync_tasks : &dp->dp_sync_tasks; + VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg)); +} + +void +dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) +{ + dsl_sync_task_nowait_common(dp, syncfunc, arg, + blocks_modified, space_check, tx, B_FALSE); +} + +void +dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) +{ + dsl_sync_task_nowait_common(dp, syncfunc, arg, + blocks_modified, space_check, tx, B_TRUE); +} + +/* + * Called in syncing context to execute the synctask. + */ +void +dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dst->dst_pool; + + ASSERT0(dst->dst_error); + + /* + * Check for sufficient space. + * + * When the sync task was created, the caller specified the + * type of space checking required. See the comment in + * zfs_space_check_t for details on the semantics of each + * type of space checking. + * + * We just check against what's on-disk; we don't want any + * in-flight accounting to get in our way, because open context + * may have already used up various in-core limits + * (arc_tempreserve, dsl_pool_tempreserve). + */ + if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) { + uint64_t quota = dsl_pool_unreserved_space(dp, + dst->dst_space_check); + uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; + + /* MOS space is triple-dittoed, so we multiply by 3. */ + if (used + dst->dst_space * 3 > quota) { + dst->dst_error = SET_ERROR(ENOSPC); + if (dst->dst_nowaiter) + kmem_free(dst, sizeof (*dst)); + return; + } + } + + /* + * Check for errors by calling checkfunc. + */ + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx); + if (dst->dst_error == 0) + dst->dst_syncfunc(dst->dst_arg, tx); + rrw_exit(&dp->dp_config_rwlock, FTAG); + if (dst->dst_nowaiter) + kmem_free(dst, sizeof (*dst)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c new file mode 100644 index 000000000000..d0274dc4ce39 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c @@ -0,0 +1,667 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/dsl_userhold.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_synctask.h> +#include <sys/dmu_tx.h> +#include <sys/zfs_onexit.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dir.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> + +typedef struct dsl_dataset_user_hold_arg { + nvlist_t *dduha_holds; + nvlist_t *dduha_chkholds; + nvlist_t *dduha_errlist; + minor_t dduha_minor; +} dsl_dataset_user_hold_arg_t; + +/* + * If you add new checks here, you may need to add additional checks to the + * "temporary" case in snapshot_check() in dmu_objset.c. + */ +int +dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag, + boolean_t temphold, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + int error = 0; + + ASSERT(dsl_pool_config_held(dp)); + + if (strlen(htag) > MAXNAMELEN) + return (SET_ERROR(E2BIG)); + /* Tempholds have a more restricted length */ + if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) + return (SET_ERROR(E2BIG)); + + /* tags must be unique (if ds already exists) */ + if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { + uint64_t value; + + error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, + htag, 8, 1, &value); + if (error == 0) + error = SET_ERROR(EEXIST); + else if (error == ENOENT) + error = 0; + } + + return (error); +} + +static int +dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_hold_arg_t *dduha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + + if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) + return (SET_ERROR(ENOTSUP)); + + if (!dmu_tx_is_syncing(tx)) + return (0); + + for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); + pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { + dsl_dataset_t *ds; + int error = 0; + char *htag, *name; + + /* must be a snapshot */ + name = nvpair_name(pair); + if (strchr(name, '@') == NULL) + error = SET_ERROR(EINVAL); + + if (error == 0) + error = nvpair_value_string(pair, &htag); + + if (error == 0) + error = dsl_dataset_hold(dp, name, FTAG, &ds); + + if (error == 0) { + error = dsl_dataset_user_hold_check_one(ds, htag, + dduha->dduha_minor != 0, tx); + dsl_dataset_rele(ds, FTAG); + } + + if (error == 0) { + fnvlist_add_string(dduha->dduha_chkholds, name, htag); + } else { + /* + * We register ENOENT errors so they can be correctly + * reported if needed, such as when all holds fail. + */ + fnvlist_add_int32(dduha->dduha_errlist, name, error); + if (error != ENOENT) + return (error); + } + } + + return (0); +} + + +static void +dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds, + const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj; + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) { + /* + * This is the first user hold for this dataset. Create + * the userrefs zap object. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj = + zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); + } else { + zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj; + } + ds->ds_userrefs++; + + VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx)); + + if (minor != 0) { + char name[MAXNAMELEN]; + nvlist_t *tags; + + VERIFY0(dsl_pool_user_hold(dp, ds->ds_object, + htag, now, tx)); + (void) snprintf(name, sizeof (name), "%llx", + (u_longlong_t)ds->ds_object); + + if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) { + tags = fnvlist_alloc(); + fnvlist_add_boolean(tags, htag); + fnvlist_add_nvlist(tmpholds, name, tags); + fnvlist_free(tags); + } else { + fnvlist_add_boolean(tags, htag); + } + } + + spa_history_log_internal_ds(ds, "hold", tx, + "tag=%s temp=%d refs=%llu", + htag, minor != 0, ds->ds_userrefs); +} + +typedef struct zfs_hold_cleanup_arg { + char zhca_spaname[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t zhca_spa_load_guid; + nvlist_t *zhca_holds; +} zfs_hold_cleanup_arg_t; + +static void +dsl_dataset_user_release_onexit(void *arg) +{ + zfs_hold_cleanup_arg_t *ca = arg; + spa_t *spa; + int error; + + error = spa_open(ca->zhca_spaname, &spa, FTAG); + if (error != 0) { + zfs_dbgmsg("couldn't release holds on pool=%s " + "because pool is no longer loaded", + ca->zhca_spaname); + return; + } + if (spa_load_guid(spa) != ca->zhca_spa_load_guid) { + zfs_dbgmsg("couldn't release holds on pool=%s " + "because pool is no longer loaded (guid doesn't match)", + ca->zhca_spaname); + spa_close(spa, FTAG); + return; + } + + (void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds); + fnvlist_free(ca->zhca_holds); + kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); + spa_close(spa, FTAG); +} + +static void +dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor) +{ + zfs_hold_cleanup_arg_t *ca; + + if (minor == 0 || nvlist_empty(holds)) { + fnvlist_free(holds); + return; + } + + ASSERT(spa != NULL); + ca = kmem_alloc(sizeof (*ca), KM_SLEEP); + + (void) strlcpy(ca->zhca_spaname, spa_name(spa), + sizeof (ca->zhca_spaname)); + ca->zhca_spa_load_guid = spa_load_guid(spa); + ca->zhca_holds = holds; + VERIFY0(zfs_onexit_add_cb(minor, + dsl_dataset_user_release_onexit, ca, NULL)); +} + +void +dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, + minor_t minor, uint64_t now, dmu_tx_t *tx) +{ + nvlist_t *tmpholds; + + if (minor != 0) + tmpholds = fnvlist_alloc(); + else + tmpholds = NULL; + dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx); + dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor); +} + +static void +dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_hold_arg_t *dduha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvlist_t *tmpholds; + uint64_t now = gethrestime_sec(); + + if (dduha->dduha_minor != 0) + tmpholds = fnvlist_alloc(); + else + tmpholds = NULL; + for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL); + pair != NULL; + pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) { + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); + dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, + fnvpair_value_string(pair), dduha->dduha_minor, now, tx); + dsl_dataset_rele(ds, FTAG); + } + dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor); +} + +/* + * The full semantics of this function are described in the comment above + * lzc_hold(). + * + * To summarize: + * holds is nvl of snapname -> holdname + * errlist will be filled in with snapname -> error + * + * The snaphosts must all be in the same pool. + * + * Holds for snapshots that don't exist will be skipped. + * + * If none of the snapshots for requested holds exist then ENOENT will be + * returned. + * + * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned + * up when the process exits. + * + * On success all the holds, for snapshots that existed, will be created and 0 + * will be returned. + * + * On failure no holds will be created, the errlist will be filled in, + * and an errno will returned. + * + * In all cases the errlist will contain entries for holds where the snapshot + * didn't exist. + */ +int +dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist) +{ + dsl_dataset_user_hold_arg_t dduha; + nvpair_t *pair; + int ret; + + pair = nvlist_next_nvpair(holds, NULL); + if (pair == NULL) + return (0); + + dduha.dduha_holds = holds; + dduha.dduha_chkholds = fnvlist_alloc(); + dduha.dduha_errlist = errlist; + dduha.dduha_minor = cleanup_minor; + + ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check, + dsl_dataset_user_hold_sync, &dduha, + fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED); + fnvlist_free(dduha.dduha_chkholds); + + return (ret); +} + +typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag, + dsl_dataset_t **dsp); + +typedef struct dsl_dataset_user_release_arg { + dsl_holdfunc_t *ddura_holdfunc; + nvlist_t *ddura_holds; + nvlist_t *ddura_todelete; + nvlist_t *ddura_errlist; + nvlist_t *ddura_chkholds; +} dsl_dataset_user_release_arg_t; + +/* Place a dataset hold on the snapshot identified by passed dsobj string */ +static int +dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag, + dsl_dataset_t **dsp) +{ + return (dsl_dataset_hold_obj(dp, zfs_strtonum(dsobj, NULL), tag, dsp)); +} + +static int +dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, + dsl_dataset_t *ds, nvlist_t *holds, const char *snapname) +{ + uint64_t zapobj; + nvlist_t *holds_found; + objset_t *mos; + int numholds; + + if (!ds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + if (nvlist_empty(holds)) + return (0); + + numholds = 0; + mos = ds->ds_dir->dd_pool->dp_meta_objset; + zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj; + holds_found = fnvlist_alloc(); + + for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + uint64_t tmp; + int error; + const char *holdname = nvpair_name(pair); + + if (zapobj != 0) + error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp); + else + error = SET_ERROR(ENOENT); + + /* + * Non-existent holds are put on the errlist, but don't + * cause an overall failure. + */ + if (error == ENOENT) { + if (ddura->ddura_errlist != NULL) { + char *errtag = kmem_asprintf("%s#%s", + snapname, holdname); + fnvlist_add_int32(ddura->ddura_errlist, errtag, + ENOENT); + strfree(errtag); + } + continue; + } + + if (error != 0) { + fnvlist_free(holds_found); + return (error); + } + + fnvlist_add_boolean(holds_found, holdname); + numholds++; + } + + if (DS_IS_DEFER_DESTROY(ds) && + dsl_dataset_phys(ds)->ds_num_children == 1 && + ds->ds_userrefs == numholds) { + /* we need to destroy the snapshot as well */ + if (dsl_dataset_long_held(ds)) { + fnvlist_free(holds_found); + return (SET_ERROR(EBUSY)); + } + fnvlist_add_boolean(ddura->ddura_todelete, snapname); + } + + if (numholds != 0) { + fnvlist_add_nvlist(ddura->ddura_chkholds, snapname, + holds_found); + } + fnvlist_free(holds_found); + + return (0); +} + +static int +dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_arg_t *ddura; + dsl_holdfunc_t *holdfunc; + dsl_pool_t *dp; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + dp = dmu_tx_pool(tx); + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + ddura = arg; + holdfunc = ddura->ddura_holdfunc; + + for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { + int error; + dsl_dataset_t *ds; + nvlist_t *holds; + const char *snapname = nvpair_name(pair); + + error = nvpair_value_nvlist(pair, &holds); + if (error != 0) + error = (SET_ERROR(EINVAL)); + else + error = holdfunc(dp, snapname, FTAG, &ds); + if (error == 0) { + error = dsl_dataset_user_release_check_one(ddura, ds, + holds, snapname); + dsl_dataset_rele(ds, FTAG); + } + if (error != 0) { + if (ddura->ddura_errlist != NULL) { + fnvlist_add_int32(ddura->ddura_errlist, + snapname, error); + } + /* + * Non-existent snapshots are put on the errlist, + * but don't cause an overall failure. + */ + if (error != ENOENT) + return (error); + } + } + + return (0); +} + +static void +dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + int error; + const char *holdname = nvpair_name(pair); + + /* Remove temporary hold if one exists. */ + error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx); + VERIFY(error == 0 || error == ENOENT); + + VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, + holdname, tx)); + ds->ds_userrefs--; + + spa_history_log_internal_ds(ds, "release", tx, + "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs); + } +} + +static void +dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_arg_t *ddura = arg; + dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc; + dsl_pool_t *dp = dmu_tx_pool(tx); + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds, + pair)) { + dsl_dataset_t *ds; + const char *name = nvpair_name(pair); + + VERIFY0(holdfunc(dp, name, FTAG, &ds)); + + dsl_dataset_user_release_sync_one(ds, + fnvpair_value_nvlist(pair), tx); + if (nvlist_exists(ddura->ddura_todelete, name)) { + ASSERT(ds->ds_userrefs == 0 && + dsl_dataset_phys(ds)->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)); + dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); + } + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * The full semantics of this function are described in the comment above + * lzc_release(). + * + * To summarize: + * Releases holds specified in the nvl holds. + * + * holds is nvl of snapname -> { holdname, ... } + * errlist will be filled in with snapname -> error + * + * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots, + * otherwise they should be the names of shapshots. + * + * As a release may cause snapshots to be destroyed this trys to ensure they + * aren't mounted. + * + * The release of non-existent holds are skipped. + * + * At least one hold must have been released for the this function to succeed + * and return 0. + */ +static int +dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist, + dsl_pool_t *tmpdp) +{ + dsl_dataset_user_release_arg_t ddura; + nvpair_t *pair; + char *pool; + int error; + + pair = nvlist_next_nvpair(holds, NULL); + if (pair == NULL) + return (0); + + /* + * The release may cause snapshots to be destroyed; make sure they + * are not mounted. + */ + if (tmpdp != NULL) { + /* Temporary holds are specified by dsobj string. */ + ddura.ddura_holdfunc = dsl_dataset_hold_obj_string; + pool = spa_name(tmpdp->dp_spa); +#ifdef _KERNEL + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + dsl_dataset_t *ds; + + dsl_pool_config_enter(tmpdp, FTAG); + error = dsl_dataset_hold_obj_string(tmpdp, + nvpair_name(pair), FTAG, &ds); + if (error == 0) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(ds, name); + dsl_pool_config_exit(tmpdp, FTAG); + dsl_dataset_rele(ds, FTAG); + (void) zfs_unmount_snap(name); + } else { + dsl_pool_config_exit(tmpdp, FTAG); + } + } +#endif + } else { + /* Non-temporary holds are specified by name. */ + ddura.ddura_holdfunc = dsl_dataset_hold; + pool = nvpair_name(pair); +#ifdef _KERNEL + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + (void) zfs_unmount_snap(nvpair_name(pair)); + } +#endif + } + + ddura.ddura_holds = holds; + ddura.ddura_errlist = errlist; + ddura.ddura_todelete = fnvlist_alloc(); + ddura.ddura_chkholds = fnvlist_alloc(); + + error = dsl_sync_task(pool, dsl_dataset_user_release_check, + dsl_dataset_user_release_sync, &ddura, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED); + fnvlist_free(ddura.ddura_todelete); + fnvlist_free(ddura.ddura_chkholds); + + return (error); +} + +/* + * holds is nvl of snapname -> { holdname, ... } + * errlist will be filled in with snapname -> error + */ +int +dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist) +{ + return (dsl_dataset_user_release_impl(holds, errlist, NULL)); +} + +/* + * holds is nvl of snapdsobj -> { holdname, ... } + */ +void +dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds) +{ + ASSERT(dp != NULL); + (void) dsl_dataset_user_release_impl(holds, NULL, dp); +} + +int +dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + + err = dsl_pool_hold(dsname, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { + zap_attribute_t *za; + zap_cursor_t zc; + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, + dsl_dataset_phys(ds)->ds_userrefs_obj); + zap_cursor_retrieve(&zc, za) == 0; + zap_cursor_advance(&zc)) { + fnvlist_add_uint64(nvl, za->za_name, + za->za_first_integer); + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (zap_attribute_t)); + } + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c new file mode 100644 index 000000000000..9a3430d94668 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c @@ -0,0 +1,114 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/edonr.h> +#include <sys/abd.h> + +#define EDONR_MODE 512 +#define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE + +static int +edonr_incremental(void *buf, size_t size, void *arg) +{ + EdonRState *ctx = arg; + EdonRUpdate(ctx, buf, size * 8); + return (0); +} + +/* + * Native zio_checksum interface for the Edon-R hash function. + */ +/*ARGSUSED*/ +void +abd_checksum_edonr_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + uint8_t digest[EDONR_MODE / 8]; + EdonRState ctx; + + ASSERT(ctx_template != NULL); + bcopy(ctx_template, &ctx, sizeof (ctx)); + (void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx); + EdonRFinal(&ctx, digest); + bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word)); +} + +/* + * Byteswapped zio_checksum interface for the Edon-R hash function. + */ +void +abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + abd_checksum_edonr_native(abd, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]); + zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]); + zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]); + zcp->zc_word[3] = BSWAP_64(zcp->zc_word[3]); +} + +void * +abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) +{ + EdonRState *ctx; + uint8_t salt_block[EDONR_BLOCK_SIZE]; + + /* + * Edon-R needs all but the last hash invocation to be on full-size + * blocks, but the salt is too small. Rather than simply padding it + * with zeros, we expand the salt into a new salt block of proper + * size by double-hashing it (the new salt block will be composed of + * H(salt) || H(H(salt))). + */ + CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8)); + EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8, + salt_block); + EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block + + EDONR_MODE / 8); + + /* + * Feed the new salt block into the hash function - this will serve + * as our MAC key. + */ + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + EdonRInit(ctx, EDONR_MODE); + EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8); + return (ctx); +} + +void +abd_checksum_edonr_tmpl_free(void *ctx_template) +{ + EdonRState *ctx = ctx_template; + + bzero(ctx, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c new file mode 100644 index 000000000000..b257d4af753c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/debug.h> +#include <sys/types.h> +#include <sys/zmod.h> + +#ifdef _KERNEL +#include <sys/systm.h> +#else +#include <strings.h> +#endif + +size_t +gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + size_t dstlen = d_len; + + ASSERT(d_len <= s_len); + + if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) { + if (d_len != s_len) + return (s_len); + + bcopy(s_start, d_start, s_len); + return (s_len); + } + + return (dstlen); +} + +/*ARGSUSED*/ +int +gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + size_t dstlen = d_len; + + ASSERT(d_len >= s_len); + + if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK) + return (-1); + + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs new file mode 100644 index 000000000000..0e22de7a4a18 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs @@ -0,0 +1,80 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +Introduction +------------ + +This README describes the Lua interpreter source code that lives in the ZFS +source tree to enable execution of ZFS channel programs, including its +maintenance policy, the modifications that have been made to it, and how it +should (and should not) be used. + +For a description of the Lua language and features exposed by ZFS channel +programs, please refer to the zfs-program(1m) man page instead. + + +Maintenance policy +------------------ + +The Lua runtime is considered stable software. Channel programs don't need much +complicated logic, so updates to the Lua runtime from upstream are viewed as +nice-to-have, but not required for channel programs to be well-supported. As +such, the Lua runtime in ZFS should be updated on an as-needed basis for +security vulnerabilities, but not much else. + + +Modifications to Lua +-------------------- + +The version of the Lua runtime we're using in ZFS has been modified in a variety +of ways to make it more useful for the specific purpose of running channel +programs. These changes include: + +1. "Normal" Lua uses floating point for all numbers it stores, but those aren't + useful inside ZFS / the kernel. We have changed the runtime to use int64_t + throughout for all numbers. +2. Some of the Lua standard libraries do file I/O or spawn processes, but + neither of these make sense from inside channel programs. We have removed + those libraries rather than reimplementing them using kernel APIs. +3. The "normal" Lua runtime handles errors by failing fatally, but since this + version of Lua runs inside the kernel we must handle these failures and + return meaningful error codes to userland. We have customized the Lua + failure paths so that they aren't fatal. +4. Running poorly-vetted code inside the kernel is always a risk; even if the + ability to do so is restricted to the root user, it's still possible to write + an incorrect program that results in an infinite loop or massive memory use. + We've added new protections into the Lua interpreter to limit the runtime + (measured in number of Lua instructions run) and memory overhead of running + a channel program. +5. The Lua bytecode is not designed to be secure / safe, so it would be easy to + pass invalid bytecode which can panic the kernel. By comparison, the parser + is hardened and fails gracefully on invalid input. Therefore, we only accept + Lua source code at the ioctl level and then interpret it inside the kernel. + +Each of these modifications have been tested in the zfs-test suite. If / when +new modifications are made, new tests should be added to the suite located in +zfs-tests/tests/functional/channel_program/lua_core. + + +How to use this Lua interpreter +------------------------------- + +From the above, it should be clear that this is not a general-purpose Lua +interpreter. Additional work would be required to extricate this custom version +of Lua from ZFS and make it usable by other areas of the kernel. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c new file mode 100644 index 000000000000..34820a2d8b44 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c @@ -0,0 +1,1283 @@ +/* +** $Id: lapi.c,v 2.171.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua API +** See Copyright Notice in lua.h +*/ + + +#include <sys/zfs_context.h> + +#define lapi_c +#define LUA_CORE + +#include "lua.h" + +#include "lapi.h" +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "lgc.h" +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" +#include "lundump.h" +#include "lvm.h" + + + +const char lua_ident[] = + "$LuaVersion: " LUA_COPYRIGHT " $" + "$LuaAuthors: " LUA_AUTHORS " $"; + + +/* value at a non-valid index */ +#define NONVALIDVALUE cast(TValue *, luaO_nilobject) + +/* corresponding test */ +#define isvalid(o) ((o) != luaO_nilobject) + +/* test for pseudo index */ +#define ispseudo(i) ((i) <= LUA_REGISTRYINDEX) + +/* test for valid but not pseudo index */ +#define isstackindex(i, o) (isvalid(o) && !ispseudo(i)) + +#define api_checkvalidindex(L, o) api_check(L, isvalid(o), "invalid index") + +#define api_checkstackindex(L, i, o) \ + api_check(L, isstackindex(i, o), "index not in the stack") + + +static TValue *index2addr (lua_State *L, int idx) { + CallInfo *ci = L->ci; + if (idx > 0) { + TValue *o = ci->func + idx; + api_check(L, idx <= ci->top - (ci->func + 1), "unacceptable index"); + if (o >= L->top) return NONVALIDVALUE; + else return o; + } + else if (!ispseudo(idx)) { /* negative index */ + api_check(L, idx != 0 && -idx <= L->top - (ci->func + 1), "invalid index"); + return L->top + idx; + } + else if (idx == LUA_REGISTRYINDEX) + return &G(L)->l_registry; + else { /* upvalues */ + idx = LUA_REGISTRYINDEX - idx; + api_check(L, idx <= MAXUPVAL + 1, "upvalue index too large"); + if (ttislcf(ci->func)) /* light C function? */ + return NONVALIDVALUE; /* it has no upvalues */ + else { + CClosure *func = clCvalue(ci->func); + return (idx <= func->nupvalues) ? &func->upvalue[idx-1] : NONVALIDVALUE; + } + } +} + + +/* +** to be called by 'lua_checkstack' in protected mode, to grow stack +** capturing memory errors +*/ +static void growstack (lua_State *L, void *ud) { + int size = *(int *)ud; + luaD_growstack(L, size); +} + + +LUA_API int lua_checkstack (lua_State *L, int size) { + int res; + CallInfo *ci = L->ci; + lua_lock(L); + if (L->stack_last - L->top > size) /* stack large enough? */ + res = 1; /* yes; check is OK */ + else { /* no; need to grow stack */ + int inuse = cast_int(L->top - L->stack) + EXTRA_STACK; + if (inuse > LUAI_MAXSTACK - size) /* can grow without overflow? */ + res = 0; /* no */ + else /* try to grow stack */ + res = (luaD_rawrunprotected(L, &growstack, &size) == LUA_OK); + } + if (res && ci->top < L->top + size) + ci->top = L->top + size; /* adjust frame top */ + lua_unlock(L); + return res; +} + + +LUA_API void lua_xmove (lua_State *from, lua_State *to, int n) { + int i; + if (from == to) return; + lua_lock(to); + api_checknelems(from, n); + api_check(from, G(from) == G(to), "moving among independent states"); + api_check(from, to->ci->top - to->top >= n, "not enough elements to move"); + from->top -= n; + for (i = 0; i < n; i++) { + setobj2s(to, to->top++, from->top + i); + } + lua_unlock(to); +} + + +LUA_API lua_CFunction lua_atpanic (lua_State *L, lua_CFunction panicf) { + lua_CFunction old; + lua_lock(L); + old = G(L)->panic; + G(L)->panic = panicf; + lua_unlock(L); + return old; +} + + +LUA_API const lua_Number *lua_version (lua_State *L) { + static const lua_Number version = LUA_VERSION_NUM; + if (L == NULL) return &version; + else return G(L)->version; +} + + + +/* +** basic stack manipulation +*/ + + +/* +** convert an acceptable stack index into an absolute index +*/ +LUA_API int lua_absindex (lua_State *L, int idx) { + return (idx > 0 || ispseudo(idx)) + ? idx + : cast_int(L->top - L->ci->func + idx); +} + + +LUA_API int lua_gettop (lua_State *L) { + return cast_int(L->top - (L->ci->func + 1)); +} + + +LUA_API void lua_settop (lua_State *L, int idx) { + StkId func = L->ci->func; + lua_lock(L); + if (idx >= 0) { + api_check(L, idx <= L->stack_last - (func + 1), "new top too large"); + while (L->top < (func + 1) + idx) + setnilvalue(L->top++); + L->top = (func + 1) + idx; + } + else { + api_check(L, -(idx+1) <= (L->top - (func + 1)), "invalid new top"); + L->top += idx+1; /* `subtract' index (index is negative) */ + } + lua_unlock(L); +} + + +LUA_API void lua_remove (lua_State *L, int idx) { + StkId p; + lua_lock(L); + p = index2addr(L, idx); + api_checkstackindex(L, idx, p); + while (++p < L->top) setobjs2s(L, p-1, p); + L->top--; + lua_unlock(L); +} + + +LUA_API void lua_insert (lua_State *L, int idx) { + StkId p; + StkId q; + lua_lock(L); + p = index2addr(L, idx); + api_checkstackindex(L, idx, p); + for (q = L->top; q > p; q--) /* use L->top as a temporary */ + setobjs2s(L, q, q - 1); + setobjs2s(L, p, L->top); + lua_unlock(L); +} + + +static void moveto (lua_State *L, TValue *fr, int idx) { + TValue *to = index2addr(L, idx); + api_checkvalidindex(L, to); + setobj(L, to, fr); + if (idx < LUA_REGISTRYINDEX) /* function upvalue? */ + luaC_barrier(L, clCvalue(L->ci->func), fr); + /* LUA_REGISTRYINDEX does not need gc barrier + (collector revisits it before finishing collection) */ +} + + +LUA_API void lua_replace (lua_State *L, int idx) { + lua_lock(L); + api_checknelems(L, 1); + moveto(L, L->top - 1, idx); + L->top--; + lua_unlock(L); +} + + +LUA_API void lua_copy (lua_State *L, int fromidx, int toidx) { + TValue *fr; + lua_lock(L); + fr = index2addr(L, fromidx); + moveto(L, fr, toidx); + lua_unlock(L); +} + + +LUA_API void lua_pushvalue (lua_State *L, int idx) { + lua_lock(L); + setobj2s(L, L->top, index2addr(L, idx)); + api_incr_top(L); + lua_unlock(L); +} + + + +/* +** access functions (stack -> C) +*/ + + +LUA_API int lua_type (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + return (isvalid(o) ? ttypenv(o) : LUA_TNONE); +} + + +LUA_API const char *lua_typename (lua_State *L, int t) { + UNUSED(L); + return ttypename(t); +} + + +LUA_API int lua_iscfunction (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + return (ttislcf(o) || (ttisCclosure(o))); +} + + +LUA_API int lua_isnumber (lua_State *L, int idx) { + TValue n; + const TValue *o = index2addr(L, idx); + return tonumber(o, &n); +} + + +LUA_API int lua_isstring (lua_State *L, int idx) { + int t = lua_type(L, idx); + return (t == LUA_TSTRING || t == LUA_TNUMBER); +} + + +LUA_API int lua_isuserdata (lua_State *L, int idx) { + const TValue *o = index2addr(L, idx); + return (ttisuserdata(o) || ttislightuserdata(o)); +} + + +LUA_API int lua_rawequal (lua_State *L, int index1, int index2) { + StkId o1 = index2addr(L, index1); + StkId o2 = index2addr(L, index2); + return (isvalid(o1) && isvalid(o2)) ? luaV_rawequalobj(o1, o2) : 0; +} + + +LUA_API void lua_arith (lua_State *L, int op) { + StkId o1; /* 1st operand */ + StkId o2; /* 2nd operand */ + lua_lock(L); + if (op != LUA_OPUNM) /* all other operations expect two operands */ + api_checknelems(L, 2); + else { /* for unary minus, add fake 2nd operand */ + api_checknelems(L, 1); + setobjs2s(L, L->top, L->top - 1); + L->top++; + } + o1 = L->top - 2; + o2 = L->top - 1; + if (ttisnumber(o1) && ttisnumber(o2)) { + setnvalue(o1, luaO_arith(op, nvalue(o1), nvalue(o2))); + } + else + luaV_arith(L, o1, o1, o2, cast(TMS, op - LUA_OPADD + TM_ADD)); + L->top--; + lua_unlock(L); +} + + +LUA_API int lua_compare (lua_State *L, int index1, int index2, int op) { + StkId o1, o2; + int i = 0; + lua_lock(L); /* may call tag method */ + o1 = index2addr(L, index1); + o2 = index2addr(L, index2); + if (isvalid(o1) && isvalid(o2)) { + switch (op) { + case LUA_OPEQ: i = equalobj(L, o1, o2); break; + case LUA_OPLT: i = luaV_lessthan(L, o1, o2); break; + case LUA_OPLE: i = luaV_lessequal(L, o1, o2); break; + default: api_check(L, 0, "invalid option"); + } + } + lua_unlock(L); + return i; +} + + +LUA_API lua_Number lua_tonumberx (lua_State *L, int idx, int *isnum) { + TValue n; + const TValue *o = index2addr(L, idx); + if (tonumber(o, &n)) { + if (isnum) *isnum = 1; + return nvalue(o); + } + else { + if (isnum) *isnum = 0; + return 0; + } +} + + +LUA_API lua_Integer lua_tointegerx (lua_State *L, int idx, int *isnum) { + TValue n; + const TValue *o = index2addr(L, idx); + if (tonumber(o, &n)) { + lua_Integer res; + lua_Number num = nvalue(o); + lua_number2integer(res, num); + if (isnum) *isnum = 1; + return res; + } + else { + if (isnum) *isnum = 0; + return 0; + } +} + + +LUA_API lua_Unsigned lua_tounsignedx (lua_State *L, int idx, int *isnum) { + TValue n; + const TValue *o = index2addr(L, idx); + if (tonumber(o, &n)) { + lua_Unsigned res; + lua_Number num = nvalue(o); + lua_number2unsigned(res, num); + if (isnum) *isnum = 1; + return res; + } + else { + if (isnum) *isnum = 0; + return 0; + } +} + + +LUA_API int lua_toboolean (lua_State *L, int idx) { + const TValue *o = index2addr(L, idx); + return !l_isfalse(o); +} + + +LUA_API const char *lua_tolstring (lua_State *L, int idx, size_t *len) { + StkId o = index2addr(L, idx); + if (!ttisstring(o)) { + lua_lock(L); /* `luaV_tostring' may create a new string */ + if (!luaV_tostring(L, o)) { /* conversion failed? */ + if (len != NULL) *len = 0; + lua_unlock(L); + return NULL; + } + luaC_checkGC(L); + o = index2addr(L, idx); /* previous call may reallocate the stack */ + lua_unlock(L); + } + if (len != NULL) *len = tsvalue(o)->len; + return svalue(o); +} + + +LUA_API size_t lua_rawlen (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + switch (ttypenv(o)) { + case LUA_TSTRING: return tsvalue(o)->len; + case LUA_TUSERDATA: return uvalue(o)->len; + case LUA_TTABLE: return luaH_getn(hvalue(o)); + default: return 0; + } +} + + +LUA_API lua_CFunction lua_tocfunction (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + if (ttislcf(o)) return fvalue(o); + else if (ttisCclosure(o)) + return clCvalue(o)->f; + else return NULL; /* not a C function */ +} + + +LUA_API void *lua_touserdata (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + switch (ttypenv(o)) { + case LUA_TUSERDATA: return (rawuvalue(o) + 1); + case LUA_TLIGHTUSERDATA: return pvalue(o); + default: return NULL; + } +} + + +LUA_API lua_State *lua_tothread (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + return (!ttisthread(o)) ? NULL : thvalue(o); +} + + +LUA_API const void *lua_topointer (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + switch (ttype(o)) { + case LUA_TTABLE: return hvalue(o); + case LUA_TLCL: return clLvalue(o); + case LUA_TCCL: return clCvalue(o); + case LUA_TLCF: return cast(void *, cast(size_t, fvalue(o))); + case LUA_TTHREAD: return thvalue(o); + case LUA_TUSERDATA: + case LUA_TLIGHTUSERDATA: + return lua_touserdata(L, idx); + default: return NULL; + } +} + + + +/* +** push functions (C -> stack) +*/ + + +LUA_API void lua_pushnil (lua_State *L) { + lua_lock(L); + setnilvalue(L->top); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_pushnumber (lua_State *L, lua_Number n) { + lua_lock(L); + setnvalue(L->top, n); + luai_checknum(L, L->top, + luaG_runerror(L, "C API - attempt to push a signaling NaN")); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_pushinteger (lua_State *L, lua_Integer n) { + lua_lock(L); + setnvalue(L->top, cast_num(n)); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_pushunsigned (lua_State *L, lua_Unsigned u) { + lua_Number n; + lua_lock(L); + n = lua_unsigned2number(u); + setnvalue(L->top, n); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API const char *lua_pushlstring (lua_State *L, const char *s, size_t len) { + TString *ts; + lua_lock(L); + luaC_checkGC(L); + ts = luaS_newlstr(L, s, len); + setsvalue2s(L, L->top, ts); + api_incr_top(L); + lua_unlock(L); + return getstr(ts); +} + + +LUA_API const char *lua_pushstring (lua_State *L, const char *s) { + if (s == NULL) { + lua_pushnil(L); + return NULL; + } + else { + TString *ts; + lua_lock(L); + luaC_checkGC(L); + ts = luaS_new(L, s); + setsvalue2s(L, L->top, ts); + api_incr_top(L); + lua_unlock(L); + return getstr(ts); + } +} + + +LUA_API const char *lua_pushvfstring (lua_State *L, const char *fmt, + va_list argp) { + const char *ret; + lua_lock(L); + luaC_checkGC(L); + ret = luaO_pushvfstring(L, fmt, argp); + lua_unlock(L); + return ret; +} + + +LUA_API const char *lua_pushfstring (lua_State *L, const char *fmt, ...) { + const char *ret; + va_list argp; + lua_lock(L); + luaC_checkGC(L); + va_start(argp, fmt); + ret = luaO_pushvfstring(L, fmt, argp); + va_end(argp); + lua_unlock(L); + return ret; +} + + +LUA_API void lua_pushcclosure (lua_State *L, lua_CFunction fn, int n) { + lua_lock(L); + if (n == 0) { + setfvalue(L->top, fn); + } + else { + Closure *cl; + api_checknelems(L, n); + api_check(L, n <= MAXUPVAL, "upvalue index too large"); + luaC_checkGC(L); + cl = luaF_newCclosure(L, n); + cl->c.f = fn; + L->top -= n; + while (n--) + setobj2n(L, &cl->c.upvalue[n], L->top + n); + setclCvalue(L, L->top, cl); + } + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_pushboolean (lua_State *L, int b) { + lua_lock(L); + setbvalue(L->top, (b != 0)); /* ensure that true is 1 */ + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_pushlightuserdata (lua_State *L, void *p) { + lua_lock(L); + setpvalue(L->top, p); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API int lua_pushthread (lua_State *L) { + lua_lock(L); + setthvalue(L, L->top, L); + api_incr_top(L); + lua_unlock(L); + return (G(L)->mainthread == L); +} + + + +/* +** get functions (Lua -> stack) +*/ + + +LUA_API void lua_getglobal (lua_State *L, const char *var) { + Table *reg = hvalue(&G(L)->l_registry); + const TValue *gt; /* global table */ + lua_lock(L); + gt = luaH_getint(reg, LUA_RIDX_GLOBALS); + setsvalue2s(L, L->top++, luaS_new(L, var)); + luaV_gettable(L, gt, L->top - 1, L->top - 1); + lua_unlock(L); +} + + +LUA_API void lua_gettable (lua_State *L, int idx) { + StkId t; + lua_lock(L); + t = index2addr(L, idx); + luaV_gettable(L, t, L->top - 1, L->top - 1); + lua_unlock(L); +} + + +LUA_API void lua_getfield (lua_State *L, int idx, const char *k) { + StkId t; + lua_lock(L); + t = index2addr(L, idx); + setsvalue2s(L, L->top, luaS_new(L, k)); + api_incr_top(L); + luaV_gettable(L, t, L->top - 1, L->top - 1); + lua_unlock(L); +} + + +LUA_API void lua_rawget (lua_State *L, int idx) { + StkId t; + lua_lock(L); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + setobj2s(L, L->top - 1, luaH_get(hvalue(t), L->top - 1)); + lua_unlock(L); +} + + +LUA_API void lua_rawgeti (lua_State *L, int idx, int n) { + StkId t; + lua_lock(L); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + setobj2s(L, L->top, luaH_getint(hvalue(t), n)); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_rawgetp (lua_State *L, int idx, const void *p) { + StkId t; + TValue k; + lua_lock(L); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + setpvalue(&k, cast(void *, p)); + setobj2s(L, L->top, luaH_get(hvalue(t), &k)); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_createtable (lua_State *L, int narray, int nrec) { + Table *t; + lua_lock(L); + luaC_checkGC(L); + t = luaH_new(L); + sethvalue(L, L->top, t); + api_incr_top(L); + if (narray > 0 || nrec > 0) + luaH_resize(L, t, narray, nrec); + lua_unlock(L); +} + + +LUA_API int lua_getmetatable (lua_State *L, int objindex) { + const TValue *obj; + Table *mt = NULL; + int res; + lua_lock(L); + obj = index2addr(L, objindex); + switch (ttypenv(obj)) { + case LUA_TTABLE: + mt = hvalue(obj)->metatable; + break; + case LUA_TUSERDATA: + mt = uvalue(obj)->metatable; + break; + default: + mt = G(L)->mt[ttypenv(obj)]; + break; + } + if (mt == NULL) + res = 0; + else { + sethvalue(L, L->top, mt); + api_incr_top(L); + res = 1; + } + lua_unlock(L); + return res; +} + + +LUA_API void lua_getuservalue (lua_State *L, int idx) { + StkId o; + lua_lock(L); + o = index2addr(L, idx); + api_check(L, ttisuserdata(o), "userdata expected"); + if (uvalue(o)->env) { + sethvalue(L, L->top, uvalue(o)->env); + } else + setnilvalue(L->top); + api_incr_top(L); + lua_unlock(L); +} + + +/* +** set functions (stack -> Lua) +*/ + + +LUA_API void lua_setglobal (lua_State *L, const char *var) { + Table *reg = hvalue(&G(L)->l_registry); + const TValue *gt; /* global table */ + lua_lock(L); + api_checknelems(L, 1); + gt = luaH_getint(reg, LUA_RIDX_GLOBALS); + setsvalue2s(L, L->top++, luaS_new(L, var)); + luaV_settable(L, gt, L->top - 1, L->top - 2); + L->top -= 2; /* pop value and key */ + lua_unlock(L); +} + + +LUA_API void lua_settable (lua_State *L, int idx) { + StkId t; + lua_lock(L); + api_checknelems(L, 2); + t = index2addr(L, idx); + luaV_settable(L, t, L->top - 2, L->top - 1); + L->top -= 2; /* pop index and value */ + lua_unlock(L); +} + + +LUA_API void lua_setfield (lua_State *L, int idx, const char *k) { + StkId t; + lua_lock(L); + api_checknelems(L, 1); + t = index2addr(L, idx); + setsvalue2s(L, L->top++, luaS_new(L, k)); + luaV_settable(L, t, L->top - 1, L->top - 2); + L->top -= 2; /* pop value and key */ + lua_unlock(L); +} + + +LUA_API void lua_rawset (lua_State *L, int idx) { + StkId t; + lua_lock(L); + api_checknelems(L, 2); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + setobj2t(L, luaH_set(L, hvalue(t), L->top-2), L->top-1); + invalidateTMcache(hvalue(t)); + luaC_barrierback(L, gcvalue(t), L->top-1); + L->top -= 2; + lua_unlock(L); +} + + +LUA_API void lua_rawseti (lua_State *L, int idx, int n) { + StkId t; + lua_lock(L); + api_checknelems(L, 1); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + luaH_setint(L, hvalue(t), n, L->top - 1); + luaC_barrierback(L, gcvalue(t), L->top-1); + L->top--; + lua_unlock(L); +} + + +LUA_API void lua_rawsetp (lua_State *L, int idx, const void *p) { + StkId t; + TValue k; + lua_lock(L); + api_checknelems(L, 1); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + setpvalue(&k, cast(void *, p)); + setobj2t(L, luaH_set(L, hvalue(t), &k), L->top - 1); + luaC_barrierback(L, gcvalue(t), L->top - 1); + L->top--; + lua_unlock(L); +} + + +LUA_API int lua_setmetatable (lua_State *L, int objindex) { + TValue *obj; + Table *mt; + lua_lock(L); + api_checknelems(L, 1); + obj = index2addr(L, objindex); + if (ttisnil(L->top - 1)) + mt = NULL; + else { + api_check(L, ttistable(L->top - 1), "table expected"); + mt = hvalue(L->top - 1); + } + switch (ttypenv(obj)) { + case LUA_TTABLE: { + hvalue(obj)->metatable = mt; + if (mt) { + luaC_objbarrierback(L, gcvalue(obj), mt); + luaC_checkfinalizer(L, gcvalue(obj), mt); + } + break; + } + case LUA_TUSERDATA: { + uvalue(obj)->metatable = mt; + if (mt) { + luaC_objbarrier(L, rawuvalue(obj), mt); + luaC_checkfinalizer(L, gcvalue(obj), mt); + } + break; + } + default: { + G(L)->mt[ttypenv(obj)] = mt; + break; + } + } + L->top--; + lua_unlock(L); + return 1; +} + + +LUA_API void lua_setuservalue (lua_State *L, int idx) { + StkId o; + lua_lock(L); + api_checknelems(L, 1); + o = index2addr(L, idx); + api_check(L, ttisuserdata(o), "userdata expected"); + if (ttisnil(L->top - 1)) + uvalue(o)->env = NULL; + else { + api_check(L, ttistable(L->top - 1), "table expected"); + uvalue(o)->env = hvalue(L->top - 1); + luaC_objbarrier(L, gcvalue(o), hvalue(L->top - 1)); + } + L->top--; + lua_unlock(L); +} + + +/* +** `load' and `call' functions (run Lua code) +*/ + + +#define checkresults(L,na,nr) \ + api_check(L, (nr) == LUA_MULTRET || (L->ci->top - L->top >= (nr) - (na)), \ + "results from function overflow current stack size") + + +LUA_API int lua_getctx (lua_State *L, int *ctx) { + if (L->ci->callstatus & CIST_YIELDED) { + if (ctx) *ctx = L->ci->u.c.ctx; + return L->ci->u.c.status; + } + else return LUA_OK; +} + + +LUA_API void lua_callk (lua_State *L, int nargs, int nresults, int ctx, + lua_CFunction k) { + StkId func; + lua_lock(L); + api_check(L, k == NULL || !isLua(L->ci), + "cannot use continuations inside hooks"); + api_checknelems(L, nargs+1); + api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread"); + checkresults(L, nargs, nresults); + func = L->top - (nargs+1); + if (k != NULL && L->nny == 0) { /* need to prepare continuation? */ + L->ci->u.c.k = k; /* save continuation */ + L->ci->u.c.ctx = ctx; /* save context */ + luaD_call(L, func, nresults, 1); /* do the call */ + } + else /* no continuation or no yieldable */ + luaD_call(L, func, nresults, 0); /* just do the call */ + adjustresults(L, nresults); + lua_unlock(L); +} + + + +/* +** Execute a protected call. +*/ +struct CallS { /* data to `f_call' */ + StkId func; + int nresults; +}; + + +static void f_call (lua_State *L, void *ud) { + struct CallS *c = cast(struct CallS *, ud); + luaD_call(L, c->func, c->nresults, 0); +} + + + +LUA_API int lua_pcallk (lua_State *L, int nargs, int nresults, int errfunc, + int ctx, lua_CFunction k) { + struct CallS c; + int status; + ptrdiff_t func; + lua_lock(L); + api_check(L, k == NULL || !isLua(L->ci), + "cannot use continuations inside hooks"); + api_checknelems(L, nargs+1); + api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread"); + checkresults(L, nargs, nresults); + if (errfunc == 0) + func = 0; + else { + StkId o = index2addr(L, errfunc); + api_checkstackindex(L, errfunc, o); + func = savestack(L, o); + } + c.func = L->top - (nargs+1); /* function to be called */ + if (k == NULL || L->nny > 0) { /* no continuation or no yieldable? */ + c.nresults = nresults; /* do a 'conventional' protected call */ + status = luaD_pcall(L, f_call, &c, savestack(L, c.func), func); + } + else { /* prepare continuation (call is already protected by 'resume') */ + CallInfo *ci = L->ci; + ci->u.c.k = k; /* save continuation */ + ci->u.c.ctx = ctx; /* save context */ + /* save information for error recovery */ + ci->extra = savestack(L, c.func); + ci->u.c.old_allowhook = L->allowhook; + ci->u.c.old_errfunc = L->errfunc; + L->errfunc = func; + /* mark that function may do error recovery */ + ci->callstatus |= CIST_YPCALL; + luaD_call(L, c.func, nresults, 1); /* do the call */ + ci->callstatus &= ~CIST_YPCALL; + L->errfunc = ci->u.c.old_errfunc; + status = LUA_OK; /* if it is here, there were no errors */ + } + adjustresults(L, nresults); + lua_unlock(L); + return status; +} + + +LUA_API int lua_load (lua_State *L, lua_Reader reader, void *data, + const char *chunkname, const char *mode) { + ZIO z; + int status; + lua_lock(L); + if (!chunkname) chunkname = "?"; + luaZ_init(L, &z, reader, data); + status = luaD_protectedparser(L, &z, chunkname, mode); + if (status == LUA_OK) { /* no errors? */ + LClosure *f = clLvalue(L->top - 1); /* get newly created function */ + if (f->nupvalues == 1) { /* does it have one upvalue? */ + /* get global table from registry */ + Table *reg = hvalue(&G(L)->l_registry); + const TValue *gt = luaH_getint(reg, LUA_RIDX_GLOBALS); + /* set global table as 1st upvalue of 'f' (may be LUA_ENV) */ + setobj(L, f->upvals[0]->v, gt); + luaC_barrier(L, f->upvals[0], gt); + } + } + lua_unlock(L); + return status; +} + + +LUA_API int lua_dump (lua_State *L, lua_Writer writer, void *data) { + int status; + TValue *o; + lua_lock(L); + api_checknelems(L, 1); + o = L->top - 1; + if (isLfunction(o)) + status = luaU_dump(L, getproto(o), writer, data, 0); + else + status = 1; + lua_unlock(L); + return status; +} + + +LUA_API int lua_status (lua_State *L) { + return L->status; +} + + +/* +** Garbage-collection function +*/ + +LUA_API int lua_gc (lua_State *L, int what, int data) { + int res = 0; + global_State *g; + lua_lock(L); + g = G(L); + switch (what) { + case LUA_GCSTOP: { + g->gcrunning = 0; + break; + } + case LUA_GCRESTART: { + luaE_setdebt(g, 0); + g->gcrunning = 1; + break; + } + case LUA_GCCOLLECT: { + luaC_fullgc(L, 0); + break; + } + case LUA_GCCOUNT: { + /* GC values are expressed in Kbytes: #bytes/2^10 */ + res = cast_int(gettotalbytes(g) >> 10); + break; + } + case LUA_GCCOUNTB: { + res = cast_int(gettotalbytes(g) & 0x3ff); + break; + } + case LUA_GCSTEP: { + if (g->gckind == KGC_GEN) { /* generational mode? */ + res = (g->GCestimate == 0); /* true if it will do major collection */ + luaC_forcestep(L); /* do a single step */ + } + else { + lu_mem debt = cast(lu_mem, data) * 1024 - GCSTEPSIZE; + if (g->gcrunning) + debt += g->GCdebt; /* include current debt */ + luaE_setdebt(g, debt); + luaC_forcestep(L); + if (g->gcstate == GCSpause) /* end of cycle? */ + res = 1; /* signal it */ + } + break; + } + case LUA_GCSETPAUSE: { + res = g->gcpause; + g->gcpause = data; + break; + } + case LUA_GCSETMAJORINC: { + res = g->gcmajorinc; + g->gcmajorinc = data; + break; + } + case LUA_GCSETSTEPMUL: { + res = g->gcstepmul; + g->gcstepmul = data; + break; + } + case LUA_GCISRUNNING: { + res = g->gcrunning; + break; + } + case LUA_GCGEN: { /* change collector to generational mode */ + luaC_changemode(L, KGC_GEN); + break; + } + case LUA_GCINC: { /* change collector to incremental mode */ + luaC_changemode(L, KGC_NORMAL); + break; + } + default: res = -1; /* invalid option */ + } + lua_unlock(L); + return res; +} + + + +/* +** miscellaneous functions +*/ + + +LUA_API int lua_error (lua_State *L) { + lua_lock(L); + api_checknelems(L, 1); + luaG_errormsg(L); + /* code unreachable; will unlock when control actually leaves the kernel */ + return 0; /* to avoid warnings */ +} + + +LUA_API int lua_next (lua_State *L, int idx) { + StkId t; + int more; + lua_lock(L); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + more = luaH_next(L, hvalue(t), L->top - 1); + if (more) { + api_incr_top(L); + } + else /* no more elements */ + L->top -= 1; /* remove key */ + lua_unlock(L); + return more; +} + + +LUA_API void lua_concat (lua_State *L, int n) { + lua_lock(L); + api_checknelems(L, n); + if (n >= 2) { + luaC_checkGC(L); + luaV_concat(L, n); + } + else if (n == 0) { /* push empty string */ + setsvalue2s(L, L->top, luaS_newlstr(L, "", 0)); + api_incr_top(L); + } + /* else n == 1; nothing to do */ + lua_unlock(L); +} + + +LUA_API void lua_len (lua_State *L, int idx) { + StkId t; + lua_lock(L); + t = index2addr(L, idx); + luaV_objlen(L, L->top, t); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API lua_Alloc lua_getallocf (lua_State *L, void **ud) { + lua_Alloc f; + lua_lock(L); + if (ud) *ud = G(L)->ud; + f = G(L)->frealloc; + lua_unlock(L); + return f; +} + + +LUA_API void lua_setallocf (lua_State *L, lua_Alloc f, void *ud) { + lua_lock(L); + G(L)->ud = ud; + G(L)->frealloc = f; + lua_unlock(L); +} + + +LUA_API void *lua_newuserdata (lua_State *L, size_t size) { + Udata *u; + lua_lock(L); + luaC_checkGC(L); + u = luaS_newudata(L, size, NULL); + setuvalue(L, L->top, u); + api_incr_top(L); + lua_unlock(L); + return u + 1; +} + + + +static const char *aux_upvalue (StkId fi, int n, TValue **val, + GCObject **owner) { + switch (ttype(fi)) { + case LUA_TCCL: { /* C closure */ + CClosure *f = clCvalue(fi); + if (!(1 <= n && n <= f->nupvalues)) return NULL; + *val = &f->upvalue[n-1]; + if (owner) *owner = obj2gco(f); + return ""; + } + case LUA_TLCL: { /* Lua closure */ + LClosure *f = clLvalue(fi); + TString *name; + Proto *p = f->p; + if (!(1 <= n && n <= p->sizeupvalues)) return NULL; + *val = f->upvals[n-1]->v; + if (owner) *owner = obj2gco(f->upvals[n - 1]); + name = p->upvalues[n-1].name; + return (name == NULL) ? "" : getstr(name); + } + default: return NULL; /* not a closure */ + } +} + + +LUA_API const char *lua_getupvalue (lua_State *L, int funcindex, int n) { + const char *name; + TValue *val = NULL; /* to avoid warnings */ + lua_lock(L); + name = aux_upvalue(index2addr(L, funcindex), n, &val, NULL); + if (name) { + setobj2s(L, L->top, val); + api_incr_top(L); + } + lua_unlock(L); + return name; +} + + +LUA_API const char *lua_setupvalue (lua_State *L, int funcindex, int n) { + const char *name; + TValue *val = NULL; /* to avoid warnings */ + GCObject *owner = NULL; /* to avoid warnings */ + StkId fi; + lua_lock(L); + fi = index2addr(L, funcindex); + api_checknelems(L, 1); + name = aux_upvalue(fi, n, &val, &owner); + if (name) { + L->top--; + setobj(L, val, L->top); + luaC_barrier(L, owner, L->top); + } + lua_unlock(L); + return name; +} + + +static UpVal **getupvalref (lua_State *L, int fidx, int n, LClosure **pf) { + LClosure *f; + StkId fi = index2addr(L, fidx); + api_check(L, ttisLclosure(fi), "Lua function expected"); + f = clLvalue(fi); + api_check(L, (1 <= n && n <= f->p->sizeupvalues), "invalid upvalue index"); + if (pf) *pf = f; + return &f->upvals[n - 1]; /* get its upvalue pointer */ +} + + +LUA_API void *lua_upvalueid (lua_State *L, int fidx, int n) { + StkId fi = index2addr(L, fidx); + switch (ttype(fi)) { + case LUA_TLCL: { /* lua closure */ + return *getupvalref(L, fidx, n, NULL); + } + case LUA_TCCL: { /* C closure */ + CClosure *f = clCvalue(fi); + api_check(L, 1 <= n && n <= f->nupvalues, "invalid upvalue index"); + return &f->upvalue[n - 1]; + } + default: { + api_check(L, 0, "closure expected"); + return NULL; + } + } +} + + +LUA_API void lua_upvaluejoin (lua_State *L, int fidx1, int n1, + int fidx2, int n2) { + LClosure *f1; + UpVal **up1 = getupvalref(L, fidx1, n1, &f1); + UpVal **up2 = getupvalref(L, fidx2, n2, NULL); + *up1 = *up2; + luaC_objbarrier(L, f1, *up2); +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h new file mode 100644 index 000000000000..c7d34ad84866 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h @@ -0,0 +1,24 @@ +/* +** $Id: lapi.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $ +** Auxiliary functions from Lua API +** See Copyright Notice in lua.h +*/ + +#ifndef lapi_h +#define lapi_h + + +#include "llimits.h" +#include "lstate.h" + +#define api_incr_top(L) {L->top++; api_check(L, L->top <= L->ci->top, \ + "stack overflow");} + +#define adjustresults(L,nres) \ + { if ((nres) == LUA_MULTRET && L->ci->top < L->top) L->ci->top = L->top; } + +#define api_checknelems(L,n) api_check(L, (n) < (L->top - L->ci->func), \ + "not enough elements in the stack") + + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c new file mode 100644 index 000000000000..4bd13788b459 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c @@ -0,0 +1,791 @@ +/* +** $Id: lauxlib.c,v 1.248.1.1 2013/04/12 18:48:47 roberto Exp $ +** Auxiliary functions for building Lua libraries +** See Copyright Notice in lua.h +*/ + + +#include <sys/zfs_context.h> + +/* This file uses only the official API of Lua. +** Any function declared here could be written as an application function. +*/ + +#define lauxlib_c +#define LUA_LIB + +#include "lua.h" + +#include "lauxlib.h" + + +/* +** {====================================================== +** Traceback +** ======================================================= +*/ + + +#define LEVELS1 12 /* size of the first part of the stack */ +#define LEVELS2 10 /* size of the second part of the stack */ + + + +/* +** search for 'objidx' in table at index -1. +** return 1 + string at top if find a good name. +*/ +static int findfield (lua_State *L, int objidx, int level) { + if (level == 0 || !lua_istable(L, -1)) + return 0; /* not found */ + lua_pushnil(L); /* start 'next' loop */ + while (lua_next(L, -2)) { /* for each pair in table */ + if (lua_type(L, -2) == LUA_TSTRING) { /* ignore non-string keys */ + if (lua_rawequal(L, objidx, -1)) { /* found object? */ + lua_pop(L, 1); /* remove value (but keep name) */ + return 1; + } + else if (findfield(L, objidx, level - 1)) { /* try recursively */ + lua_remove(L, -2); /* remove table (but keep name) */ + lua_pushliteral(L, "."); + lua_insert(L, -2); /* place '.' between the two names */ + lua_concat(L, 3); + return 1; + } + } + lua_pop(L, 1); /* remove value */ + } + return 0; /* not found */ +} + + +static int pushglobalfuncname (lua_State *L, lua_Debug *ar) { + int top = lua_gettop(L); + lua_getinfo(L, "f", ar); /* push function */ + lua_pushglobaltable(L); + if (findfield(L, top + 1, 2)) { + lua_copy(L, -1, top + 1); /* move name to proper place */ + lua_pop(L, 2); /* remove pushed values */ + return 1; + } + else { + lua_settop(L, top); /* remove function and global table */ + return 0; + } +} + + +static void pushfuncname (lua_State *L, lua_Debug *ar) { + if (*ar->namewhat != '\0') /* is there a name? */ + lua_pushfstring(L, "function " LUA_QS, ar->name); + else if (*ar->what == 'm') /* main? */ + lua_pushliteral(L, "main chunk"); + else if (*ar->what == 'C') { + if (pushglobalfuncname(L, ar)) { + lua_pushfstring(L, "function " LUA_QS, lua_tostring(L, -1)); + lua_remove(L, -2); /* remove name */ + } + else + lua_pushliteral(L, "?"); + } + else + lua_pushfstring(L, "function <%s:%d>", ar->short_src, ar->linedefined); +} + + +static int countlevels (lua_State *L) { + lua_Debug ar; + int li = 1, le = 1; + /* find an upper bound */ + while (lua_getstack(L, le, &ar)) { li = le; le *= 2; } + /* do a binary search */ + while (li < le) { + int m = (li + le)/2; + if (lua_getstack(L, m, &ar)) li = m + 1; + else le = m; + } + return le - 1; +} + + +LUALIB_API void luaL_traceback (lua_State *L, lua_State *L1, + const char *msg, int level) { + lua_Debug ar; + int top = lua_gettop(L); + int numlevels = countlevels(L1); + int mark = (numlevels > LEVELS1 + LEVELS2) ? LEVELS1 : 0; + if (msg) lua_pushfstring(L, "%s\n", msg); + lua_pushliteral(L, "stack traceback:"); + while (lua_getstack(L1, level++, &ar)) { + if (level == mark) { /* too many levels? */ + lua_pushliteral(L, "\n\t..."); /* add a '...' */ + level = numlevels - LEVELS2; /* and skip to last ones */ + } + else { + lua_getinfo(L1, "Slnt", &ar); + lua_pushfstring(L, "\n\t%s:", ar.short_src); + if (ar.currentline > 0) + lua_pushfstring(L, "%d:", ar.currentline); + lua_pushliteral(L, " in "); + pushfuncname(L, &ar); + if (ar.istailcall) + lua_pushliteral(L, "\n\t(...tail calls...)"); + lua_concat(L, lua_gettop(L) - top); + } + } + lua_concat(L, lua_gettop(L) - top); +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Error-report functions +** ======================================================= +*/ + +LUALIB_API int luaL_argerror (lua_State *L, int narg, const char *extramsg) { + lua_Debug ar; + if (!lua_getstack(L, 0, &ar)) /* no stack frame? */ + return luaL_error(L, "bad argument #%d (%s)", narg, extramsg); + lua_getinfo(L, "n", &ar); + if (strcmp(ar.namewhat, "method") == 0) { + narg--; /* do not count `self' */ + if (narg == 0) /* error is in the self argument itself? */ + return luaL_error(L, "calling " LUA_QS " on bad self (%s)", + ar.name, extramsg); + } + if (ar.name == NULL) + ar.name = (pushglobalfuncname(L, &ar)) ? lua_tostring(L, -1) : "?"; + return luaL_error(L, "bad argument #%d to " LUA_QS " (%s)", + narg, ar.name, extramsg); +} + + +static int typeerror (lua_State *L, int narg, const char *tname) { + const char *msg = lua_pushfstring(L, "%s expected, got %s", + tname, luaL_typename(L, narg)); + return luaL_argerror(L, narg, msg); +} + + +static void tag_error (lua_State *L, int narg, int tag) { + typeerror(L, narg, lua_typename(L, tag)); +} + + +LUALIB_API void luaL_where (lua_State *L, int level) { + lua_Debug ar; + if (lua_getstack(L, level, &ar)) { /* check function at level */ + lua_getinfo(L, "Sl", &ar); /* get info about it */ + if (ar.currentline > 0) { /* is there info? */ + lua_pushfstring(L, "%s:%d: ", ar.short_src, ar.currentline); + return; + } + } + lua_pushliteral(L, ""); /* else, no information available... */ +} + + +LUALIB_API int luaL_error (lua_State *L, const char *fmt, ...) { + va_list argp; + va_start(argp, fmt); + luaL_where(L, 1); + lua_pushvfstring(L, fmt, argp); + va_end(argp); + lua_concat(L, 2); + return lua_error(L); +} + + +#if !defined(inspectstat) /* { */ + +#if defined(LUA_USE_POSIX) + +#include <sys/wait.h> + +/* +** use appropriate macros to interpret 'pclose' return status +*/ +#define inspectstat(stat,what) \ + if (WIFEXITED(stat)) { stat = WEXITSTATUS(stat); } \ + else if (WIFSIGNALED(stat)) { stat = WTERMSIG(stat); what = "signal"; } + +#else + +#define inspectstat(stat,what) /* no op */ + +#endif + +#endif /* } */ + + +/* }====================================================== */ + + +/* +** {====================================================== +** Userdata's metatable manipulation +** ======================================================= +*/ + +LUALIB_API int luaL_newmetatable (lua_State *L, const char *tname) { + luaL_getmetatable(L, tname); /* try to get metatable */ + if (!lua_isnil(L, -1)) /* name already in use? */ + return 0; /* leave previous value on top, but return 0 */ + lua_pop(L, 1); + lua_newtable(L); /* create metatable */ + lua_pushvalue(L, -1); + lua_setfield(L, LUA_REGISTRYINDEX, tname); /* registry.name = metatable */ + return 1; +} + + +LUALIB_API void luaL_setmetatable (lua_State *L, const char *tname) { + luaL_getmetatable(L, tname); + lua_setmetatable(L, -2); +} + + +LUALIB_API void *luaL_testudata (lua_State *L, int ud, const char *tname) { + void *p = lua_touserdata(L, ud); + if (p != NULL) { /* value is a userdata? */ + if (lua_getmetatable(L, ud)) { /* does it have a metatable? */ + luaL_getmetatable(L, tname); /* get correct metatable */ + if (!lua_rawequal(L, -1, -2)) /* not the same? */ + p = NULL; /* value is a userdata with wrong metatable */ + lua_pop(L, 2); /* remove both metatables */ + return p; + } + } + return NULL; /* value is not a userdata with a metatable */ +} + + +LUALIB_API void *luaL_checkudata (lua_State *L, int ud, const char *tname) { + void *p = luaL_testudata(L, ud, tname); + if (p == NULL) typeerror(L, ud, tname); + return p; +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Argument check functions +** ======================================================= +*/ + +LUALIB_API int luaL_checkoption (lua_State *L, int narg, const char *def, + const char *const lst[]) { + const char *name = (def) ? luaL_optstring(L, narg, def) : + luaL_checkstring(L, narg); + int i; + for (i=0; lst[i]; i++) + if (strcmp(lst[i], name) == 0) + return i; + return luaL_argerror(L, narg, + lua_pushfstring(L, "invalid option " LUA_QS, name)); +} + + +LUALIB_API void luaL_checkstack (lua_State *L, int space, const char *msg) { + /* keep some extra space to run error routines, if needed */ + const int extra = LUA_MINSTACK; + if (!lua_checkstack(L, space + extra)) { + if (msg) + luaL_error(L, "stack overflow (%s)", msg); + else + luaL_error(L, "stack overflow"); + } +} + + +LUALIB_API void luaL_checktype (lua_State *L, int narg, int t) { + if (lua_type(L, narg) != t) + tag_error(L, narg, t); +} + + +LUALIB_API void luaL_checkany (lua_State *L, int narg) { + if (lua_type(L, narg) == LUA_TNONE) + luaL_argerror(L, narg, "value expected"); +} + + +LUALIB_API const char *luaL_checklstring (lua_State *L, int narg, size_t *len) { + const char *s = lua_tolstring(L, narg, len); + if (!s) tag_error(L, narg, LUA_TSTRING); + return s; +} + + +LUALIB_API const char *luaL_optlstring (lua_State *L, int narg, + const char *def, size_t *len) { + if (lua_isnoneornil(L, narg)) { + if (len) + *len = (def ? strlen(def) : 0); + return def; + } + else return luaL_checklstring(L, narg, len); +} + + +LUALIB_API lua_Number luaL_checknumber (lua_State *L, int narg) { + int isnum; + lua_Number d = lua_tonumberx(L, narg, &isnum); + if (!isnum) + tag_error(L, narg, LUA_TNUMBER); + return d; +} + + +LUALIB_API lua_Number luaL_optnumber (lua_State *L, int narg, lua_Number def) { + return luaL_opt(L, luaL_checknumber, narg, def); +} + + +LUALIB_API lua_Integer luaL_checkinteger (lua_State *L, int narg) { + int isnum; + lua_Integer d = lua_tointegerx(L, narg, &isnum); + if (!isnum) + tag_error(L, narg, LUA_TNUMBER); + return d; +} + + +LUALIB_API lua_Unsigned luaL_checkunsigned (lua_State *L, int narg) { + int isnum; + lua_Unsigned d = lua_tounsignedx(L, narg, &isnum); + if (!isnum) + tag_error(L, narg, LUA_TNUMBER); + return d; +} + + +LUALIB_API lua_Integer luaL_optinteger (lua_State *L, int narg, + lua_Integer def) { + return luaL_opt(L, luaL_checkinteger, narg, def); +} + + +LUALIB_API lua_Unsigned luaL_optunsigned (lua_State *L, int narg, + lua_Unsigned def) { + return luaL_opt(L, luaL_checkunsigned, narg, def); +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Generic Buffer manipulation +** ======================================================= +*/ + +/* +** check whether buffer is using a userdata on the stack as a temporary +** buffer +*/ +#define buffonstack(B) ((B)->b != (B)->initb) + + +/* +** returns a pointer to a free area with at least 'sz' bytes +*/ +LUALIB_API char *luaL_prepbuffsize (luaL_Buffer *B, size_t sz) { + lua_State *L = B->L; + if (B->size - B->n < sz) { /* not enough space? */ + char *newbuff; + size_t newsize = B->size * 2; /* double buffer size */ + if (newsize - B->n < sz) /* not big enough? */ + newsize = B->n + sz; + if (newsize < B->n || newsize - B->n < sz) + luaL_error(L, "buffer too large"); + /* create larger buffer */ + newbuff = (char *)lua_newuserdata(L, newsize * sizeof(char)); + /* move content to new buffer */ + memcpy(newbuff, B->b, B->n * sizeof(char)); + if (buffonstack(B)) + lua_remove(L, -2); /* remove old buffer */ + B->b = newbuff; + B->size = newsize; + } + return &B->b[B->n]; +} + + +LUALIB_API void luaL_addlstring (luaL_Buffer *B, const char *s, size_t l) { + char *b = luaL_prepbuffsize(B, l); + memcpy(b, s, l * sizeof(char)); + luaL_addsize(B, l); +} + + +LUALIB_API void luaL_addstring (luaL_Buffer *B, const char *s) { + luaL_addlstring(B, s, strlen(s)); +} + + +LUALIB_API void luaL_pushresult (luaL_Buffer *B) { + lua_State *L = B->L; + lua_pushlstring(L, B->b, B->n); + if (buffonstack(B)) + lua_remove(L, -2); /* remove old buffer */ +} + + +LUALIB_API void luaL_pushresultsize (luaL_Buffer *B, size_t sz) { + luaL_addsize(B, sz); + luaL_pushresult(B); +} + + +LUALIB_API void luaL_addvalue (luaL_Buffer *B) { + lua_State *L = B->L; + size_t l; + const char *s = lua_tolstring(L, -1, &l); + if (buffonstack(B)) + lua_insert(L, -2); /* put value below buffer */ + luaL_addlstring(B, s, l); + lua_remove(L, (buffonstack(B)) ? -2 : -1); /* remove value */ +} + + +LUALIB_API void luaL_buffinit (lua_State *L, luaL_Buffer *B) { + B->L = L; + B->b = B->initb; + B->n = 0; + B->size = LUAL_BUFFERSIZE; +} + + +LUALIB_API char *luaL_buffinitsize (lua_State *L, luaL_Buffer *B, size_t sz) { + luaL_buffinit(L, B); + return luaL_prepbuffsize(B, sz); +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Reference system +** ======================================================= +*/ + +/* index of free-list header */ +#define freelist 0 + + +LUALIB_API int luaL_ref (lua_State *L, int t) { + int ref; + if (lua_isnil(L, -1)) { + lua_pop(L, 1); /* remove from stack */ + return LUA_REFNIL; /* `nil' has a unique fixed reference */ + } + t = lua_absindex(L, t); + lua_rawgeti(L, t, freelist); /* get first free element */ + ref = (int)lua_tointeger(L, -1); /* ref = t[freelist] */ + lua_pop(L, 1); /* remove it from stack */ + if (ref != 0) { /* any free element? */ + lua_rawgeti(L, t, ref); /* remove it from list */ + lua_rawseti(L, t, freelist); /* (t[freelist] = t[ref]) */ + } + else /* no free elements */ + ref = (int)lua_rawlen(L, t) + 1; /* get a new reference */ + lua_rawseti(L, t, ref); + return ref; +} + + +LUALIB_API void luaL_unref (lua_State *L, int t, int ref) { + if (ref >= 0) { + t = lua_absindex(L, t); + lua_rawgeti(L, t, freelist); + lua_rawseti(L, t, ref); /* t[ref] = t[freelist] */ + lua_pushinteger(L, ref); + lua_rawseti(L, t, freelist); /* t[freelist] = ref */ + } +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Load functions +** ======================================================= +*/ + +typedef struct LoadS { + const char *s; + size_t size; +} LoadS; + + +static const char *getS (lua_State *L, void *ud, size_t *size) { + LoadS *ls = (LoadS *)ud; + (void)L; /* not used */ + if (ls->size == 0) return NULL; + *size = ls->size; + ls->size = 0; + return ls->s; +} + + +LUALIB_API int luaL_loadbufferx (lua_State *L, const char *buff, size_t size, + const char *name, const char *mode) { + LoadS ls; + ls.s = buff; + ls.size = size; + return lua_load(L, getS, &ls, name, mode); +} + + +LUALIB_API int luaL_loadstring (lua_State *L, const char *s) { + return luaL_loadbuffer(L, s, strlen(s), s); +} + +/* }====================================================== */ + + + +LUALIB_API int luaL_getmetafield (lua_State *L, int obj, const char *event) { + if (!lua_getmetatable(L, obj)) /* no metatable? */ + return 0; + lua_pushstring(L, event); + lua_rawget(L, -2); + if (lua_isnil(L, -1)) { + lua_pop(L, 2); /* remove metatable and metafield */ + return 0; + } + else { + lua_remove(L, -2); /* remove only metatable */ + return 1; + } +} + + +LUALIB_API int luaL_callmeta (lua_State *L, int obj, const char *event) { + obj = lua_absindex(L, obj); + if (!luaL_getmetafield(L, obj, event)) /* no metafield? */ + return 0; + lua_pushvalue(L, obj); + lua_call(L, 1, 1); + return 1; +} + + +LUALIB_API int luaL_len (lua_State *L, int idx) { + int l; + int isnum; + lua_len(L, idx); + l = (int)lua_tointegerx(L, -1, &isnum); + if (!isnum) + luaL_error(L, "object length is not a number"); + lua_pop(L, 1); /* remove object */ + return l; +} + + +LUALIB_API const char *luaL_tolstring (lua_State *L, int idx, size_t *len) { + if (!luaL_callmeta(L, idx, "__tostring")) { /* no metafield? */ + switch (lua_type(L, idx)) { + case LUA_TNUMBER: + case LUA_TSTRING: + lua_pushvalue(L, idx); + break; + case LUA_TBOOLEAN: + lua_pushstring(L, (lua_toboolean(L, idx) ? "true" : "false")); + break; + case LUA_TNIL: + lua_pushliteral(L, "nil"); + break; + default: + lua_pushfstring(L, "%s: %p", luaL_typename(L, idx), + lua_topointer(L, idx)); + break; + } + } + return lua_tolstring(L, -1, len); +} + + +/* +** {====================================================== +** Compatibility with 5.1 module functions +** ======================================================= +*/ +#if defined(LUA_COMPAT_MODULE) + +static const char *luaL_findtable (lua_State *L, int idx, + const char *fname, int szhint) { + const char *e; + if (idx) lua_pushvalue(L, idx); + do { + e = strchr(fname, '.'); + if (e == NULL) e = fname + strlen(fname); + lua_pushlstring(L, fname, e - fname); + lua_rawget(L, -2); + if (lua_isnil(L, -1)) { /* no such field? */ + lua_pop(L, 1); /* remove this nil */ + lua_createtable(L, 0, (*e == '.' ? 1 : szhint)); /* new table for field */ + lua_pushlstring(L, fname, e - fname); + lua_pushvalue(L, -2); + lua_settable(L, -4); /* set new table into field */ + } + else if (!lua_istable(L, -1)) { /* field has a non-table value? */ + lua_pop(L, 2); /* remove table and value */ + return fname; /* return problematic part of the name */ + } + lua_remove(L, -2); /* remove previous table */ + fname = e + 1; + } while (*e == '.'); + return NULL; +} + + +/* +** Count number of elements in a luaL_Reg list. +*/ +static int libsize (const luaL_Reg *l) { + int size = 0; + for (; l && l->name; l++) size++; + return size; +} + + +/* +** Find or create a module table with a given name. The function +** first looks at the _LOADED table and, if that fails, try a +** global variable with that name. In any case, leaves on the stack +** the module table. +*/ +LUALIB_API void luaL_pushmodule (lua_State *L, const char *modname, + int sizehint) { + luaL_findtable(L, LUA_REGISTRYINDEX, "_LOADED", 1); /* get _LOADED table */ + lua_getfield(L, -1, modname); /* get _LOADED[modname] */ + if (!lua_istable(L, -1)) { /* not found? */ + lua_pop(L, 1); /* remove previous result */ + /* try global variable (and create one if it does not exist) */ + lua_pushglobaltable(L); + if (luaL_findtable(L, 0, modname, sizehint) != NULL) + luaL_error(L, "name conflict for module " LUA_QS, modname); + lua_pushvalue(L, -1); + lua_setfield(L, -3, modname); /* _LOADED[modname] = new table */ + } + lua_remove(L, -2); /* remove _LOADED table */ +} + + +LUALIB_API void luaL_openlib (lua_State *L, const char *libname, + const luaL_Reg *l, int nup) { + luaL_checkversion(L); + if (libname) { + luaL_pushmodule(L, libname, libsize(l)); /* get/create library table */ + lua_insert(L, -(nup + 1)); /* move library table to below upvalues */ + } + if (l) + luaL_setfuncs(L, l, nup); + else + lua_pop(L, nup); /* remove upvalues */ +} + +#endif +/* }====================================================== */ + +/* +** set functions from list 'l' into table at top - 'nup'; each +** function gets the 'nup' elements at the top as upvalues. +** Returns with only the table at the stack. +*/ +LUALIB_API void luaL_setfuncs (lua_State *L, const luaL_Reg *l, int nup) { + luaL_checkversion(L); + luaL_checkstack(L, nup, "too many upvalues"); + for (; l->name != NULL; l++) { /* fill the table with given functions */ + int i; + for (i = 0; i < nup; i++) /* copy upvalues to the top */ + lua_pushvalue(L, -nup); + lua_pushcclosure(L, l->func, nup); /* closure with those upvalues */ + lua_setfield(L, -(nup + 2), l->name); + } + lua_pop(L, nup); /* remove upvalues */ +} + + +/* +** ensure that stack[idx][fname] has a table and push that table +** into the stack +*/ +LUALIB_API int luaL_getsubtable (lua_State *L, int idx, const char *fname) { + lua_getfield(L, idx, fname); + if (lua_istable(L, -1)) return 1; /* table already there */ + else { + lua_pop(L, 1); /* remove previous result */ + idx = lua_absindex(L, idx); + lua_newtable(L); + lua_pushvalue(L, -1); /* copy to be left at top */ + lua_setfield(L, idx, fname); /* assign new table to field */ + return 0; /* false, because did not find table there */ + } +} + + +/* +** stripped-down 'require'. Calls 'openf' to open a module, +** registers the result in 'package.loaded' table and, if 'glb' +** is true, also registers the result in the global table. +** Leaves resulting module on the top. +*/ +LUALIB_API void luaL_requiref (lua_State *L, const char *modname, + lua_CFunction openf, int glb) { + lua_pushcfunction(L, openf); + lua_pushstring(L, modname); /* argument to open function */ + lua_call(L, 1, 1); /* open module */ + luaL_getsubtable(L, LUA_REGISTRYINDEX, "_LOADED"); + lua_pushvalue(L, -2); /* make copy of module (call result) */ + lua_setfield(L, -2, modname); /* _LOADED[modname] = module */ + lua_pop(L, 1); /* remove _LOADED table */ + if (glb) { + lua_pushvalue(L, -1); /* copy of 'mod' */ + lua_setglobal(L, modname); /* _G[modname] = module */ + } +} + + +LUALIB_API const char *luaL_gsub (lua_State *L, const char *s, const char *p, + const char *r) { + const char *wild; + size_t l = strlen(p); + luaL_Buffer b; + luaL_buffinit(L, &b); + while ((wild = strstr(s, p)) != NULL) { + luaL_addlstring(&b, s, wild - s); /* push prefix */ + luaL_addstring(&b, r); /* push replacement in place of pattern */ + s = wild + l; /* continue after `p' */ + } + luaL_addstring(&b, s); /* push last suffix */ + luaL_pushresult(&b); + return lua_tostring(L, -1); +} + + +LUALIB_API void luaL_checkversion_ (lua_State *L, lua_Number ver) { + const lua_Number *v = lua_version(L); + if (v != lua_version(NULL)) + luaL_error(L, "multiple Lua VMs detected"); + else if (*v != ver) + luaL_error(L, "version mismatch: app. needs %f, Lua core provides %f", + ver, *v); + /* check conversions number -> integer types */ + lua_pushnumber(L, -(lua_Number)0x1234); + if (lua_tointeger(L, -1) != -0x1234 || + lua_tounsigned(L, -1) != (lua_Unsigned)-0x1234) + luaL_error(L, "bad conversion number->int;" + " must recompile Lua with proper settings"); + lua_pop(L, 1); +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h new file mode 100644 index 000000000000..f6fdac14f50b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h @@ -0,0 +1,176 @@ +/* +** $Id: lauxlib.h,v 1.120.1.1 2013/04/12 18:48:47 roberto Exp $ +** Auxiliary functions for building Lua libraries +** See Copyright Notice in lua.h +*/ + + +#ifndef lauxlib_h +#define lauxlib_h + + +#include <sys/zfs_context.h> + +#include "lua.h" + + + +/* extra error code for `luaL_load' */ +#define LUA_ERRFILE (LUA_ERRERR+1) + + +typedef struct luaL_Reg { + const char *name; + lua_CFunction func; +} luaL_Reg; + + +LUALIB_API void (luaL_checkversion_) (lua_State *L, lua_Number ver); +#define luaL_checkversion(L) luaL_checkversion_(L, LUA_VERSION_NUM) + +LUALIB_API int (luaL_getmetafield) (lua_State *L, int obj, const char *e); +LUALIB_API int (luaL_callmeta) (lua_State *L, int obj, const char *e); +LUALIB_API const char *(luaL_tolstring) (lua_State *L, int idx, size_t *len); +LUALIB_API int (luaL_argerror) (lua_State *L, int numarg, const char *extramsg); +LUALIB_API const char *(luaL_checklstring) (lua_State *L, int numArg, + size_t *l); +LUALIB_API const char *(luaL_optlstring) (lua_State *L, int numArg, + const char *def, size_t *l); +LUALIB_API lua_Number (luaL_checknumber) (lua_State *L, int numArg); +LUALIB_API lua_Number (luaL_optnumber) (lua_State *L, int nArg, lua_Number def); + +LUALIB_API lua_Integer (luaL_checkinteger) (lua_State *L, int numArg); +LUALIB_API lua_Integer (luaL_optinteger) (lua_State *L, int nArg, + lua_Integer def); +LUALIB_API lua_Unsigned (luaL_checkunsigned) (lua_State *L, int numArg); +LUALIB_API lua_Unsigned (luaL_optunsigned) (lua_State *L, int numArg, + lua_Unsigned def); + +LUALIB_API void (luaL_checkstack) (lua_State *L, int sz, const char *msg); +LUALIB_API void (luaL_checktype) (lua_State *L, int narg, int t); +LUALIB_API void (luaL_checkany) (lua_State *L, int narg); + +LUALIB_API int (luaL_newmetatable) (lua_State *L, const char *tname); +LUALIB_API void (luaL_setmetatable) (lua_State *L, const char *tname); +LUALIB_API void *(luaL_testudata) (lua_State *L, int ud, const char *tname); +LUALIB_API void *(luaL_checkudata) (lua_State *L, int ud, const char *tname); + +LUALIB_API void (luaL_where) (lua_State *L, int lvl); +LUALIB_API int (luaL_error) (lua_State *L, const char *fmt, ...); + +LUALIB_API int (luaL_checkoption) (lua_State *L, int narg, const char *def, + const char *const lst[]); + +/* pre-defined references */ +#define LUA_NOREF (-2) +#define LUA_REFNIL (-1) + +LUALIB_API int (luaL_ref) (lua_State *L, int t); +LUALIB_API void (luaL_unref) (lua_State *L, int t, int ref); + +LUALIB_API int (luaL_loadbufferx) (lua_State *L, const char *buff, size_t sz, + const char *name, const char *mode); +LUALIB_API int (luaL_loadstring) (lua_State *L, const char *s); + +LUALIB_API int (luaL_len) (lua_State *L, int idx); + +LUALIB_API const char *(luaL_gsub) (lua_State *L, const char *s, const char *p, + const char *r); + +LUALIB_API void (luaL_setfuncs) (lua_State *L, const luaL_Reg *l, int nup); + +LUALIB_API int (luaL_getsubtable) (lua_State *L, int idx, const char *fname); + +LUALIB_API void (luaL_traceback) (lua_State *L, lua_State *L1, + const char *msg, int level); + +LUALIB_API void (luaL_requiref) (lua_State *L, const char *modname, + lua_CFunction openf, int glb); + +/* +** =============================================================== +** some useful macros +** =============================================================== +*/ + + +#define luaL_newlibtable(L,l) \ + lua_createtable(L, 0, sizeof(l)/sizeof((l)[0]) - 1) + +#define luaL_newlib(L,l) (luaL_newlibtable(L,l), luaL_setfuncs(L,l,0)) + +#define luaL_argcheck(L, cond,numarg,extramsg) \ + ((void)((cond) || luaL_argerror(L, (numarg), (extramsg)))) +#define luaL_checkstring(L,n) (luaL_checklstring(L, (n), NULL)) +#define luaL_optstring(L,n,d) (luaL_optlstring(L, (n), (d), NULL)) +#define luaL_checkint(L,n) ((int)luaL_checkinteger(L, (n))) +#define luaL_optint(L,n,d) ((int)luaL_optinteger(L, (n), (d))) +#define luaL_checklong(L,n) ((long)luaL_checkinteger(L, (n))) +#define luaL_optlong(L,n,d) ((long)luaL_optinteger(L, (n), (d))) + +#define luaL_typename(L,i) lua_typename(L, lua_type(L,(i))) + +#define luaL_dofile(L, fn) \ + (luaL_loadfile(L, fn) || lua_pcall(L, 0, LUA_MULTRET, 0)) + +#define luaL_dostring(L, s) \ + (luaL_loadstring(L, s) || lua_pcall(L, 0, LUA_MULTRET, 0)) + +#define luaL_getmetatable(L,n) (lua_getfield(L, LUA_REGISTRYINDEX, (n))) + +#define luaL_opt(L,f,n,d) (lua_isnoneornil(L,(n)) ? (d) : f(L,(n))) + +#define luaL_loadbuffer(L,s,sz,n) luaL_loadbufferx(L,s,sz,n,NULL) + + +/* +** {====================================================== +** Generic Buffer manipulation +** ======================================================= +*/ + +typedef struct luaL_Buffer { + char *b; /* buffer address */ + size_t size; /* buffer size */ + size_t n; /* number of characters in buffer */ + lua_State *L; + char initb[LUAL_BUFFERSIZE]; /* initial buffer */ +} luaL_Buffer; + + +#define luaL_addchar(B,c) \ + ((void)((B)->n < (B)->size || luaL_prepbuffsize((B), 1)), \ + ((B)->b[(B)->n++] = (c))) + +#define luaL_addsize(B,s) ((B)->n += (s)) + +LUALIB_API void (luaL_buffinit) (lua_State *L, luaL_Buffer *B); +LUALIB_API char *(luaL_prepbuffsize) (luaL_Buffer *B, size_t sz); +LUALIB_API void (luaL_addlstring) (luaL_Buffer *B, const char *s, size_t l); +LUALIB_API void (luaL_addstring) (luaL_Buffer *B, const char *s); +LUALIB_API void (luaL_addvalue) (luaL_Buffer *B); +LUALIB_API void (luaL_pushresult) (luaL_Buffer *B); +LUALIB_API void (luaL_pushresultsize) (luaL_Buffer *B, size_t sz); +LUALIB_API char *(luaL_buffinitsize) (lua_State *L, luaL_Buffer *B, size_t sz); + +#define luaL_prepbuffer(B) luaL_prepbuffsize(B, LUAL_BUFFERSIZE) + +/* }====================================================== */ + + +/* compatibility with old module system */ +#if defined(LUA_COMPAT_MODULE) + +LUALIB_API void (luaL_pushmodule) (lua_State *L, const char *modname, + int sizehint); +LUALIB_API void (luaL_openlib) (lua_State *L, const char *libname, + const luaL_Reg *l, int nup); + +#define luaL_register(L,n,l) (luaL_openlib(L,(n),(l),0)) + +#endif + + +#endif + + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c new file mode 100644 index 000000000000..b580cee1f955 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c @@ -0,0 +1,296 @@ +/* +** $Id: lbaselib.c,v 1.276.1.1 2013/04/12 18:48:47 roberto Exp $ +** Basic library +** See Copyright Notice in lua.h +*/ + +/* The following built-in lua functions have been removed and are not available + * for use in ZFS channel programs: + * + * dofile + * loadfile + * load + * pcall + * print + * xpcall + */ + +#include <sys/zfs_context.h> +#include <sys/ctype.h> +#ifdef illumos +#define toupper(C) (((C) >= 'a' && (C) <= 'z')? (C) - 'a' + 'A': (C)) +#else +#define isalnum(C) (isalpha(C) || isdigit(C)) +#endif + +#define lbaselib_c +#define LUA_LIB + +#include "lua.h" + +#include "lauxlib.h" +#include "lualib.h" + +#define SPACECHARS " \f\n\r\t\v" + +static int luaB_tonumber (lua_State *L) { + if (lua_isnoneornil(L, 2)) { /* standard conversion */ + int isnum; + lua_Number n = lua_tonumberx(L, 1, &isnum); + if (isnum) { + lua_pushnumber(L, n); + return 1; + } /* else not a number; must be something */ + luaL_checkany(L, 1); + } + else { + size_t l; + const char *s = luaL_checklstring(L, 1, &l); + const char *e = s + l; /* end point for 's' */ + int base = luaL_checkint(L, 2); + int neg = 0; + luaL_argcheck(L, 2 <= base && base <= 36, 2, "base out of range"); + s += strspn(s, SPACECHARS); /* skip initial spaces */ + if (*s == '-') { s++; neg = 1; } /* handle signal */ + else if (*s == '+') s++; + if (isalnum((unsigned char)*s)) { + lua_Number n = 0; + do { + int digit = (isdigit((unsigned char)*s)) ? *s - '0' + : toupper((unsigned char)*s) - 'A' + 10; + if (digit >= base) break; /* invalid numeral; force a fail */ + n = n * (lua_Number)base + (lua_Number)digit; + s++; + } while (isalnum((unsigned char)*s)); + s += strspn(s, SPACECHARS); /* skip trailing spaces */ + if (s == e) { /* no invalid trailing characters? */ + lua_pushnumber(L, (neg) ? -n : n); + return 1; + } /* else not a number */ + } /* else not a number */ + } + lua_pushnil(L); /* not a number */ + return 1; +} + + +static int luaB_error (lua_State *L) { + int level = luaL_optint(L, 2, 1); + lua_settop(L, 1); + if (lua_isstring(L, 1) && level > 0) { /* add extra information? */ + luaL_where(L, level); + lua_pushvalue(L, 1); + lua_concat(L, 2); + } + return lua_error(L); +} + + +static int luaB_getmetatable (lua_State *L) { + luaL_checkany(L, 1); + if (!lua_getmetatable(L, 1)) { + lua_pushnil(L); + return 1; /* no metatable */ + } + luaL_getmetafield(L, 1, "__metatable"); + return 1; /* returns either __metatable field (if present) or metatable */ +} + + +static int luaB_setmetatable (lua_State *L) { + int t = lua_type(L, 2); + luaL_checktype(L, 1, LUA_TTABLE); + luaL_argcheck(L, t == LUA_TNIL || t == LUA_TTABLE, 2, + "nil or table expected"); + if (luaL_getmetafield(L, 1, "__metatable")) + return luaL_error(L, "cannot change a protected metatable"); + lua_settop(L, 2); + lua_setmetatable(L, 1); + return 1; +} + + +static int luaB_rawequal (lua_State *L) { + luaL_checkany(L, 1); + luaL_checkany(L, 2); + lua_pushboolean(L, lua_rawequal(L, 1, 2)); + return 1; +} + + +static int luaB_rawlen (lua_State *L) { + int t = lua_type(L, 1); + luaL_argcheck(L, t == LUA_TTABLE || t == LUA_TSTRING, 1, + "table or string expected"); + lua_pushinteger(L, lua_rawlen(L, 1)); + return 1; +} + + +static int luaB_rawget (lua_State *L) { + luaL_checktype(L, 1, LUA_TTABLE); + luaL_checkany(L, 2); + lua_settop(L, 2); + lua_rawget(L, 1); + return 1; +} + +static int luaB_rawset (lua_State *L) { + luaL_checktype(L, 1, LUA_TTABLE); + luaL_checkany(L, 2); + luaL_checkany(L, 3); + lua_settop(L, 3); + lua_rawset(L, 1); + return 1; +} + + +static int luaB_collectgarbage (lua_State *L) { + static const char *const opts[] = {"stop", "restart", "collect", + "count", "step", "setpause", "setstepmul", + "setmajorinc", "isrunning", "generational", "incremental", NULL}; + static const int optsnum[] = {LUA_GCSTOP, LUA_GCRESTART, LUA_GCCOLLECT, + LUA_GCCOUNT, LUA_GCSTEP, LUA_GCSETPAUSE, LUA_GCSETSTEPMUL, + LUA_GCSETMAJORINC, LUA_GCISRUNNING, LUA_GCGEN, LUA_GCINC}; + int o = optsnum[luaL_checkoption(L, 1, "collect", opts)]; + int ex = luaL_optint(L, 2, 0); + int res = lua_gc(L, o, ex); + switch (o) { + case LUA_GCCOUNT: { + int b = lua_gc(L, LUA_GCCOUNTB, 0); + lua_pushnumber(L, res + ((lua_Number)b/1024)); + lua_pushinteger(L, b); + return 2; + } + case LUA_GCSTEP: case LUA_GCISRUNNING: { + lua_pushboolean(L, res); + return 1; + } + default: { + lua_pushinteger(L, res); + return 1; + } + } +} + + +static int luaB_type (lua_State *L) { + luaL_checkany(L, 1); + lua_pushstring(L, luaL_typename(L, 1)); + return 1; +} + + +static int pairsmeta (lua_State *L, const char *method, int iszero, + lua_CFunction iter) { + if (!luaL_getmetafield(L, 1, method)) { /* no metamethod? */ + luaL_checktype(L, 1, LUA_TTABLE); /* argument must be a table */ + lua_pushcfunction(L, iter); /* will return generator, */ + lua_pushvalue(L, 1); /* state, */ + if (iszero) lua_pushinteger(L, 0); /* and initial value */ + else lua_pushnil(L); + } + else { + lua_pushvalue(L, 1); /* argument 'self' to metamethod */ + lua_call(L, 1, 3); /* get 3 values from metamethod */ + } + return 3; +} + + +static int luaB_next (lua_State *L) { + luaL_checktype(L, 1, LUA_TTABLE); + lua_settop(L, 2); /* create a 2nd argument if there isn't one */ + if (lua_next(L, 1)) + return 2; + else { + lua_pushnil(L); + return 1; + } +} + + +static int luaB_pairs (lua_State *L) { + return pairsmeta(L, "__pairs", 0, luaB_next); +} + + +static int ipairsaux (lua_State *L) { + int i = luaL_checkint(L, 2); + luaL_checktype(L, 1, LUA_TTABLE); + i++; /* next value */ + lua_pushinteger(L, i); + lua_rawgeti(L, 1, i); + return (lua_isnil(L, -1)) ? 1 : 2; +} + + +static int luaB_ipairs (lua_State *L) { + return pairsmeta(L, "__ipairs", 1, ipairsaux); +} + + +static int luaB_assert (lua_State *L) { + if (!lua_toboolean(L, 1)) + return luaL_error(L, "%s", luaL_optstring(L, 2, "assertion failed!")); + return lua_gettop(L); +} + + +static int luaB_select (lua_State *L) { + int n = lua_gettop(L); + if (lua_type(L, 1) == LUA_TSTRING && *lua_tostring(L, 1) == '#') { + lua_pushinteger(L, n-1); + return 1; + } + else { + int i = luaL_checkint(L, 1); + if (i < 0) i = n + i; + else if (i > n) i = n; + luaL_argcheck(L, 1 <= i, 1, "index out of range"); + return n - i; + } +} + +static int luaB_tostring (lua_State *L) { + luaL_checkany(L, 1); + luaL_tolstring(L, 1, NULL); + return 1; +} + +static const luaL_Reg base_funcs[] = { + {"assert", luaB_assert}, + {"collectgarbage", luaB_collectgarbage}, + {"error", luaB_error}, + {"getmetatable", luaB_getmetatable}, + {"ipairs", luaB_ipairs}, +#if defined(LUA_COMPAT_LOADSTRING) + {"loadstring", luaB_load}, +#endif + {"next", luaB_next}, + {"pairs", luaB_pairs}, + {"rawequal", luaB_rawequal}, + {"rawlen", luaB_rawlen}, + {"rawget", luaB_rawget}, + {"rawset", luaB_rawset}, + {"select", luaB_select}, + {"setmetatable", luaB_setmetatable}, + {"tonumber", luaB_tonumber}, + {"tostring", luaB_tostring}, + {"type", luaB_type}, + {NULL, NULL} +}; + + +LUAMOD_API int luaopen_base (lua_State *L) { + /* set global _G */ + lua_pushglobaltable(L); + lua_pushglobaltable(L); + lua_setfield(L, -2, "_G"); + /* open lib into global table */ + luaL_setfuncs(L, base_funcs, 0); + lua_pushliteral(L, LUA_VERSION); + lua_setfield(L, -2, "_VERSION"); /* set global _VERSION */ + return 1; +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c new file mode 100644 index 000000000000..31c7b66f1290 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c @@ -0,0 +1,212 @@ +/* +** $Id: lbitlib.c,v 1.18.1.2 2013/07/09 18:01:41 roberto Exp $ +** Standard library for bitwise operations +** See Copyright Notice in lua.h +*/ + +#define lbitlib_c +#define LUA_LIB + +#include "lua.h" + +#include "lauxlib.h" +#include "lualib.h" + + +/* number of bits to consider in a number */ +#if !defined(LUA_NBITS) +#define LUA_NBITS 32 +#endif + + +#define ALLONES (~(((~(lua_Unsigned)0) << (LUA_NBITS - 1)) << 1)) + +/* macro to trim extra bits */ +#define trim(x) ((x) & ALLONES) + + +/* builds a number with 'n' ones (1 <= n <= LUA_NBITS) */ +#define mask(n) (~((ALLONES << 1) << ((n) - 1))) + + +typedef lua_Unsigned b_uint; + + + +static b_uint andaux (lua_State *L) { + int i, n = lua_gettop(L); + b_uint r = ~(b_uint)0; + for (i = 1; i <= n; i++) + r &= luaL_checkunsigned(L, i); + return trim(r); +} + + +static int b_and (lua_State *L) { + b_uint r = andaux(L); + lua_pushunsigned(L, r); + return 1; +} + + +static int b_test (lua_State *L) { + b_uint r = andaux(L); + lua_pushboolean(L, r != 0); + return 1; +} + + +static int b_or (lua_State *L) { + int i, n = lua_gettop(L); + b_uint r = 0; + for (i = 1; i <= n; i++) + r |= luaL_checkunsigned(L, i); + lua_pushunsigned(L, trim(r)); + return 1; +} + + +static int b_xor (lua_State *L) { + int i, n = lua_gettop(L); + b_uint r = 0; + for (i = 1; i <= n; i++) + r ^= luaL_checkunsigned(L, i); + lua_pushunsigned(L, trim(r)); + return 1; +} + + +static int b_not (lua_State *L) { + b_uint r = ~luaL_checkunsigned(L, 1); + lua_pushunsigned(L, trim(r)); + return 1; +} + + +static int b_shift (lua_State *L, b_uint r, int i) { + if (i < 0) { /* shift right? */ + i = -i; + r = trim(r); + if (i >= LUA_NBITS) r = 0; + else r >>= i; + } + else { /* shift left */ + if (i >= LUA_NBITS) r = 0; + else r <<= i; + r = trim(r); + } + lua_pushunsigned(L, r); + return 1; +} + + +static int b_lshift (lua_State *L) { + return b_shift(L, luaL_checkunsigned(L, 1), luaL_checkint(L, 2)); +} + + +static int b_rshift (lua_State *L) { + return b_shift(L, luaL_checkunsigned(L, 1), -luaL_checkint(L, 2)); +} + + +static int b_arshift (lua_State *L) { + b_uint r = luaL_checkunsigned(L, 1); + int i = luaL_checkint(L, 2); + if (i < 0 || !(r & ((b_uint)1 << (LUA_NBITS - 1)))) + return b_shift(L, r, -i); + else { /* arithmetic shift for 'negative' number */ + if (i >= LUA_NBITS) r = ALLONES; + else + r = trim((r >> i) | ~(~(b_uint)0 >> i)); /* add signal bit */ + lua_pushunsigned(L, r); + return 1; + } +} + + +static int b_rot (lua_State *L, int i) { + b_uint r = luaL_checkunsigned(L, 1); + i &= (LUA_NBITS - 1); /* i = i % NBITS */ + r = trim(r); + if (i != 0) /* avoid undefined shift of LUA_NBITS when i == 0 */ + r = (r << i) | (r >> (LUA_NBITS - i)); + lua_pushunsigned(L, trim(r)); + return 1; +} + + +static int b_lrot (lua_State *L) { + return b_rot(L, luaL_checkint(L, 2)); +} + + +static int b_rrot (lua_State *L) { + return b_rot(L, -luaL_checkint(L, 2)); +} + + +/* +** get field and width arguments for field-manipulation functions, +** checking whether they are valid. +** ('luaL_error' called without 'return' to avoid later warnings about +** 'width' being used uninitialized.) +*/ +static int fieldargs (lua_State *L, int farg, int *width) { + int f = luaL_checkint(L, farg); + int w = luaL_optint(L, farg + 1, 1); + luaL_argcheck(L, 0 <= f, farg, "field cannot be negative"); + luaL_argcheck(L, 0 < w, farg + 1, "width must be positive"); + if (f + w > LUA_NBITS) + luaL_error(L, "trying to access non-existent bits"); + *width = w; + return f; +} + + +static int b_extract (lua_State *L) { + int w; + b_uint r = luaL_checkunsigned(L, 1); + int f = fieldargs(L, 2, &w); + r = (r >> f) & mask(w); + lua_pushunsigned(L, r); + return 1; +} + + +static int b_replace (lua_State *L) { + int w; + b_uint r = luaL_checkunsigned(L, 1); + b_uint v = luaL_checkunsigned(L, 2); + int f = fieldargs(L, 3, &w); + int m = mask(w); + v &= m; /* erase bits outside given width */ + r = (r & ~(m << f)) | (v << f); + lua_pushunsigned(L, r); + return 1; +} + + +static const luaL_Reg bitlib[] = { + {"arshift", b_arshift}, + {"band", b_and}, + {"bnot", b_not}, + {"bor", b_or}, + {"bxor", b_xor}, + {"btest", b_test}, + {"extract", b_extract}, + {"lrotate", b_lrot}, + {"lshift", b_lshift}, + {"replace", b_replace}, + {"rrotate", b_rrot}, + {"rshift", b_rshift}, + {NULL, NULL} +}; + + + +LUAMOD_API int luaopen_bit32 (lua_State *L) { + luaL_newlib(L, bitlib); + return 1; +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c new file mode 100644 index 000000000000..f155014d12c4 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c @@ -0,0 +1,885 @@ +/* +** $Id: lcode.c,v 2.62.1.1 2013/04/12 18:48:47 roberto Exp $ +** Code generator for Lua +** See Copyright Notice in lua.h +*/ + +#include <sys/zfs_context.h> + +#define lcode_c +#define LUA_CORE + +#include "lua.h" + +#include "lcode.h" +#include "ldebug.h" +#include "ldo.h" +#include "lgc.h" +#include "llex.h" +#include "lmem.h" +#include "lobject.h" +#include "lopcodes.h" +#include "lparser.h" +#include "lstring.h" +#include "ltable.h" +#include "lvm.h" + + +#define hasjumps(e) ((e)->t != (e)->f) + + +static int isnumeral(expdesc *e) { + return (e->k == VKNUM && e->t == NO_JUMP && e->f == NO_JUMP); +} + + +void luaK_nil (FuncState *fs, int from, int n) { + Instruction *previous; + int l = from + n - 1; /* last register to set nil */ + if (fs->pc > fs->lasttarget) { /* no jumps to current position? */ + previous = &fs->f->code[fs->pc-1]; + if (GET_OPCODE(*previous) == OP_LOADNIL) { + int pfrom = GETARG_A(*previous); + int pl = pfrom + GETARG_B(*previous); + if ((pfrom <= from && from <= pl + 1) || + (from <= pfrom && pfrom <= l + 1)) { /* can connect both? */ + if (pfrom < from) from = pfrom; /* from = min(from, pfrom) */ + if (pl > l) l = pl; /* l = max(l, pl) */ + SETARG_A(*previous, from); + SETARG_B(*previous, l - from); + return; + } + } /* else go through */ + } + luaK_codeABC(fs, OP_LOADNIL, from, n - 1, 0); /* else no optimization */ +} + + +int luaK_jump (FuncState *fs) { + int jpc = fs->jpc; /* save list of jumps to here */ + int j; + fs->jpc = NO_JUMP; + j = luaK_codeAsBx(fs, OP_JMP, 0, NO_JUMP); + luaK_concat(fs, &j, jpc); /* keep them on hold */ + return j; +} + + +void luaK_ret (FuncState *fs, int first, int nret) { + luaK_codeABC(fs, OP_RETURN, first, nret+1, 0); +} + + +static int condjump (FuncState *fs, OpCode op, int A, int B, int C) { + luaK_codeABC(fs, op, A, B, C); + return luaK_jump(fs); +} + + +static void fixjump (FuncState *fs, int pc, int dest) { + Instruction *jmp = &fs->f->code[pc]; + int offset = dest-(pc+1); + lua_assert(dest != NO_JUMP); + if (abs(offset) > MAXARG_sBx) + luaX_syntaxerror(fs->ls, "control structure too long"); + SETARG_sBx(*jmp, offset); +} + + +/* +** returns current `pc' and marks it as a jump target (to avoid wrong +** optimizations with consecutive instructions not in the same basic block). +*/ +int luaK_getlabel (FuncState *fs) { + fs->lasttarget = fs->pc; + return fs->pc; +} + + +static int getjump (FuncState *fs, int pc) { + int offset = GETARG_sBx(fs->f->code[pc]); + if (offset == NO_JUMP) /* point to itself represents end of list */ + return NO_JUMP; /* end of list */ + else + return (pc+1)+offset; /* turn offset into absolute position */ +} + + +static Instruction *getjumpcontrol (FuncState *fs, int pc) { + Instruction *pi = &fs->f->code[pc]; + if (pc >= 1 && testTMode(GET_OPCODE(*(pi-1)))) + return pi-1; + else + return pi; +} + + +/* +** check whether list has any jump that do not produce a value +** (or produce an inverted value) +*/ +static int need_value (FuncState *fs, int list) { + for (; list != NO_JUMP; list = getjump(fs, list)) { + Instruction i = *getjumpcontrol(fs, list); + if (GET_OPCODE(i) != OP_TESTSET) return 1; + } + return 0; /* not found */ +} + + +static int patchtestreg (FuncState *fs, int node, int reg) { + Instruction *i = getjumpcontrol(fs, node); + if (GET_OPCODE(*i) != OP_TESTSET) + return 0; /* cannot patch other instructions */ + if (reg != NO_REG && reg != GETARG_B(*i)) + SETARG_A(*i, reg); + else /* no register to put value or register already has the value */ + *i = CREATE_ABC(OP_TEST, GETARG_B(*i), 0, GETARG_C(*i)); + + return 1; +} + + +static void removevalues (FuncState *fs, int list) { + for (; list != NO_JUMP; list = getjump(fs, list)) + patchtestreg(fs, list, NO_REG); +} + + +static void patchlistaux (FuncState *fs, int list, int vtarget, int reg, + int dtarget) { + while (list != NO_JUMP) { + int next = getjump(fs, list); + if (patchtestreg(fs, list, reg)) + fixjump(fs, list, vtarget); + else + fixjump(fs, list, dtarget); /* jump to default target */ + list = next; + } +} + + +static void dischargejpc (FuncState *fs) { + patchlistaux(fs, fs->jpc, fs->pc, NO_REG, fs->pc); + fs->jpc = NO_JUMP; +} + + +void luaK_patchlist (FuncState *fs, int list, int target) { + if (target == fs->pc) + luaK_patchtohere(fs, list); + else { + lua_assert(target < fs->pc); + patchlistaux(fs, list, target, NO_REG, target); + } +} + + +LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level) { + level++; /* argument is +1 to reserve 0 as non-op */ + while (list != NO_JUMP) { + int next = getjump(fs, list); + lua_assert(GET_OPCODE(fs->f->code[list]) == OP_JMP && + (GETARG_A(fs->f->code[list]) == 0 || + GETARG_A(fs->f->code[list]) >= level)); + SETARG_A(fs->f->code[list], level); + list = next; + } +} + + +void luaK_patchtohere (FuncState *fs, int list) { + luaK_getlabel(fs); + luaK_concat(fs, &fs->jpc, list); +} + + +void luaK_concat (FuncState *fs, int *l1, int l2) { + if (l2 == NO_JUMP) return; + else if (*l1 == NO_JUMP) + *l1 = l2; + else { + int list = *l1; + int next; + while ((next = getjump(fs, list)) != NO_JUMP) /* find last element */ + list = next; + fixjump(fs, list, l2); + } +} + + +static int luaK_code (FuncState *fs, Instruction i) { + Proto *f = fs->f; + dischargejpc(fs); /* `pc' will change */ + /* put new instruction in code array */ + luaM_growvector(fs->ls->L, f->code, fs->pc, f->sizecode, Instruction, + MAX_INT, "opcodes"); + f->code[fs->pc] = i; + /* save corresponding line information */ + luaM_growvector(fs->ls->L, f->lineinfo, fs->pc, f->sizelineinfo, int, + MAX_INT, "opcodes"); + f->lineinfo[fs->pc] = fs->ls->lastline; + return fs->pc++; +} + + +int luaK_codeABC (FuncState *fs, OpCode o, int a, int b, int c) { + lua_assert(getOpMode(o) == iABC); + lua_assert(getBMode(o) != OpArgN || b == 0); + lua_assert(getCMode(o) != OpArgN || c == 0); + lua_assert(a <= MAXARG_A && b <= MAXARG_B && c <= MAXARG_C); + return luaK_code(fs, CREATE_ABC(o, a, b, c)); +} + + +int luaK_codeABx (FuncState *fs, OpCode o, int a, unsigned int bc) { + lua_assert(getOpMode(o) == iABx || getOpMode(o) == iAsBx); + lua_assert(getCMode(o) == OpArgN); + lua_assert(a <= MAXARG_A && bc <= MAXARG_Bx); + return luaK_code(fs, CREATE_ABx(o, a, bc)); +} + + +static int codeextraarg (FuncState *fs, int a) { + lua_assert(a <= MAXARG_Ax); + return luaK_code(fs, CREATE_Ax(OP_EXTRAARG, a)); +} + + +int luaK_codek (FuncState *fs, int reg, int k) { + if (k <= MAXARG_Bx) + return luaK_codeABx(fs, OP_LOADK, reg, k); + else { + int p = luaK_codeABx(fs, OP_LOADKX, reg, 0); + codeextraarg(fs, k); + return p; + } +} + + +void luaK_checkstack (FuncState *fs, int n) { + int newstack = fs->freereg + n; + if (newstack > fs->f->maxstacksize) { + if (newstack >= MAXSTACK) + luaX_syntaxerror(fs->ls, "function or expression too complex"); + fs->f->maxstacksize = cast_byte(newstack); + } +} + + +void luaK_reserveregs (FuncState *fs, int n) { + luaK_checkstack(fs, n); + fs->freereg += n; +} + + +static void freereg (FuncState *fs, int reg) { + if (!ISK(reg) && reg >= fs->nactvar) { + fs->freereg--; + lua_assert(reg == fs->freereg); + } +} + + +static void freeexp (FuncState *fs, expdesc *e) { + if (e->k == VNONRELOC) + freereg(fs, e->u.info); +} + + +static int addk (FuncState *fs, TValue *key, TValue *v) { + lua_State *L = fs->ls->L; + TValue *idx = luaH_set(L, fs->h, key); + Proto *f = fs->f; + int k, oldsize; + if (ttisnumber(idx)) { + lua_Number n = nvalue(idx); + lua_number2int(k, n); + if (luaV_rawequalobj(&f->k[k], v)) + return k; + /* else may be a collision (e.g., between 0.0 and "\0\0\0\0\0\0\0\0"); + go through and create a new entry for this value */ + } + /* constant not found; create a new entry */ + oldsize = f->sizek; + k = fs->nk; + /* numerical value does not need GC barrier; + table has no metatable, so it does not need to invalidate cache */ + setnvalue(idx, cast_num(k)); + luaM_growvector(L, f->k, k, f->sizek, TValue, MAXARG_Ax, "constants"); + while (oldsize < f->sizek) setnilvalue(&f->k[oldsize++]); + setobj(L, &f->k[k], v); + fs->nk++; + luaC_barrier(L, f, v); + return k; +} + + +int luaK_stringK (FuncState *fs, TString *s) { + TValue o; + setsvalue(fs->ls->L, &o, s); + return addk(fs, &o, &o); +} + + +int luaK_numberK (FuncState *fs, lua_Number r) { + int n; + lua_State *L = fs->ls->L; + TValue o; + setnvalue(&o, r); + if (r == 0 || luai_numisnan(NULL, r)) { /* handle -0 and NaN */ + /* use raw representation as key to avoid numeric problems */ + setsvalue(L, L->top++, luaS_newlstr(L, (char *)&r, sizeof(r))); + n = addk(fs, L->top - 1, &o); + L->top--; + } + else + n = addk(fs, &o, &o); /* regular case */ + return n; +} + + +static int boolK (FuncState *fs, int b) { + TValue o; + setbvalue(&o, b); + return addk(fs, &o, &o); +} + + +static int nilK (FuncState *fs) { + TValue k, v; + setnilvalue(&v); + /* cannot use nil as key; instead use table itself to represent nil */ + sethvalue(fs->ls->L, &k, fs->h); + return addk(fs, &k, &v); +} + + +void luaK_setreturns (FuncState *fs, expdesc *e, int nresults) { + if (e->k == VCALL) { /* expression is an open function call? */ + SETARG_C(getcode(fs, e), nresults+1); + } + else if (e->k == VVARARG) { + SETARG_B(getcode(fs, e), nresults+1); + SETARG_A(getcode(fs, e), fs->freereg); + luaK_reserveregs(fs, 1); + } +} + + +void luaK_setoneret (FuncState *fs, expdesc *e) { + if (e->k == VCALL) { /* expression is an open function call? */ + e->k = VNONRELOC; + e->u.info = GETARG_A(getcode(fs, e)); + } + else if (e->k == VVARARG) { + SETARG_B(getcode(fs, e), 2); + e->k = VRELOCABLE; /* can relocate its simple result */ + } +} + + +void luaK_dischargevars (FuncState *fs, expdesc *e) { + switch (e->k) { + case VLOCAL: { + e->k = VNONRELOC; + break; + } + case VUPVAL: { + e->u.info = luaK_codeABC(fs, OP_GETUPVAL, 0, e->u.info, 0); + e->k = VRELOCABLE; + break; + } + case VINDEXED: { + OpCode op = OP_GETTABUP; /* assume 't' is in an upvalue */ + freereg(fs, e->u.ind.idx); + if (e->u.ind.vt == VLOCAL) { /* 't' is in a register? */ + freereg(fs, e->u.ind.t); + op = OP_GETTABLE; + } + e->u.info = luaK_codeABC(fs, op, 0, e->u.ind.t, e->u.ind.idx); + e->k = VRELOCABLE; + break; + } + case VVARARG: + case VCALL: { + luaK_setoneret(fs, e); + break; + } + default: break; /* there is one value available (somewhere) */ + } +} + + +static int code_label (FuncState *fs, int A, int b, int jump) { + luaK_getlabel(fs); /* those instructions may be jump targets */ + return luaK_codeABC(fs, OP_LOADBOOL, A, b, jump); +} + + +static void discharge2reg (FuncState *fs, expdesc *e, int reg) { + luaK_dischargevars(fs, e); + switch (e->k) { + case VNIL: { + luaK_nil(fs, reg, 1); + break; + } + case VFALSE: case VTRUE: { + luaK_codeABC(fs, OP_LOADBOOL, reg, e->k == VTRUE, 0); + break; + } + case VK: { + luaK_codek(fs, reg, e->u.info); + break; + } + case VKNUM: { + luaK_codek(fs, reg, luaK_numberK(fs, e->u.nval)); + break; + } + case VRELOCABLE: { + Instruction *pc = &getcode(fs, e); + SETARG_A(*pc, reg); + break; + } + case VNONRELOC: { + if (reg != e->u.info) + luaK_codeABC(fs, OP_MOVE, reg, e->u.info, 0); + break; + } + default: { + lua_assert(e->k == VVOID || e->k == VJMP); + return; /* nothing to do... */ + } + } + e->u.info = reg; + e->k = VNONRELOC; +} + + +static void discharge2anyreg (FuncState *fs, expdesc *e) { + if (e->k != VNONRELOC) { + luaK_reserveregs(fs, 1); + discharge2reg(fs, e, fs->freereg-1); + } +} + + +static void exp2reg (FuncState *fs, expdesc *e, int reg) { + discharge2reg(fs, e, reg); + if (e->k == VJMP) + luaK_concat(fs, &e->t, e->u.info); /* put this jump in `t' list */ + if (hasjumps(e)) { + int final; /* position after whole expression */ + int p_f = NO_JUMP; /* position of an eventual LOAD false */ + int p_t = NO_JUMP; /* position of an eventual LOAD true */ + if (need_value(fs, e->t) || need_value(fs, e->f)) { + int fj = (e->k == VJMP) ? NO_JUMP : luaK_jump(fs); + p_f = code_label(fs, reg, 0, 1); + p_t = code_label(fs, reg, 1, 0); + luaK_patchtohere(fs, fj); + } + final = luaK_getlabel(fs); + patchlistaux(fs, e->f, final, reg, p_f); + patchlistaux(fs, e->t, final, reg, p_t); + } + e->f = e->t = NO_JUMP; + e->u.info = reg; + e->k = VNONRELOC; +} + + +void luaK_exp2nextreg (FuncState *fs, expdesc *e) { + luaK_dischargevars(fs, e); + freeexp(fs, e); + luaK_reserveregs(fs, 1); + exp2reg(fs, e, fs->freereg - 1); +} + + +int luaK_exp2anyreg (FuncState *fs, expdesc *e) { + luaK_dischargevars(fs, e); + if (e->k == VNONRELOC) { + if (!hasjumps(e)) return e->u.info; /* exp is already in a register */ + if (e->u.info >= fs->nactvar) { /* reg. is not a local? */ + exp2reg(fs, e, e->u.info); /* put value on it */ + return e->u.info; + } + } + luaK_exp2nextreg(fs, e); /* default */ + return e->u.info; +} + + +void luaK_exp2anyregup (FuncState *fs, expdesc *e) { + if (e->k != VUPVAL || hasjumps(e)) + luaK_exp2anyreg(fs, e); +} + + +void luaK_exp2val (FuncState *fs, expdesc *e) { + if (hasjumps(e)) + luaK_exp2anyreg(fs, e); + else + luaK_dischargevars(fs, e); +} + + +int luaK_exp2RK (FuncState *fs, expdesc *e) { + luaK_exp2val(fs, e); + switch (e->k) { + case VTRUE: + case VFALSE: + case VNIL: { + if (fs->nk <= MAXINDEXRK) { /* constant fits in RK operand? */ + e->u.info = (e->k == VNIL) ? nilK(fs) : boolK(fs, (e->k == VTRUE)); + e->k = VK; + return RKASK(e->u.info); + } + else break; + } + case VKNUM: { + e->u.info = luaK_numberK(fs, e->u.nval); + e->k = VK; + /* go through */ + } + case VK: { + if (e->u.info <= MAXINDEXRK) /* constant fits in argC? */ + return RKASK(e->u.info); + else break; + } + default: break; + } + /* not a constant in the right range: put it in a register */ + return luaK_exp2anyreg(fs, e); +} + + +void luaK_storevar (FuncState *fs, expdesc *var, expdesc *ex) { + switch (var->k) { + case VLOCAL: { + freeexp(fs, ex); + exp2reg(fs, ex, var->u.info); + return; + } + case VUPVAL: { + int e = luaK_exp2anyreg(fs, ex); + luaK_codeABC(fs, OP_SETUPVAL, e, var->u.info, 0); + break; + } + case VINDEXED: { + OpCode op = (var->u.ind.vt == VLOCAL) ? OP_SETTABLE : OP_SETTABUP; + int e = luaK_exp2RK(fs, ex); + luaK_codeABC(fs, op, var->u.ind.t, var->u.ind.idx, e); + break; + } + default: { + lua_assert(0); /* invalid var kind to store */ + break; + } + } + freeexp(fs, ex); +} + + +void luaK_self (FuncState *fs, expdesc *e, expdesc *key) { + int ereg; + luaK_exp2anyreg(fs, e); + ereg = e->u.info; /* register where 'e' was placed */ + freeexp(fs, e); + e->u.info = fs->freereg; /* base register for op_self */ + e->k = VNONRELOC; + luaK_reserveregs(fs, 2); /* function and 'self' produced by op_self */ + luaK_codeABC(fs, OP_SELF, e->u.info, ereg, luaK_exp2RK(fs, key)); + freeexp(fs, key); +} + + +static void invertjump (FuncState *fs, expdesc *e) { + Instruction *pc = getjumpcontrol(fs, e->u.info); + lua_assert(testTMode(GET_OPCODE(*pc)) && GET_OPCODE(*pc) != OP_TESTSET && + GET_OPCODE(*pc) != OP_TEST); + SETARG_A(*pc, !(GETARG_A(*pc))); +} + + +static int jumponcond (FuncState *fs, expdesc *e, int cond) { + if (e->k == VRELOCABLE) { + Instruction ie = getcode(fs, e); + if (GET_OPCODE(ie) == OP_NOT) { + fs->pc--; /* remove previous OP_NOT */ + return condjump(fs, OP_TEST, GETARG_B(ie), 0, !cond); + } + /* else go through */ + } + discharge2anyreg(fs, e); + freeexp(fs, e); + return condjump(fs, OP_TESTSET, NO_REG, e->u.info, cond); +} + + +void luaK_goiftrue (FuncState *fs, expdesc *e) { + int pc; /* pc of last jump */ + luaK_dischargevars(fs, e); + switch (e->k) { + case VJMP: { + invertjump(fs, e); + pc = e->u.info; + break; + } + case VK: case VKNUM: case VTRUE: { + pc = NO_JUMP; /* always true; do nothing */ + break; + } + default: { + pc = jumponcond(fs, e, 0); + break; + } + } + luaK_concat(fs, &e->f, pc); /* insert last jump in `f' list */ + luaK_patchtohere(fs, e->t); + e->t = NO_JUMP; +} + + +void luaK_goiffalse (FuncState *fs, expdesc *e) { + int pc; /* pc of last jump */ + luaK_dischargevars(fs, e); + switch (e->k) { + case VJMP: { + pc = e->u.info; + break; + } + case VNIL: case VFALSE: { + pc = NO_JUMP; /* always false; do nothing */ + break; + } + default: { + pc = jumponcond(fs, e, 1); + break; + } + } + luaK_concat(fs, &e->t, pc); /* insert last jump in `t' list */ + luaK_patchtohere(fs, e->f); + e->f = NO_JUMP; +} + + +static void codenot (FuncState *fs, expdesc *e) { + luaK_dischargevars(fs, e); + switch (e->k) { + case VNIL: case VFALSE: { + e->k = VTRUE; + break; + } + case VK: case VKNUM: case VTRUE: { + e->k = VFALSE; + break; + } + case VJMP: { + invertjump(fs, e); + break; + } + case VRELOCABLE: + case VNONRELOC: { + discharge2anyreg(fs, e); + freeexp(fs, e); + e->u.info = luaK_codeABC(fs, OP_NOT, 0, e->u.info, 0); + e->k = VRELOCABLE; + break; + } + default: { + lua_assert(0); /* cannot happen */ + break; + } + } + /* interchange true and false lists */ + { int temp = e->f; e->f = e->t; e->t = temp; } + removevalues(fs, e->f); + removevalues(fs, e->t); +} + + +void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k) { + lua_assert(!hasjumps(t)); + t->u.ind.t = t->u.info; + t->u.ind.idx = luaK_exp2RK(fs, k); + t->u.ind.vt = (t->k == VUPVAL) ? VUPVAL + : check_exp(vkisinreg(t->k), VLOCAL); + t->k = VINDEXED; +} + + +static int constfolding (OpCode op, expdesc *e1, expdesc *e2) { + lua_Number r; + if (!isnumeral(e1) || !isnumeral(e2)) return 0; + if ((op == OP_DIV || op == OP_MOD) && e2->u.nval == 0) + return 0; /* do not attempt to divide by 0 */ + /* + * Patched: check for MIN_INT / -1 + */ + if (op == OP_DIV && e1->u.nval == INT64_MIN && e2->u.nval == -1) + return 0; + r = luaO_arith(op - OP_ADD + LUA_OPADD, e1->u.nval, e2->u.nval); + e1->u.nval = r; + return 1; +} + + +static void codearith (FuncState *fs, OpCode op, + expdesc *e1, expdesc *e2, int line) { + if (constfolding(op, e1, e2)) + return; + else { + int o2 = (op != OP_UNM && op != OP_LEN) ? luaK_exp2RK(fs, e2) : 0; + int o1 = luaK_exp2RK(fs, e1); + if (o1 > o2) { + freeexp(fs, e1); + freeexp(fs, e2); + } + else { + freeexp(fs, e2); + freeexp(fs, e1); + } + e1->u.info = luaK_codeABC(fs, op, 0, o1, o2); + e1->k = VRELOCABLE; + luaK_fixline(fs, line); + } +} + + +static void codecomp (FuncState *fs, OpCode op, int cond, expdesc *e1, + expdesc *e2) { + int o1 = luaK_exp2RK(fs, e1); + int o2 = luaK_exp2RK(fs, e2); + freeexp(fs, e2); + freeexp(fs, e1); + if (cond == 0 && op != OP_EQ) { + int temp; /* exchange args to replace by `<' or `<=' */ + temp = o1; o1 = o2; o2 = temp; /* o1 <==> o2 */ + cond = 1; + } + e1->u.info = condjump(fs, op, cond, o1, o2); + e1->k = VJMP; +} + + +void luaK_prefix (FuncState *fs, UnOpr op, expdesc *e, int line) { + expdesc e2; + e2.t = e2.f = NO_JUMP; e2.k = VKNUM; e2.u.nval = 0; + switch (op) { + case OPR_MINUS: { + if (isnumeral(e)) /* minus constant? */ + e->u.nval = luai_numunm(NULL, e->u.nval); /* fold it */ + else { + luaK_exp2anyreg(fs, e); + codearith(fs, OP_UNM, e, &e2, line); + } + break; + } + case OPR_NOT: codenot(fs, e); break; + case OPR_LEN: { + luaK_exp2anyreg(fs, e); /* cannot operate on constants */ + codearith(fs, OP_LEN, e, &e2, line); + break; + } + default: lua_assert(0); + } +} + + +void luaK_infix (FuncState *fs, BinOpr op, expdesc *v) { + switch (op) { + case OPR_AND: { + luaK_goiftrue(fs, v); + break; + } + case OPR_OR: { + luaK_goiffalse(fs, v); + break; + } + case OPR_CONCAT: { + luaK_exp2nextreg(fs, v); /* operand must be on the `stack' */ + break; + } + case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV: + case OPR_MOD: case OPR_POW: { + if (!isnumeral(v)) luaK_exp2RK(fs, v); + break; + } + default: { + luaK_exp2RK(fs, v); + break; + } + } +} + + +void luaK_posfix (FuncState *fs, BinOpr op, + expdesc *e1, expdesc *e2, int line) { + switch (op) { + case OPR_AND: { + lua_assert(e1->t == NO_JUMP); /* list must be closed */ + luaK_dischargevars(fs, e2); + luaK_concat(fs, &e2->f, e1->f); + *e1 = *e2; + break; + } + case OPR_OR: { + lua_assert(e1->f == NO_JUMP); /* list must be closed */ + luaK_dischargevars(fs, e2); + luaK_concat(fs, &e2->t, e1->t); + *e1 = *e2; + break; + } + case OPR_CONCAT: { + luaK_exp2val(fs, e2); + if (e2->k == VRELOCABLE && GET_OPCODE(getcode(fs, e2)) == OP_CONCAT) { + lua_assert(e1->u.info == GETARG_B(getcode(fs, e2))-1); + freeexp(fs, e1); + SETARG_B(getcode(fs, e2), e1->u.info); + e1->k = VRELOCABLE; e1->u.info = e2->u.info; + } + else { + luaK_exp2nextreg(fs, e2); /* operand must be on the 'stack' */ + codearith(fs, OP_CONCAT, e1, e2, line); + } + break; + } + case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV: + case OPR_MOD: case OPR_POW: { + codearith(fs, cast(OpCode, op - OPR_ADD + OP_ADD), e1, e2, line); + break; + } + case OPR_EQ: case OPR_LT: case OPR_LE: { + codecomp(fs, cast(OpCode, op - OPR_EQ + OP_EQ), 1, e1, e2); + break; + } + case OPR_NE: case OPR_GT: case OPR_GE: { + codecomp(fs, cast(OpCode, op - OPR_NE + OP_EQ), 0, e1, e2); + break; + } + default: lua_assert(0); + } +} + + +void luaK_fixline (FuncState *fs, int line) { + fs->f->lineinfo[fs->pc - 1] = line; +} + + +void luaK_setlist (FuncState *fs, int base, int nelems, int tostore) { + int c = (nelems - 1)/LFIELDS_PER_FLUSH + 1; + int b = (tostore == LUA_MULTRET) ? 0 : tostore; + lua_assert(tostore != 0); + if (c <= MAXARG_C) + luaK_codeABC(fs, OP_SETLIST, base, b, c); + else if (c <= MAXARG_Ax) { + luaK_codeABC(fs, OP_SETLIST, base, b, 0); + codeextraarg(fs, c); + } + else + luaX_syntaxerror(fs->ls, "constructor too long"); + fs->freereg = base + 1; /* free registers with list values */ +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h new file mode 100644 index 000000000000..6a1424cf5a73 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h @@ -0,0 +1,83 @@ +/* +** $Id: lcode.h,v 1.58.1.1 2013/04/12 18:48:47 roberto Exp $ +** Code generator for Lua +** See Copyright Notice in lua.h +*/ + +#ifndef lcode_h +#define lcode_h + +#include "llex.h" +#include "lobject.h" +#include "lopcodes.h" +#include "lparser.h" + + +/* +** Marks the end of a patch list. It is an invalid value both as an absolute +** address, and as a list link (would link an element to itself). +*/ +#define NO_JUMP (-1) + + +/* +** grep "ORDER OPR" if you change these enums (ORDER OP) +*/ +typedef enum BinOpr { + OPR_ADD, OPR_SUB, OPR_MUL, OPR_DIV, OPR_MOD, OPR_POW, + OPR_CONCAT, + OPR_EQ, OPR_LT, OPR_LE, + OPR_NE, OPR_GT, OPR_GE, + OPR_AND, OPR_OR, + OPR_NOBINOPR +} BinOpr; + + +typedef enum UnOpr { OPR_MINUS, OPR_NOT, OPR_LEN, OPR_NOUNOPR } UnOpr; + + +#define getcode(fs,e) ((fs)->f->code[(e)->u.info]) + +#define luaK_codeAsBx(fs,o,A,sBx) luaK_codeABx(fs,o,A,(sBx)+MAXARG_sBx) + +#define luaK_setmultret(fs,e) luaK_setreturns(fs, e, LUA_MULTRET) + +#define luaK_jumpto(fs,t) luaK_patchlist(fs, luaK_jump(fs), t) + +LUAI_FUNC int luaK_codeABx (FuncState *fs, OpCode o, int A, unsigned int Bx); +LUAI_FUNC int luaK_codeABC (FuncState *fs, OpCode o, int A, int B, int C); +LUAI_FUNC int luaK_codek (FuncState *fs, int reg, int k); +LUAI_FUNC void luaK_fixline (FuncState *fs, int line); +LUAI_FUNC void luaK_nil (FuncState *fs, int from, int n); +LUAI_FUNC void luaK_reserveregs (FuncState *fs, int n); +LUAI_FUNC void luaK_checkstack (FuncState *fs, int n); +LUAI_FUNC int luaK_stringK (FuncState *fs, TString *s); +LUAI_FUNC int luaK_numberK (FuncState *fs, lua_Number r); +LUAI_FUNC void luaK_dischargevars (FuncState *fs, expdesc *e); +LUAI_FUNC int luaK_exp2anyreg (FuncState *fs, expdesc *e); +LUAI_FUNC void luaK_exp2anyregup (FuncState *fs, expdesc *e); +LUAI_FUNC void luaK_exp2nextreg (FuncState *fs, expdesc *e); +LUAI_FUNC void luaK_exp2val (FuncState *fs, expdesc *e); +LUAI_FUNC int luaK_exp2RK (FuncState *fs, expdesc *e); +LUAI_FUNC void luaK_self (FuncState *fs, expdesc *e, expdesc *key); +LUAI_FUNC void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k); +LUAI_FUNC void luaK_goiftrue (FuncState *fs, expdesc *e); +LUAI_FUNC void luaK_goiffalse (FuncState *fs, expdesc *e); +LUAI_FUNC void luaK_storevar (FuncState *fs, expdesc *var, expdesc *e); +LUAI_FUNC void luaK_setreturns (FuncState *fs, expdesc *e, int nresults); +LUAI_FUNC void luaK_setoneret (FuncState *fs, expdesc *e); +LUAI_FUNC int luaK_jump (FuncState *fs); +LUAI_FUNC void luaK_ret (FuncState *fs, int first, int nret); +LUAI_FUNC void luaK_patchlist (FuncState *fs, int list, int target); +LUAI_FUNC void luaK_patchtohere (FuncState *fs, int list); +LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level); +LUAI_FUNC void luaK_concat (FuncState *fs, int *l1, int l2); +LUAI_FUNC int luaK_getlabel (FuncState *fs); +LUAI_FUNC void luaK_prefix (FuncState *fs, UnOpr op, expdesc *v, int line); +LUAI_FUNC void luaK_infix (FuncState *fs, BinOpr op, expdesc *v); +LUAI_FUNC void luaK_posfix (FuncState *fs, BinOpr op, expdesc *v1, + expdesc *v2, int line); +LUAI_FUNC void luaK_setlist (FuncState *fs, int base, int nelems, int tostore); + + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c new file mode 100644 index 000000000000..55564ddbd9fd --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#include "lua.h" + +#include <sys/zfs_context.h> + +ssize_t +lcompat_sprintf(char *buf, const char *fmt, ...) +{ + ssize_t res; + va_list args; + + va_start(args, fmt); + res = vsnprintf(buf, INT_MAX, fmt, args); + va_end(args); + + return (res); +} + +int64_t +lcompat_strtoll(const char *str, char **ptr) +{ + int base; + const char *cp; + int digits; + int64_t value; + boolean_t is_negative; + + cp = str; + while (*cp == ' ' || *cp == '\t' || *cp == '\n') { + cp++; + } + is_negative = (*cp == '-'); + if (is_negative) { + cp++; + } + base = 10; + + if (*cp == '0') { + base = 8; + cp++; + if (*cp == 'x' || *cp == 'X') { + base = 16; + cp++; + } + } + + value = 0; + for (; *cp != '\0'; cp++) { + if (*cp >= '0' && *cp <= '9') { + digits = *cp - '0'; + } else if (*cp >= 'a' && *cp <= 'f') { + digits = *cp - 'a' + 10; + } else if (*cp >= 'A' && *cp <= 'F') { + digits = *cp - 'A' + 10; + } else { + break; + } + if (digits >= base) { + break; + } + value = (value * base) + digits; + } + + if (ptr != NULL) { + *ptr = (char *)cp; + } + if (is_negative) { + value = -value; + } + return (value); +} + +int64_t +lcompat_pow(int64_t x, int64_t y) +{ + int64_t result = 1; + if (y < 0) + return (0); + + while (y) { + if (y & 1) + result *= x; + y >>= 1; + x *= x; + } + return (result); +} + +int +lcompat_hashnum(int64_t x) +{ + x = (~x) + (x << 18); + x = x ^ (x >> 31); + x = x * 21; + x = x ^ (x >> 11); + x = x + (x << 6); + x = x ^ (x >> 22); + return ((int)x); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c new file mode 100644 index 000000000000..405350bb145b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c @@ -0,0 +1,154 @@ +/* +** $Id: lcorolib.c,v 1.5.1.1 2013/04/12 18:48:47 roberto Exp $ +** Coroutine Library +** See Copyright Notice in lua.h +*/ + + +#include <sys/zfs_context.h> + +#define lcorolib_c +#define LUA_LIB + +#include "lua.h" + +#include "lauxlib.h" +#include "lualib.h" + + +static int auxresume (lua_State *L, lua_State *co, int narg) { + int status; + if (!lua_checkstack(co, narg)) { + lua_pushliteral(L, "too many arguments to resume"); + return -1; /* error flag */ + } + if (lua_status(co) == LUA_OK && lua_gettop(co) == 0) { + lua_pushliteral(L, "cannot resume dead coroutine"); + return -1; /* error flag */ + } + lua_xmove(L, co, narg); + status = lua_resume(co, L, narg); + if (status == LUA_OK || status == LUA_YIELD) { + int nres = lua_gettop(co); + if (!lua_checkstack(L, nres + 1)) { + lua_pop(co, nres); /* remove results anyway */ + lua_pushliteral(L, "too many results to resume"); + return -1; /* error flag */ + } + lua_xmove(co, L, nres); /* move yielded values */ + return nres; + } + else { + lua_xmove(co, L, 1); /* move error message */ + return -1; /* error flag */ + } +} + + +static int luaB_coresume (lua_State *L) { + lua_State *co = lua_tothread(L, 1); + int r; + luaL_argcheck(L, co, 1, "coroutine expected"); + r = auxresume(L, co, lua_gettop(L) - 1); + if (r < 0) { + lua_pushboolean(L, 0); + lua_insert(L, -2); + return 2; /* return false + error message */ + } + else { + lua_pushboolean(L, 1); + lua_insert(L, -(r + 1)); + return r + 1; /* return true + `resume' returns */ + } +} + + +static int luaB_auxwrap (lua_State *L) { + lua_State *co = lua_tothread(L, lua_upvalueindex(1)); + int r = auxresume(L, co, lua_gettop(L)); + if (r < 0) { + if (lua_isstring(L, -1)) { /* error object is a string? */ + luaL_where(L, 1); /* add extra info */ + lua_insert(L, -2); + lua_concat(L, 2); + } + return lua_error(L); /* propagate error */ + } + return r; +} + + +static int luaB_cocreate (lua_State *L) { + lua_State *NL; + luaL_checktype(L, 1, LUA_TFUNCTION); + NL = lua_newthread(L); + lua_pushvalue(L, 1); /* move function to top */ + lua_xmove(L, NL, 1); /* move function from L to NL */ + return 1; +} + + +static int luaB_cowrap (lua_State *L) { + luaB_cocreate(L); + lua_pushcclosure(L, luaB_auxwrap, 1); + return 1; +} + + +static int luaB_yield (lua_State *L) { + return lua_yield(L, lua_gettop(L)); +} + + +static int luaB_costatus (lua_State *L) { + lua_State *co = lua_tothread(L, 1); + luaL_argcheck(L, co, 1, "coroutine expected"); + if (L == co) lua_pushliteral(L, "running"); + else { + switch (lua_status(co)) { + case LUA_YIELD: + lua_pushliteral(L, "suspended"); + break; + case LUA_OK: { + lua_Debug ar; + if (lua_getstack(co, 0, &ar) > 0) /* does it have frames? */ + lua_pushliteral(L, "normal"); /* it is running */ + else if (lua_gettop(co) == 0) + lua_pushliteral(L, "dead"); + else + lua_pushliteral(L, "suspended"); /* initial state */ + break; + } + default: /* some error occurred */ + lua_pushliteral(L, "dead"); + break; + } + } + return 1; +} + + +static int luaB_corunning (lua_State *L) { + int ismain = lua_pushthread(L); + lua_pushboolean(L, ismain); + return 2; +} + + +static const luaL_Reg co_funcs[] = { + {"create", luaB_cocreate}, + {"resume", luaB_coresume}, + {"running", luaB_corunning}, + {"status", luaB_costatus}, + {"wrap", luaB_cowrap}, + {"yield", luaB_yield}, + {NULL, NULL} +}; + + + +LUAMOD_API int luaopen_coroutine (lua_State *L) { + luaL_newlib(L, co_funcs); + return 1; +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c new file mode 100644 index 000000000000..107859811bfc --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c @@ -0,0 +1,52 @@ +/* +** $Id: lctype.c,v 1.11.1.1 2013/04/12 18:48:47 roberto Exp $ +** 'ctype' functions for Lua +** See Copyright Notice in lua.h +*/ + +#define lctype_c +#define LUA_CORE + +#include "lctype.h" + +#if !LUA_USE_CTYPE /* { */ + +#include <sys/zfs_context.h> + +LUAI_DDEF const lu_byte luai_ctype_[UCHAR_MAX + 2] = { + 0x00, /* EOZ */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0. */ + 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 1. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0c, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, /* 2. */ + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, + 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, /* 3. */ + 0x16, 0x16, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, + 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 4. */ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 5. */ + 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x05, + 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 6. */ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 7. */ + 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 8. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 9. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* a. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* b. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* c. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* d. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* e. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* f. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +#endif /* } */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h new file mode 100644 index 000000000000..299a59b92e2c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h @@ -0,0 +1,93 @@ +/* +** $Id: lctype.h,v 1.12.1.1 2013/04/12 18:48:47 roberto Exp $ +** 'ctype' functions for Lua +** See Copyright Notice in lua.h +*/ + +#ifndef lctype_h +#define lctype_h + +#include "lua.h" + + +/* +** WARNING: the functions defined here do not necessarily correspond +** to the similar functions in the standard C ctype.h. They are +** optimized for the specific needs of Lua +*/ + +#if !defined(LUA_USE_CTYPE) + +#if 'A' == 65 && '0' == 48 +/* ASCII case: can use its own tables; faster and fixed */ +#define LUA_USE_CTYPE 0 +#else +/* must use standard C ctype */ +#define LUA_USE_CTYPE 1 +#endif + +#endif + + +#if !LUA_USE_CTYPE /* { */ + +#include "llimits.h" + + +#define ALPHABIT 0 +#define DIGITBIT 1 +#define PRINTBIT 2 +#define SPACEBIT 3 +#define XDIGITBIT 4 + + +#define MASK(B) (1 << (B)) + + +/* +** add 1 to char to allow index -1 (EOZ) +*/ +#define testprop(c,p) (luai_ctype_[(c)+1] & (p)) + +/* +** 'lalpha' (Lua alphabetic) and 'lalnum' (Lua alphanumeric) both include '_' +*/ +#define lislalpha(c) testprop(c, MASK(ALPHABIT)) +#define lislalnum(c) testprop(c, (MASK(ALPHABIT) | MASK(DIGITBIT))) +#define lisdigit(c) testprop(c, MASK(DIGITBIT)) +#define lisspace(c) testprop(c, MASK(SPACEBIT)) +#define lisprint(c) testprop(c, MASK(PRINTBIT)) +#define lisxdigit(c) testprop(c, MASK(XDIGITBIT)) + +/* +** this 'ltolower' only works for alphabetic characters +*/ +#define ltolower(c) ((c) | ('A' ^ 'a')) + + +/* two more entries for 0 and -1 (EOZ) */ +LUAI_DDEC const lu_byte luai_ctype_[UCHAR_MAX + 2]; + + +#else /* }{ */ + +/* +** use standard C ctypes +*/ + +#include <ctype.h> + + +#define lislalpha(c) (isalpha(c) || (c) == '_') +#define lislalnum(c) (isalnum(c) || (c) == '_') +#define lisdigit(c) (isdigit(c)) +#define lisspace(c) (isspace(c)) +#define lisprint(c) (isprint(c)) +#define lisxdigit(c) (isxdigit(c)) + +#define ltolower(c) (tolower(c)) + +#endif /* } */ + +#endif + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c new file mode 100644 index 000000000000..b8ddcff3c6bb --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c @@ -0,0 +1,607 @@ +/* +** $Id: ldebug.c,v 2.90.1.4 2015/02/19 17:05:13 roberto Exp $ +** Debug Interface +** See Copyright Notice in lua.h +*/ + + +#include <sys/zfs_context.h> + +#define ldebug_c +#define LUA_CORE + +#include "lua.h" + +#include "lapi.h" +#include "lcode.h" +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "lobject.h" +#include "lopcodes.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" +#include "lvm.h" + + + +#define noLuaClosure(f) ((f) == NULL || (f)->c.tt == LUA_TCCL) + + +static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name); + + +static int currentpc (CallInfo *ci) { + lua_assert(isLua(ci)); + return pcRel(ci->u.l.savedpc, ci_func(ci)->p); +} + + +static int currentline (CallInfo *ci) { + return getfuncline(ci_func(ci)->p, currentpc(ci)); +} + + +static void swapextra (lua_State *L) { + if (L->status == LUA_YIELD) { + CallInfo *ci = L->ci; /* get function that yielded */ + StkId temp = ci->func; /* exchange its 'func' and 'extra' values */ + ci->func = restorestack(L, ci->extra); + ci->extra = savestack(L, temp); + } +} + + +/* +** this function can be called asynchronous (e.g. during a signal) +*/ +LUA_API int lua_sethook (lua_State *L, lua_Hook func, int mask, int count) { + if (func == NULL || mask == 0) { /* turn off hooks? */ + mask = 0; + func = NULL; + } + if (isLua(L->ci)) + L->oldpc = L->ci->u.l.savedpc; + L->hook = func; + L->basehookcount = count; + resethookcount(L); + L->hookmask = cast_byte(mask); + return 1; +} + + +LUA_API lua_Hook lua_gethook (lua_State *L) { + return L->hook; +} + + +LUA_API int lua_gethookmask (lua_State *L) { + return L->hookmask; +} + + +LUA_API int lua_gethookcount (lua_State *L) { + return L->basehookcount; +} + + +LUA_API int lua_getstack (lua_State *L, int level, lua_Debug *ar) { + int status; + CallInfo *ci; + if (level < 0) return 0; /* invalid (negative) level */ + lua_lock(L); + for (ci = L->ci; level > 0 && ci != &L->base_ci; ci = ci->previous) + level--; + if (level == 0 && ci != &L->base_ci) { /* level found? */ + status = 1; + ar->i_ci = ci; + } + else status = 0; /* no such level */ + lua_unlock(L); + return status; +} + + +static const char *upvalname (Proto *p, int uv) { + TString *s = check_exp(uv < p->sizeupvalues, p->upvalues[uv].name); + if (s == NULL) return "?"; + else return getstr(s); +} + + +static const char *findvararg (CallInfo *ci, int n, StkId *pos) { + int nparams = clLvalue(ci->func)->p->numparams; + if (n >= ci->u.l.base - ci->func - nparams) + return NULL; /* no such vararg */ + else { + *pos = ci->func + nparams + n; + return "(*vararg)"; /* generic name for any vararg */ + } +} + + +static const char *findlocal (lua_State *L, CallInfo *ci, int n, + StkId *pos) { + const char *name = NULL; + StkId base; + if (isLua(ci)) { + if (n < 0) /* access to vararg values? */ + return findvararg(ci, -n, pos); + else { + base = ci->u.l.base; + name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci)); + } + } + else + base = ci->func + 1; + if (name == NULL) { /* no 'standard' name? */ + StkId limit = (ci == L->ci) ? L->top : ci->next->func; + if (limit - base >= n && n > 0) /* is 'n' inside 'ci' stack? */ + name = "(*temporary)"; /* generic name for any valid slot */ + else + return NULL; /* no name */ + } + *pos = base + (n - 1); + return name; +} + + +LUA_API const char *lua_getlocal (lua_State *L, const lua_Debug *ar, int n) { + const char *name; + lua_lock(L); + swapextra(L); + if (ar == NULL) { /* information about non-active function? */ + if (!isLfunction(L->top - 1)) /* not a Lua function? */ + name = NULL; + else /* consider live variables at function start (parameters) */ + name = luaF_getlocalname(clLvalue(L->top - 1)->p, n, 0); + } + else { /* active function; get information through 'ar' */ + StkId pos = 0; /* to avoid warnings */ + name = findlocal(L, ar->i_ci, n, &pos); + if (name) { + setobj2s(L, L->top, pos); + api_incr_top(L); + } + } + swapextra(L); + lua_unlock(L); + return name; +} + + +LUA_API const char *lua_setlocal (lua_State *L, const lua_Debug *ar, int n) { + StkId pos = 0; /* to avoid warnings */ + const char *name; + lua_lock(L); + swapextra(L); + name = findlocal(L, ar->i_ci, n, &pos); + if (name) + setobjs2s(L, pos, L->top - 1); + L->top--; /* pop value */ + swapextra(L); + lua_unlock(L); + return name; +} + + +static void funcinfo (lua_Debug *ar, Closure *cl) { + if (noLuaClosure(cl)) { + ar->source = "=[C]"; + ar->linedefined = -1; + ar->lastlinedefined = -1; + ar->what = "C"; + } + else { + Proto *p = cl->l.p; + ar->source = p->source ? getstr(p->source) : "=?"; + ar->linedefined = p->linedefined; + ar->lastlinedefined = p->lastlinedefined; + ar->what = (ar->linedefined == 0) ? "main" : "Lua"; + } + luaO_chunkid(ar->short_src, ar->source, LUA_IDSIZE); +} + + +static void collectvalidlines (lua_State *L, Closure *f) { + if (noLuaClosure(f)) { + setnilvalue(L->top); + api_incr_top(L); + } + else { + int i; + TValue v; + int *lineinfo = f->l.p->lineinfo; + Table *t = luaH_new(L); /* new table to store active lines */ + sethvalue(L, L->top, t); /* push it on stack */ + api_incr_top(L); + setbvalue(&v, 1); /* boolean 'true' to be the value of all indices */ + for (i = 0; i < f->l.p->sizelineinfo; i++) /* for all lines with code */ + luaH_setint(L, t, lineinfo[i], &v); /* table[line] = true */ + } +} + + +static int auxgetinfo (lua_State *L, const char *what, lua_Debug *ar, + Closure *f, CallInfo *ci) { + int status = 1; + for (; *what; what++) { + switch (*what) { + case 'S': { + funcinfo(ar, f); + break; + } + case 'l': { + ar->currentline = (ci && isLua(ci)) ? currentline(ci) : -1; + break; + } + case 'u': { + ar->nups = (f == NULL) ? 0 : f->c.nupvalues; + if (noLuaClosure(f)) { + ar->isvararg = 1; + ar->nparams = 0; + } + else { + ar->isvararg = f->l.p->is_vararg; + ar->nparams = f->l.p->numparams; + } + break; + } + case 't': { + ar->istailcall = (ci) ? ci->callstatus & CIST_TAIL : 0; + break; + } + case 'n': { + /* calling function is a known Lua function? */ + if (ci && !(ci->callstatus & CIST_TAIL) && isLua(ci->previous)) + ar->namewhat = getfuncname(L, ci->previous, &ar->name); + else + ar->namewhat = NULL; + if (ar->namewhat == NULL) { + ar->namewhat = ""; /* not found */ + ar->name = NULL; + } + break; + } + case 'L': + case 'f': /* handled by lua_getinfo */ + break; + default: status = 0; /* invalid option */ + } + } + return status; +} + + +LUA_API int lua_getinfo (lua_State *L, const char *what, lua_Debug *ar) { + int status; + Closure *cl; + CallInfo *ci; + StkId func; + lua_lock(L); + swapextra(L); + if (*what == '>') { + ci = NULL; + func = L->top - 1; + api_check(L, ttisfunction(func), "function expected"); + what++; /* skip the '>' */ + L->top--; /* pop function */ + } + else { + ci = ar->i_ci; + func = ci->func; + lua_assert(ttisfunction(ci->func)); + } + cl = ttisclosure(func) ? clvalue(func) : NULL; + status = auxgetinfo(L, what, ar, cl, ci); + if (strchr(what, 'f')) { + setobjs2s(L, L->top, func); + api_incr_top(L); + } + swapextra(L); + if (strchr(what, 'L')) + collectvalidlines(L, cl); + lua_unlock(L); + return status; +} + + +/* +** {====================================================== +** Symbolic Execution +** ======================================================= +*/ + +static const char *getobjname (Proto *p, int lastpc, int reg, + const char **name); + + +/* +** find a "name" for the RK value 'c' +*/ +static void kname (Proto *p, int pc, int c, const char **name) { + if (ISK(c)) { /* is 'c' a constant? */ + TValue *kvalue = &p->k[INDEXK(c)]; + if (ttisstring(kvalue)) { /* literal constant? */ + *name = svalue(kvalue); /* it is its own name */ + return; + } + /* else no reasonable name found */ + } + else { /* 'c' is a register */ + const char *what = getobjname(p, pc, c, name); /* search for 'c' */ + if (what && *what == 'c') { /* found a constant name? */ + return; /* 'name' already filled */ + } + /* else no reasonable name found */ + } + *name = "?"; /* no reasonable name found */ +} + + +static int filterpc (int pc, int jmptarget) { + if (pc < jmptarget) /* is code conditional (inside a jump)? */ + return -1; /* cannot know who sets that register */ + else return pc; /* current position sets that register */ +} + + +/* +** try to find last instruction before 'lastpc' that modified register 'reg' +*/ +static int findsetreg (Proto *p, int lastpc, int reg) { + int pc; + int setreg = -1; /* keep last instruction that changed 'reg' */ + int jmptarget = 0; /* any code before this address is conditional */ + for (pc = 0; pc < lastpc; pc++) { + Instruction i = p->code[pc]; + OpCode op = GET_OPCODE(i); + int a = GETARG_A(i); + switch (op) { + case OP_LOADNIL: { + int b = GETARG_B(i); + if (a <= reg && reg <= a + b) /* set registers from 'a' to 'a+b' */ + setreg = filterpc(pc, jmptarget); + break; + } + case OP_TFORCALL: { + if (reg >= a + 2) /* affect all regs above its base */ + setreg = filterpc(pc, jmptarget); + break; + } + case OP_CALL: + case OP_TAILCALL: { + if (reg >= a) /* affect all registers above base */ + setreg = filterpc(pc, jmptarget); + break; + } + case OP_JMP: { + int b = GETARG_sBx(i); + int dest = pc + 1 + b; + /* jump is forward and do not skip `lastpc'? */ + if (pc < dest && dest <= lastpc) { + if (dest > jmptarget) + jmptarget = dest; /* update 'jmptarget' */ + } + break; + } + case OP_TEST: { + if (reg == a) /* jumped code can change 'a' */ + setreg = filterpc(pc, jmptarget); + break; + } + default: + if (testAMode(op) && reg == a) /* any instruction that set A */ + setreg = filterpc(pc, jmptarget); + break; + } + } + return setreg; +} + + +static const char *getobjname (Proto *p, int lastpc, int reg, + const char **name) { + int pc; + *name = luaF_getlocalname(p, reg + 1, lastpc); + if (*name) /* is a local? */ + return "local"; + /* else try symbolic execution */ + pc = findsetreg(p, lastpc, reg); + if (pc != -1) { /* could find instruction? */ + Instruction i = p->code[pc]; + OpCode op = GET_OPCODE(i); + switch (op) { + case OP_MOVE: { + int b = GETARG_B(i); /* move from 'b' to 'a' */ + if (b < GETARG_A(i)) + return getobjname(p, pc, b, name); /* get name for 'b' */ + break; + } + case OP_GETTABUP: + case OP_GETTABLE: { + int k = GETARG_C(i); /* key index */ + int t = GETARG_B(i); /* table index */ + const char *vn = (op == OP_GETTABLE) /* name of indexed variable */ + ? luaF_getlocalname(p, t + 1, pc) + : upvalname(p, t); + kname(p, pc, k, name); + return (vn && strcmp(vn, LUA_ENV) == 0) ? "global" : "field"; + } + case OP_GETUPVAL: { + *name = upvalname(p, GETARG_B(i)); + return "upvalue"; + } + case OP_LOADK: + case OP_LOADKX: { + int b = (op == OP_LOADK) ? GETARG_Bx(i) + : GETARG_Ax(p->code[pc + 1]); + if (ttisstring(&p->k[b])) { + *name = svalue(&p->k[b]); + return "constant"; + } + break; + } + case OP_SELF: { + int k = GETARG_C(i); /* key index */ + kname(p, pc, k, name); + return "method"; + } + default: break; /* go through to return NULL */ + } + } + return NULL; /* could not find reasonable name */ +} + + +static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name) { + TMS tm; + Proto *p = ci_func(ci)->p; /* calling function */ + int pc = currentpc(ci); /* calling instruction index */ + Instruction i = p->code[pc]; /* calling instruction */ + switch (GET_OPCODE(i)) { + case OP_CALL: + case OP_TAILCALL: /* get function name */ + return getobjname(p, pc, GETARG_A(i), name); + case OP_TFORCALL: { /* for iterator */ + *name = "for iterator"; + return "for iterator"; + } + /* all other instructions can call only through metamethods */ + case OP_SELF: + case OP_GETTABUP: + case OP_GETTABLE: tm = TM_INDEX; break; + case OP_SETTABUP: + case OP_SETTABLE: tm = TM_NEWINDEX; break; + case OP_EQ: tm = TM_EQ; break; + case OP_ADD: tm = TM_ADD; break; + case OP_SUB: tm = TM_SUB; break; + case OP_MUL: tm = TM_MUL; break; + case OP_DIV: tm = TM_DIV; break; + case OP_MOD: tm = TM_MOD; break; + case OP_POW: tm = TM_POW; break; + case OP_UNM: tm = TM_UNM; break; + case OP_LEN: tm = TM_LEN; break; + case OP_LT: tm = TM_LT; break; + case OP_LE: tm = TM_LE; break; + case OP_CONCAT: tm = TM_CONCAT; break; + default: + return NULL; /* else no useful name can be found */ + } + *name = getstr(G(L)->tmname[tm]); + return "metamethod"; +} + +/* }====================================================== */ + + + +/* +** only ANSI way to check whether a pointer points to an array +** (used only for error messages, so efficiency is not a big concern) +*/ +static int isinstack (CallInfo *ci, const TValue *o) { + StkId p; + for (p = ci->u.l.base; p < ci->top; p++) + if (o == p) return 1; + return 0; +} + + +static const char *getupvalname (CallInfo *ci, const TValue *o, + const char **name) { + LClosure *c = ci_func(ci); + int i; + for (i = 0; i < c->nupvalues; i++) { + if (c->upvals[i]->v == o) { + *name = upvalname(c->p, i); + return "upvalue"; + } + } + return NULL; +} + + +l_noret luaG_typeerror (lua_State *L, const TValue *o, const char *op) { + CallInfo *ci = L->ci; + const char *name = NULL; + const char *t = objtypename(o); + const char *kind = NULL; + if (isLua(ci)) { + kind = getupvalname(ci, o, &name); /* check whether 'o' is an upvalue */ + if (!kind && isinstack(ci, o)) /* no? try a register */ + kind = getobjname(ci_func(ci)->p, currentpc(ci), + cast_int(o - ci->u.l.base), &name); + } + if (kind) + luaG_runerror(L, "attempt to %s %s " LUA_QS " (a %s value)", + op, kind, name, t); + else + luaG_runerror(L, "attempt to %s a %s value", op, t); +} + + +l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2) { + if (ttisstring(p1) || ttisnumber(p1)) p1 = p2; + lua_assert(!ttisstring(p1) && !ttisnumber(p1)); + luaG_typeerror(L, p1, "concatenate"); +} + + +l_noret luaG_aritherror (lua_State *L, const TValue *p1, const TValue *p2) { + TValue temp; + if (luaV_tonumber(p1, &temp) == NULL) + p2 = p1; /* first operand is wrong */ + luaG_typeerror(L, p2, "perform arithmetic on"); +} + + +l_noret luaG_ordererror (lua_State *L, const TValue *p1, const TValue *p2) { + const char *t1 = objtypename(p1); + const char *t2 = objtypename(p2); + if (t1 == t2) + luaG_runerror(L, "attempt to compare two %s values", t1); + else + luaG_runerror(L, "attempt to compare %s with %s", t1, t2); +} + + +static void addinfo (lua_State *L, const char *msg) { + CallInfo *ci = L->ci; + if (isLua(ci)) { /* is Lua code? */ + char buff[LUA_IDSIZE]; /* add file:line information */ + int line = currentline(ci); + TString *src = ci_func(ci)->p->source; + if (src) + luaO_chunkid(buff, getstr(src), LUA_IDSIZE); + else { /* no source available; use "?" instead */ + buff[0] = '?'; buff[1] = '\0'; + } + luaO_pushfstring(L, "%s:%d: %s", buff, line, msg); + } +} + + +l_noret luaG_errormsg (lua_State *L) { + if (L->errfunc != 0) { /* is there an error handling function? */ + StkId errfunc = restorestack(L, L->errfunc); + if (!ttisfunction(errfunc)) luaD_throw(L, LUA_ERRERR); + setobjs2s(L, L->top, L->top - 1); /* move argument */ + setobjs2s(L, L->top - 1, errfunc); /* push function */ + L->top++; + luaD_call(L, L->top - 2, 1, 0); /* call it */ + } + luaD_throw(L, LUA_ERRRUN); +} + + +l_noret luaG_runerror (lua_State *L, const char *fmt, ...) { + va_list argp; + va_start(argp, fmt); + addinfo(L, luaO_pushvfstring(L, fmt, argp)); + va_end(argp); + luaG_errormsg(L); +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h new file mode 100644 index 000000000000..6445c763ea51 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h @@ -0,0 +1,34 @@ +/* +** $Id: ldebug.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $ +** Auxiliary functions from Debug Interface module +** See Copyright Notice in lua.h +*/ + +#ifndef ldebug_h +#define ldebug_h + + +#include "lstate.h" + + +#define pcRel(pc, p) (cast(int, (pc) - (p)->code) - 1) + +#define getfuncline(f,pc) (((f)->lineinfo) ? (f)->lineinfo[pc] : 0) + +#define resethookcount(L) (L->hookcount = L->basehookcount) + +/* Active Lua function (given call info) */ +#define ci_func(ci) (clLvalue((ci)->func)) + + +LUAI_FUNC l_noret luaG_typeerror (lua_State *L, const TValue *o, + const char *opname); +LUAI_FUNC l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2); +LUAI_FUNC l_noret luaG_aritherror (lua_State *L, const TValue *p1, + const TValue *p2); +LUAI_FUNC l_noret luaG_ordererror (lua_State *L, const TValue *p1, + const TValue *p2); +LUAI_FUNC l_noret luaG_runerror (lua_State *L, const char *fmt, ...); +LUAI_FUNC l_noret luaG_errormsg (lua_State *L); + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c new file mode 100644 index 000000000000..cb49cb55e6cf --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c @@ -0,0 +1,691 @@ +/* +** $Id: ldo.c,v 2.108.1.3 2013/11/08 18:22:50 roberto Exp $ +** Stack and Call structure of Lua +** See Copyright Notice in lua.h +*/ + + +#include <sys/zfs_context.h> + +#define ldo_c +#define LUA_CORE + +#include "lua.h" + +#include "lapi.h" +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "lgc.h" +#include "lmem.h" +#include "lobject.h" +#include "lopcodes.h" +#include "lparser.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" +#include "lundump.h" +#include "lvm.h" +#include "lzio.h" + + + + +/* +** {====================================================== +** Error-recovery functions +** ======================================================= +*/ + +/* +** LUAI_THROW/LUAI_TRY define how Lua does exception handling. By +** default, Lua handles errors with exceptions when compiling as +** C++ code, with _longjmp/_setjmp when asked to use them, and with +** longjmp/setjmp otherwise. +*/ +#if !defined(LUAI_THROW) + +#ifdef _KERNEL +#ifdef illumos +#define LUAI_THROW(L,c) longjmp(&(c)->b) +#define LUAI_TRY(L,c,a) if (setjmp(&(c)->b) == 0) { a } +#define luai_jmpbuf label_t +#else +#define LUAI_THROW(L,c) longjmp((c)->b, 1) +#define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a } +#define luai_jmpbuf jmp_buf +#endif +#else +#if defined(__cplusplus) && !defined(LUA_USE_LONGJMP) +/* C++ exceptions */ +#define LUAI_THROW(L,c) throw(c) +#define LUAI_TRY(L,c,a) \ + try { a } catch(...) { if ((c)->status == 0) (c)->status = -1; } +#define luai_jmpbuf int /* dummy variable */ + +#elif defined(LUA_USE_ULONGJMP) +/* in Unix, try _longjmp/_setjmp (more efficient) */ +#define LUAI_THROW(L,c) _longjmp((c)->b, 1) +#define LUAI_TRY(L,c,a) if (_setjmp((c)->b) == 0) { a } +#define luai_jmpbuf jmp_buf + +#else +/* default handling with long jumps */ +#define LUAI_THROW(L,c) longjmp((c)->b, 1) +#define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a } +#define luai_jmpbuf jmp_buf + +#endif + +#endif + +#endif + + +/* chain list of long jump buffers */ +struct lua_longjmp { + struct lua_longjmp *previous; + luai_jmpbuf b; + volatile int status; /* error code */ +}; + + +static void seterrorobj (lua_State *L, int errcode, StkId oldtop) { + switch (errcode) { + case LUA_ERRMEM: { /* memory error? */ + setsvalue2s(L, oldtop, G(L)->memerrmsg); /* reuse preregistered msg. */ + break; + } + case LUA_ERRERR: { + setsvalue2s(L, oldtop, luaS_newliteral(L, "error in error handling")); + break; + } + default: { + setobjs2s(L, oldtop, L->top - 1); /* error message on current top */ + break; + } + } + L->top = oldtop + 1; +} + + +l_noret luaD_throw (lua_State *L, int errcode) { + if (L->errorJmp) { /* thread has an error handler? */ + L->errorJmp->status = errcode; /* set status */ + LUAI_THROW(L, L->errorJmp); /* jump to it */ + } + else { /* thread has no error handler */ + L->status = cast_byte(errcode); /* mark it as dead */ + if (G(L)->mainthread->errorJmp) { /* main thread has a handler? */ + setobjs2s(L, G(L)->mainthread->top++, L->top - 1); /* copy error obj. */ + luaD_throw(G(L)->mainthread, errcode); /* re-throw in main thread */ + } + else { /* no handler at all; abort */ + if (G(L)->panic) { /* panic function? */ + lua_unlock(L); + G(L)->panic(L); /* call it (last chance to jump out) */ + } + panic("no error handler"); + } + } +} + + +int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud) { + unsigned short oldnCcalls = L->nCcalls; + struct lua_longjmp lj; + lj.status = LUA_OK; + lj.previous = L->errorJmp; /* chain new error handler */ + L->errorJmp = &lj; + LUAI_TRY(L, &lj, + (*f)(L, ud); + ); + L->errorJmp = lj.previous; /* restore old error handler */ + L->nCcalls = oldnCcalls; + return lj.status; +} + +/* }====================================================== */ + + +static void correctstack (lua_State *L, TValue *oldstack) { + CallInfo *ci; + GCObject *up; + L->top = (L->top - oldstack) + L->stack; + for (up = L->openupval; up != NULL; up = up->gch.next) + gco2uv(up)->v = (gco2uv(up)->v - oldstack) + L->stack; + for (ci = L->ci; ci != NULL; ci = ci->previous) { + ci->top = (ci->top - oldstack) + L->stack; + ci->func = (ci->func - oldstack) + L->stack; + if (isLua(ci)) + ci->u.l.base = (ci->u.l.base - oldstack) + L->stack; + } +} + + +/* some space for error handling */ +#define ERRORSTACKSIZE (LUAI_MAXSTACK + 200) + + +void luaD_reallocstack (lua_State *L, int newsize) { + TValue *oldstack = L->stack; + int lim = L->stacksize; + lua_assert(newsize <= LUAI_MAXSTACK || newsize == ERRORSTACKSIZE); + lua_assert(L->stack_last - L->stack == L->stacksize - EXTRA_STACK); + luaM_reallocvector(L, L->stack, L->stacksize, newsize, TValue); + for (; lim < newsize; lim++) + setnilvalue(L->stack + lim); /* erase new segment */ + L->stacksize = newsize; + L->stack_last = L->stack + newsize - EXTRA_STACK; + correctstack(L, oldstack); +} + + +void luaD_growstack (lua_State *L, int n) { + int size = L->stacksize; + if (size > LUAI_MAXSTACK) /* error after extra size? */ + luaD_throw(L, LUA_ERRERR); + else { + int needed = cast_int(L->top - L->stack) + n + EXTRA_STACK; + int newsize = 2 * size; + if (newsize > LUAI_MAXSTACK) newsize = LUAI_MAXSTACK; + if (newsize < needed) newsize = needed; + if (newsize > LUAI_MAXSTACK) { /* stack overflow? */ + luaD_reallocstack(L, ERRORSTACKSIZE); + luaG_runerror(L, "stack overflow"); + } + else + luaD_reallocstack(L, newsize); + } +} + + +static int stackinuse (lua_State *L) { + CallInfo *ci; + StkId lim = L->top; + for (ci = L->ci; ci != NULL; ci = ci->previous) { + lua_assert(ci->top <= L->stack_last); + if (lim < ci->top) lim = ci->top; + } + return cast_int(lim - L->stack) + 1; /* part of stack in use */ +} + + +void luaD_shrinkstack (lua_State *L) { + int inuse = stackinuse(L); + int goodsize = inuse + (inuse / 8) + 2*EXTRA_STACK; + if (goodsize > LUAI_MAXSTACK) goodsize = LUAI_MAXSTACK; + if (inuse > LUAI_MAXSTACK || /* handling stack overflow? */ + goodsize >= L->stacksize) /* would grow instead of shrink? */ + condmovestack(L); /* don't change stack (change only for debugging) */ + else + luaD_reallocstack(L, goodsize); /* shrink it */ +} + + +void luaD_hook (lua_State *L, int event, int line) { + lua_Hook hook = L->hook; + if (hook && L->allowhook) { + CallInfo *ci = L->ci; + ptrdiff_t top = savestack(L, L->top); + ptrdiff_t ci_top = savestack(L, ci->top); + lua_Debug ar; + ar.event = event; + ar.currentline = line; + ar.i_ci = ci; + luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */ + ci->top = L->top + LUA_MINSTACK; + lua_assert(ci->top <= L->stack_last); + L->allowhook = 0; /* cannot call hooks inside a hook */ + ci->callstatus |= CIST_HOOKED; + lua_unlock(L); + (*hook)(L, &ar); + lua_lock(L); + lua_assert(!L->allowhook); + L->allowhook = 1; + ci->top = restorestack(L, ci_top); + L->top = restorestack(L, top); + ci->callstatus &= ~CIST_HOOKED; + } +} + + +static void callhook (lua_State *L, CallInfo *ci) { + int hook = LUA_HOOKCALL; + ci->u.l.savedpc++; /* hooks assume 'pc' is already incremented */ + if (isLua(ci->previous) && + GET_OPCODE(*(ci->previous->u.l.savedpc - 1)) == OP_TAILCALL) { + ci->callstatus |= CIST_TAIL; + hook = LUA_HOOKTAILCALL; + } + luaD_hook(L, hook, -1); + ci->u.l.savedpc--; /* correct 'pc' */ +} + + +static StkId adjust_varargs (lua_State *L, Proto *p, int actual) { + int i; + int nfixargs = p->numparams; + StkId base, fixed; + lua_assert(actual >= nfixargs); + /* move fixed parameters to final position */ + luaD_checkstack(L, p->maxstacksize); /* check again for new 'base' */ + fixed = L->top - actual; /* first fixed argument */ + base = L->top; /* final position of first argument */ + for (i=0; i<nfixargs; i++) { + setobjs2s(L, L->top++, fixed + i); + setnilvalue(fixed + i); + } + return base; +} + + +static StkId tryfuncTM (lua_State *L, StkId func) { + const TValue *tm = luaT_gettmbyobj(L, func, TM_CALL); + StkId p; + ptrdiff_t funcr = savestack(L, func); + if (!ttisfunction(tm)) + luaG_typeerror(L, func, "call"); + /* Open a hole inside the stack at `func' */ + for (p = L->top; p > func; p--) setobjs2s(L, p, p-1); + incr_top(L); + func = restorestack(L, funcr); /* previous call may change stack */ + setobj2s(L, func, tm); /* tag method is the new function to be called */ + return func; +} + + + +#define next_ci(L) (L->ci = (L->ci->next ? L->ci->next : luaE_extendCI(L))) + + +/* +** returns true if function has been executed (C function) +*/ +int luaD_precall (lua_State *L, StkId func, int nresults) { + lua_CFunction f; + CallInfo *ci; + int n; /* number of arguments (Lua) or returns (C) */ + ptrdiff_t funcr = savestack(L, func); + switch (ttype(func)) { + case LUA_TLCF: /* light C function */ + f = fvalue(func); + goto Cfunc; + case LUA_TCCL: { /* C closure */ + f = clCvalue(func)->f; + Cfunc: + luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */ + ci = next_ci(L); /* now 'enter' new function */ + ci->nresults = nresults; + ci->func = restorestack(L, funcr); + ci->top = L->top + LUA_MINSTACK; + lua_assert(ci->top <= L->stack_last); + ci->callstatus = 0; + luaC_checkGC(L); /* stack grow uses memory */ + if (L->hookmask & LUA_MASKCALL) + luaD_hook(L, LUA_HOOKCALL, -1); + lua_unlock(L); + n = (*f)(L); /* do the actual call */ + lua_lock(L); + api_checknelems(L, n); + luaD_poscall(L, L->top - n); + return 1; + } + case LUA_TLCL: { /* Lua function: prepare its call */ + StkId base; + Proto *p = clLvalue(func)->p; + n = cast_int(L->top - func) - 1; /* number of real arguments */ + luaD_checkstack(L, p->maxstacksize); + for (; n < p->numparams; n++) + setnilvalue(L->top++); /* complete missing arguments */ + if (!p->is_vararg) { + func = restorestack(L, funcr); + base = func + 1; + } + else { + base = adjust_varargs(L, p, n); + func = restorestack(L, funcr); /* previous call can change stack */ + } + ci = next_ci(L); /* now 'enter' new function */ + ci->nresults = nresults; + ci->func = func; + ci->u.l.base = base; + ci->top = base + p->maxstacksize; + lua_assert(ci->top <= L->stack_last); + ci->u.l.savedpc = p->code; /* starting point */ + ci->callstatus = CIST_LUA; + L->top = ci->top; + luaC_checkGC(L); /* stack grow uses memory */ + if (L->hookmask & LUA_MASKCALL) + callhook(L, ci); + return 0; + } + default: { /* not a function */ + func = tryfuncTM(L, func); /* retry with 'function' tag method */ + return luaD_precall(L, func, nresults); /* now it must be a function */ + } + } +} + + +int luaD_poscall (lua_State *L, StkId firstResult) { + StkId res; + int wanted, i; + CallInfo *ci = L->ci; + if (L->hookmask & (LUA_MASKRET | LUA_MASKLINE)) { + if (L->hookmask & LUA_MASKRET) { + ptrdiff_t fr = savestack(L, firstResult); /* hook may change stack */ + luaD_hook(L, LUA_HOOKRET, -1); + firstResult = restorestack(L, fr); + } + L->oldpc = ci->previous->u.l.savedpc; /* 'oldpc' for caller function */ + } + res = ci->func; /* res == final position of 1st result */ + wanted = ci->nresults; + L->ci = ci = ci->previous; /* back to caller */ + /* move results to correct place */ + for (i = wanted; i != 0 && firstResult < L->top; i--) + setobjs2s(L, res++, firstResult++); + while (i-- > 0) + setnilvalue(res++); + L->top = res; + return (wanted - LUA_MULTRET); /* 0 iff wanted == LUA_MULTRET */ +} + + +/* +** Call a function (C or Lua). The function to be called is at *func. +** The arguments are on the stack, right after the function. +** When returns, all the results are on the stack, starting at the original +** function position. +*/ +void luaD_call (lua_State *L, StkId func, int nResults, int allowyield) { + if (++L->nCcalls >= LUAI_MAXCCALLS) { + if (L->nCcalls == LUAI_MAXCCALLS) + luaG_runerror(L, "C stack overflow"); + else if (L->nCcalls >= (LUAI_MAXCCALLS + (LUAI_MAXCCALLS>>3))) + luaD_throw(L, LUA_ERRERR); /* error while handing stack error */ + } + if (!allowyield) L->nny++; + if (!luaD_precall(L, func, nResults)) /* is a Lua function? */ + luaV_execute(L); /* call it */ + if (!allowyield) L->nny--; + L->nCcalls--; +} + + +static void finishCcall (lua_State *L) { + CallInfo *ci = L->ci; + int n; + lua_assert(ci->u.c.k != NULL); /* must have a continuation */ + lua_assert(L->nny == 0); + if (ci->callstatus & CIST_YPCALL) { /* was inside a pcall? */ + ci->callstatus &= ~CIST_YPCALL; /* finish 'lua_pcall' */ + L->errfunc = ci->u.c.old_errfunc; + } + /* finish 'lua_callk'/'lua_pcall' */ + adjustresults(L, ci->nresults); + /* call continuation function */ + if (!(ci->callstatus & CIST_STAT)) /* no call status? */ + ci->u.c.status = LUA_YIELD; /* 'default' status */ + lua_assert(ci->u.c.status != LUA_OK); + ci->callstatus = (ci->callstatus & ~(CIST_YPCALL | CIST_STAT)) | CIST_YIELDED; + lua_unlock(L); + n = (*ci->u.c.k)(L); + lua_lock(L); + api_checknelems(L, n); + /* finish 'luaD_precall' */ + luaD_poscall(L, L->top - n); +} + + +static void unroll (lua_State *L, void *ud) { + UNUSED(ud); + for (;;) { + if (L->ci == &L->base_ci) /* stack is empty? */ + return; /* coroutine finished normally */ + if (!isLua(L->ci)) /* C function? */ + finishCcall(L); + else { /* Lua function */ + luaV_finishOp(L); /* finish interrupted instruction */ + luaV_execute(L); /* execute down to higher C 'boundary' */ + } + } +} + + +/* +** check whether thread has a suspended protected call +*/ +static CallInfo *findpcall (lua_State *L) { + CallInfo *ci; + for (ci = L->ci; ci != NULL; ci = ci->previous) { /* search for a pcall */ + if (ci->callstatus & CIST_YPCALL) + return ci; + } + return NULL; /* no pending pcall */ +} + + +static int recover (lua_State *L, int status) { + StkId oldtop; + CallInfo *ci = findpcall(L); + if (ci == NULL) return 0; /* no recovery point */ + /* "finish" luaD_pcall */ + oldtop = restorestack(L, ci->extra); + luaF_close(L, oldtop); + seterrorobj(L, status, oldtop); + L->ci = ci; + L->allowhook = ci->u.c.old_allowhook; + L->nny = 0; /* should be zero to be yieldable */ + luaD_shrinkstack(L); + L->errfunc = ci->u.c.old_errfunc; + ci->callstatus |= CIST_STAT; /* call has error status */ + ci->u.c.status = status; /* (here it is) */ + return 1; /* continue running the coroutine */ +} + + +/* +** signal an error in the call to 'resume', not in the execution of the +** coroutine itself. (Such errors should not be handled by any coroutine +** error handler and should not kill the coroutine.) +*/ +static l_noret resume_error (lua_State *L, const char *msg, StkId firstArg) { + L->top = firstArg; /* remove args from the stack */ + setsvalue2s(L, L->top, luaS_new(L, msg)); /* push error message */ + api_incr_top(L); + luaD_throw(L, -1); /* jump back to 'lua_resume' */ +} + + +/* +** do the work for 'lua_resume' in protected mode +*/ +static void resume_cb (lua_State *L, void *ud) { + int nCcalls = L->nCcalls; + StkId firstArg = cast(StkId, ud); + CallInfo *ci = L->ci; + if (nCcalls >= LUAI_MAXCCALLS) + resume_error(L, "C stack overflow", firstArg); + if (L->status == LUA_OK) { /* may be starting a coroutine */ + if (ci != &L->base_ci) /* not in base level? */ + resume_error(L, "cannot resume non-suspended coroutine", firstArg); + /* coroutine is in base level; start running it */ + if (!luaD_precall(L, firstArg - 1, LUA_MULTRET)) /* Lua function? */ + luaV_execute(L); /* call it */ + } + else if (L->status != LUA_YIELD) + resume_error(L, "cannot resume dead coroutine", firstArg); + else { /* resuming from previous yield */ + L->status = LUA_OK; + ci->func = restorestack(L, ci->extra); + if (isLua(ci)) /* yielded inside a hook? */ + luaV_execute(L); /* just continue running Lua code */ + else { /* 'common' yield */ + if (ci->u.c.k != NULL) { /* does it have a continuation? */ + int n; + ci->u.c.status = LUA_YIELD; /* 'default' status */ + ci->callstatus |= CIST_YIELDED; + lua_unlock(L); + n = (*ci->u.c.k)(L); /* call continuation */ + lua_lock(L); + api_checknelems(L, n); + firstArg = L->top - n; /* yield results come from continuation */ + } + luaD_poscall(L, firstArg); /* finish 'luaD_precall' */ + } + unroll(L, NULL); + } + lua_assert(nCcalls == L->nCcalls); +} + + +LUA_API int lua_resume (lua_State *L, lua_State *from, int nargs) { + int status; + int oldnny = L->nny; /* save 'nny' */ + lua_lock(L); + luai_userstateresume(L, nargs); + L->nCcalls = (from) ? from->nCcalls + 1 : 1; + L->nny = 0; /* allow yields */ + api_checknelems(L, (L->status == LUA_OK) ? nargs + 1 : nargs); + status = luaD_rawrunprotected(L, resume_cb, L->top - nargs); + if (status == -1) /* error calling 'lua_resume'? */ + status = LUA_ERRRUN; + else { /* yield or regular error */ + while (status != LUA_OK && status != LUA_YIELD) { /* error? */ + if (recover(L, status)) /* recover point? */ + status = luaD_rawrunprotected(L, unroll, NULL); /* run continuation */ + else { /* unrecoverable error */ + L->status = cast_byte(status); /* mark thread as `dead' */ + seterrorobj(L, status, L->top); + L->ci->top = L->top; + break; + } + } + lua_assert(status == L->status); + } + L->nny = oldnny; /* restore 'nny' */ + L->nCcalls--; + lua_assert(L->nCcalls == ((from) ? from->nCcalls : 0)); + lua_unlock(L); + return status; +} + + +LUA_API int lua_yieldk (lua_State *L, int nresults, int ctx, lua_CFunction k) { + CallInfo *ci = L->ci; + luai_userstateyield(L, nresults); + lua_lock(L); + api_checknelems(L, nresults); + if (L->nny > 0) { + if (L != G(L)->mainthread) + luaG_runerror(L, "attempt to yield across a C-call boundary"); + else + luaG_runerror(L, "attempt to yield from outside a coroutine"); + } + L->status = LUA_YIELD; + ci->extra = savestack(L, ci->func); /* save current 'func' */ + if (isLua(ci)) { /* inside a hook? */ + api_check(L, k == NULL, "hooks cannot continue after yielding"); + } + else { + if ((ci->u.c.k = k) != NULL) /* is there a continuation? */ + ci->u.c.ctx = ctx; /* save context */ + ci->func = L->top - nresults - 1; /* protect stack below results */ + luaD_throw(L, LUA_YIELD); + } + lua_assert(ci->callstatus & CIST_HOOKED); /* must be inside a hook */ + lua_unlock(L); + return 0; /* return to 'luaD_hook' */ +} + + +int luaD_pcall (lua_State *L, Pfunc func, void *u, + ptrdiff_t old_top, ptrdiff_t ef) { + int status; + CallInfo *old_ci = L->ci; + lu_byte old_allowhooks = L->allowhook; + unsigned short old_nny = L->nny; + ptrdiff_t old_errfunc = L->errfunc; + L->errfunc = ef; + status = luaD_rawrunprotected(L, func, u); + if (status != LUA_OK) { /* an error occurred? */ + StkId oldtop = restorestack(L, old_top); + luaF_close(L, oldtop); /* close possible pending closures */ + seterrorobj(L, status, oldtop); + L->ci = old_ci; + L->allowhook = old_allowhooks; + L->nny = old_nny; + luaD_shrinkstack(L); + } + L->errfunc = old_errfunc; + return status; +} + + + +/* +** Execute a protected parser. +*/ +struct SParser { /* data to `f_parser' */ + ZIO *z; + Mbuffer buff; /* dynamic structure used by the scanner */ + Dyndata dyd; /* dynamic structures used by the parser */ + const char *mode; + const char *name; +}; + + +static void checkmode (lua_State *L, const char *mode, const char *x) { + if (mode && strchr(mode, x[0]) == NULL) { + luaO_pushfstring(L, + "attempt to load a %s chunk (mode is " LUA_QS ")", x, mode); + luaD_throw(L, LUA_ERRSYNTAX); + } +} + + +static void f_parser (lua_State *L, void *ud) { + int i; + Closure *cl; + struct SParser *p = cast(struct SParser *, ud); + int c = zgetc(p->z); /* read first character */ + if (c == LUA_SIGNATURE[0]) { + checkmode(L, p->mode, "binary"); + cl = luaU_undump(L, p->z, &p->buff, p->name); + } + else { + checkmode(L, p->mode, "text"); + cl = luaY_parser(L, p->z, &p->buff, &p->dyd, p->name, c); + } + lua_assert(cl->l.nupvalues == cl->l.p->sizeupvalues); + for (i = 0; i < cl->l.nupvalues; i++) { /* initialize upvalues */ + UpVal *up = luaF_newupval(L); + cl->l.upvals[i] = up; + luaC_objbarrier(L, cl, up); + } +} + + +int luaD_protectedparser (lua_State *L, ZIO *z, const char *name, + const char *mode) { + struct SParser p; + int status; + L->nny++; /* cannot yield during parsing */ + p.z = z; p.name = name; p.mode = mode; + p.dyd.actvar.arr = NULL; p.dyd.actvar.size = 0; + p.dyd.gt.arr = NULL; p.dyd.gt.size = 0; + p.dyd.label.arr = NULL; p.dyd.label.size = 0; + luaZ_initbuffer(L, &p.buff); + status = luaD_pcall(L, f_parser, &p, savestack(L, L->top), L->errfunc); + luaZ_freebuffer(L, &p.buff); + luaM_freearray(L, p.dyd.actvar.arr, p.dyd.actvar.size); + luaM_freearray(L, p.dyd.gt.arr, p.dyd.gt.size); + luaM_freearray(L, p.dyd.label.arr, p.dyd.label.size); + L->nny--; + return status; +} + + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h new file mode 100644 index 000000000000..d3d3082c9ba3 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h @@ -0,0 +1,46 @@ +/* +** $Id: ldo.h,v 2.20.1.1 2013/04/12 18:48:47 roberto Exp $ +** Stack and Call structure of Lua +** See Copyright Notice in lua.h +*/ + +#ifndef ldo_h +#define ldo_h + + +#include "lobject.h" +#include "lstate.h" +#include "lzio.h" + + +#define luaD_checkstack(L,n) if (L->stack_last - L->top <= (n)) \ + luaD_growstack(L, n); else condmovestack(L); + + +#define incr_top(L) {L->top++; luaD_checkstack(L,0);} + +#define savestack(L,p) ((char *)(p) - (char *)L->stack) +#define restorestack(L,n) ((TValue *)((char *)L->stack + (n))) + + +/* type of protected functions, to be ran by `runprotected' */ +typedef void (*Pfunc) (lua_State *L, void *ud); + +LUAI_FUNC int luaD_protectedparser (lua_State *L, ZIO *z, const char *name, + const char *mode); +LUAI_FUNC void luaD_hook (lua_State *L, int event, int line); +LUAI_FUNC int luaD_precall (lua_State *L, StkId func, int nresults); +LUAI_FUNC void luaD_call (lua_State *L, StkId func, int nResults, + int allowyield); +LUAI_FUNC int luaD_pcall (lua_State *L, Pfunc func, void *u, + ptrdiff_t oldtop, ptrdiff_t ef); +LUAI_FUNC int luaD_poscall (lua_State *L, StkId firstResult); +LUAI_FUNC void luaD_reallocstack (lua_State *L, int newsize); +LUAI_FUNC void luaD_growstack (lua_State *L, int n); +LUAI_FUNC void luaD_shrinkstack (lua_State *L); + +LUAI_FUNC l_noret luaD_throw (lua_State *L, int errcode); +LUAI_FUNC int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud); + +#endif + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c new file mode 100644 index 000000000000..64e564933268 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c @@ -0,0 +1,173 @@ +/* +** $Id: ldump.c,v 2.17.1.1 2013/04/12 18:48:47 roberto Exp $ +** save precompiled Lua chunks +** See Copyright Notice in lua.h +*/ + +#include <sys/zfs_context.h> + +#define ldump_c +#define LUA_CORE + +#include "lua.h" + +#include "lobject.h" +#include "lstate.h" +#include "lundump.h" + +typedef struct { + lua_State* L; + lua_Writer writer; + void* data; + int strip; + int status; +} DumpState; + +#define DumpMem(b,n,size,D) DumpBlock(b,(n)*(size),D) +#define DumpVar(x,D) DumpMem(&x,1,sizeof(x),D) + +static void DumpBlock(const void* b, size_t size, DumpState* D) +{ + if (D->status==0) + { + lua_unlock(D->L); + D->status=(*D->writer)(D->L,b,size,D->data); + lua_lock(D->L); + } +} + +static void DumpChar(int y, DumpState* D) +{ + char x=(char)y; + DumpVar(x,D); +} + +static void DumpInt(int x, DumpState* D) +{ + DumpVar(x,D); +} + +static void DumpNumber(lua_Number x, DumpState* D) +{ + DumpVar(x,D); +} + +static void DumpVector(const void* b, int n, size_t size, DumpState* D) +{ + DumpInt(n,D); + DumpMem(b,n,size,D); +} + +static void DumpString(const TString* s, DumpState* D) +{ + if (s==NULL) + { + size_t size=0; + DumpVar(size,D); + } + else + { + size_t size=s->tsv.len+1; /* include trailing '\0' */ + DumpVar(size,D); + DumpBlock(getstr(s),size*sizeof(char),D); + } +} + +#define DumpCode(f,D) DumpVector(f->code,f->sizecode,sizeof(Instruction),D) + +static void DumpFunction(const Proto* f, DumpState* D); + +static void DumpConstants(const Proto* f, DumpState* D) +{ + int i,n=f->sizek; + DumpInt(n,D); + for (i=0; i<n; i++) + { + const TValue* o=&f->k[i]; + DumpChar(ttypenv(o),D); + switch (ttypenv(o)) + { + case LUA_TNIL: + break; + case LUA_TBOOLEAN: + DumpChar(bvalue(o),D); + break; + case LUA_TNUMBER: + DumpNumber(nvalue(o),D); + break; + case LUA_TSTRING: + DumpString(rawtsvalue(o),D); + break; + default: lua_assert(0); + } + } + n=f->sizep; + DumpInt(n,D); + for (i=0; i<n; i++) DumpFunction(f->p[i],D); +} + +static void DumpUpvalues(const Proto* f, DumpState* D) +{ + int i,n=f->sizeupvalues; + DumpInt(n,D); + for (i=0; i<n; i++) + { + DumpChar(f->upvalues[i].instack,D); + DumpChar(f->upvalues[i].idx,D); + } +} + +static void DumpDebug(const Proto* f, DumpState* D) +{ + int i,n; + DumpString((D->strip) ? NULL : f->source,D); + n= (D->strip) ? 0 : f->sizelineinfo; + DumpVector(f->lineinfo,n,sizeof(int),D); + n= (D->strip) ? 0 : f->sizelocvars; + DumpInt(n,D); + for (i=0; i<n; i++) + { + DumpString(f->locvars[i].varname,D); + DumpInt(f->locvars[i].startpc,D); + DumpInt(f->locvars[i].endpc,D); + } + n= (D->strip) ? 0 : f->sizeupvalues; + DumpInt(n,D); + for (i=0; i<n; i++) DumpString(f->upvalues[i].name,D); +} + +static void DumpFunction(const Proto* f, DumpState* D) +{ + DumpInt(f->linedefined,D); + DumpInt(f->lastlinedefined,D); + DumpChar(f->numparams,D); + DumpChar(f->is_vararg,D); + DumpChar(f->maxstacksize,D); + DumpCode(f,D); + DumpConstants(f,D); + DumpUpvalues(f,D); + DumpDebug(f,D); +} + +static void DumpHeader(DumpState* D) +{ + lu_byte h[LUAC_HEADERSIZE]; + luaU_header(h); + DumpBlock(h,LUAC_HEADERSIZE,D); +} + +/* +** dump Lua function as precompiled chunk +*/ +int luaU_dump (lua_State* L, const Proto* f, lua_Writer w, void* data, int strip) +{ + DumpState D; + D.L=L; + D.writer=w; + D.data=data; + D.strip=strip; + D.status=0; + DumpHeader(&D); + DumpFunction(f,&D); + return D.status; +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c new file mode 100644 index 000000000000..684e44709a8f --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c @@ -0,0 +1,161 @@ +/* +** $Id: lfunc.c,v 2.30.1.1 2013/04/12 18:48:47 roberto Exp $ +** Auxiliary functions to manipulate prototypes and closures +** See Copyright Notice in lua.h +*/ + + +#include <sys/zfs_context.h> + +#define lfunc_c +#define LUA_CORE + +#include "lua.h" + +#include "lfunc.h" +#include "lgc.h" +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" + + + +Closure *luaF_newCclosure (lua_State *L, int n) { + Closure *c = &luaC_newobj(L, LUA_TCCL, sizeCclosure(n), NULL, 0)->cl; + c->c.nupvalues = cast_byte(n); + return c; +} + + +Closure *luaF_newLclosure (lua_State *L, int n) { + Closure *c = &luaC_newobj(L, LUA_TLCL, sizeLclosure(n), NULL, 0)->cl; + c->l.p = NULL; + c->l.nupvalues = cast_byte(n); + while (n--) c->l.upvals[n] = NULL; + return c; +} + + +UpVal *luaF_newupval (lua_State *L) { + UpVal *uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), NULL, 0)->uv; + uv->v = &uv->u.value; + setnilvalue(uv->v); + return uv; +} + + +UpVal *luaF_findupval (lua_State *L, StkId level) { + global_State *g = G(L); + GCObject **pp = &L->openupval; + UpVal *p; + UpVal *uv; + while (*pp != NULL && (p = gco2uv(*pp))->v >= level) { + GCObject *o = obj2gco(p); + lua_assert(p->v != &p->u.value); + lua_assert(!isold(o) || isold(obj2gco(L))); + if (p->v == level) { /* found a corresponding upvalue? */ + if (isdead(g, o)) /* is it dead? */ + changewhite(o); /* resurrect it */ + return p; + } + pp = &p->next; + } + /* not found: create a new one */ + uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), pp, 0)->uv; + uv->v = level; /* current value lives in the stack */ + uv->u.l.prev = &g->uvhead; /* double link it in `uvhead' list */ + uv->u.l.next = g->uvhead.u.l.next; + uv->u.l.next->u.l.prev = uv; + g->uvhead.u.l.next = uv; + lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv); + return uv; +} + + +static void unlinkupval (UpVal *uv) { + lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv); + uv->u.l.next->u.l.prev = uv->u.l.prev; /* remove from `uvhead' list */ + uv->u.l.prev->u.l.next = uv->u.l.next; +} + + +void luaF_freeupval (lua_State *L, UpVal *uv) { + if (uv->v != &uv->u.value) /* is it open? */ + unlinkupval(uv); /* remove from open list */ + luaM_free(L, uv); /* free upvalue */ +} + + +void luaF_close (lua_State *L, StkId level) { + UpVal *uv; + global_State *g = G(L); + while (L->openupval != NULL && (uv = gco2uv(L->openupval))->v >= level) { + GCObject *o = obj2gco(uv); + lua_assert(!isblack(o) && uv->v != &uv->u.value); + L->openupval = uv->next; /* remove from `open' list */ + if (isdead(g, o)) + luaF_freeupval(L, uv); /* free upvalue */ + else { + unlinkupval(uv); /* remove upvalue from 'uvhead' list */ + setobj(L, &uv->u.value, uv->v); /* move value to upvalue slot */ + uv->v = &uv->u.value; /* now current value lives here */ + gch(o)->next = g->allgc; /* link upvalue into 'allgc' list */ + g->allgc = o; + luaC_checkupvalcolor(g, uv); + } + } +} + + +Proto *luaF_newproto (lua_State *L) { + Proto *f = &luaC_newobj(L, LUA_TPROTO, sizeof(Proto), NULL, 0)->p; + f->k = NULL; + f->sizek = 0; + f->p = NULL; + f->sizep = 0; + f->code = NULL; + f->cache = NULL; + f->sizecode = 0; + f->lineinfo = NULL; + f->sizelineinfo = 0; + f->upvalues = NULL; + f->sizeupvalues = 0; + f->numparams = 0; + f->is_vararg = 0; + f->maxstacksize = 0; + f->locvars = NULL; + f->sizelocvars = 0; + f->linedefined = 0; + f->lastlinedefined = 0; + f->source = NULL; + return f; +} + + +void luaF_freeproto (lua_State *L, Proto *f) { + luaM_freearray(L, f->code, f->sizecode); + luaM_freearray(L, f->p, f->sizep); + luaM_freearray(L, f->k, f->sizek); + luaM_freearray(L, f->lineinfo, f->sizelineinfo); + luaM_freearray(L, f->locvars, f->sizelocvars); + luaM_freearray(L, f->upvalues, f->sizeupvalues); + luaM_free(L, f); +} + + +/* +** Look for n-th local variable at line `line' in function `func'. +** Returns NULL if not found. +*/ +const char *luaF_getlocalname (const Proto *f, int local_number, int pc) { + int i; + for (i = 0; i<f->sizelocvars && f->locvars[i].startpc <= pc; i++) { + if (pc < f->locvars[i].endpc) { /* is variable active? */ + local_number--; + if (local_number == 0) + return getstr(f->locvars[i].varname); + } + } + return NULL; /* not found */ +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h new file mode 100644 index 000000000000..ca0d3a3e0b03 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h @@ -0,0 +1,33 @@ +/* +** $Id: lfunc.h,v 2.8.1.1 2013/04/12 18:48:47 roberto Exp $ +** Auxiliary functions to manipulate prototypes and closures +** See Copyright Notice in lua.h +*/ + +#ifndef lfunc_h +#define lfunc_h + + +#include "lobject.h" + + +#define sizeCclosure(n) (cast(int, sizeof(CClosure)) + \ + cast(int, sizeof(TValue)*((n)-1))) + +#define sizeLclosure(n) (cast(int, sizeof(LClosure)) + \ + cast(int, sizeof(TValue *)*((n)-1))) + + +LUAI_FUNC Proto *luaF_newproto (lua_State *L); +LUAI_FUNC Closure *luaF_newCclosure (lua_State *L, int nelems); +LUAI_FUNC Closure *luaF_newLclosure (lua_State *L, int nelems); +LUAI_FUNC UpVal *luaF_newupval (lua_State *L); +LUAI_FUNC UpVal *luaF_findupval (lua_State *L, StkId level); +LUAI_FUNC void luaF_close (lua_State *L, StkId level); +LUAI_FUNC void luaF_freeproto (lua_State *L, Proto *f); +LUAI_FUNC void luaF_freeupval (lua_State *L, UpVal *uv); +LUAI_FUNC const char *luaF_getlocalname (const Proto *func, int local_number, + int pc); + + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c new file mode 100644 index 000000000000..4a7d25af2083 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c @@ -0,0 +1,1220 @@ +/* +** $Id: lgc.c,v 2.140.1.3 2014/09/01 16:55:08 roberto Exp $ +** Garbage Collector +** See Copyright Notice in lua.h +*/ + +#include <sys/zfs_context.h> + +#define lgc_c +#define LUA_CORE + +#include "lua.h" + +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "lgc.h" +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" + + + +/* +** cost of sweeping one element (the size of a small object divided +** by some adjust for the sweep speed) +*/ +#define GCSWEEPCOST ((sizeof(TString) + 4) / 4) + +/* maximum number of elements to sweep in each single step */ +#define GCSWEEPMAX (cast_int((GCSTEPSIZE / GCSWEEPCOST) / 4)) + +/* maximum number of finalizers to call in each GC step */ +#define GCFINALIZENUM 4 + + +/* +** macro to adjust 'stepmul': 'stepmul' is actually used like +** 'stepmul / STEPMULADJ' (value chosen by tests) +*/ +#define STEPMULADJ 200 + + +/* +** macro to adjust 'pause': 'pause' is actually used like +** 'pause / PAUSEADJ' (value chosen by tests) +*/ +#define PAUSEADJ 100 + + +/* +** 'makewhite' erases all color bits plus the old bit and then +** sets only the current white bit +*/ +#define maskcolors (~(bit2mask(BLACKBIT, OLDBIT) | WHITEBITS)) +#define makewhite(g,x) \ + (gch(x)->marked = cast_byte((gch(x)->marked & maskcolors) | luaC_white(g))) + +#define white2gray(x) resetbits(gch(x)->marked, WHITEBITS) +#define black2gray(x) resetbit(gch(x)->marked, BLACKBIT) + + +#define isfinalized(x) testbit(gch(x)->marked, FINALIZEDBIT) + +#define checkdeadkey(n) lua_assert(!ttisdeadkey(gkey(n)) || ttisnil(gval(n))) + + +#define checkconsistency(obj) \ + lua_longassert(!iscollectable(obj) || righttt(obj)) + + +#define markvalue(g,o) { checkconsistency(o); \ + if (valiswhite(o)) reallymarkobject(g,gcvalue(o)); } + +#define markobject(g,t) { if ((t) && iswhite(obj2gco(t))) \ + reallymarkobject(g, obj2gco(t)); } + +static void reallymarkobject (global_State *g, GCObject *o); + + +/* +** {====================================================== +** Generic functions +** ======================================================= +*/ + + +/* +** one after last element in a hash array +*/ +#define gnodelast(h) gnode(h, cast(size_t, sizenode(h))) + + +/* +** link table 'h' into list pointed by 'p' +*/ +#define linktable(h,p) ((h)->gclist = *(p), *(p) = obj2gco(h)) + + +/* +** if key is not marked, mark its entry as dead (therefore removing it +** from the table) +*/ +static void removeentry (Node *n) { + lua_assert(ttisnil(gval(n))); + if (valiswhite(gkey(n))) + setdeadvalue(gkey(n)); /* unused and unmarked key; remove it */ +} + + +/* +** tells whether a key or value can be cleared from a weak +** table. Non-collectable objects are never removed from weak +** tables. Strings behave as `values', so are never removed too. for +** other objects: if really collected, cannot keep them; for objects +** being finalized, keep them in keys, but not in values +*/ +static int iscleared (global_State *g, const TValue *o) { + if (!iscollectable(o)) return 0; + else if (ttisstring(o)) { + markobject(g, rawtsvalue(o)); /* strings are `values', so are never weak */ + return 0; + } + else return iswhite(gcvalue(o)); +} + + +/* +** barrier that moves collector forward, that is, mark the white object +** being pointed by a black object. +*/ +void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v) { + global_State *g = G(L); + lua_assert(isblack(o) && iswhite(v) && !isdead(g, v) && !isdead(g, o)); + lua_assert(g->gcstate != GCSpause); + lua_assert(gch(o)->tt != LUA_TTABLE); + if (keepinvariantout(g)) /* must keep invariant? */ + reallymarkobject(g, v); /* restore invariant */ + else { /* sweep phase */ + lua_assert(issweepphase(g)); + makewhite(g, o); /* mark main obj. as white to avoid other barriers */ + } +} + + +/* +** barrier that moves collector backward, that is, mark the black object +** pointing to a white object as gray again. (Current implementation +** only works for tables; access to 'gclist' is not uniform across +** different types.) +*/ +void luaC_barrierback_ (lua_State *L, GCObject *o) { + global_State *g = G(L); + lua_assert(isblack(o) && !isdead(g, o) && gch(o)->tt == LUA_TTABLE); + black2gray(o); /* make object gray (again) */ + gco2t(o)->gclist = g->grayagain; + g->grayagain = o; +} + + +/* +** barrier for prototypes. When creating first closure (cache is +** NULL), use a forward barrier; this may be the only closure of the +** prototype (if it is a "regular" function, with a single instance) +** and the prototype may be big, so it is better to avoid traversing +** it again. Otherwise, use a backward barrier, to avoid marking all +** possible instances. +*/ +LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c) { + global_State *g = G(L); + lua_assert(isblack(obj2gco(p))); + if (p->cache == NULL) { /* first time? */ + luaC_objbarrier(L, p, c); + } + else { /* use a backward barrier */ + black2gray(obj2gco(p)); /* make prototype gray (again) */ + p->gclist = g->grayagain; + g->grayagain = obj2gco(p); + } +} + + +/* +** check color (and invariants) for an upvalue that was closed, +** i.e., moved into the 'allgc' list +*/ +void luaC_checkupvalcolor (global_State *g, UpVal *uv) { + GCObject *o = obj2gco(uv); + lua_assert(!isblack(o)); /* open upvalues are never black */ + if (isgray(o)) { + if (keepinvariant(g)) { + resetoldbit(o); /* see MOVE OLD rule */ + gray2black(o); /* it is being visited now */ + markvalue(g, uv->v); + } + else { + lua_assert(issweepphase(g)); + makewhite(g, o); + } + } +} + + +/* +** create a new collectable object (with given type and size) and link +** it to '*list'. 'offset' tells how many bytes to allocate before the +** object itself (used only by states). +*/ +GCObject *luaC_newobj (lua_State *L, int tt, size_t sz, GCObject **list, + int offset) { + global_State *g = G(L); + char *raw = cast(char *, luaM_newobject(L, novariant(tt), sz)); + GCObject *o = obj2gco(raw + offset); + if (list == NULL) + list = &g->allgc; /* standard list for collectable objects */ + gch(o)->marked = luaC_white(g); + gch(o)->tt = tt; + gch(o)->next = *list; + *list = o; + return o; +} + +/* }====================================================== */ + + + +/* +** {====================================================== +** Mark functions +** ======================================================= +*/ + + +/* +** mark an object. Userdata, strings, and closed upvalues are visited +** and turned black here. Other objects are marked gray and added +** to appropriate list to be visited (and turned black) later. (Open +** upvalues are already linked in 'headuv' list.) +*/ +static void reallymarkobject (global_State *g, GCObject *o) { + lu_mem size; + white2gray(o); + switch (gch(o)->tt) { + case LUA_TSHRSTR: + case LUA_TLNGSTR: { + size = sizestring(gco2ts(o)); + break; /* nothing else to mark; make it black */ + } + case LUA_TUSERDATA: { + Table *mt = gco2u(o)->metatable; + markobject(g, mt); + markobject(g, gco2u(o)->env); + size = sizeudata(gco2u(o)); + break; + } + case LUA_TUPVAL: { + UpVal *uv = gco2uv(o); + markvalue(g, uv->v); + if (uv->v != &uv->u.value) /* open? */ + return; /* open upvalues remain gray */ + size = sizeof(UpVal); + break; + } + case LUA_TLCL: { + gco2lcl(o)->gclist = g->gray; + g->gray = o; + return; + } + case LUA_TCCL: { + gco2ccl(o)->gclist = g->gray; + g->gray = o; + return; + } + case LUA_TTABLE: { + linktable(gco2t(o), &g->gray); + return; + } + case LUA_TTHREAD: { + gco2th(o)->gclist = g->gray; + g->gray = o; + return; + } + case LUA_TPROTO: { + gco2p(o)->gclist = g->gray; + g->gray = o; + return; + } + default: lua_assert(0); return; + } + gray2black(o); + g->GCmemtrav += size; +} + + +/* +** mark metamethods for basic types +*/ +static void markmt (global_State *g) { + int i; + for (i=0; i < LUA_NUMTAGS; i++) + markobject(g, g->mt[i]); +} + + +/* +** mark all objects in list of being-finalized +*/ +static void markbeingfnz (global_State *g) { + GCObject *o; + for (o = g->tobefnz; o != NULL; o = gch(o)->next) { + makewhite(g, o); + reallymarkobject(g, o); + } +} + + +/* +** mark all values stored in marked open upvalues. (See comment in +** 'lstate.h'.) +*/ +static void remarkupvals (global_State *g) { + UpVal *uv; + for (uv = g->uvhead.u.l.next; uv != &g->uvhead; uv = uv->u.l.next) { + if (isgray(obj2gco(uv))) + markvalue(g, uv->v); + } +} + + +/* +** mark root set and reset all gray lists, to start a new +** incremental (or full) collection +*/ +static void restartcollection (global_State *g) { + g->gray = g->grayagain = NULL; + g->weak = g->allweak = g->ephemeron = NULL; + markobject(g, g->mainthread); + markvalue(g, &g->l_registry); + markmt(g); + markbeingfnz(g); /* mark any finalizing object left from previous cycle */ +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Traverse functions +** ======================================================= +*/ + +static void traverseweakvalue (global_State *g, Table *h) { + Node *n, *limit = gnodelast(h); + /* if there is array part, assume it may have white values (do not + traverse it just to check) */ + int hasclears = (h->sizearray > 0); + for (n = gnode(h, 0); n < limit; n++) { + checkdeadkey(n); + if (ttisnil(gval(n))) /* entry is empty? */ + removeentry(n); /* remove it */ + else { + lua_assert(!ttisnil(gkey(n))); + markvalue(g, gkey(n)); /* mark key */ + if (!hasclears && iscleared(g, gval(n))) /* is there a white value? */ + hasclears = 1; /* table will have to be cleared */ + } + } + if (hasclears) + linktable(h, &g->weak); /* has to be cleared later */ + else /* no white values */ + linktable(h, &g->grayagain); /* no need to clean */ +} + + +static int traverseephemeron (global_State *g, Table *h) { + int marked = 0; /* true if an object is marked in this traversal */ + int hasclears = 0; /* true if table has white keys */ + int prop = 0; /* true if table has entry "white-key -> white-value" */ + Node *n, *limit = gnodelast(h); + int i; + /* traverse array part (numeric keys are 'strong') */ + for (i = 0; i < h->sizearray; i++) { + if (valiswhite(&h->array[i])) { + marked = 1; + reallymarkobject(g, gcvalue(&h->array[i])); + } + } + /* traverse hash part */ + for (n = gnode(h, 0); n < limit; n++) { + checkdeadkey(n); + if (ttisnil(gval(n))) /* entry is empty? */ + removeentry(n); /* remove it */ + else if (iscleared(g, gkey(n))) { /* key is not marked (yet)? */ + hasclears = 1; /* table must be cleared */ + if (valiswhite(gval(n))) /* value not marked yet? */ + prop = 1; /* must propagate again */ + } + else if (valiswhite(gval(n))) { /* value not marked yet? */ + marked = 1; + reallymarkobject(g, gcvalue(gval(n))); /* mark it now */ + } + } + if (g->gcstate != GCSatomic || prop) + linktable(h, &g->ephemeron); /* have to propagate again */ + else if (hasclears) /* does table have white keys? */ + linktable(h, &g->allweak); /* may have to clean white keys */ + else /* no white keys */ + linktable(h, &g->grayagain); /* no need to clean */ + return marked; +} + + +static void traversestrongtable (global_State *g, Table *h) { + Node *n, *limit = gnodelast(h); + int i; + for (i = 0; i < h->sizearray; i++) /* traverse array part */ + markvalue(g, &h->array[i]); + for (n = gnode(h, 0); n < limit; n++) { /* traverse hash part */ + checkdeadkey(n); + if (ttisnil(gval(n))) /* entry is empty? */ + removeentry(n); /* remove it */ + else { + lua_assert(!ttisnil(gkey(n))); + markvalue(g, gkey(n)); /* mark key */ + markvalue(g, gval(n)); /* mark value */ + } + } +} + + +static lu_mem traversetable (global_State *g, Table *h) { + const char *weakkey, *weakvalue; + const TValue *mode = gfasttm(g, h->metatable, TM_MODE); + markobject(g, h->metatable); + if (mode && ttisstring(mode) && /* is there a weak mode? */ + ((weakkey = strchr(svalue(mode), 'k')), + (weakvalue = strchr(svalue(mode), 'v')), + (weakkey || weakvalue))) { /* is really weak? */ + black2gray(obj2gco(h)); /* keep table gray */ + if (!weakkey) /* strong keys? */ + traverseweakvalue(g, h); + else if (!weakvalue) /* strong values? */ + traverseephemeron(g, h); + else /* all weak */ + linktable(h, &g->allweak); /* nothing to traverse now */ + } + else /* not weak */ + traversestrongtable(g, h); + return sizeof(Table) + sizeof(TValue) * h->sizearray + + sizeof(Node) * cast(size_t, sizenode(h)); +} + + +static int traverseproto (global_State *g, Proto *f) { + int i; + if (f->cache && iswhite(obj2gco(f->cache))) + f->cache = NULL; /* allow cache to be collected */ + markobject(g, f->source); + for (i = 0; i < f->sizek; i++) /* mark literals */ + markvalue(g, &f->k[i]); + for (i = 0; i < f->sizeupvalues; i++) /* mark upvalue names */ + markobject(g, f->upvalues[i].name); + for (i = 0; i < f->sizep; i++) /* mark nested protos */ + markobject(g, f->p[i]); + for (i = 0; i < f->sizelocvars; i++) /* mark local-variable names */ + markobject(g, f->locvars[i].varname); + return sizeof(Proto) + sizeof(Instruction) * f->sizecode + + sizeof(Proto *) * f->sizep + + sizeof(TValue) * f->sizek + + sizeof(int) * f->sizelineinfo + + sizeof(LocVar) * f->sizelocvars + + sizeof(Upvaldesc) * f->sizeupvalues; +} + + +static lu_mem traverseCclosure (global_State *g, CClosure *cl) { + int i; + for (i = 0; i < cl->nupvalues; i++) /* mark its upvalues */ + markvalue(g, &cl->upvalue[i]); + return sizeCclosure(cl->nupvalues); +} + +static lu_mem traverseLclosure (global_State *g, LClosure *cl) { + int i; + markobject(g, cl->p); /* mark its prototype */ + for (i = 0; i < cl->nupvalues; i++) /* mark its upvalues */ + markobject(g, cl->upvals[i]); + return sizeLclosure(cl->nupvalues); +} + + +static lu_mem traversestack (global_State *g, lua_State *th) { + int n = 0; + StkId o = th->stack; + if (o == NULL) + return 1; /* stack not completely built yet */ + for (; o < th->top; o++) /* mark live elements in the stack */ + markvalue(g, o); + if (g->gcstate == GCSatomic) { /* final traversal? */ + StkId lim = th->stack + th->stacksize; /* real end of stack */ + for (; o < lim; o++) /* clear not-marked stack slice */ + setnilvalue(o); + } + else { /* count call infos to compute size */ + CallInfo *ci; + for (ci = &th->base_ci; ci != th->ci; ci = ci->next) + n++; + } + return sizeof(lua_State) + sizeof(TValue) * th->stacksize + + sizeof(CallInfo) * n; +} + + +/* +** traverse one gray object, turning it to black (except for threads, +** which are always gray). +*/ +static void propagatemark (global_State *g) { + lu_mem size; + GCObject *o = g->gray; + lua_assert(isgray(o)); + gray2black(o); + switch (gch(o)->tt) { + case LUA_TTABLE: { + Table *h = gco2t(o); + g->gray = h->gclist; /* remove from 'gray' list */ + size = traversetable(g, h); + break; + } + case LUA_TLCL: { + LClosure *cl = gco2lcl(o); + g->gray = cl->gclist; /* remove from 'gray' list */ + size = traverseLclosure(g, cl); + break; + } + case LUA_TCCL: { + CClosure *cl = gco2ccl(o); + g->gray = cl->gclist; /* remove from 'gray' list */ + size = traverseCclosure(g, cl); + break; + } + case LUA_TTHREAD: { + lua_State *th = gco2th(o); + g->gray = th->gclist; /* remove from 'gray' list */ + th->gclist = g->grayagain; + g->grayagain = o; /* insert into 'grayagain' list */ + black2gray(o); + size = traversestack(g, th); + break; + } + case LUA_TPROTO: { + Proto *p = gco2p(o); + g->gray = p->gclist; /* remove from 'gray' list */ + size = traverseproto(g, p); + break; + } + default: lua_assert(0); return; + } + g->GCmemtrav += size; +} + + +static void propagateall (global_State *g) { + while (g->gray) propagatemark(g); +} + + +static void propagatelist (global_State *g, GCObject *l) { + lua_assert(g->gray == NULL); /* no grays left */ + g->gray = l; + propagateall(g); /* traverse all elements from 'l' */ +} + +/* +** retraverse all gray lists. Because tables may be reinserted in other +** lists when traversed, traverse the original lists to avoid traversing +** twice the same table (which is not wrong, but inefficient) +*/ +static void retraversegrays (global_State *g) { + GCObject *weak = g->weak; /* save original lists */ + GCObject *grayagain = g->grayagain; + GCObject *ephemeron = g->ephemeron; + g->weak = g->grayagain = g->ephemeron = NULL; + propagateall(g); /* traverse main gray list */ + propagatelist(g, grayagain); + propagatelist(g, weak); + propagatelist(g, ephemeron); +} + + +static void convergeephemerons (global_State *g) { + int changed; + do { + GCObject *w; + GCObject *next = g->ephemeron; /* get ephemeron list */ + g->ephemeron = NULL; /* tables will return to this list when traversed */ + changed = 0; + while ((w = next) != NULL) { + next = gco2t(w)->gclist; + if (traverseephemeron(g, gco2t(w))) { /* traverse marked some value? */ + propagateall(g); /* propagate changes */ + changed = 1; /* will have to revisit all ephemeron tables */ + } + } + } while (changed); +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Sweep Functions +** ======================================================= +*/ + + +/* +** clear entries with unmarked keys from all weaktables in list 'l' up +** to element 'f' +*/ +static void clearkeys (global_State *g, GCObject *l, GCObject *f) { + for (; l != f; l = gco2t(l)->gclist) { + Table *h = gco2t(l); + Node *n, *limit = gnodelast(h); + for (n = gnode(h, 0); n < limit; n++) { + if (!ttisnil(gval(n)) && (iscleared(g, gkey(n)))) { + setnilvalue(gval(n)); /* remove value ... */ + removeentry(n); /* and remove entry from table */ + } + } + } +} + + +/* +** clear entries with unmarked values from all weaktables in list 'l' up +** to element 'f' +*/ +static void clearvalues (global_State *g, GCObject *l, GCObject *f) { + for (; l != f; l = gco2t(l)->gclist) { + Table *h = gco2t(l); + Node *n, *limit = gnodelast(h); + int i; + for (i = 0; i < h->sizearray; i++) { + TValue *o = &h->array[i]; + if (iscleared(g, o)) /* value was collected? */ + setnilvalue(o); /* remove value */ + } + for (n = gnode(h, 0); n < limit; n++) { + if (!ttisnil(gval(n)) && iscleared(g, gval(n))) { + setnilvalue(gval(n)); /* remove value ... */ + removeentry(n); /* and remove entry from table */ + } + } + } +} + + +static void freeobj (lua_State *L, GCObject *o) { + switch (gch(o)->tt) { + case LUA_TPROTO: luaF_freeproto(L, gco2p(o)); break; + case LUA_TLCL: { + luaM_freemem(L, o, sizeLclosure(gco2lcl(o)->nupvalues)); + break; + } + case LUA_TCCL: { + luaM_freemem(L, o, sizeCclosure(gco2ccl(o)->nupvalues)); + break; + } + case LUA_TUPVAL: luaF_freeupval(L, gco2uv(o)); break; + case LUA_TTABLE: luaH_free(L, gco2t(o)); break; + case LUA_TTHREAD: luaE_freethread(L, gco2th(o)); break; + case LUA_TUSERDATA: luaM_freemem(L, o, sizeudata(gco2u(o))); break; + case LUA_TSHRSTR: + G(L)->strt.nuse--; + /* FALLTHROUGH */ + case LUA_TLNGSTR: { + luaM_freemem(L, o, sizestring(gco2ts(o))); + break; + } + default: lua_assert(0); + } +} + + +#define sweepwholelist(L,p) sweeplist(L,p,MAX_LUMEM) +static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count); + + +/* +** sweep the (open) upvalues of a thread and resize its stack and +** list of call-info structures. +*/ +static void sweepthread (lua_State *L, lua_State *L1) { + if (L1->stack == NULL) return; /* stack not completely built yet */ + sweepwholelist(L, &L1->openupval); /* sweep open upvalues */ + luaE_freeCI(L1); /* free extra CallInfo slots */ + /* should not change the stack during an emergency gc cycle */ + if (G(L)->gckind != KGC_EMERGENCY) + luaD_shrinkstack(L1); +} + + +/* +** sweep at most 'count' elements from a list of GCObjects erasing dead +** objects, where a dead (not alive) object is one marked with the "old" +** (non current) white and not fixed. +** In non-generational mode, change all non-dead objects back to white, +** preparing for next collection cycle. +** In generational mode, keep black objects black, and also mark them as +** old; stop when hitting an old object, as all objects after that +** one will be old too. +** When object is a thread, sweep its list of open upvalues too. +*/ +static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count) { + global_State *g = G(L); + int ow = otherwhite(g); + int toclear, toset; /* bits to clear and to set in all live objects */ + int tostop; /* stop sweep when this is true */ + if (isgenerational(g)) { /* generational mode? */ + toclear = ~0; /* clear nothing */ + toset = bitmask(OLDBIT); /* set the old bit of all surviving objects */ + tostop = bitmask(OLDBIT); /* do not sweep old generation */ + } + else { /* normal mode */ + toclear = maskcolors; /* clear all color bits + old bit */ + toset = luaC_white(g); /* make object white */ + tostop = 0; /* do not stop */ + } + while (*p != NULL && count-- > 0) { + GCObject *curr = *p; + int marked = gch(curr)->marked; + if (isdeadm(ow, marked)) { /* is 'curr' dead? */ + *p = gch(curr)->next; /* remove 'curr' from list */ + freeobj(L, curr); /* erase 'curr' */ + } + else { + if (testbits(marked, tostop)) + return NULL; /* stop sweeping this list */ + if (gch(curr)->tt == LUA_TTHREAD) + sweepthread(L, gco2th(curr)); /* sweep thread's upvalues */ + /* update marks */ + gch(curr)->marked = cast_byte((marked & toclear) | toset); + p = &gch(curr)->next; /* go to next element */ + } + } + return (*p == NULL) ? NULL : p; +} + + +/* +** sweep a list until a live object (or end of list) +*/ +static GCObject **sweeptolive (lua_State *L, GCObject **p, int *n) { + GCObject ** old = p; + int i = 0; + do { + i++; + p = sweeplist(L, p, 1); + } while (p == old); + if (n) *n += i; + return p; +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Finalization +** ======================================================= +*/ + +static void checkSizes (lua_State *L) { + global_State *g = G(L); + if (g->gckind != KGC_EMERGENCY) { /* do not change sizes in emergency */ + int hs = g->strt.size / 2; /* half the size of the string table */ + if (g->strt.nuse < cast(lu_int32, hs)) /* using less than that half? */ + luaS_resize(L, hs); /* halve its size */ + luaZ_freebuffer(L, &g->buff); /* free concatenation buffer */ + } +} + + +static GCObject *udata2finalize (global_State *g) { + GCObject *o = g->tobefnz; /* get first element */ + lua_assert(isfinalized(o)); + g->tobefnz = gch(o)->next; /* remove it from 'tobefnz' list */ + gch(o)->next = g->allgc; /* return it to 'allgc' list */ + g->allgc = o; + resetbit(gch(o)->marked, SEPARATED); /* mark that it is not in 'tobefnz' */ + lua_assert(!isold(o)); /* see MOVE OLD rule */ + if (!keepinvariantout(g)) /* not keeping invariant? */ + makewhite(g, o); /* "sweep" object */ + return o; +} + + +static void dothecall (lua_State *L, void *ud) { + UNUSED(ud); + luaD_call(L, L->top - 2, 0, 0); +} + + +static void GCTM (lua_State *L, int propagateerrors) { + global_State *g = G(L); + const TValue *tm; + TValue v; + setgcovalue(L, &v, udata2finalize(g)); + tm = luaT_gettmbyobj(L, &v, TM_GC); + if (tm != NULL && ttisfunction(tm)) { /* is there a finalizer? */ + int status; + lu_byte oldah = L->allowhook; + int running = g->gcrunning; + L->allowhook = 0; /* stop debug hooks during GC metamethod */ + g->gcrunning = 0; /* avoid GC steps */ + setobj2s(L, L->top, tm); /* push finalizer... */ + setobj2s(L, L->top + 1, &v); /* ... and its argument */ + L->top += 2; /* and (next line) call the finalizer */ + status = luaD_pcall(L, dothecall, NULL, savestack(L, L->top - 2), 0); + L->allowhook = oldah; /* restore hooks */ + g->gcrunning = running; /* restore state */ + if (status != LUA_OK && propagateerrors) { /* error while running __gc? */ + if (status == LUA_ERRRUN) { /* is there an error object? */ + const char *msg = (ttisstring(L->top - 1)) + ? svalue(L->top - 1) + : "no message"; + luaO_pushfstring(L, "error in __gc metamethod (%s)", msg); + status = LUA_ERRGCMM; /* error in __gc metamethod */ + } + luaD_throw(L, status); /* re-throw error */ + } + } +} + + +/* +** move all unreachable objects (or 'all' objects) that need +** finalization from list 'finobj' to list 'tobefnz' (to be finalized) +*/ +static void separatetobefnz (lua_State *L, int all) { + global_State *g = G(L); + GCObject **p = &g->finobj; + GCObject *curr; + GCObject **lastnext = &g->tobefnz; + /* find last 'next' field in 'tobefnz' list (to add elements in its end) */ + while (*lastnext != NULL) + lastnext = &gch(*lastnext)->next; + while ((curr = *p) != NULL) { /* traverse all finalizable objects */ + lua_assert(!isfinalized(curr)); + lua_assert(testbit(gch(curr)->marked, SEPARATED)); + if (!(iswhite(curr) || all)) /* not being collected? */ + p = &gch(curr)->next; /* don't bother with it */ + else { + l_setbit(gch(curr)->marked, FINALIZEDBIT); /* won't be finalized again */ + *p = gch(curr)->next; /* remove 'curr' from 'finobj' list */ + gch(curr)->next = *lastnext; /* link at the end of 'tobefnz' list */ + *lastnext = curr; + lastnext = &gch(curr)->next; + } + } +} + + +/* +** if object 'o' has a finalizer, remove it from 'allgc' list (must +** search the list to find it) and link it in 'finobj' list. +*/ +void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt) { + global_State *g = G(L); + if (testbit(gch(o)->marked, SEPARATED) || /* obj. is already separated... */ + isfinalized(o) || /* ... or is finalized... */ + gfasttm(g, mt, TM_GC) == NULL) /* or has no finalizer? */ + return; /* nothing to be done */ + else { /* move 'o' to 'finobj' list */ + GCObject **p; + GCheader *ho = gch(o); + if (g->sweepgc == &ho->next) { /* avoid removing current sweep object */ + lua_assert(issweepphase(g)); + g->sweepgc = sweeptolive(L, g->sweepgc, NULL); + } + /* search for pointer pointing to 'o' */ + for (p = &g->allgc; *p != o; p = &gch(*p)->next) { /* empty */ } + *p = ho->next; /* remove 'o' from root list */ + ho->next = g->finobj; /* link it in list 'finobj' */ + g->finobj = o; + l_setbit(ho->marked, SEPARATED); /* mark it as such */ + if (!keepinvariantout(g)) /* not keeping invariant? */ + makewhite(g, o); /* "sweep" object */ + else + resetoldbit(o); /* see MOVE OLD rule */ + } +} + +/* }====================================================== */ + + +/* +** {====================================================== +** GC control +** ======================================================= +*/ + + +/* +** set a reasonable "time" to wait before starting a new GC cycle; +** cycle will start when memory use hits threshold +*/ +static void setpause (global_State *g, l_mem estimate) { + l_mem debt, threshold; + estimate = estimate / PAUSEADJ; /* adjust 'estimate' */ + threshold = (g->gcpause < MAX_LMEM / estimate) /* overflow? */ + ? estimate * g->gcpause /* no overflow */ + : MAX_LMEM; /* overflow; truncate to maximum */ + debt = -cast(l_mem, threshold - gettotalbytes(g)); + luaE_setdebt(g, debt); +} + + +#define sweepphases \ + (bitmask(GCSsweepstring) | bitmask(GCSsweepudata) | bitmask(GCSsweep)) + + +/* +** enter first sweep phase (strings) and prepare pointers for other +** sweep phases. The calls to 'sweeptolive' make pointers point to an +** object inside the list (instead of to the header), so that the real +** sweep do not need to skip objects created between "now" and the start +** of the real sweep. +** Returns how many objects it swept. +*/ +static int entersweep (lua_State *L) { + global_State *g = G(L); + int n = 0; + g->gcstate = GCSsweepstring; + lua_assert(g->sweepgc == NULL && g->sweepfin == NULL); + /* prepare to sweep strings, finalizable objects, and regular objects */ + g->sweepstrgc = 0; + g->sweepfin = sweeptolive(L, &g->finobj, &n); + g->sweepgc = sweeptolive(L, &g->allgc, &n); + return n; +} + + +/* +** change GC mode +*/ +void luaC_changemode (lua_State *L, int mode) { + global_State *g = G(L); + if (mode == g->gckind) return; /* nothing to change */ + if (mode == KGC_GEN) { /* change to generational mode */ + /* make sure gray lists are consistent */ + luaC_runtilstate(L, bitmask(GCSpropagate)); + g->GCestimate = gettotalbytes(g); + g->gckind = KGC_GEN; + } + else { /* change to incremental mode */ + /* sweep all objects to turn them back to white + (as white has not changed, nothing extra will be collected) */ + g->gckind = KGC_NORMAL; + entersweep(L); + luaC_runtilstate(L, ~sweepphases); + } +} + + +/* +** call all pending finalizers +*/ +static void callallpendingfinalizers (lua_State *L, int propagateerrors) { + global_State *g = G(L); + while (g->tobefnz) { + resetoldbit(g->tobefnz); + GCTM(L, propagateerrors); + } +} + + +void luaC_freeallobjects (lua_State *L) { + global_State *g = G(L); + int i; + separatetobefnz(L, 1); /* separate all objects with finalizers */ + lua_assert(g->finobj == NULL); + callallpendingfinalizers(L, 0); + g->currentwhite = WHITEBITS; /* this "white" makes all objects look dead */ + g->gckind = KGC_NORMAL; + sweepwholelist(L, &g->finobj); /* finalizers can create objs. in 'finobj' */ + sweepwholelist(L, &g->allgc); + for (i = 0; i < g->strt.size; i++) /* free all string lists */ + sweepwholelist(L, &g->strt.hash[i]); + lua_assert(g->strt.nuse == 0); +} + + +static l_mem atomic (lua_State *L) { + global_State *g = G(L); + l_mem work = -cast(l_mem, g->GCmemtrav); /* start counting work */ + GCObject *origweak, *origall; + lua_assert(!iswhite(obj2gco(g->mainthread))); + markobject(g, L); /* mark running thread */ + /* registry and global metatables may be changed by API */ + markvalue(g, &g->l_registry); + markmt(g); /* mark basic metatables */ + /* remark occasional upvalues of (maybe) dead threads */ + remarkupvals(g); + propagateall(g); /* propagate changes */ + work += g->GCmemtrav; /* stop counting (do not (re)count grays) */ + /* traverse objects caught by write barrier and by 'remarkupvals' */ + retraversegrays(g); + work -= g->GCmemtrav; /* restart counting */ + convergeephemerons(g); + /* at this point, all strongly accessible objects are marked. */ + /* clear values from weak tables, before checking finalizers */ + clearvalues(g, g->weak, NULL); + clearvalues(g, g->allweak, NULL); + origweak = g->weak; origall = g->allweak; + work += g->GCmemtrav; /* stop counting (objects being finalized) */ + separatetobefnz(L, 0); /* separate objects to be finalized */ + markbeingfnz(g); /* mark objects that will be finalized */ + propagateall(g); /* remark, to propagate `preserveness' */ + work -= g->GCmemtrav; /* restart counting */ + convergeephemerons(g); + /* at this point, all resurrected objects are marked. */ + /* remove dead objects from weak tables */ + clearkeys(g, g->ephemeron, NULL); /* clear keys from all ephemeron tables */ + clearkeys(g, g->allweak, NULL); /* clear keys from all allweak tables */ + /* clear values from resurrected weak tables */ + clearvalues(g, g->weak, origweak); + clearvalues(g, g->allweak, origall); + g->currentwhite = cast_byte(otherwhite(g)); /* flip current white */ + work += g->GCmemtrav; /* complete counting */ + return work; /* estimate of memory marked by 'atomic' */ +} + + +static lu_mem singlestep (lua_State *L) { + global_State *g = G(L); + switch (g->gcstate) { + case GCSpause: { + /* start to count memory traversed */ + g->GCmemtrav = g->strt.size * sizeof(GCObject*); + lua_assert(!isgenerational(g)); + restartcollection(g); + g->gcstate = GCSpropagate; + return g->GCmemtrav; + } + case GCSpropagate: { + if (g->gray) { + lu_mem oldtrav = g->GCmemtrav; + propagatemark(g); + return g->GCmemtrav - oldtrav; /* memory traversed in this step */ + } + else { /* no more `gray' objects */ + lu_mem work; + int sw; + g->gcstate = GCSatomic; /* finish mark phase */ + g->GCestimate = g->GCmemtrav; /* save what was counted */; + work = atomic(L); /* add what was traversed by 'atomic' */ + g->GCestimate += work; /* estimate of total memory traversed */ + sw = entersweep(L); + return work + sw * GCSWEEPCOST; + } + } + case GCSsweepstring: { + int i; + for (i = 0; i < GCSWEEPMAX && g->sweepstrgc + i < g->strt.size; i++) + sweepwholelist(L, &g->strt.hash[g->sweepstrgc + i]); + g->sweepstrgc += i; + if (g->sweepstrgc >= g->strt.size) /* no more strings to sweep? */ + g->gcstate = GCSsweepudata; + return i * GCSWEEPCOST; + } + case GCSsweepudata: { + if (g->sweepfin) { + g->sweepfin = sweeplist(L, g->sweepfin, GCSWEEPMAX); + return GCSWEEPMAX*GCSWEEPCOST; + } + else { + g->gcstate = GCSsweep; + return 0; + } + } + case GCSsweep: { + if (g->sweepgc) { + g->sweepgc = sweeplist(L, g->sweepgc, GCSWEEPMAX); + return GCSWEEPMAX*GCSWEEPCOST; + } + else { + /* sweep main thread */ + GCObject *mt = obj2gco(g->mainthread); + sweeplist(L, &mt, 1); + checkSizes(L); + g->gcstate = GCSpause; /* finish collection */ + return GCSWEEPCOST; + } + } + default: lua_assert(0); return 0; + } +} + + +/* +** advances the garbage collector until it reaches a state allowed +** by 'statemask' +*/ +void luaC_runtilstate (lua_State *L, int statesmask) { + global_State *g = G(L); + while (!testbit(statesmask, g->gcstate)) + singlestep(L); +} + + +static void generationalcollection (lua_State *L) { + global_State *g = G(L); + lua_assert(g->gcstate == GCSpropagate); + if (g->GCestimate == 0) { /* signal for another major collection? */ + luaC_fullgc(L, 0); /* perform a full regular collection */ + g->GCestimate = gettotalbytes(g); /* update control */ + } + else { + lu_mem estimate = g->GCestimate; + luaC_runtilstate(L, bitmask(GCSpause)); /* run complete (minor) cycle */ + g->gcstate = GCSpropagate; /* skip restart */ + if (gettotalbytes(g) > (estimate / 100) * g->gcmajorinc) + g->GCestimate = 0; /* signal for a major collection */ + else + g->GCestimate = estimate; /* keep estimate from last major coll. */ + + } + setpause(g, gettotalbytes(g)); + lua_assert(g->gcstate == GCSpropagate); +} + + +static void incstep (lua_State *L) { + global_State *g = G(L); + l_mem debt = g->GCdebt; + int stepmul = g->gcstepmul; + if (stepmul < 40) stepmul = 40; /* avoid ridiculous low values (and 0) */ + /* convert debt from Kb to 'work units' (avoid zero debt and overflows) */ + debt = (debt / STEPMULADJ) + 1; + debt = (debt < MAX_LMEM / stepmul) ? debt * stepmul : MAX_LMEM; + do { /* always perform at least one single step */ + lu_mem work = singlestep(L); /* do some work */ + debt -= work; + } while (debt > -GCSTEPSIZE && g->gcstate != GCSpause); + if (g->gcstate == GCSpause) + setpause(g, g->GCestimate); /* pause until next cycle */ + else { + debt = (debt / stepmul) * STEPMULADJ; /* convert 'work units' to Kb */ + luaE_setdebt(g, debt); + } +} + + +/* +** performs a basic GC step +*/ +void luaC_forcestep (lua_State *L) { + global_State *g = G(L); + int i; + if (isgenerational(g)) generationalcollection(L); + else incstep(L); + /* run a few finalizers (or all of them at the end of a collect cycle) */ + for (i = 0; g->tobefnz && (i < GCFINALIZENUM || g->gcstate == GCSpause); i++) + GCTM(L, 1); /* call one finalizer */ +} + + +/* +** performs a basic GC step only if collector is running +*/ +void luaC_step (lua_State *L) { + global_State *g = G(L); + if (g->gcrunning) luaC_forcestep(L); + else luaE_setdebt(g, -GCSTEPSIZE); /* avoid being called too often */ +} + + + +/* +** performs a full GC cycle; if "isemergency", does not call +** finalizers (which could change stack positions) +*/ +void luaC_fullgc (lua_State *L, int isemergency) { + global_State *g = G(L); + int origkind = g->gckind; + lua_assert(origkind != KGC_EMERGENCY); + if (isemergency) /* do not run finalizers during emergency GC */ + g->gckind = KGC_EMERGENCY; + else { + g->gckind = KGC_NORMAL; + callallpendingfinalizers(L, 1); + } + if (keepinvariant(g)) { /* may there be some black objects? */ + /* must sweep all objects to turn them back to white + (as white has not changed, nothing will be collected) */ + entersweep(L); + } + /* finish any pending sweep phase to start a new cycle */ + luaC_runtilstate(L, bitmask(GCSpause)); + luaC_runtilstate(L, ~bitmask(GCSpause)); /* start new collection */ + luaC_runtilstate(L, bitmask(GCSpause)); /* run entire collection */ + if (origkind == KGC_GEN) { /* generational mode? */ + /* generational mode must be kept in propagate phase */ + luaC_runtilstate(L, bitmask(GCSpropagate)); + } + g->gckind = origkind; + setpause(g, gettotalbytes(g)); + if (!isemergency) /* do not run finalizers during emergency GC */ + callallpendingfinalizers(L, 1); +} + +/* }====================================================== */ + + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h new file mode 100644 index 000000000000..84bb1cdf99fa --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h @@ -0,0 +1,157 @@ +/* +** $Id: lgc.h,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $ +** Garbage Collector +** See Copyright Notice in lua.h +*/ + +#ifndef lgc_h +#define lgc_h + + +#include "lobject.h" +#include "lstate.h" + +/* +** Collectable objects may have one of three colors: white, which +** means the object is not marked; gray, which means the +** object is marked, but its references may be not marked; and +** black, which means that the object and all its references are marked. +** The main invariant of the garbage collector, while marking objects, +** is that a black object can never point to a white one. Moreover, +** any gray object must be in a "gray list" (gray, grayagain, weak, +** allweak, ephemeron) so that it can be visited again before finishing +** the collection cycle. These lists have no meaning when the invariant +** is not being enforced (e.g., sweep phase). +*/ + + + +/* how much to allocate before next GC step */ +#if !defined(GCSTEPSIZE) +/* ~100 small strings */ +#define GCSTEPSIZE (cast_int(100 * sizeof(TString))) +#endif + + +/* +** Possible states of the Garbage Collector +*/ +#define GCSpropagate 0 +#define GCSatomic 1 +#define GCSsweepstring 2 +#define GCSsweepudata 3 +#define GCSsweep 4 +#define GCSpause 5 + + +#define issweepphase(g) \ + (GCSsweepstring <= (g)->gcstate && (g)->gcstate <= GCSsweep) + +#define isgenerational(g) ((g)->gckind == KGC_GEN) + +/* +** macros to tell when main invariant (white objects cannot point to black +** ones) must be kept. During a non-generational collection, the sweep +** phase may break the invariant, as objects turned white may point to +** still-black objects. The invariant is restored when sweep ends and +** all objects are white again. During a generational collection, the +** invariant must be kept all times. +*/ + +#define keepinvariant(g) (isgenerational(g) || g->gcstate <= GCSatomic) + + +/* +** Outside the collector, the state in generational mode is kept in +** 'propagate', so 'keepinvariant' is always true. +*/ +#define keepinvariantout(g) \ + check_exp(g->gcstate == GCSpropagate || !isgenerational(g), \ + g->gcstate <= GCSatomic) + + +/* +** some useful bit tricks +*/ +#define resetbits(x,m) ((x) &= cast(lu_byte, ~(m))) +#define setbits(x,m) ((x) |= (m)) +#define testbits(x,m) ((x) & (m)) +#define bitmask(b) (1<<(b)) +#define bit2mask(b1,b2) (bitmask(b1) | bitmask(b2)) +#define l_setbit(x,b) setbits(x, bitmask(b)) +#define resetbit(x,b) resetbits(x, bitmask(b)) +#define testbit(x,b) testbits(x, bitmask(b)) + + +/* Layout for bit use in `marked' field: */ +#define WHITE0BIT 0 /* object is white (type 0) */ +#define WHITE1BIT 1 /* object is white (type 1) */ +#define BLACKBIT 2 /* object is black */ +#define FINALIZEDBIT 3 /* object has been separated for finalization */ +#define SEPARATED 4 /* object is in 'finobj' list or in 'tobefnz' */ +#define FIXEDBIT 5 /* object is fixed (should not be collected) */ +#define OLDBIT 6 /* object is old (only in generational mode) */ +/* bit 7 is currently used by tests (luaL_checkmemory) */ + +#define WHITEBITS bit2mask(WHITE0BIT, WHITE1BIT) + + +#define iswhite(x) testbits((x)->gch.marked, WHITEBITS) +#define isblack(x) testbit((x)->gch.marked, BLACKBIT) +#define isgray(x) /* neither white nor black */ \ + (!testbits((x)->gch.marked, WHITEBITS | bitmask(BLACKBIT))) + +#define isold(x) testbit((x)->gch.marked, OLDBIT) + +/* MOVE OLD rule: whenever an object is moved to the beginning of + a GC list, its old bit must be cleared */ +#define resetoldbit(o) resetbit((o)->gch.marked, OLDBIT) + +#define otherwhite(g) (g->currentwhite ^ WHITEBITS) +#define isdeadm(ow,m) (!(((m) ^ WHITEBITS) & (ow))) +#define isdead(g,v) isdeadm(otherwhite(g), (v)->gch.marked) + +#define changewhite(x) ((x)->gch.marked ^= WHITEBITS) +#define gray2black(x) l_setbit((x)->gch.marked, BLACKBIT) + +#define valiswhite(x) (iscollectable(x) && iswhite(gcvalue(x))) + +#define luaC_white(g) cast(lu_byte, (g)->currentwhite & WHITEBITS) + + +#define luaC_condGC(L,c) \ + {if (G(L)->GCdebt > 0) {c;}; condchangemem(L);} +#define luaC_checkGC(L) luaC_condGC(L, luaC_step(L);) + + +#define luaC_barrier(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p))) \ + luaC_barrier_(L,obj2gco(p),gcvalue(v)); } + +#define luaC_barrierback(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p))) \ + luaC_barrierback_(L,p); } + +#define luaC_objbarrier(L,p,o) \ + { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) \ + luaC_barrier_(L,obj2gco(p),obj2gco(o)); } + +#define luaC_objbarrierback(L,p,o) \ + { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) luaC_barrierback_(L,p); } + +#define luaC_barrierproto(L,p,c) \ + { if (isblack(obj2gco(p))) luaC_barrierproto_(L,p,c); } + +LUAI_FUNC void luaC_freeallobjects (lua_State *L); +LUAI_FUNC void luaC_step (lua_State *L); +LUAI_FUNC void luaC_forcestep (lua_State *L); +LUAI_FUNC void luaC_runtilstate (lua_State *L, int statesmask); +LUAI_FUNC void luaC_fullgc (lua_State *L, int isemergency); +LUAI_FUNC GCObject *luaC_newobj (lua_State *L, int tt, size_t sz, + GCObject **list, int offset); +LUAI_FUNC void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v); +LUAI_FUNC void luaC_barrierback_ (lua_State *L, GCObject *o); +LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c); +LUAI_FUNC void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt); +LUAI_FUNC void luaC_checkupvalcolor (global_State *g, UpVal *uv); +LUAI_FUNC void luaC_changemode (lua_State *L, int mode); + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c new file mode 100644 index 000000000000..dfac7aef8645 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c @@ -0,0 +1,529 @@ +/* +** $Id: llex.c,v 2.63.1.3 2015/02/09 17:56:34 roberto Exp $ +** Lexical Analyzer +** See Copyright Notice in lua.h +*/ + +#include <sys/zfs_context.h> + +#define llex_c +#define LUA_CORE + +#include "lua.h" + +#include "lctype.h" +#include "ldo.h" +#include "llex.h" +#include "lobject.h" +#include "lparser.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "lzio.h" + + + +#define next(ls) (ls->current = zgetc(ls->z)) + + + +#define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r') + + +/* ORDER RESERVED */ +static const char *const luaX_tokens [] = { + "and", "break", "do", "else", "elseif", + "end", "false", "for", "function", "goto", "if", + "in", "local", "nil", "not", "or", "repeat", + "return", "then", "true", "until", "while", + "..", "...", "==", ">=", "<=", "~=", "::", "<eof>", + "<number>", "<name>", "<string>" +}; + + +#define save_and_next(ls) (save(ls, ls->current), next(ls)) + + +static l_noret lexerror (LexState *ls, const char *msg, int token); + + +static void save (LexState *ls, int c) { + Mbuffer *b = ls->buff; + if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) { + size_t newsize; + if (luaZ_sizebuffer(b) >= MAX_SIZET/2) + lexerror(ls, "lexical element too long", 0); + newsize = luaZ_sizebuffer(b) * 2; + luaZ_resizebuffer(ls->L, b, newsize); + } + b->buffer[luaZ_bufflen(b)++] = cast(char, c); +} + + +void luaX_init (lua_State *L) { + int i; + for (i=0; i<NUM_RESERVED; i++) { + TString *ts = luaS_new(L, luaX_tokens[i]); + luaS_fix(ts); /* reserved words are never collected */ + ts->tsv.extra = cast_byte(i+1); /* reserved word */ + } +} + + +const char *luaX_token2str (LexState *ls, int token) { + if (token < FIRST_RESERVED) { /* single-byte symbols? */ + lua_assert(token == cast(unsigned char, token)); + return (lisprint(token)) ? luaO_pushfstring(ls->L, LUA_QL("%c"), token) : + luaO_pushfstring(ls->L, "char(%d)", token); + } + else { + const char *s = luaX_tokens[token - FIRST_RESERVED]; + if (token < TK_EOS) /* fixed format (symbols and reserved words)? */ + return luaO_pushfstring(ls->L, LUA_QS, s); + else /* names, strings, and numerals */ + return s; + } +} + + +static const char *txtToken (LexState *ls, int token) { + switch (token) { + case TK_NAME: + case TK_STRING: + case TK_NUMBER: + save(ls, '\0'); + return luaO_pushfstring(ls->L, LUA_QS, luaZ_buffer(ls->buff)); + default: + return luaX_token2str(ls, token); + } +} + + +static l_noret lexerror (LexState *ls, const char *msg, int token) { + char buff[LUA_IDSIZE]; + luaO_chunkid(buff, getstr(ls->source), LUA_IDSIZE); + msg = luaO_pushfstring(ls->L, "%s:%d: %s", buff, ls->linenumber, msg); + if (token) + luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token)); + luaD_throw(ls->L, LUA_ERRSYNTAX); +} + + +l_noret luaX_syntaxerror (LexState *ls, const char *msg) { + lexerror(ls, msg, ls->t.token); +} + + +/* +** creates a new string and anchors it in function's table so that +** it will not be collected until the end of the function's compilation +** (by that time it should be anchored in function's prototype) +*/ +TString *luaX_newstring (LexState *ls, const char *str, size_t l) { + lua_State *L = ls->L; + TValue *o; /* entry for `str' */ + TString *ts = luaS_newlstr(L, str, l); /* create new string */ + setsvalue2s(L, L->top++, ts); /* temporarily anchor it in stack */ + o = luaH_set(L, ls->fs->h, L->top - 1); + if (ttisnil(o)) { /* not in use yet? (see 'addK') */ + /* boolean value does not need GC barrier; + table has no metatable, so it does not need to invalidate cache */ + setbvalue(o, 1); /* t[string] = true */ + luaC_checkGC(L); + } + else { /* string already present */ + ts = rawtsvalue(keyfromval(o)); /* re-use value previously stored */ + } + L->top--; /* remove string from stack */ + return ts; +} + + +/* +** increment line number and skips newline sequence (any of +** \n, \r, \n\r, or \r\n) +*/ +static void inclinenumber (LexState *ls) { + int old = ls->current; + lua_assert(currIsNewline(ls)); + next(ls); /* skip `\n' or `\r' */ + if (currIsNewline(ls) && ls->current != old) + next(ls); /* skip `\n\r' or `\r\n' */ + if (++ls->linenumber >= MAX_INT) + lexerror(ls, "chunk has too many lines", 0); +} + + +void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source, + int firstchar) { + ls->decpoint = '.'; + ls->L = L; + ls->current = firstchar; + ls->lookahead.token = TK_EOS; /* no look-ahead token */ + ls->z = z; + ls->fs = NULL; + ls->linenumber = 1; + ls->lastline = 1; + ls->source = source; + ls->envn = luaS_new(L, LUA_ENV); /* create env name */ + luaS_fix(ls->envn); /* never collect this name */ + luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */ +} + + + +/* +** ======================================================= +** LEXICAL ANALYZER +** ======================================================= +*/ + + + +static int check_next (LexState *ls, const char *set) { + if (ls->current == '\0' || !strchr(set, ls->current)) + return 0; + save_and_next(ls); + return 1; +} + + +/* +** change all characters 'from' in buffer to 'to' +*/ +static void buffreplace (LexState *ls, char from, char to) { + size_t n = luaZ_bufflen(ls->buff); + char *p = luaZ_buffer(ls->buff); + while (n--) + if (p[n] == from) p[n] = to; +} + + +#if !defined(getlocaledecpoint) +#define getlocaledecpoint() (localeconv()->decimal_point[0]) +#endif + + +#define buff2d(b,e) luaO_str2d(luaZ_buffer(b), luaZ_bufflen(b) - 1, e) + +/* +** in case of format error, try to change decimal point separator to +** the one defined in the current locale and check again +*/ +static void trydecpoint (LexState *ls, SemInfo *seminfo) { + char old = ls->decpoint; + ls->decpoint = getlocaledecpoint(); + buffreplace(ls, old, ls->decpoint); /* try new decimal separator */ + if (!buff2d(ls->buff, &seminfo->r)) { + /* format error with correct decimal point: no more options */ + buffreplace(ls, ls->decpoint, '.'); /* undo change (for error message) */ + lexerror(ls, "malformed number", TK_NUMBER); + } +} + + +/* LUA_NUMBER */ +/* +** this function is quite liberal in what it accepts, as 'luaO_str2d' +** will reject ill-formed numerals. +*/ +static void read_numeral (LexState *ls, SemInfo *seminfo) { + const char *expo = "Ee"; + int first = ls->current; + lua_assert(lisdigit(ls->current)); + save_and_next(ls); + if (first == '0' && check_next(ls, "Xx")) /* hexadecimal? */ + expo = "Pp"; + for (;;) { + if (check_next(ls, expo)) /* exponent part? */ + check_next(ls, "+-"); /* optional exponent sign */ + if (lisxdigit(ls->current) || ls->current == '.') + save_and_next(ls); + else break; + } + save(ls, '\0'); + buffreplace(ls, '.', ls->decpoint); /* follow locale for decimal point */ + if (!buff2d(ls->buff, &seminfo->r)) /* format error? */ + trydecpoint(ls, seminfo); /* try to update decimal point separator */ +} + + +/* +** skip a sequence '[=*[' or ']=*]' and return its number of '='s or +** -1 if sequence is malformed +*/ +static int skip_sep (LexState *ls) { + int count = 0; + int s = ls->current; + lua_assert(s == '[' || s == ']'); + save_and_next(ls); + while (ls->current == '=') { + save_and_next(ls); + count++; + } + return (ls->current == s) ? count : (-count) - 1; +} + + +static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) { + save_and_next(ls); /* skip 2nd `[' */ + if (currIsNewline(ls)) /* string starts with a newline? */ + inclinenumber(ls); /* skip it */ + for (;;) { + switch (ls->current) { + case EOZ: + lexerror(ls, (seminfo) ? "unfinished long string" : + "unfinished long comment", TK_EOS); + break; /* to avoid warnings */ + case ']': { + if (skip_sep(ls) == sep) { + save_and_next(ls); /* skip 2nd `]' */ + goto endloop; + } + break; + } + case '\n': case '\r': { + save(ls, '\n'); + inclinenumber(ls); + if (!seminfo) luaZ_resetbuffer(ls->buff); /* avoid wasting space */ + break; + } + default: { + if (seminfo) save_and_next(ls); + else next(ls); + } + } + } endloop: + if (seminfo) + seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep), + luaZ_bufflen(ls->buff) - 2*(2 + sep)); +} + + +static void escerror (LexState *ls, int *c, int n, const char *msg) { + int i; + luaZ_resetbuffer(ls->buff); /* prepare error message */ + save(ls, '\\'); + for (i = 0; i < n && c[i] != EOZ; i++) + save(ls, c[i]); + lexerror(ls, msg, TK_STRING); +} + + +static int readhexaesc (LexState *ls) { + int c[3], i; /* keep input for error message */ + int r = 0; /* result accumulator */ + c[0] = 'x'; /* for error message */ + for (i = 1; i < 3; i++) { /* read two hexadecimal digits */ + c[i] = next(ls); + if (!lisxdigit(c[i])) + escerror(ls, c, i + 1, "hexadecimal digit expected"); + r = (r << 4) + luaO_hexavalue(c[i]); + } + return r; +} + + +static int readdecesc (LexState *ls) { + int c[3], i; + int r = 0; /* result accumulator */ + for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */ + c[i] = ls->current; + r = 10*r + c[i] - '0'; + next(ls); + } + if (r > UCHAR_MAX) + escerror(ls, c, i, "decimal escape too large"); + return r; +} + + +static void read_string (LexState *ls, int del, SemInfo *seminfo) { + save_and_next(ls); /* keep delimiter (for error messages) */ + while (ls->current != del) { + switch (ls->current) { + case EOZ: + lexerror(ls, "unfinished string", TK_EOS); + break; /* to avoid warnings */ + case '\n': + case '\r': + lexerror(ls, "unfinished string", TK_STRING); + break; /* to avoid warnings */ + case '\\': { /* escape sequences */ + int c; /* final character to be saved */ + next(ls); /* do not save the `\' */ + switch (ls->current) { + case 'a': c = '\a'; goto read_save; + case 'b': c = '\b'; goto read_save; + case 'f': c = '\f'; goto read_save; + case 'n': c = '\n'; goto read_save; + case 'r': c = '\r'; goto read_save; + case 't': c = '\t'; goto read_save; + case 'v': c = '\v'; goto read_save; + case 'x': c = readhexaesc(ls); goto read_save; + case '\n': case '\r': + inclinenumber(ls); c = '\n'; goto only_save; + case '\\': case '\"': case '\'': + c = ls->current; goto read_save; + case EOZ: goto no_save; /* will raise an error next loop */ + case 'z': { /* zap following span of spaces */ + next(ls); /* skip the 'z' */ + while (lisspace(ls->current)) { + if (currIsNewline(ls)) inclinenumber(ls); + else next(ls); + } + goto no_save; + } + default: { + if (!lisdigit(ls->current)) + escerror(ls, &ls->current, 1, "invalid escape sequence"); + /* digital escape \ddd */ + c = readdecesc(ls); + goto only_save; + } + } + read_save: next(ls); /* read next character */ + only_save: save(ls, c); /* save 'c' */ + no_save: break; + } + default: + save_and_next(ls); + } + } + save_and_next(ls); /* skip delimiter */ + seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1, + luaZ_bufflen(ls->buff) - 2); +} + + +static int llex (LexState *ls, SemInfo *seminfo) { + luaZ_resetbuffer(ls->buff); + for (;;) { + switch (ls->current) { + case '\n': case '\r': { /* line breaks */ + inclinenumber(ls); + break; + } + case ' ': case '\f': case '\t': case '\v': { /* spaces */ + next(ls); + break; + } + case '-': { /* '-' or '--' (comment) */ + next(ls); + if (ls->current != '-') return '-'; + /* else is a comment */ + next(ls); + if (ls->current == '[') { /* long comment? */ + int sep = skip_sep(ls); + luaZ_resetbuffer(ls->buff); /* `skip_sep' may dirty the buffer */ + if (sep >= 0) { + read_long_string(ls, NULL, sep); /* skip long comment */ + luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */ + break; + } + } + /* else short comment */ + while (!currIsNewline(ls) && ls->current != EOZ) + next(ls); /* skip until end of line (or end of file) */ + break; + } + case '[': { /* long string or simply '[' */ + int sep = skip_sep(ls); + if (sep >= 0) { + read_long_string(ls, seminfo, sep); + return TK_STRING; + } + else if (sep == -1) return '['; + else lexerror(ls, "invalid long string delimiter", TK_STRING); + } + case '=': { + next(ls); + if (ls->current != '=') return '='; + else { next(ls); return TK_EQ; } + } + case '<': { + next(ls); + if (ls->current != '=') return '<'; + else { next(ls); return TK_LE; } + } + case '>': { + next(ls); + if (ls->current != '=') return '>'; + else { next(ls); return TK_GE; } + } + case '~': { + next(ls); + if (ls->current != '=') return '~'; + else { next(ls); return TK_NE; } + } + case ':': { + next(ls); + if (ls->current != ':') return ':'; + else { next(ls); return TK_DBCOLON; } + } + case '"': case '\'': { /* short literal strings */ + read_string(ls, ls->current, seminfo); + return TK_STRING; + } + case '.': { /* '.', '..', '...', or number */ + save_and_next(ls); + if (check_next(ls, ".")) { + if (check_next(ls, ".")) + return TK_DOTS; /* '...' */ + else return TK_CONCAT; /* '..' */ + } + else if (!lisdigit(ls->current)) return '.'; + /* else go through */ + } + /* FALLTHROUGH */ + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + read_numeral(ls, seminfo); + return TK_NUMBER; + } + case EOZ: { + return TK_EOS; + } + default: { + if (lislalpha(ls->current)) { /* identifier or reserved word? */ + TString *ts; + do { + save_and_next(ls); + } while (lislalnum(ls->current)); + ts = luaX_newstring(ls, luaZ_buffer(ls->buff), + luaZ_bufflen(ls->buff)); + seminfo->ts = ts; + if (isreserved(ts)) /* reserved word? */ + return ts->tsv.extra - 1 + FIRST_RESERVED; + else { + return TK_NAME; + } + } + else { /* single-char tokens (+ - / ...) */ + int c = ls->current; + next(ls); + return c; + } + } + } + } +} + + +void luaX_next (LexState *ls) { + ls->lastline = ls->linenumber; + if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */ + ls->t = ls->lookahead; /* use this one */ + ls->lookahead.token = TK_EOS; /* and discharge it */ + } + else + ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */ +} + + +int luaX_lookahead (LexState *ls) { + lua_assert(ls->lookahead.token == TK_EOS); + ls->lookahead.token = llex(ls, &ls->lookahead.seminfo); + return ls->lookahead.token; +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h new file mode 100644 index 000000000000..a4acdd30218a --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h @@ -0,0 +1,78 @@ +/* +** $Id: llex.h,v 1.72.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lexical Analyzer +** See Copyright Notice in lua.h +*/ + +#ifndef llex_h +#define llex_h + +#include "lobject.h" +#include "lzio.h" + + +#define FIRST_RESERVED 257 + + + +/* +* WARNING: if you change the order of this enumeration, +* grep "ORDER RESERVED" +*/ +enum RESERVED { + /* terminal symbols denoted by reserved words */ + TK_AND = FIRST_RESERVED, TK_BREAK, + TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION, + TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT, + TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE, + /* other terminal symbols */ + TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, TK_DBCOLON, TK_EOS, + TK_NUMBER, TK_NAME, TK_STRING +}; + +/* number of reserved words */ +#define NUM_RESERVED (cast(int, TK_WHILE-FIRST_RESERVED+1)) + + +typedef union { + lua_Number r; + TString *ts; +} SemInfo; /* semantics information */ + + +typedef struct Token { + int token; + SemInfo seminfo; +} Token; + + +/* state of the lexer plus state of the parser when shared by all + functions */ +typedef struct LexState { + int current; /* current character (charint) */ + int linenumber; /* input line counter */ + int lastline; /* line of last token `consumed' */ + Token t; /* current token */ + Token lookahead; /* look ahead token */ + struct FuncState *fs; /* current function (parser) */ + struct lua_State *L; + ZIO *z; /* input stream */ + Mbuffer *buff; /* buffer for tokens */ + struct Dyndata *dyd; /* dynamic structures used by the parser */ + TString *source; /* current source name */ + TString *envn; /* environment variable name */ + char decpoint; /* locale decimal point */ +} LexState; + + +LUAI_FUNC void luaX_init (lua_State *L); +LUAI_FUNC void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, + TString *source, int firstchar); +LUAI_FUNC TString *luaX_newstring (LexState *ls, const char *str, size_t l); +LUAI_FUNC void luaX_next (LexState *ls); +LUAI_FUNC int luaX_lookahead (LexState *ls); +LUAI_FUNC l_noret luaX_syntaxerror (LexState *ls, const char *s); +LUAI_FUNC const char *luaX_token2str (LexState *ls, int token); + + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h new file mode 100644 index 000000000000..4277c1fd03db --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h @@ -0,0 +1,308 @@ +/* +** $Id: llimits.h,v 1.103.1.1 2013/04/12 18:48:47 roberto Exp $ +** Limits, basic types, and some other `installation-dependent' definitions +** See Copyright Notice in lua.h +*/ + +#ifndef llimits_h +#define llimits_h + + +#include <sys/zfs_context.h> + +#include "lua.h" + + +typedef unsigned LUA_INT32 lu_int32; + +typedef LUAI_UMEM lu_mem; + +typedef LUAI_MEM l_mem; + + + +/* chars used as small naturals (so that `char' is reserved for characters) */ +typedef unsigned char lu_byte; + + +#define MAX_SIZET ((size_t)(~(size_t)0)-2) + +#define MAX_LUMEM ((lu_mem)(~(lu_mem)0)-2) + +#define MAX_LMEM ((l_mem) ((MAX_LUMEM >> 1) - 2)) + + +#define MAX_INT (INT_MAX-2) /* maximum value of an int (-2 for safety) */ + +/* +** conversion of pointer to integer +** this is for hashing only; there is no problem if the integer +** cannot hold the whole pointer value +*/ +#define IntPoint(p) ((unsigned int)(lu_mem)(p)) + + + +/* type to ensure maximum alignment */ +#if !defined(LUAI_USER_ALIGNMENT_T) +#define LUAI_USER_ALIGNMENT_T union { double u; void *s; long l; } +#endif + +typedef LUAI_USER_ALIGNMENT_T L_Umaxalign; + + +/* result of a `usual argument conversion' over lua_Number */ +typedef LUAI_UACNUMBER l_uacNumber; + + +/* internal assertions for in-house debugging */ +#if defined(lua_assert) +#define check_exp(c,e) (lua_assert(c), (e)) +/* to avoid problems with conditions too long */ +#define lua_longassert(c) { if (!(c)) lua_assert(0); } +#else +#define lua_assert(c) ((void)0) +#define check_exp(c,e) (e) +#define lua_longassert(c) ((void)0) +#endif + +/* +** assertion for checking API calls +*/ +#if !defined(luai_apicheck) + +#if defined(LUA_USE_APICHECK) +#include <assert.h> +#define luai_apicheck(L,e) assert(e) +#else +#define luai_apicheck(L,e) lua_assert(e) +#endif + +#endif + +#define api_check(l,e,msg) luai_apicheck(l,(e) && msg) + + +#if !defined(UNUSED) +#define UNUSED(x) ((void)(x)) /* to avoid warnings */ +#endif + + +#define cast(t, exp) ((t)(exp)) + +#define cast_byte(i) cast(lu_byte, (i)) +#define cast_num(i) cast(lua_Number, (i)) +#define cast_int(i) cast(int, (i)) +#define cast_uchar(i) cast(unsigned char, (i)) + + +/* +** non-return type +*/ +#if defined(__GNUC__) +#define l_noret void __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define l_noret void __declspec(noreturn) +#else +#define l_noret void +#endif + + + +/* +** maximum depth for nested C calls and syntactical nested non-terminals +** in a program. (Value must fit in an unsigned short int.) +** +** Note: On amd64 platform, the limit has been measured to be 45. We set +** the maximum lower to give a margin for changing the amount of stack +** used by various functions involved in parsing and executing code. +*/ +#if !defined(LUAI_MAXCCALLS) +#define LUAI_MAXCCALLS 20 +#endif + +/* +** maximum number of upvalues in a closure (both C and Lua). (Value +** must fit in an unsigned char.) +*/ +#define MAXUPVAL UCHAR_MAX + + +/* +** type for virtual-machine instructions +** must be an unsigned with (at least) 4 bytes (see details in lopcodes.h) +*/ +typedef lu_int32 Instruction; + + + +/* maximum stack for a Lua function */ +#define MAXSTACK 250 + + + +/* minimum size for the string table (must be power of 2) */ +#if !defined(MINSTRTABSIZE) +#define MINSTRTABSIZE 32 +#endif + + +/* minimum size for string buffer */ +#if !defined(LUA_MINBUFFER) +#define LUA_MINBUFFER 32 +#endif + + +#if !defined(lua_lock) +#define lua_lock(L) ((void) 0) +#define lua_unlock(L) ((void) 0) +#endif + +#if !defined(luai_threadyield) +#define luai_threadyield(L) {lua_unlock(L); lua_lock(L);} +#endif + + +/* +** these macros allow user-specific actions on threads when you defined +** LUAI_EXTRASPACE and need to do something extra when a thread is +** created/deleted/resumed/yielded. +*/ +#if !defined(luai_userstateopen) +#define luai_userstateopen(L) ((void)L) +#endif + +#if !defined(luai_userstateclose) +#define luai_userstateclose(L) ((void)L) +#endif + +#if !defined(luai_userstatethread) +#define luai_userstatethread(L,L1) ((void)L) +#endif + +#if !defined(luai_userstatefree) +#define luai_userstatefree(L,L1) ((void)L) +#endif + +#if !defined(luai_userstateresume) +#define luai_userstateresume(L,n) ((void)L) +#endif + +#if !defined(luai_userstateyield) +#define luai_userstateyield(L,n) ((void)L) +#endif + +/* +** lua_number2int is a macro to convert lua_Number to int. +** lua_number2integer is a macro to convert lua_Number to lua_Integer. +** lua_number2unsigned is a macro to convert a lua_Number to a lua_Unsigned. +** lua_unsigned2number is a macro to convert a lua_Unsigned to a lua_Number. +** luai_hashnum is a macro to hash a lua_Number value into an integer. +** The hash must be deterministic and give reasonable values for +** both small and large values (outside the range of integers). +*/ + +#if defined(MS_ASMTRICK) || defined(LUA_MSASMTRICK) /* { */ +/* trick with Microsoft assembler for X86 */ + +#define lua_number2int(i,n) __asm {__asm fld n __asm fistp i} +#define lua_number2integer(i,n) lua_number2int(i, n) +#define lua_number2unsigned(i,n) \ + {__int64 l; __asm {__asm fld n __asm fistp l} i = (unsigned int)l;} + + +#elif defined(LUA_IEEE754TRICK) /* }{ */ +/* the next trick should work on any machine using IEEE754 with + a 32-bit int type */ + +union luai_Cast { double l_d; LUA_INT32 l_p[2]; }; + +#if !defined(LUA_IEEEENDIAN) /* { */ +#define LUAI_EXTRAIEEE \ + static const union luai_Cast ieeeendian = {-(33.0 + 6755399441055744.0)}; +#define LUA_IEEEENDIANLOC (ieeeendian.l_p[1] == 33) +#else +#define LUA_IEEEENDIANLOC LUA_IEEEENDIAN +#define LUAI_EXTRAIEEE /* empty */ +#endif /* } */ + +#define lua_number2int32(i,n,t) \ + { LUAI_EXTRAIEEE \ + volatile union luai_Cast u; u.l_d = (n) + 6755399441055744.0; \ + (i) = (t)u.l_p[LUA_IEEEENDIANLOC]; } + +#define luai_hashnum(i,n) \ + { volatile union luai_Cast u; u.l_d = (n) + 1.0; /* avoid -0 */ \ + (i) = u.l_p[0]; (i) += u.l_p[1]; } /* add double bits for his hash */ + +#define lua_number2int(i,n) lua_number2int32(i, n, int) +#define lua_number2unsigned(i,n) lua_number2int32(i, n, lua_Unsigned) + +/* the trick can be expanded to lua_Integer when it is a 32-bit value */ +#if defined(LUA_IEEELL) +#define lua_number2integer(i,n) lua_number2int32(i, n, lua_Integer) +#endif + +#endif /* } */ + + +/* the following definitions always work, but may be slow */ + +#if !defined(lua_number2int) +#define lua_number2int(i,n) ((i)=(int)(n)) +#endif + +#if !defined(lua_number2integer) +#define lua_number2integer(i,n) ((i)=(lua_Integer)(n)) +#endif + +#if !defined(lua_number2unsigned) /* { */ +/* the following definition assures proper modulo behavior */ +#if defined(LUA_NUMBER_DOUBLE) || defined(LUA_NUMBER_FLOAT) +#include <math.h> +#define SUPUNSIGNED ((lua_Number)(~(lua_Unsigned)0) + 1) +#define lua_number2unsigned(i,n) \ + ((i)=(lua_Unsigned)((n) - floor((n)/SUPUNSIGNED)*SUPUNSIGNED)) +#else +#define lua_number2unsigned(i,n) ((i)=(lua_Unsigned)(n)) +#endif +#endif /* } */ + + +#if !defined(lua_unsigned2number) +/* on several machines, coercion from unsigned to double is slow, + so it may be worth to avoid */ +#define lua_unsigned2number(u) \ + (((u) <= (lua_Unsigned)INT_MAX) ? (lua_Number)(int)(u) : (lua_Number)(u)) +#endif + + + +#if defined(ltable_c) && !defined(luai_hashnum) + +extern int lcompat_hashnum(int64_t); + +#define luai_hashnum(i,n) (i = lcompat_hashnum(n)) + +#endif + + + +/* +** macro to control inclusion of some hard tests on stack reallocation +*/ +#if !defined(HARDSTACKTESTS) +#define condmovestack(L) ((void)0) +#else +/* realloc stack keeping its size */ +#define condmovestack(L) luaD_reallocstack((L), (L)->stacksize) +#endif + +#if !defined(HARDMEMTESTS) +#define condchangemem(L) condmovestack(L) +#else +#define condchangemem(L) \ + ((void)(!(G(L)->gcrunning) || (luaC_fullgc(L, 0), 1))) +#endif + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c new file mode 100644 index 000000000000..0d070fbde83c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c @@ -0,0 +1,99 @@ +/* +** $Id: lmem.c,v 1.84.1.1 2013/04/12 18:48:47 roberto Exp $ +** Interface to Memory Manager +** See Copyright Notice in lua.h +*/ + + +#include <sys/zfs_context.h> + +#define lmem_c +#define LUA_CORE + +#include "lua.h" + +#include "ldebug.h" +#include "ldo.h" +#include "lgc.h" +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" + + + +/* +** About the realloc function: +** void * frealloc (void *ud, void *ptr, size_t osize, size_t nsize); +** (`osize' is the old size, `nsize' is the new size) +** +** * frealloc(ud, NULL, x, s) creates a new block of size `s' (no +** matter 'x'). +** +** * frealloc(ud, p, x, 0) frees the block `p' +** (in this specific case, frealloc must return NULL); +** particularly, frealloc(ud, NULL, 0, 0) does nothing +** (which is equivalent to free(NULL) in ANSI C) +** +** frealloc returns NULL if it cannot create or reallocate the area +** (any reallocation to an equal or smaller size cannot fail!) +*/ + + + +#define MINSIZEARRAY 4 + + +void *luaM_growaux_ (lua_State *L, void *block, int *size, size_t size_elems, + int limit, const char *what) { + void *newblock; + int newsize; + if (*size >= limit/2) { /* cannot double it? */ + if (*size >= limit) /* cannot grow even a little? */ + luaG_runerror(L, "too many %s (limit is %d)", what, limit); + newsize = limit; /* still have at least one free place */ + } + else { + newsize = (*size)*2; + if (newsize < MINSIZEARRAY) + newsize = MINSIZEARRAY; /* minimum size */ + } + newblock = luaM_reallocv(L, block, *size, newsize, size_elems); + *size = newsize; /* update only when everything else is OK */ + return newblock; +} + + +l_noret luaM_toobig (lua_State *L) { + luaG_runerror(L, "memory allocation error: block too big"); +} + + + +/* +** generic allocation routine. +*/ +void *luaM_realloc_ (lua_State *L, void *block, size_t osize, size_t nsize) { + void *newblock; + global_State *g = G(L); + size_t realosize = (block) ? osize : 0; + lua_assert((realosize == 0) == (block == NULL)); +#if defined(HARDMEMTESTS) + if (nsize > realosize && g->gcrunning) + luaC_fullgc(L, 1); /* force a GC whenever possible */ +#endif + newblock = (*g->frealloc)(g->ud, block, osize, nsize); + if (newblock == NULL && nsize > 0) { + api_check(L, nsize > realosize, + "realloc cannot fail when shrinking a block"); + if (g->gcrunning) { + luaC_fullgc(L, 1); /* try to free some memory... */ + newblock = (*g->frealloc)(g->ud, block, osize, nsize); /* try again */ + } + if (newblock == NULL) + luaD_throw(L, LUA_ERRMEM); + } + lua_assert((nsize == 0) == (newblock == NULL)); + g->GCdebt = (g->GCdebt + nsize) - realosize; + return newblock; +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h new file mode 100644 index 000000000000..c75a3d50984a --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h @@ -0,0 +1,57 @@ +/* +** $Id: lmem.h,v 1.40.1.1 2013/04/12 18:48:47 roberto Exp $ +** Interface to Memory Manager +** See Copyright Notice in lua.h +*/ + +#ifndef lmem_h +#define lmem_h + + +#include <sys/zfs_context.h> + +#include "llimits.h" +#include "lua.h" + + +/* +** This macro avoids the runtime division MAX_SIZET/(e), as 'e' is +** always constant. +** The macro is somewhat complex to avoid warnings: +** +1 avoids warnings of "comparison has constant result"; +** cast to 'void' avoids warnings of "value unused". +*/ +#define luaM_reallocv(L,b,on,n,e) \ + (cast(void, \ + (cast(size_t, (n)+1) > MAX_SIZET/(e)) ? (luaM_toobig(L), 0) : 0), \ + luaM_realloc_(L, (b), (on)*(e), (n)*(e))) + +#define luaM_freemem(L, b, s) luaM_realloc_(L, (b), (s), 0) +#define luaM_free(L, b) luaM_realloc_(L, (b), sizeof(*(b)), 0) +#define luaM_freearray(L, b, n) luaM_reallocv(L, (b), n, 0, sizeof((b)[0])) + +#define luaM_malloc(L,s) luaM_realloc_(L, NULL, 0, (s)) +#define luaM_new(L,t) cast(t *, luaM_malloc(L, sizeof(t))) +#define luaM_newvector(L,n,t) \ + cast(t *, luaM_reallocv(L, NULL, 0, n, sizeof(t))) + +#define luaM_newobject(L,tag,s) luaM_realloc_(L, NULL, tag, (s)) + +#define luaM_growvector(L,v,nelems,size,t,limit,e) \ + if ((nelems)+1 > (size)) \ + ((v)=cast(t *, luaM_growaux_(L,v,&(size),sizeof(t),limit,e))) + +#define luaM_reallocvector(L, v,oldn,n,t) \ + ((v)=cast(t *, luaM_reallocv(L, v, oldn, n, sizeof(t)))) + +LUAI_FUNC l_noret luaM_toobig (lua_State *L); + +/* not to be called directly */ +LUAI_FUNC void *luaM_realloc_ (lua_State *L, void *block, size_t oldsize, + size_t size); +LUAI_FUNC void *luaM_growaux_ (lua_State *L, void *block, int *size, + size_t size_elem, int limit, + const char *what); + +#endif + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c new file mode 100644 index 000000000000..339c84d21d79 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c @@ -0,0 +1,283 @@ +/* +** $Id: lobject.c,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $ +** Some generic functions over Lua objects +** See Copyright Notice in lua.h +*/ + +#include <sys/zfs_context.h> + +#define lobject_c +#define LUA_CORE + +#include "lua.h" + +#include "lctype.h" +#include "ldebug.h" +#include "ldo.h" +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" +#include "lstring.h" +#include "lvm.h" + + + +LUAI_DDEF const TValue luaO_nilobject_ = {NILCONSTANT}; + + +/* +** converts an integer to a "floating point byte", represented as +** (eeeeexxx), where the real value is (1xxx) * 2^(eeeee - 1) if +** eeeee != 0 and (xxx) otherwise. +*/ +int luaO_int2fb (unsigned int x) { + int e = 0; /* exponent */ + if (x < 8) return x; + while (x >= 0x10) { + x = (x+1) >> 1; + e++; + } + return ((e+1) << 3) | (cast_int(x) - 8); +} + + +/* converts back */ +int luaO_fb2int (int x) { + int e = (x >> 3) & 0x1f; + if (e == 0) return x; + else return ((x & 7) + 8) << (e - 1); +} + + +int luaO_ceillog2 (unsigned int x) { + static const lu_byte log_2[256] = { + 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 + }; + int l = 0; + x--; + while (x >= 256) { l += 8; x >>= 8; } + return l + log_2[x]; +} + + +lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2) { + switch (op) { + case LUA_OPADD: return luai_numadd(NULL, v1, v2); + case LUA_OPSUB: return luai_numsub(NULL, v1, v2); + case LUA_OPMUL: return luai_nummul(NULL, v1, v2); + case LUA_OPDIV: return luai_numdiv(NULL, v1, v2); + case LUA_OPMOD: return luai_nummod(NULL, v1, v2); + case LUA_OPPOW: return luai_numpow(NULL, v1, v2); + case LUA_OPUNM: return luai_numunm(NULL, v1); + default: lua_assert(0); return 0; + } +} + + +int luaO_hexavalue (int c) { + if (lisdigit(c)) return c - '0'; + else return ltolower(c) - 'a' + 10; +} + + +#if !defined(lua_strx2number) + + + +static int isneg (const char **s) { + if (**s == '-') { (*s)++; return 1; } + else if (**s == '+') (*s)++; + return 0; +} + + +static lua_Number readhexa (const char **s, lua_Number r, int *count) { + for (; lisxdigit(cast_uchar(**s)); (*s)++) { /* read integer part */ + r = (r * cast_num(16.0)) + cast_num(luaO_hexavalue(cast_uchar(**s))); + (*count)++; + } + return r; +} + + +/* +** convert an hexadecimal numeric string to a number, following +** C99 specification for 'strtod' +*/ +static lua_Number lua_strx2number (const char *s, char **endptr) { + lua_Number r = 0.0; + int e = 0, i = 0; + int neg = 0; /* 1 if number is negative */ + *endptr = cast(char *, s); /* nothing is valid yet */ + while (lisspace(cast_uchar(*s))) s++; /* skip initial spaces */ + neg = isneg(&s); /* check signal */ + if (!(*s == '0' && (*(s + 1) == 'x' || *(s + 1) == 'X'))) /* check '0x' */ + return 0.0; /* invalid format (no '0x') */ + s += 2; /* skip '0x' */ + r = readhexa(&s, r, &i); /* read integer part */ + if (*s == '.') { + s++; /* skip dot */ + r = readhexa(&s, r, &e); /* read fractional part */ + } + if (i == 0 && e == 0) + return 0.0; /* invalid format (no digit) */ + e *= -4; /* each fractional digit divides value by 2^-4 */ + *endptr = cast(char *, s); /* valid up to here */ + if (*s == 'p' || *s == 'P') { /* exponent part? */ + int exp1 = 0; + int neg1; + s++; /* skip 'p' */ + neg1 = isneg(&s); /* signal */ + if (!lisdigit(cast_uchar(*s))) + goto ret; /* must have at least one digit */ + while (lisdigit(cast_uchar(*s))) /* read exponent */ + exp1 = exp1 * 10 + *(s++) - '0'; + if (neg1) exp1 = -exp1; + e += exp1; + } + *endptr = cast(char *, s); /* valid up to here */ + ret: + if (neg) r = -r; + return (r * (1 << e)); +} + +#endif + + +int luaO_str2d (const char *s, size_t len, lua_Number *result) { + char *endptr; + if (strpbrk(s, "nN")) /* reject 'inf' and 'nan' */ + return 0; + else if (strpbrk(s, "xX")) /* hexa? */ + *result = lua_strx2number(s, &endptr); + else + *result = lua_str2number(s, &endptr); + if (endptr == s) return 0; /* nothing recognized */ + while (lisspace(cast_uchar(*endptr))) endptr++; + return (endptr == s + len); /* OK if no trailing characters */ +} + + + +static void pushstr (lua_State *L, const char *str, size_t l) { + setsvalue2s(L, L->top++, luaS_newlstr(L, str, l)); +} + + +/* this function handles only `%d', `%c', %f, %p, and `%s' formats */ +const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) { + int n = 0; + for (;;) { + const char *e = strchr(fmt, '%'); + if (e == NULL) break; + luaD_checkstack(L, 2); /* fmt + item */ + pushstr(L, fmt, e - fmt); + switch (*(e+1)) { + case 's': { + const char *s = va_arg(argp, char *); + if (s == NULL) s = "(null)"; + pushstr(L, s, strlen(s)); + break; + } + case 'c': { + char buff; + buff = cast(char, va_arg(argp, int)); + pushstr(L, &buff, 1); + break; + } + case 'd': { + setnvalue(L->top++, cast_num(va_arg(argp, int))); + break; + } + case 'f': { + setnvalue(L->top++, cast_num(va_arg(argp, l_uacNumber))); + break; + } + case 'p': { + char buff[4*sizeof(void *) + 8]; /* should be enough space for a `%p' */ + int l = lcompat_sprintf(buff, "%p", va_arg(argp, void *)); + pushstr(L, buff, l); + break; + } + case '%': { + pushstr(L, "%", 1); + break; + } + default: { + luaG_runerror(L, + "invalid option " LUA_QL("%%%c") " to " LUA_QL("lua_pushfstring"), + *(e + 1)); + } + } + n += 2; + fmt = e+2; + } + luaD_checkstack(L, 1); + pushstr(L, fmt, strlen(fmt)); + if (n > 0) luaV_concat(L, n + 1); + return svalue(L->top - 1); +} + + +const char *luaO_pushfstring (lua_State *L, const char *fmt, ...) { + const char *msg; + va_list argp; + va_start(argp, fmt); + msg = luaO_pushvfstring(L, fmt, argp); + va_end(argp); + return msg; +} + + +/* number of chars of a literal string without the ending \0 */ +#define LL(x) (sizeof(x)/sizeof(char) - 1) + +#define RETS "..." +#define PRE "[string \"" +#define POS "\"]" + +#define addstr(a,b,l) ( memcpy(a,b,(l) * sizeof(char)), a += (l) ) + +void luaO_chunkid (char *out, const char *source, size_t bufflen) { + size_t l = strlen(source); + if (*source == '=') { /* 'literal' source */ + if (l <= bufflen) /* small enough? */ + memcpy(out, source + 1, l * sizeof(char)); + else { /* truncate it */ + addstr(out, source + 1, bufflen - 1); + *out = '\0'; + } + } + else if (*source == '@') { /* file name */ + if (l <= bufflen) /* small enough? */ + memcpy(out, source + 1, l * sizeof(char)); + else { /* add '...' before rest of name */ + addstr(out, RETS, LL(RETS)); + bufflen -= LL(RETS); + memcpy(out, source + 1 + l - bufflen, bufflen * sizeof(char)); + } + } + else { /* string; format as [string "source"] */ + const char *nl = strchr(source, '\n'); /* find first new line (if any) */ + addstr(out, PRE, LL(PRE)); /* add prefix */ + bufflen -= LL(PRE RETS POS) + 1; /* save space for prefix+suffix+'\0' */ + if (l < bufflen && nl == NULL) { /* small one-line source? */ + addstr(out, source, l); /* keep it */ + } + else { + if (nl != NULL) l = nl - source; /* stop at first newline */ + if (l > bufflen) l = bufflen; + addstr(out, source, l); + addstr(out, RETS, LL(RETS)); + } + memcpy(out, POS, (LL(POS) + 1) * sizeof(char)); + } +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h new file mode 100644 index 000000000000..9c9f23542867 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h @@ -0,0 +1,606 @@ +/* +** $Id: lobject.h,v 2.71.1.2 2014/05/07 14:14:58 roberto Exp $ +** Type definitions for Lua objects +** See Copyright Notice in lua.h +*/ + + +#ifndef lobject_h +#define lobject_h + + +#include <sys/zfs_context.h> + +#include "llimits.h" +#include "lua.h" + + +/* +** Extra tags for non-values +*/ +#define LUA_TPROTO LUA_NUMTAGS +#define LUA_TUPVAL (LUA_NUMTAGS+1) +#define LUA_TDEADKEY (LUA_NUMTAGS+2) + +/* +** number of all possible tags (including LUA_TNONE but excluding DEADKEY) +*/ +#define LUA_TOTALTAGS (LUA_TUPVAL+2) + + +/* +** tags for Tagged Values have the following use of bits: +** bits 0-3: actual tag (a LUA_T* value) +** bits 4-5: variant bits +** bit 6: whether value is collectable +*/ + +#define VARBITS (3 << 4) + + +/* +** LUA_TFUNCTION variants: +** 0 - Lua function +** 1 - light C function +** 2 - regular C function (closure) +*/ + +/* Variant tags for functions */ +#define LUA_TLCL (LUA_TFUNCTION | (0 << 4)) /* Lua closure */ +#define LUA_TLCF (LUA_TFUNCTION | (1 << 4)) /* light C function */ +#define LUA_TCCL (LUA_TFUNCTION | (2 << 4)) /* C closure */ + + +/* Variant tags for strings */ +#define LUA_TSHRSTR (LUA_TSTRING | (0 << 4)) /* short strings */ +#define LUA_TLNGSTR (LUA_TSTRING | (1 << 4)) /* long strings */ + + +/* Bit mark for collectable types */ +#define BIT_ISCOLLECTABLE (1 << 6) + +/* mark a tag as collectable */ +#define ctb(t) ((t) | BIT_ISCOLLECTABLE) + + +/* +** Union of all collectable objects +*/ +typedef union GCObject GCObject; + + +/* +** Common Header for all collectable objects (in macro form, to be +** included in other objects) +*/ +#define CommonHeader GCObject *next; lu_byte tt; lu_byte marked + + +/* +** Common header in struct form +*/ +typedef struct GCheader { + CommonHeader; +} GCheader; + + + +/* +** Union of all Lua values +*/ +typedef union Value Value; + + +#define numfield lua_Number n; /* numbers */ + + + +/* +** Tagged Values. This is the basic representation of values in Lua, +** an actual value plus a tag with its type. +*/ + +#define TValuefields Value value_; int tt_ + +typedef struct lua_TValue TValue; + + +/* macro defining a nil value */ +#define NILCONSTANT {NULL}, LUA_TNIL + + +#define val_(o) ((o)->value_) +#define num_(o) (val_(o).n) + + +/* raw type tag of a TValue */ +#define rttype(o) ((o)->tt_) + +/* tag with no variants (bits 0-3) */ +#define novariant(x) ((x) & 0x0F) + +/* type tag of a TValue (bits 0-3 for tags + variant bits 4-5) */ +#define ttype(o) (rttype(o) & 0x3F) + +/* type tag of a TValue with no variants (bits 0-3) */ +#define ttypenv(o) (novariant(rttype(o))) + + +/* Macros to test type */ +#define checktag(o,t) (rttype(o) == (t)) +#define checktype(o,t) (ttypenv(o) == (t)) +#define ttisnumber(o) checktag((o), LUA_TNUMBER) +#define ttisnil(o) checktag((o), LUA_TNIL) +#define ttisboolean(o) checktag((o), LUA_TBOOLEAN) +#define ttislightuserdata(o) checktag((o), LUA_TLIGHTUSERDATA) +#define ttisstring(o) checktype((o), LUA_TSTRING) +#define ttisshrstring(o) checktag((o), ctb(LUA_TSHRSTR)) +#define ttislngstring(o) checktag((o), ctb(LUA_TLNGSTR)) +#define ttistable(o) checktag((o), ctb(LUA_TTABLE)) +#define ttisfunction(o) checktype(o, LUA_TFUNCTION) +#define ttisclosure(o) ((rttype(o) & 0x1F) == LUA_TFUNCTION) +#define ttisCclosure(o) checktag((o), ctb(LUA_TCCL)) +#define ttisLclosure(o) checktag((o), ctb(LUA_TLCL)) +#define ttislcf(o) checktag((o), LUA_TLCF) +#define ttisuserdata(o) checktag((o), ctb(LUA_TUSERDATA)) +#define ttisthread(o) checktag((o), ctb(LUA_TTHREAD)) +#define ttisdeadkey(o) checktag((o), LUA_TDEADKEY) + +#define ttisequal(o1,o2) (rttype(o1) == rttype(o2)) + +/* Macros to access values */ +#define nvalue(o) check_exp(ttisnumber(o), num_(o)) +#define gcvalue(o) check_exp(iscollectable(o), val_(o).gc) +#define pvalue(o) check_exp(ttislightuserdata(o), val_(o).p) +#define rawtsvalue(o) check_exp(ttisstring(o), &val_(o).gc->ts) +#define tsvalue(o) (&rawtsvalue(o)->tsv) +#define rawuvalue(o) check_exp(ttisuserdata(o), &val_(o).gc->u) +#define uvalue(o) (&rawuvalue(o)->uv) +#define clvalue(o) check_exp(ttisclosure(o), &val_(o).gc->cl) +#define clLvalue(o) check_exp(ttisLclosure(o), &val_(o).gc->cl.l) +#define clCvalue(o) check_exp(ttisCclosure(o), &val_(o).gc->cl.c) +#define fvalue(o) check_exp(ttislcf(o), val_(o).f) +#define hvalue(o) check_exp(ttistable(o), &val_(o).gc->h) +#define bvalue(o) check_exp(ttisboolean(o), val_(o).b) +#define thvalue(o) check_exp(ttisthread(o), &val_(o).gc->th) +/* a dead value may get the 'gc' field, but cannot access its contents */ +#define deadvalue(o) check_exp(ttisdeadkey(o), cast(void *, val_(o).gc)) + +#define l_isfalse(o) (ttisnil(o) || (ttisboolean(o) && bvalue(o) == 0)) + + +#define iscollectable(o) (rttype(o) & BIT_ISCOLLECTABLE) + + +/* Macros for internal tests */ +#define righttt(obj) (ttype(obj) == gcvalue(obj)->gch.tt) + +#define checkliveness(g,obj) \ + lua_longassert(!iscollectable(obj) || \ + (righttt(obj) && !isdead(g,gcvalue(obj)))) + + +/* Macros to set values */ +#define settt_(o,t) ((o)->tt_=(t)) + +#define setnvalue(obj,x) \ + { TValue *io=(obj); num_(io)=(x); settt_(io, LUA_TNUMBER); } + +#define setnilvalue(obj) settt_(obj, LUA_TNIL) + +#define setfvalue(obj,x) \ + { TValue *io=(obj); val_(io).f=(x); settt_(io, LUA_TLCF); } + +#define setpvalue(obj,x) \ + { TValue *io=(obj); val_(io).p=(x); settt_(io, LUA_TLIGHTUSERDATA); } + +#define setbvalue(obj,x) \ + { TValue *io=(obj); val_(io).b=(x); settt_(io, LUA_TBOOLEAN); } + +#define setgcovalue(L,obj,x) \ + { TValue *io=(obj); GCObject *i_g=(x); \ + val_(io).gc=i_g; settt_(io, ctb(gch(i_g)->tt)); } + +#define setsvalue(L,obj,x) \ + { TValue *io=(obj); \ + TString *x_ = (x); \ + val_(io).gc=cast(GCObject *, x_); settt_(io, ctb(x_->tsv.tt)); \ + checkliveness(G(L),io); } + +#define setuvalue(L,obj,x) \ + { TValue *io=(obj); \ + val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TUSERDATA)); \ + checkliveness(G(L),io); } + +#define setthvalue(L,obj,x) \ + { TValue *io=(obj); \ + val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTHREAD)); \ + checkliveness(G(L),io); } + +#define setclLvalue(L,obj,x) \ + { TValue *io=(obj); \ + val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TLCL)); \ + checkliveness(G(L),io); } + +#define setclCvalue(L,obj,x) \ + { TValue *io=(obj); \ + val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TCCL)); \ + checkliveness(G(L),io); } + +#define sethvalue(L,obj,x) \ + { TValue *io=(obj); \ + val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTABLE)); \ + checkliveness(G(L),io); } + +#define setdeadvalue(obj) settt_(obj, LUA_TDEADKEY) + + + +#define setobj(L,obj1,obj2) \ + { const TValue *io2=(obj2); TValue *io1=(obj1); \ + io1->value_ = io2->value_; io1->tt_ = io2->tt_; \ + checkliveness(G(L),io1); } + + +/* +** different types of assignments, according to destination +*/ + +/* from stack to (same) stack */ +#define setobjs2s setobj +/* to stack (not from same stack) */ +#define setobj2s setobj +#define setsvalue2s setsvalue +#define sethvalue2s sethvalue +#define setptvalue2s setptvalue +/* from table to same table */ +#define setobjt2t setobj +/* to table */ +#define setobj2t setobj +/* to new object */ +#define setobj2n setobj +#define setsvalue2n setsvalue + + +/* check whether a number is valid (useful only for NaN trick) */ +#define luai_checknum(L,o,c) { /* empty */ } + + +/* +** {====================================================== +** NaN Trick +** ======================================================= +*/ +#if defined(LUA_NANTRICK) + +/* +** numbers are represented in the 'd_' field. All other values have the +** value (NNMARK | tag) in 'tt__'. A number with such pattern would be +** a "signaled NaN", which is never generated by regular operations by +** the CPU (nor by 'strtod') +*/ + +/* allows for external implementation for part of the trick */ +#if !defined(NNMARK) /* { */ + + +#if !defined(LUA_IEEEENDIAN) +#error option 'LUA_NANTRICK' needs 'LUA_IEEEENDIAN' +#endif + + +#define NNMARK 0x7FF7A500 +#define NNMASK 0x7FFFFF00 + +#undef TValuefields +#undef NILCONSTANT + +#if (LUA_IEEEENDIAN == 0) /* { */ + +/* little endian */ +#define TValuefields \ + union { struct { Value v__; int tt__; } i; double d__; } u +#define NILCONSTANT {{{NULL}, tag2tt(LUA_TNIL)}} +/* field-access macros */ +#define v_(o) ((o)->u.i.v__) +#define d_(o) ((o)->u.d__) +#define tt_(o) ((o)->u.i.tt__) + +#else /* }{ */ + +/* big endian */ +#define TValuefields \ + union { struct { int tt__; Value v__; } i; double d__; } u +#define NILCONSTANT {{tag2tt(LUA_TNIL), {NULL}}} +/* field-access macros */ +#define v_(o) ((o)->u.i.v__) +#define d_(o) ((o)->u.d__) +#define tt_(o) ((o)->u.i.tt__) + +#endif /* } */ + +#endif /* } */ + + +/* correspondence with standard representation */ +#undef val_ +#define val_(o) v_(o) +#undef num_ +#define num_(o) d_(o) + + +#undef numfield +#define numfield /* no such field; numbers are the entire struct */ + +/* basic check to distinguish numbers from non-numbers */ +#undef ttisnumber +#define ttisnumber(o) ((tt_(o) & NNMASK) != NNMARK) + +#define tag2tt(t) (NNMARK | (t)) + +#undef rttype +#define rttype(o) (ttisnumber(o) ? LUA_TNUMBER : tt_(o) & 0xff) + +#undef settt_ +#define settt_(o,t) (tt_(o) = tag2tt(t)) + +#undef setnvalue +#define setnvalue(obj,x) \ + { TValue *io_=(obj); num_(io_)=(x); lua_assert(ttisnumber(io_)); } + +#undef setobj +#define setobj(L,obj1,obj2) \ + { const TValue *o2_=(obj2); TValue *o1_=(obj1); \ + o1_->u = o2_->u; \ + checkliveness(G(L),o1_); } + + +/* +** these redefinitions are not mandatory, but these forms are more efficient +*/ + +#undef checktag +#undef checktype +#define checktag(o,t) (tt_(o) == tag2tt(t)) +#define checktype(o,t) (ctb(tt_(o) | VARBITS) == ctb(tag2tt(t) | VARBITS)) + +#undef ttisequal +#define ttisequal(o1,o2) \ + (ttisnumber(o1) ? ttisnumber(o2) : (tt_(o1) == tt_(o2))) + + +#undef luai_checknum +#define luai_checknum(L,o,c) { if (!ttisnumber(o)) c; } + +#endif +/* }====================================================== */ + + + +/* +** {====================================================== +** types and prototypes +** ======================================================= +*/ + + +union Value { + GCObject *gc; /* collectable objects */ + void *p; /* light userdata */ + int b; /* booleans */ + lua_CFunction f; /* light C functions */ + numfield /* numbers */ +}; + + +struct lua_TValue { + TValuefields; +}; + + +typedef TValue *StkId; /* index to stack elements */ + + + + +/* +** Header for string value; string bytes follow the end of this structure +*/ +typedef union TString { + L_Umaxalign dummy; /* ensures maximum alignment for strings */ + struct { + CommonHeader; + lu_byte extra; /* reserved words for short strings; "has hash" for longs */ + unsigned int hash; + size_t len; /* number of characters in string */ + } tsv; +} TString; + + +/* get the actual string (array of bytes) from a TString */ +#define getstr(ts) cast(const char *, (ts) + 1) + +/* get the actual string (array of bytes) from a Lua value */ +#define svalue(o) getstr(rawtsvalue(o)) + + +/* +** Header for userdata; memory area follows the end of this structure +*/ +typedef union Udata { + L_Umaxalign dummy; /* ensures maximum alignment for `local' udata */ + struct { + CommonHeader; + struct Table *metatable; + struct Table *env; + size_t len; /* number of bytes */ + } uv; +} Udata; + + + +/* +** Description of an upvalue for function prototypes +*/ +typedef struct Upvaldesc { + TString *name; /* upvalue name (for debug information) */ + lu_byte instack; /* whether it is in stack */ + lu_byte idx; /* index of upvalue (in stack or in outer function's list) */ +} Upvaldesc; + + +/* +** Description of a local variable for function prototypes +** (used for debug information) +*/ +typedef struct LocVar { + TString *varname; + int startpc; /* first point where variable is active */ + int endpc; /* first point where variable is dead */ +} LocVar; + + +/* +** Function Prototypes +*/ +typedef struct Proto { + CommonHeader; + TValue *k; /* constants used by the function */ + Instruction *code; + struct Proto **p; /* functions defined inside the function */ + int *lineinfo; /* map from opcodes to source lines (debug information) */ + LocVar *locvars; /* information about local variables (debug information) */ + Upvaldesc *upvalues; /* upvalue information */ + union Closure *cache; /* last created closure with this prototype */ + TString *source; /* used for debug information */ + int sizeupvalues; /* size of 'upvalues' */ + int sizek; /* size of `k' */ + int sizecode; + int sizelineinfo; + int sizep; /* size of `p' */ + int sizelocvars; + int linedefined; + int lastlinedefined; + GCObject *gclist; + lu_byte numparams; /* number of fixed parameters */ + lu_byte is_vararg; + lu_byte maxstacksize; /* maximum stack used by this function */ +} Proto; + + + +/* +** Lua Upvalues +*/ +typedef struct UpVal { + CommonHeader; + TValue *v; /* points to stack or to its own value */ + union { + TValue value; /* the value (when closed) */ + struct { /* double linked list (when open) */ + struct UpVal *prev; + struct UpVal *next; + } l; + } u; +} UpVal; + + +/* +** Closures +*/ + +#define ClosureHeader \ + CommonHeader; lu_byte nupvalues; GCObject *gclist + +typedef struct CClosure { + ClosureHeader; + lua_CFunction f; + TValue upvalue[1]; /* list of upvalues */ +} CClosure; + + +typedef struct LClosure { + ClosureHeader; + struct Proto *p; + UpVal *upvals[1]; /* list of upvalues */ +} LClosure; + + +typedef union Closure { + CClosure c; + LClosure l; +} Closure; + + +#define isLfunction(o) ttisLclosure(o) + +#define getproto(o) (clLvalue(o)->p) + + +/* +** Tables +*/ + +typedef union TKey { + struct { + TValuefields; + struct Node *next; /* for chaining */ + } nk; + TValue tvk; +} TKey; + + +typedef struct Node { + TValue i_val; + TKey i_key; +} Node; + + +typedef struct Table { + CommonHeader; + lu_byte flags; /* 1<<p means tagmethod(p) is not present */ + lu_byte lsizenode; /* log2 of size of `node' array */ + int sizearray; /* size of `array' array */ + TValue *array; /* array part */ + Node *node; + Node *lastfree; /* any free position is before this position */ + struct Table *metatable; + GCObject *gclist; +} Table; + + + +/* +** `module' operation for hashing (size is always a power of 2) +*/ +#define lmod(s,size) \ + (check_exp((size&(size-1))==0, (cast(int, (s) & ((size)-1))))) + + +#define twoto(x) (1<<(x)) +#define sizenode(t) (twoto((t)->lsizenode)) + + +/* +** (address of) a fixed nil value +*/ +#define luaO_nilobject (&luaO_nilobject_) + + +LUAI_DDEC const TValue luaO_nilobject_; + + +LUAI_FUNC int luaO_int2fb (unsigned int x); +LUAI_FUNC int luaO_fb2int (int x); +LUAI_FUNC int luaO_ceillog2 (unsigned int x); +LUAI_FUNC lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2); +LUAI_FUNC int luaO_str2d (const char *s, size_t len, lua_Number *result); +LUAI_FUNC int luaO_hexavalue (int c); +LUAI_FUNC const char *luaO_pushvfstring (lua_State *L, const char *fmt, + va_list argp); +LUAI_FUNC const char *luaO_pushfstring (lua_State *L, const char *fmt, ...); +LUAI_FUNC void luaO_chunkid (char *out, const char *source, size_t len); + + +#endif + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c new file mode 100644 index 000000000000..4190dc762428 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c @@ -0,0 +1,107 @@ +/* +** $Id: lopcodes.c,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $ +** Opcodes for Lua virtual machine +** See Copyright Notice in lua.h +*/ + + +#define lopcodes_c +#define LUA_CORE + + +#include "lopcodes.h" + + +/* ORDER OP */ + +LUAI_DDEF const char *const luaP_opnames[NUM_OPCODES+1] = { + "MOVE", + "LOADK", + "LOADKX", + "LOADBOOL", + "LOADNIL", + "GETUPVAL", + "GETTABUP", + "GETTABLE", + "SETTABUP", + "SETUPVAL", + "SETTABLE", + "NEWTABLE", + "SELF", + "ADD", + "SUB", + "MUL", + "DIV", + "MOD", + "POW", + "UNM", + "NOT", + "LEN", + "CONCAT", + "JMP", + "EQ", + "LT", + "LE", + "TEST", + "TESTSET", + "CALL", + "TAILCALL", + "RETURN", + "FORLOOP", + "FORPREP", + "TFORCALL", + "TFORLOOP", + "SETLIST", + "CLOSURE", + "VARARG", + "EXTRAARG", + NULL +}; + + +#define opmode(t,a,b,c,m) (((t)<<7) | ((a)<<6) | ((b)<<4) | ((c)<<2) | (m)) + +LUAI_DDEF const lu_byte luaP_opmodes[NUM_OPCODES] = { +/* T A B C mode opcode */ + opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_MOVE */ + ,opmode(0, 1, OpArgK, OpArgN, iABx) /* OP_LOADK */ + ,opmode(0, 1, OpArgN, OpArgN, iABx) /* OP_LOADKX */ + ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_LOADBOOL */ + ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_LOADNIL */ + ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_GETUPVAL */ + ,opmode(0, 1, OpArgU, OpArgK, iABC) /* OP_GETTABUP */ + ,opmode(0, 1, OpArgR, OpArgK, iABC) /* OP_GETTABLE */ + ,opmode(0, 0, OpArgK, OpArgK, iABC) /* OP_SETTABUP */ + ,opmode(0, 0, OpArgU, OpArgN, iABC) /* OP_SETUPVAL */ + ,opmode(0, 0, OpArgK, OpArgK, iABC) /* OP_SETTABLE */ + ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_NEWTABLE */ + ,opmode(0, 1, OpArgR, OpArgK, iABC) /* OP_SELF */ + ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_ADD */ + ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_SUB */ + ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_MUL */ + ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_DIV */ + ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_MOD */ + ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_POW */ + ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_UNM */ + ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_NOT */ + ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_LEN */ + ,opmode(0, 1, OpArgR, OpArgR, iABC) /* OP_CONCAT */ + ,opmode(0, 0, OpArgR, OpArgN, iAsBx) /* OP_JMP */ + ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_EQ */ + ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_LT */ + ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_LE */ + ,opmode(1, 0, OpArgN, OpArgU, iABC) /* OP_TEST */ + ,opmode(1, 1, OpArgR, OpArgU, iABC) /* OP_TESTSET */ + ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_CALL */ + ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_TAILCALL */ + ,opmode(0, 0, OpArgU, OpArgN, iABC) /* OP_RETURN */ + ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_FORLOOP */ + ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_FORPREP */ + ,opmode(0, 0, OpArgN, OpArgU, iABC) /* OP_TFORCALL */ + ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_TFORLOOP */ + ,opmode(0, 0, OpArgU, OpArgU, iABC) /* OP_SETLIST */ + ,opmode(0, 1, OpArgU, OpArgN, iABx) /* OP_CLOSURE */ + ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_VARARG */ + ,opmode(0, 0, OpArgU, OpArgU, iAx) /* OP_EXTRAARG */ +}; + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h new file mode 100644 index 000000000000..8e2f80a13141 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h @@ -0,0 +1,288 @@ +/* +** $Id: lopcodes.h,v 1.142.1.2 2014/10/20 18:32:09 roberto Exp $ +** Opcodes for Lua virtual machine +** See Copyright Notice in lua.h +*/ + +#ifndef lopcodes_h +#define lopcodes_h + +#include "llimits.h" + + +/*=========================================================================== + We assume that instructions are unsigned numbers. + All instructions have an opcode in the first 6 bits. + Instructions can have the following fields: + `A' : 8 bits + `B' : 9 bits + `C' : 9 bits + 'Ax' : 26 bits ('A', 'B', and 'C' together) + `Bx' : 18 bits (`B' and `C' together) + `sBx' : signed Bx + + A signed argument is represented in excess K; that is, the number + value is the unsigned value minus K. K is exactly the maximum value + for that argument (so that -max is represented by 0, and +max is + represented by 2*max), which is half the maximum for the corresponding + unsigned argument. +===========================================================================*/ + + +enum OpMode {iABC, iABx, iAsBx, iAx}; /* basic instruction format */ + + +/* +** size and position of opcode arguments. +*/ +#define SIZE_C 9 +#define SIZE_B 9 +#define SIZE_Bx (SIZE_C + SIZE_B) +#define SIZE_A 8 +#define SIZE_Ax (SIZE_C + SIZE_B + SIZE_A) + +#define SIZE_OP 6 + +#define POS_OP 0 +#define POS_A (POS_OP + SIZE_OP) +#define POS_C (POS_A + SIZE_A) +#define POS_B (POS_C + SIZE_C) +#define POS_Bx POS_C +#define POS_Ax POS_A + + +/* +** limits for opcode arguments. +** we use (signed) int to manipulate most arguments, +** so they must fit in LUAI_BITSINT-1 bits (-1 for sign) +*/ +#if SIZE_Bx < LUAI_BITSINT-1 +#define MAXARG_Bx ((1<<SIZE_Bx)-1) +#define MAXARG_sBx (MAXARG_Bx>>1) /* `sBx' is signed */ +#else +#define MAXARG_Bx MAX_INT +#define MAXARG_sBx MAX_INT +#endif + +#if SIZE_Ax < LUAI_BITSINT-1 +#define MAXARG_Ax ((1<<SIZE_Ax)-1) +#else +#define MAXARG_Ax MAX_INT +#endif + + +#define MAXARG_A ((1<<SIZE_A)-1) +#define MAXARG_B ((1<<SIZE_B)-1) +#define MAXARG_C ((1<<SIZE_C)-1) + + +/* creates a mask with `n' 1 bits at position `p' */ +#define MASK1(n,p) ((~((~(Instruction)0)<<(n)))<<(p)) + +/* creates a mask with `n' 0 bits at position `p' */ +#define MASK0(n,p) (~MASK1(n,p)) + +/* +** the following macros help to manipulate instructions +*/ + +#define GET_OPCODE(i) (cast(OpCode, ((i)>>POS_OP) & MASK1(SIZE_OP,0))) +#define SET_OPCODE(i,o) ((i) = (((i)&MASK0(SIZE_OP,POS_OP)) | \ + ((cast(Instruction, o)<<POS_OP)&MASK1(SIZE_OP,POS_OP)))) + +#define getarg(i,pos,size) (cast(int, ((i)>>pos) & MASK1(size,0))) +#define setarg(i,v,pos,size) ((i) = (((i)&MASK0(size,pos)) | \ + ((cast(Instruction, v)<<pos)&MASK1(size,pos)))) + +#define GETARG_A(i) getarg(i, POS_A, SIZE_A) +#define SETARG_A(i,v) setarg(i, v, POS_A, SIZE_A) + +#define GETARG_B(i) getarg(i, POS_B, SIZE_B) +#define SETARG_B(i,v) setarg(i, v, POS_B, SIZE_B) + +#define GETARG_C(i) getarg(i, POS_C, SIZE_C) +#define SETARG_C(i,v) setarg(i, v, POS_C, SIZE_C) + +#define GETARG_Bx(i) getarg(i, POS_Bx, SIZE_Bx) +#define SETARG_Bx(i,v) setarg(i, v, POS_Bx, SIZE_Bx) + +#define GETARG_Ax(i) getarg(i, POS_Ax, SIZE_Ax) +#define SETARG_Ax(i,v) setarg(i, v, POS_Ax, SIZE_Ax) + +#define GETARG_sBx(i) (GETARG_Bx(i)-MAXARG_sBx) +#define SETARG_sBx(i,b) SETARG_Bx((i),cast(unsigned int, (b)+MAXARG_sBx)) + + +#define CREATE_ABC(o,a,b,c) ((cast(Instruction, o)<<POS_OP) \ + | (cast(Instruction, a)<<POS_A) \ + | (cast(Instruction, b)<<POS_B) \ + | (cast(Instruction, c)<<POS_C)) + +#define CREATE_ABx(o,a,bc) ((cast(Instruction, o)<<POS_OP) \ + | (cast(Instruction, a)<<POS_A) \ + | (cast(Instruction, bc)<<POS_Bx)) + +#define CREATE_Ax(o,a) ((cast(Instruction, o)<<POS_OP) \ + | (cast(Instruction, a)<<POS_Ax)) + + +/* +** Macros to operate RK indices +*/ + +/* this bit 1 means constant (0 means register) */ +#define BITRK (1 << (SIZE_B - 1)) + +/* test whether value is a constant */ +#define ISK(x) ((x) & BITRK) + +/* gets the index of the constant */ +#define INDEXK(r) ((int)(r) & ~BITRK) + +#define MAXINDEXRK (BITRK - 1) + +/* code a constant index as a RK value */ +#define RKASK(x) ((x) | BITRK) + + +/* +** invalid register that fits in 8 bits +*/ +#define NO_REG MAXARG_A + + +/* +** R(x) - register +** Kst(x) - constant (in constant table) +** RK(x) == if ISK(x) then Kst(INDEXK(x)) else R(x) +*/ + + +/* +** grep "ORDER OP" if you change these enums +*/ + +typedef enum { +/*---------------------------------------------------------------------- +name args description +------------------------------------------------------------------------*/ +OP_MOVE,/* A B R(A) := R(B) */ +OP_LOADK,/* A Bx R(A) := Kst(Bx) */ +OP_LOADKX,/* A R(A) := Kst(extra arg) */ +OP_LOADBOOL,/* A B C R(A) := (Bool)B; if (C) pc++ */ +OP_LOADNIL,/* A B R(A), R(A+1), ..., R(A+B) := nil */ +OP_GETUPVAL,/* A B R(A) := UpValue[B] */ + +OP_GETTABUP,/* A B C R(A) := UpValue[B][RK(C)] */ +OP_GETTABLE,/* A B C R(A) := R(B)[RK(C)] */ + +OP_SETTABUP,/* A B C UpValue[A][RK(B)] := RK(C) */ +OP_SETUPVAL,/* A B UpValue[B] := R(A) */ +OP_SETTABLE,/* A B C R(A)[RK(B)] := RK(C) */ + +OP_NEWTABLE,/* A B C R(A) := {} (size = B,C) */ + +OP_SELF,/* A B C R(A+1) := R(B); R(A) := R(B)[RK(C)] */ + +OP_ADD,/* A B C R(A) := RK(B) + RK(C) */ +OP_SUB,/* A B C R(A) := RK(B) - RK(C) */ +OP_MUL,/* A B C R(A) := RK(B) * RK(C) */ +OP_DIV,/* A B C R(A) := RK(B) / RK(C) */ +OP_MOD,/* A B C R(A) := RK(B) % RK(C) */ +OP_POW,/* A B C R(A) := RK(B) ^ RK(C) */ +OP_UNM,/* A B R(A) := -R(B) */ +OP_NOT,/* A B R(A) := not R(B) */ +OP_LEN,/* A B R(A) := length of R(B) */ + +OP_CONCAT,/* A B C R(A) := R(B).. ... ..R(C) */ + +OP_JMP,/* A sBx pc+=sBx; if (A) close all upvalues >= R(A - 1) */ +OP_EQ,/* A B C if ((RK(B) == RK(C)) ~= A) then pc++ */ +OP_LT,/* A B C if ((RK(B) < RK(C)) ~= A) then pc++ */ +OP_LE,/* A B C if ((RK(B) <= RK(C)) ~= A) then pc++ */ + +OP_TEST,/* A C if not (R(A) <=> C) then pc++ */ +OP_TESTSET,/* A B C if (R(B) <=> C) then R(A) := R(B) else pc++ */ + +OP_CALL,/* A B C R(A), ... ,R(A+C-2) := R(A)(R(A+1), ... ,R(A+B-1)) */ +OP_TAILCALL,/* A B C return R(A)(R(A+1), ... ,R(A+B-1)) */ +OP_RETURN,/* A B return R(A), ... ,R(A+B-2) (see note) */ + +OP_FORLOOP,/* A sBx R(A)+=R(A+2); + if R(A) <?= R(A+1) then { pc+=sBx; R(A+3)=R(A) }*/ +OP_FORPREP,/* A sBx R(A)-=R(A+2); pc+=sBx */ + +OP_TFORCALL,/* A C R(A+3), ... ,R(A+2+C) := R(A)(R(A+1), R(A+2)); */ +OP_TFORLOOP,/* A sBx if R(A+1) ~= nil then { R(A)=R(A+1); pc += sBx }*/ + +OP_SETLIST,/* A B C R(A)[(C-1)*FPF+i] := R(A+i), 1 <= i <= B */ + +OP_CLOSURE,/* A Bx R(A) := closure(KPROTO[Bx]) */ + +OP_VARARG,/* A B R(A), R(A+1), ..., R(A+B-2) = vararg */ + +OP_EXTRAARG/* Ax extra (larger) argument for previous opcode */ +} OpCode; + + +#define NUM_OPCODES (cast(int, OP_EXTRAARG) + 1) + + + +/*=========================================================================== + Notes: + (*) In OP_CALL, if (B == 0) then B = top. If (C == 0), then `top' is + set to last_result+1, so next open instruction (OP_CALL, OP_RETURN, + OP_SETLIST) may use `top'. + + (*) In OP_VARARG, if (B == 0) then use actual number of varargs and + set top (like in OP_CALL with C == 0). + + (*) In OP_RETURN, if (B == 0) then return up to `top'. + + (*) In OP_SETLIST, if (B == 0) then B = `top'; if (C == 0) then next + 'instruction' is EXTRAARG(real C). + + (*) In OP_LOADKX, the next 'instruction' is always EXTRAARG. + + (*) For comparisons, A specifies what condition the test should accept + (true or false). + + (*) All `skips' (pc++) assume that next instruction is a jump. + +===========================================================================*/ + + +/* +** masks for instruction properties. The format is: +** bits 0-1: op mode +** bits 2-3: C arg mode +** bits 4-5: B arg mode +** bit 6: instruction set register A +** bit 7: operator is a test (next instruction must be a jump) +*/ + +enum OpArgMask { + OpArgN, /* argument is not used */ + OpArgU, /* argument is used */ + OpArgR, /* argument is a register or a jump offset */ + OpArgK /* argument is a constant or register/constant */ +}; + +LUAI_DDEC const lu_byte luaP_opmodes[NUM_OPCODES]; + +#define getOpMode(m) (cast(enum OpMode, luaP_opmodes[m] & 3)) +#define getBMode(m) (cast(enum OpArgMask, (luaP_opmodes[m] >> 4) & 3)) +#define getCMode(m) (cast(enum OpArgMask, (luaP_opmodes[m] >> 2) & 3)) +#define testAMode(m) (luaP_opmodes[m] & (1 << 6)) +#define testTMode(m) (luaP_opmodes[m] & (1 << 7)) + + +LUAI_DDEC const char *const luaP_opnames[NUM_OPCODES+1]; /* opcode names */ + + +/* number of list items to accumulate before a SETLIST instruction */ +#define LFIELDS_PER_FLUSH 50 + + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c new file mode 100644 index 000000000000..73f1af64f834 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c @@ -0,0 +1,1637 @@ +/* +** $Id: lparser.c,v 2.130.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua Parser +** See Copyright Notice in lua.h +*/ + +#include <sys/zfs_context.h> + +#define lparser_c +#define LUA_CORE + +#include "lua.h" + +#include "lcode.h" +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "llex.h" +#include "lmem.h" +#include "lobject.h" +#include "lopcodes.h" +#include "lparser.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" + + + +/* maximum number of local variables per function (must be smaller + than 250, due to the bytecode format) */ +#define MAXVARS 200 + + +#define hasmultret(k) ((k) == VCALL || (k) == VVARARG) + + + +/* +** nodes for block list (list of active blocks) +*/ +typedef struct BlockCnt { + struct BlockCnt *previous; /* chain */ + short firstlabel; /* index of first label in this block */ + short firstgoto; /* index of first pending goto in this block */ + lu_byte nactvar; /* # active locals outside the block */ + lu_byte upval; /* true if some variable in the block is an upvalue */ + lu_byte isloop; /* true if `block' is a loop */ +} BlockCnt; + + + +/* +** prototypes for recursive non-terminal functions +*/ +static void statement (LexState *ls); +static void expr (LexState *ls, expdesc *v); + + +static void anchor_token (LexState *ls) { + /* last token from outer function must be EOS */ + lua_assert(ls->fs != NULL || ls->t.token == TK_EOS); + if (ls->t.token == TK_NAME || ls->t.token == TK_STRING) { + TString *ts = ls->t.seminfo.ts; + luaX_newstring(ls, getstr(ts), ts->tsv.len); + } +} + + +/* semantic error */ +static l_noret semerror (LexState *ls, const char *msg) { + ls->t.token = 0; /* remove 'near to' from final message */ + luaX_syntaxerror(ls, msg); +} + + +static l_noret error_expected (LexState *ls, int token) { + luaX_syntaxerror(ls, + luaO_pushfstring(ls->L, "%s expected", luaX_token2str(ls, token))); +} + + +static l_noret errorlimit (FuncState *fs, int limit, const char *what) { + lua_State *L = fs->ls->L; + const char *msg; + int line = fs->f->linedefined; + const char *where = (line == 0) + ? "main function" + : luaO_pushfstring(L, "function at line %d", line); + msg = luaO_pushfstring(L, "too many %s (limit is %d) in %s", + what, limit, where); + luaX_syntaxerror(fs->ls, msg); +} + + +static void checklimit (FuncState *fs, int v, int l, const char *what) { + if (v > l) errorlimit(fs, l, what); +} + + +static int testnext (LexState *ls, int c) { + if (ls->t.token == c) { + luaX_next(ls); + return 1; + } + else return 0; +} + + +static void check (LexState *ls, int c) { + if (ls->t.token != c) + error_expected(ls, c); +} + + +static void checknext (LexState *ls, int c) { + check(ls, c); + luaX_next(ls); +} + + +#define check_condition(ls,c,msg) { if (!(c)) luaX_syntaxerror(ls, msg); } + + + +static void check_match (LexState *ls, int what, int who, int where) { + if (!testnext(ls, what)) { + if (where == ls->linenumber) + error_expected(ls, what); + else { + luaX_syntaxerror(ls, luaO_pushfstring(ls->L, + "%s expected (to close %s at line %d)", + luaX_token2str(ls, what), luaX_token2str(ls, who), where)); + } + } +} + + +static TString *str_checkname (LexState *ls) { + TString *ts; + check(ls, TK_NAME); + ts = ls->t.seminfo.ts; + luaX_next(ls); + return ts; +} + + +static void init_exp (expdesc *e, expkind k, int i) { + e->f = e->t = NO_JUMP; + e->k = k; + e->u.info = i; +} + + +static void codestring (LexState *ls, expdesc *e, TString *s) { + init_exp(e, VK, luaK_stringK(ls->fs, s)); +} + + +static void checkname (LexState *ls, expdesc *e) { + codestring(ls, e, str_checkname(ls)); +} + + +static int registerlocalvar (LexState *ls, TString *varname) { + FuncState *fs = ls->fs; + Proto *f = fs->f; + int oldsize = f->sizelocvars; + luaM_growvector(ls->L, f->locvars, fs->nlocvars, f->sizelocvars, + LocVar, SHRT_MAX, "local variables"); + while (oldsize < f->sizelocvars) f->locvars[oldsize++].varname = NULL; + f->locvars[fs->nlocvars].varname = varname; + luaC_objbarrier(ls->L, f, varname); + return fs->nlocvars++; +} + + +static void new_localvar (LexState *ls, TString *name) { + FuncState *fs = ls->fs; + Dyndata *dyd = ls->dyd; + int reg = registerlocalvar(ls, name); + checklimit(fs, dyd->actvar.n + 1 - fs->firstlocal, + MAXVARS, "local variables"); + luaM_growvector(ls->L, dyd->actvar.arr, dyd->actvar.n + 1, + dyd->actvar.size, Vardesc, MAX_INT, "local variables"); + dyd->actvar.arr[dyd->actvar.n++].idx = cast(short, reg); +} + + +static void new_localvarliteral_ (LexState *ls, const char *name, size_t sz) { + new_localvar(ls, luaX_newstring(ls, name, sz)); +} + +#define new_localvarliteral(ls,v) \ + new_localvarliteral_(ls, "" v, (sizeof(v)/sizeof(char))-1) + + +static LocVar *getlocvar (FuncState *fs, int i) { + int idx = fs->ls->dyd->actvar.arr[fs->firstlocal + i].idx; + lua_assert(idx < fs->nlocvars); + return &fs->f->locvars[idx]; +} + + +static void adjustlocalvars (LexState *ls, int nvars) { + FuncState *fs = ls->fs; + fs->nactvar = cast_byte(fs->nactvar + nvars); + for (; nvars; nvars--) { + getlocvar(fs, fs->nactvar - nvars)->startpc = fs->pc; + } +} + + +static void removevars (FuncState *fs, int tolevel) { + fs->ls->dyd->actvar.n -= (fs->nactvar - tolevel); + while (fs->nactvar > tolevel) + getlocvar(fs, --fs->nactvar)->endpc = fs->pc; +} + + +static int searchupvalue (FuncState *fs, TString *name) { + int i; + Upvaldesc *up = fs->f->upvalues; + for (i = 0; i < fs->nups; i++) { + if (luaS_eqstr(up[i].name, name)) return i; + } + return -1; /* not found */ +} + + +static int newupvalue (FuncState *fs, TString *name, expdesc *v) { + Proto *f = fs->f; + int oldsize = f->sizeupvalues; + checklimit(fs, fs->nups + 1, MAXUPVAL, "upvalues"); + luaM_growvector(fs->ls->L, f->upvalues, fs->nups, f->sizeupvalues, + Upvaldesc, MAXUPVAL, "upvalues"); + while (oldsize < f->sizeupvalues) f->upvalues[oldsize++].name = NULL; + f->upvalues[fs->nups].instack = (v->k == VLOCAL); + f->upvalues[fs->nups].idx = cast_byte(v->u.info); + f->upvalues[fs->nups].name = name; + luaC_objbarrier(fs->ls->L, f, name); + return fs->nups++; +} + + +static int searchvar (FuncState *fs, TString *n) { + int i; + for (i = cast_int(fs->nactvar) - 1; i >= 0; i--) { + if (luaS_eqstr(n, getlocvar(fs, i)->varname)) + return i; + } + return -1; /* not found */ +} + + +/* + Mark block where variable at given level was defined + (to emit close instructions later). +*/ +static void markupval (FuncState *fs, int level) { + BlockCnt *bl = fs->bl; + while (bl->nactvar > level) bl = bl->previous; + bl->upval = 1; +} + + +/* + Find variable with given name 'n'. If it is an upvalue, add this + upvalue into all intermediate functions. +*/ +static int singlevaraux (FuncState *fs, TString *n, expdesc *var, int base) { + if (fs == NULL) /* no more levels? */ + return VVOID; /* default is global */ + else { + int v = searchvar(fs, n); /* look up locals at current level */ + if (v >= 0) { /* found? */ + init_exp(var, VLOCAL, v); /* variable is local */ + if (!base) + markupval(fs, v); /* local will be used as an upval */ + return VLOCAL; + } + else { /* not found as local at current level; try upvalues */ + int idx = searchupvalue(fs, n); /* try existing upvalues */ + if (idx < 0) { /* not found? */ + if (singlevaraux(fs->prev, n, var, 0) == VVOID) /* try upper levels */ + return VVOID; /* not found; is a global */ + /* else was LOCAL or UPVAL */ + idx = newupvalue(fs, n, var); /* will be a new upvalue */ + } + init_exp(var, VUPVAL, idx); + return VUPVAL; + } + } +} + + +static void singlevar (LexState *ls, expdesc *var) { + TString *varname = str_checkname(ls); + FuncState *fs = ls->fs; + if (singlevaraux(fs, varname, var, 1) == VVOID) { /* global name? */ + expdesc key; + singlevaraux(fs, ls->envn, var, 1); /* get environment variable */ + lua_assert(var->k == VLOCAL || var->k == VUPVAL); + codestring(ls, &key, varname); /* key is variable name */ + luaK_indexed(fs, var, &key); /* env[varname] */ + } +} + + +static void adjust_assign (LexState *ls, int nvars, int nexps, expdesc *e) { + FuncState *fs = ls->fs; + int extra = nvars - nexps; + if (hasmultret(e->k)) { + extra++; /* includes call itself */ + if (extra < 0) extra = 0; + luaK_setreturns(fs, e, extra); /* last exp. provides the difference */ + if (extra > 1) luaK_reserveregs(fs, extra-1); + } + else { + if (e->k != VVOID) luaK_exp2nextreg(fs, e); /* close last expression */ + if (extra > 0) { + int reg = fs->freereg; + luaK_reserveregs(fs, extra); + luaK_nil(fs, reg, extra); + } + } +} + + +static void enterlevel (LexState *ls) { + lua_State *L = ls->L; + ++L->nCcalls; + checklimit(ls->fs, L->nCcalls, LUAI_MAXCCALLS, "C levels"); +} + + +#define leavelevel(ls) ((ls)->L->nCcalls--) + + +static void closegoto (LexState *ls, int g, Labeldesc *label) { + int i; + FuncState *fs = ls->fs; + Labellist *gl = &ls->dyd->gt; + Labeldesc *gt = &gl->arr[g]; + lua_assert(luaS_eqstr(gt->name, label->name)); + if (gt->nactvar < label->nactvar) { + TString *vname = getlocvar(fs, gt->nactvar)->varname; + const char *msg = luaO_pushfstring(ls->L, + "<goto %s> at line %d jumps into the scope of local " LUA_QS, + getstr(gt->name), gt->line, getstr(vname)); + semerror(ls, msg); + } + luaK_patchlist(fs, gt->pc, label->pc); + /* remove goto from pending list */ + for (i = g; i < gl->n - 1; i++) + gl->arr[i] = gl->arr[i + 1]; + gl->n--; +} + + +/* +** try to close a goto with existing labels; this solves backward jumps +*/ +static int findlabel (LexState *ls, int g) { + int i; + BlockCnt *bl = ls->fs->bl; + Dyndata *dyd = ls->dyd; + Labeldesc *gt = &dyd->gt.arr[g]; + /* check labels in current block for a match */ + for (i = bl->firstlabel; i < dyd->label.n; i++) { + Labeldesc *lb = &dyd->label.arr[i]; + if (luaS_eqstr(lb->name, gt->name)) { /* correct label? */ + if (gt->nactvar > lb->nactvar && + (bl->upval || dyd->label.n > bl->firstlabel)) + luaK_patchclose(ls->fs, gt->pc, lb->nactvar); + closegoto(ls, g, lb); /* close it */ + return 1; + } + } + return 0; /* label not found; cannot close goto */ +} + + +static int newlabelentry (LexState *ls, Labellist *l, TString *name, + int line, int pc) { + int n = l->n; + luaM_growvector(ls->L, l->arr, n, l->size, + Labeldesc, SHRT_MAX, "labels/gotos"); + l->arr[n].name = name; + l->arr[n].line = line; + l->arr[n].nactvar = ls->fs->nactvar; + l->arr[n].pc = pc; + l->n++; + return n; +} + + +/* +** check whether new label 'lb' matches any pending gotos in current +** block; solves forward jumps +*/ +static void findgotos (LexState *ls, Labeldesc *lb) { + Labellist *gl = &ls->dyd->gt; + int i = ls->fs->bl->firstgoto; + while (i < gl->n) { + if (luaS_eqstr(gl->arr[i].name, lb->name)) + closegoto(ls, i, lb); + else + i++; + } +} + + +/* +** "export" pending gotos to outer level, to check them against +** outer labels; if the block being exited has upvalues, and +** the goto exits the scope of any variable (which can be the +** upvalue), close those variables being exited. +*/ +static void movegotosout (FuncState *fs, BlockCnt *bl) { + int i = bl->firstgoto; + Labellist *gl = &fs->ls->dyd->gt; + /* correct pending gotos to current block and try to close it + with visible labels */ + while (i < gl->n) { + Labeldesc *gt = &gl->arr[i]; + if (gt->nactvar > bl->nactvar) { + if (bl->upval) + luaK_patchclose(fs, gt->pc, bl->nactvar); + gt->nactvar = bl->nactvar; + } + if (!findlabel(fs->ls, i)) + i++; /* move to next one */ + } +} + + +static void enterblock (FuncState *fs, BlockCnt *bl, lu_byte isloop) { + bl->isloop = isloop; + bl->nactvar = fs->nactvar; + bl->firstlabel = fs->ls->dyd->label.n; + bl->firstgoto = fs->ls->dyd->gt.n; + bl->upval = 0; + bl->previous = fs->bl; + fs->bl = bl; + lua_assert(fs->freereg == fs->nactvar); +} + + +/* +** create a label named "break" to resolve break statements +*/ +static void breaklabel (LexState *ls) { + TString *n = luaS_new(ls->L, "break"); + int l = newlabelentry(ls, &ls->dyd->label, n, 0, ls->fs->pc); + findgotos(ls, &ls->dyd->label.arr[l]); +} + +/* +** generates an error for an undefined 'goto'; choose appropriate +** message when label name is a reserved word (which can only be 'break') +*/ +static l_noret undefgoto (LexState *ls, Labeldesc *gt) { + const char *msg = isreserved(gt->name) + ? "<%s> at line %d not inside a loop" + : "no visible label " LUA_QS " for <goto> at line %d"; + msg = luaO_pushfstring(ls->L, msg, getstr(gt->name), gt->line); + semerror(ls, msg); +} + + +static void leaveblock (FuncState *fs) { + BlockCnt *bl = fs->bl; + LexState *ls = fs->ls; + if (bl->previous && bl->upval) { + /* create a 'jump to here' to close upvalues */ + int j = luaK_jump(fs); + luaK_patchclose(fs, j, bl->nactvar); + luaK_patchtohere(fs, j); + } + if (bl->isloop) + breaklabel(ls); /* close pending breaks */ + fs->bl = bl->previous; + removevars(fs, bl->nactvar); + lua_assert(bl->nactvar == fs->nactvar); + fs->freereg = fs->nactvar; /* free registers */ + ls->dyd->label.n = bl->firstlabel; /* remove local labels */ + if (bl->previous) /* inner block? */ + movegotosout(fs, bl); /* update pending gotos to outer block */ + else if (bl->firstgoto < ls->dyd->gt.n) /* pending gotos in outer block? */ + undefgoto(ls, &ls->dyd->gt.arr[bl->firstgoto]); /* error */ +} + + +/* +** adds a new prototype into list of prototypes +*/ +static Proto *addprototype (LexState *ls) { + Proto *clp; + lua_State *L = ls->L; + FuncState *fs = ls->fs; + Proto *f = fs->f; /* prototype of current function */ + if (fs->np >= f->sizep) { + int oldsize = f->sizep; + luaM_growvector(L, f->p, fs->np, f->sizep, Proto *, MAXARG_Bx, "functions"); + while (oldsize < f->sizep) f->p[oldsize++] = NULL; + } + f->p[fs->np++] = clp = luaF_newproto(L); + luaC_objbarrier(L, f, clp); + return clp; +} + + +/* +** codes instruction to create new closure in parent function. +** The OP_CLOSURE instruction must use the last available register, +** so that, if it invokes the GC, the GC knows which registers +** are in use at that time. +*/ +static void codeclosure (LexState *ls, expdesc *v) { + FuncState *fs = ls->fs->prev; + init_exp(v, VRELOCABLE, luaK_codeABx(fs, OP_CLOSURE, 0, fs->np - 1)); + luaK_exp2nextreg(fs, v); /* fix it at the last register */ +} + + +static void open_func (LexState *ls, FuncState *fs, BlockCnt *bl) { + lua_State *L = ls->L; + Proto *f; + fs->prev = ls->fs; /* linked list of funcstates */ + fs->ls = ls; + ls->fs = fs; + fs->pc = 0; + fs->lasttarget = 0; + fs->jpc = NO_JUMP; + fs->freereg = 0; + fs->nk = 0; + fs->np = 0; + fs->nups = 0; + fs->nlocvars = 0; + fs->nactvar = 0; + fs->firstlocal = ls->dyd->actvar.n; + fs->bl = NULL; + f = fs->f; + f->source = ls->source; + f->maxstacksize = 2; /* registers 0/1 are always valid */ + fs->h = luaH_new(L); + /* anchor table of constants (to avoid being collected) */ + sethvalue2s(L, L->top, fs->h); + incr_top(L); + enterblock(fs, bl, 0); +} + + +static void close_func (LexState *ls) { + lua_State *L = ls->L; + FuncState *fs = ls->fs; + Proto *f = fs->f; + luaK_ret(fs, 0, 0); /* final return */ + leaveblock(fs); + luaM_reallocvector(L, f->code, f->sizecode, fs->pc, Instruction); + f->sizecode = fs->pc; + luaM_reallocvector(L, f->lineinfo, f->sizelineinfo, fs->pc, int); + f->sizelineinfo = fs->pc; + luaM_reallocvector(L, f->k, f->sizek, fs->nk, TValue); + f->sizek = fs->nk; + luaM_reallocvector(L, f->p, f->sizep, fs->np, Proto *); + f->sizep = fs->np; + luaM_reallocvector(L, f->locvars, f->sizelocvars, fs->nlocvars, LocVar); + f->sizelocvars = fs->nlocvars; + luaM_reallocvector(L, f->upvalues, f->sizeupvalues, fs->nups, Upvaldesc); + f->sizeupvalues = fs->nups; + lua_assert(fs->bl == NULL); + ls->fs = fs->prev; + /* last token read was anchored in defunct function; must re-anchor it */ + anchor_token(ls); + L->top--; /* pop table of constants */ + luaC_checkGC(L); +} + + + +/*============================================================*/ +/* GRAMMAR RULES */ +/*============================================================*/ + + +/* +** check whether current token is in the follow set of a block. +** 'until' closes syntactical blocks, but do not close scope, +** so it handled in separate. +*/ +static int block_follow (LexState *ls, int withuntil) { + switch (ls->t.token) { + case TK_ELSE: case TK_ELSEIF: + case TK_END: case TK_EOS: + return 1; + case TK_UNTIL: return withuntil; + default: return 0; + } +} + + +static void statlist (LexState *ls) { + /* statlist -> { stat [`;'] } */ + while (!block_follow(ls, 1)) { + if (ls->t.token == TK_RETURN) { + statement(ls); + return; /* 'return' must be last statement */ + } + statement(ls); + } +} + + +static void fieldsel (LexState *ls, expdesc *v) { + /* fieldsel -> ['.' | ':'] NAME */ + FuncState *fs = ls->fs; + expdesc key; + luaK_exp2anyregup(fs, v); + luaX_next(ls); /* skip the dot or colon */ + checkname(ls, &key); + luaK_indexed(fs, v, &key); +} + + +static void yindex (LexState *ls, expdesc *v) { + /* index -> '[' expr ']' */ + luaX_next(ls); /* skip the '[' */ + expr(ls, v); + luaK_exp2val(ls->fs, v); + checknext(ls, ']'); +} + + +/* +** {====================================================================== +** Rules for Constructors +** ======================================================================= +*/ + + +struct ConsControl { + expdesc v; /* last list item read */ + expdesc *t; /* table descriptor */ + int nh; /* total number of `record' elements */ + int na; /* total number of array elements */ + int tostore; /* number of array elements pending to be stored */ +}; + + +static void recfield (LexState *ls, struct ConsControl *cc) { + /* recfield -> (NAME | `['exp1`]') = exp1 */ + FuncState *fs = ls->fs; + int reg = ls->fs->freereg; + expdesc key, val; + int rkkey; + if (ls->t.token == TK_NAME) { + checklimit(fs, cc->nh, MAX_INT, "items in a constructor"); + checkname(ls, &key); + } + else /* ls->t.token == '[' */ + yindex(ls, &key); + cc->nh++; + checknext(ls, '='); + rkkey = luaK_exp2RK(fs, &key); + expr(ls, &val); + luaK_codeABC(fs, OP_SETTABLE, cc->t->u.info, rkkey, luaK_exp2RK(fs, &val)); + fs->freereg = reg; /* free registers */ +} + + +static void closelistfield (FuncState *fs, struct ConsControl *cc) { + if (cc->v.k == VVOID) return; /* there is no list item */ + luaK_exp2nextreg(fs, &cc->v); + cc->v.k = VVOID; + if (cc->tostore == LFIELDS_PER_FLUSH) { + luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore); /* flush */ + cc->tostore = 0; /* no more items pending */ + } +} + + +static void lastlistfield (FuncState *fs, struct ConsControl *cc) { + if (cc->tostore == 0) return; + if (hasmultret(cc->v.k)) { + luaK_setmultret(fs, &cc->v); + luaK_setlist(fs, cc->t->u.info, cc->na, LUA_MULTRET); + cc->na--; /* do not count last expression (unknown number of elements) */ + } + else { + if (cc->v.k != VVOID) + luaK_exp2nextreg(fs, &cc->v); + luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore); + } +} + + +static void listfield (LexState *ls, struct ConsControl *cc) { + /* listfield -> exp */ + expr(ls, &cc->v); + checklimit(ls->fs, cc->na, MAX_INT, "items in a constructor"); + cc->na++; + cc->tostore++; +} + + +static void field (LexState *ls, struct ConsControl *cc) { + /* field -> listfield | recfield */ + switch(ls->t.token) { + case TK_NAME: { /* may be 'listfield' or 'recfield' */ + if (luaX_lookahead(ls) != '=') /* expression? */ + listfield(ls, cc); + else + recfield(ls, cc); + break; + } + case '[': { + recfield(ls, cc); + break; + } + default: { + listfield(ls, cc); + break; + } + } +} + + +static void constructor (LexState *ls, expdesc *t) { + /* constructor -> '{' [ field { sep field } [sep] ] '}' + sep -> ',' | ';' */ + FuncState *fs = ls->fs; + int line = ls->linenumber; + int pc = luaK_codeABC(fs, OP_NEWTABLE, 0, 0, 0); + struct ConsControl cc; + cc.na = cc.nh = cc.tostore = 0; + cc.t = t; + init_exp(t, VRELOCABLE, pc); + init_exp(&cc.v, VVOID, 0); /* no value (yet) */ + luaK_exp2nextreg(ls->fs, t); /* fix it at stack top */ + checknext(ls, '{'); + do { + lua_assert(cc.v.k == VVOID || cc.tostore > 0); + if (ls->t.token == '}') break; + closelistfield(fs, &cc); + field(ls, &cc); + } while (testnext(ls, ',') || testnext(ls, ';')); + check_match(ls, '}', '{', line); + lastlistfield(fs, &cc); + SETARG_B(fs->f->code[pc], luaO_int2fb(cc.na)); /* set initial array size */ + SETARG_C(fs->f->code[pc], luaO_int2fb(cc.nh)); /* set initial table size */ +} + +/* }====================================================================== */ + + + +static void parlist (LexState *ls) { + /* parlist -> [ param { `,' param } ] */ + FuncState *fs = ls->fs; + Proto *f = fs->f; + int nparams = 0; + f->is_vararg = 0; + if (ls->t.token != ')') { /* is `parlist' not empty? */ + do { + switch (ls->t.token) { + case TK_NAME: { /* param -> NAME */ + new_localvar(ls, str_checkname(ls)); + nparams++; + break; + } + case TK_DOTS: { /* param -> `...' */ + luaX_next(ls); + f->is_vararg = 1; + break; + } + default: luaX_syntaxerror(ls, "<name> or " LUA_QL("...") " expected"); + } + } while (!f->is_vararg && testnext(ls, ',')); + } + adjustlocalvars(ls, nparams); + f->numparams = cast_byte(fs->nactvar); + luaK_reserveregs(fs, fs->nactvar); /* reserve register for parameters */ +} + + +static void body (LexState *ls, expdesc *e, int ismethod, int line) { + /* body -> `(' parlist `)' block END */ + FuncState new_fs; + BlockCnt bl; + new_fs.f = addprototype(ls); + new_fs.f->linedefined = line; + open_func(ls, &new_fs, &bl); + checknext(ls, '('); + if (ismethod) { + new_localvarliteral(ls, "self"); /* create 'self' parameter */ + adjustlocalvars(ls, 1); + } + parlist(ls); + checknext(ls, ')'); + statlist(ls); + new_fs.f->lastlinedefined = ls->linenumber; + check_match(ls, TK_END, TK_FUNCTION, line); + codeclosure(ls, e); + close_func(ls); +} + + +static int explist (LexState *ls, expdesc *v) { + /* explist -> expr { `,' expr } */ + int n = 1; /* at least one expression */ + expr(ls, v); + while (testnext(ls, ',')) { + luaK_exp2nextreg(ls->fs, v); + expr(ls, v); + n++; + } + return n; +} + + +static void funcargs (LexState *ls, expdesc *f, int line) { + FuncState *fs = ls->fs; + expdesc args; + int base, nparams; + switch (ls->t.token) { + case '(': { /* funcargs -> `(' [ explist ] `)' */ + luaX_next(ls); + if (ls->t.token == ')') /* arg list is empty? */ + args.k = VVOID; + else { + explist(ls, &args); + luaK_setmultret(fs, &args); + } + check_match(ls, ')', '(', line); + break; + } + case '{': { /* funcargs -> constructor */ + constructor(ls, &args); + break; + } + case TK_STRING: { /* funcargs -> STRING */ + codestring(ls, &args, ls->t.seminfo.ts); + luaX_next(ls); /* must use `seminfo' before `next' */ + break; + } + default: { + luaX_syntaxerror(ls, "function arguments expected"); + } + } + lua_assert(f->k == VNONRELOC); + base = f->u.info; /* base register for call */ + if (hasmultret(args.k)) + nparams = LUA_MULTRET; /* open call */ + else { + if (args.k != VVOID) + luaK_exp2nextreg(fs, &args); /* close last argument */ + nparams = fs->freereg - (base+1); + } + init_exp(f, VCALL, luaK_codeABC(fs, OP_CALL, base, nparams+1, 2)); + luaK_fixline(fs, line); + fs->freereg = base+1; /* call remove function and arguments and leaves + (unless changed) one result */ +} + + + + +/* +** {====================================================================== +** Expression parsing +** ======================================================================= +*/ + + +static void primaryexp (LexState *ls, expdesc *v) { + /* primaryexp -> NAME | '(' expr ')' */ + switch (ls->t.token) { + case '(': { + int line = ls->linenumber; + luaX_next(ls); + expr(ls, v); + check_match(ls, ')', '(', line); + luaK_dischargevars(ls->fs, v); + return; + } + case TK_NAME: { + singlevar(ls, v); + return; + } + default: { + luaX_syntaxerror(ls, "unexpected symbol"); + } + } +} + + +static void suffixedexp (LexState *ls, expdesc *v) { + /* suffixedexp -> + primaryexp { '.' NAME | '[' exp ']' | ':' NAME funcargs | funcargs } */ + FuncState *fs = ls->fs; + int line = ls->linenumber; + primaryexp(ls, v); + for (;;) { + switch (ls->t.token) { + case '.': { /* fieldsel */ + fieldsel(ls, v); + break; + } + case '[': { /* `[' exp1 `]' */ + expdesc key; + luaK_exp2anyregup(fs, v); + yindex(ls, &key); + luaK_indexed(fs, v, &key); + break; + } + case ':': { /* `:' NAME funcargs */ + expdesc key; + luaX_next(ls); + checkname(ls, &key); + luaK_self(fs, v, &key); + funcargs(ls, v, line); + break; + } + case '(': case TK_STRING: case '{': { /* funcargs */ + luaK_exp2nextreg(fs, v); + funcargs(ls, v, line); + break; + } + default: return; + } + } +} + + +static void simpleexp (LexState *ls, expdesc *v) { + /* simpleexp -> NUMBER | STRING | NIL | TRUE | FALSE | ... | + constructor | FUNCTION body | suffixedexp */ + switch (ls->t.token) { + case TK_NUMBER: { + init_exp(v, VKNUM, 0); + v->u.nval = ls->t.seminfo.r; + break; + } + case TK_STRING: { + codestring(ls, v, ls->t.seminfo.ts); + break; + } + case TK_NIL: { + init_exp(v, VNIL, 0); + break; + } + case TK_TRUE: { + init_exp(v, VTRUE, 0); + break; + } + case TK_FALSE: { + init_exp(v, VFALSE, 0); + break; + } + case TK_DOTS: { /* vararg */ + FuncState *fs = ls->fs; + check_condition(ls, fs->f->is_vararg, + "cannot use " LUA_QL("...") " outside a vararg function"); + init_exp(v, VVARARG, luaK_codeABC(fs, OP_VARARG, 0, 1, 0)); + break; + } + case '{': { /* constructor */ + constructor(ls, v); + return; + } + case TK_FUNCTION: { + luaX_next(ls); + body(ls, v, 0, ls->linenumber); + return; + } + default: { + suffixedexp(ls, v); + return; + } + } + luaX_next(ls); +} + + +static UnOpr getunopr (int op) { + switch (op) { + case TK_NOT: return OPR_NOT; + case '-': return OPR_MINUS; + case '#': return OPR_LEN; + default: return OPR_NOUNOPR; + } +} + + +static BinOpr getbinopr (int op) { + switch (op) { + case '+': return OPR_ADD; + case '-': return OPR_SUB; + case '*': return OPR_MUL; + case '/': return OPR_DIV; + case '%': return OPR_MOD; + case '^': return OPR_POW; + case TK_CONCAT: return OPR_CONCAT; + case TK_NE: return OPR_NE; + case TK_EQ: return OPR_EQ; + case '<': return OPR_LT; + case TK_LE: return OPR_LE; + case '>': return OPR_GT; + case TK_GE: return OPR_GE; + case TK_AND: return OPR_AND; + case TK_OR: return OPR_OR; + default: return OPR_NOBINOPR; + } +} + + +static const struct { + lu_byte left; /* left priority for each binary operator */ + lu_byte right; /* right priority */ +} priority[] = { /* ORDER OPR */ + {6, 6}, {6, 6}, {7, 7}, {7, 7}, {7, 7}, /* `+' `-' `*' `/' `%' */ + {10, 9}, {5, 4}, /* ^, .. (right associative) */ + {3, 3}, {3, 3}, {3, 3}, /* ==, <, <= */ + {3, 3}, {3, 3}, {3, 3}, /* ~=, >, >= */ + {2, 2}, {1, 1} /* and, or */ +}; + +#define UNARY_PRIORITY 8 /* priority for unary operators */ + + +/* +** subexpr -> (simpleexp | unop subexpr) { binop subexpr } +** where `binop' is any binary operator with a priority higher than `limit' +*/ +static BinOpr subexpr (LexState *ls, expdesc *v, int limit) { + BinOpr op; + UnOpr uop; + enterlevel(ls); + uop = getunopr(ls->t.token); + if (uop != OPR_NOUNOPR) { + int line = ls->linenumber; + luaX_next(ls); + subexpr(ls, v, UNARY_PRIORITY); + luaK_prefix(ls->fs, uop, v, line); + } + else simpleexp(ls, v); + /* expand while operators have priorities higher than `limit' */ + op = getbinopr(ls->t.token); + while (op != OPR_NOBINOPR && priority[op].left > limit) { + expdesc v2; + BinOpr nextop; + int line = ls->linenumber; + luaX_next(ls); + luaK_infix(ls->fs, op, v); + /* read sub-expression with higher priority */ + nextop = subexpr(ls, &v2, priority[op].right); + luaK_posfix(ls->fs, op, v, &v2, line); + op = nextop; + } + leavelevel(ls); + return op; /* return first untreated operator */ +} + + +static void expr (LexState *ls, expdesc *v) { + subexpr(ls, v, 0); +} + +/* }==================================================================== */ + + + +/* +** {====================================================================== +** Rules for Statements +** ======================================================================= +*/ + + +static void block (LexState *ls) { + /* block -> statlist */ + FuncState *fs = ls->fs; + BlockCnt bl; + enterblock(fs, &bl, 0); + statlist(ls); + leaveblock(fs); +} + + +/* +** structure to chain all variables in the left-hand side of an +** assignment +*/ +struct LHS_assign { + struct LHS_assign *prev; + expdesc v; /* variable (global, local, upvalue, or indexed) */ +}; + + +/* +** check whether, in an assignment to an upvalue/local variable, the +** upvalue/local variable is begin used in a previous assignment to a +** table. If so, save original upvalue/local value in a safe place and +** use this safe copy in the previous assignment. +*/ +static void check_conflict (LexState *ls, struct LHS_assign *lh, expdesc *v) { + FuncState *fs = ls->fs; + int extra = fs->freereg; /* eventual position to save local variable */ + int conflict = 0; + for (; lh; lh = lh->prev) { /* check all previous assignments */ + if (lh->v.k == VINDEXED) { /* assigning to a table? */ + /* table is the upvalue/local being assigned now? */ + if (lh->v.u.ind.vt == v->k && lh->v.u.ind.t == v->u.info) { + conflict = 1; + lh->v.u.ind.vt = VLOCAL; + lh->v.u.ind.t = extra; /* previous assignment will use safe copy */ + } + /* index is the local being assigned? (index cannot be upvalue) */ + if (v->k == VLOCAL && lh->v.u.ind.idx == v->u.info) { + conflict = 1; + lh->v.u.ind.idx = extra; /* previous assignment will use safe copy */ + } + } + } + if (conflict) { + /* copy upvalue/local value to a temporary (in position 'extra') */ + OpCode op = (v->k == VLOCAL) ? OP_MOVE : OP_GETUPVAL; + luaK_codeABC(fs, op, extra, v->u.info, 0); + luaK_reserveregs(fs, 1); + } +} + + +static void assignment (LexState *ls, struct LHS_assign *lh, int nvars) { + expdesc e; + check_condition(ls, vkisvar(lh->v.k), "syntax error"); + if (testnext(ls, ',')) { /* assignment -> ',' suffixedexp assignment */ + struct LHS_assign nv; + nv.prev = lh; + suffixedexp(ls, &nv.v); + if (nv.v.k != VINDEXED) + check_conflict(ls, lh, &nv.v); + checklimit(ls->fs, nvars + ls->L->nCcalls, LUAI_MAXCCALLS, + "C levels"); + assignment(ls, &nv, nvars+1); + } + else { /* assignment -> `=' explist */ + int nexps; + checknext(ls, '='); + nexps = explist(ls, &e); + if (nexps != nvars) { + adjust_assign(ls, nvars, nexps, &e); + if (nexps > nvars) + ls->fs->freereg -= nexps - nvars; /* remove extra values */ + } + else { + luaK_setoneret(ls->fs, &e); /* close last expression */ + luaK_storevar(ls->fs, &lh->v, &e); + return; /* avoid default */ + } + } + init_exp(&e, VNONRELOC, ls->fs->freereg-1); /* default assignment */ + luaK_storevar(ls->fs, &lh->v, &e); +} + + +static int cond (LexState *ls) { + /* cond -> exp */ + expdesc v; + expr(ls, &v); /* read condition */ + if (v.k == VNIL) v.k = VFALSE; /* `falses' are all equal here */ + luaK_goiftrue(ls->fs, &v); + return v.f; +} + + +static void gotostat (LexState *ls, int pc) { + int line = ls->linenumber; + TString *label; + int g; + if (testnext(ls, TK_GOTO)) + label = str_checkname(ls); + else { + luaX_next(ls); /* skip break */ + label = luaS_new(ls->L, "break"); + } + g = newlabelentry(ls, &ls->dyd->gt, label, line, pc); + findlabel(ls, g); /* close it if label already defined */ +} + + +/* check for repeated labels on the same block */ +static void checkrepeated (FuncState *fs, Labellist *ll, TString *label) { + int i; + for (i = fs->bl->firstlabel; i < ll->n; i++) { + if (luaS_eqstr(label, ll->arr[i].name)) { + const char *msg = luaO_pushfstring(fs->ls->L, + "label " LUA_QS " already defined on line %d", + getstr(label), ll->arr[i].line); + semerror(fs->ls, msg); + } + } +} + + +/* skip no-op statements */ +static void skipnoopstat (LexState *ls) { + while (ls->t.token == ';' || ls->t.token == TK_DBCOLON) + statement(ls); +} + + +static void labelstat (LexState *ls, TString *label, int line) { + /* label -> '::' NAME '::' */ + FuncState *fs = ls->fs; + Labellist *ll = &ls->dyd->label; + int l; /* index of new label being created */ + checkrepeated(fs, ll, label); /* check for repeated labels */ + checknext(ls, TK_DBCOLON); /* skip double colon */ + /* create new entry for this label */ + l = newlabelentry(ls, ll, label, line, fs->pc); + skipnoopstat(ls); /* skip other no-op statements */ + if (block_follow(ls, 0)) { /* label is last no-op statement in the block? */ + /* assume that locals are already out of scope */ + ll->arr[l].nactvar = fs->bl->nactvar; + } + findgotos(ls, &ll->arr[l]); +} + + +static void whilestat (LexState *ls, int line) { + /* whilestat -> WHILE cond DO block END */ + FuncState *fs = ls->fs; + int whileinit; + int condexit; + BlockCnt bl; + luaX_next(ls); /* skip WHILE */ + whileinit = luaK_getlabel(fs); + condexit = cond(ls); + enterblock(fs, &bl, 1); + checknext(ls, TK_DO); + block(ls); + luaK_jumpto(fs, whileinit); + check_match(ls, TK_END, TK_WHILE, line); + leaveblock(fs); + luaK_patchtohere(fs, condexit); /* false conditions finish the loop */ +} + + +static void repeatstat (LexState *ls, int line) { + /* repeatstat -> REPEAT block UNTIL cond */ + int condexit; + FuncState *fs = ls->fs; + int repeat_init = luaK_getlabel(fs); + BlockCnt bl1, bl2; + enterblock(fs, &bl1, 1); /* loop block */ + enterblock(fs, &bl2, 0); /* scope block */ + luaX_next(ls); /* skip REPEAT */ + statlist(ls); + check_match(ls, TK_UNTIL, TK_REPEAT, line); + condexit = cond(ls); /* read condition (inside scope block) */ + if (bl2.upval) /* upvalues? */ + luaK_patchclose(fs, condexit, bl2.nactvar); + leaveblock(fs); /* finish scope */ + luaK_patchlist(fs, condexit, repeat_init); /* close the loop */ + leaveblock(fs); /* finish loop */ +} + + +static int exp1 (LexState *ls) { + expdesc e; + int reg; + expr(ls, &e); + luaK_exp2nextreg(ls->fs, &e); + lua_assert(e.k == VNONRELOC); + reg = e.u.info; + return reg; +} + + +static void forbody (LexState *ls, int base, int line, int nvars, int isnum) { + /* forbody -> DO block */ + BlockCnt bl; + FuncState *fs = ls->fs; + int prep, endfor; + adjustlocalvars(ls, 3); /* control variables */ + checknext(ls, TK_DO); + prep = isnum ? luaK_codeAsBx(fs, OP_FORPREP, base, NO_JUMP) : luaK_jump(fs); + enterblock(fs, &bl, 0); /* scope for declared variables */ + adjustlocalvars(ls, nvars); + luaK_reserveregs(fs, nvars); + block(ls); + leaveblock(fs); /* end of scope for declared variables */ + luaK_patchtohere(fs, prep); + if (isnum) /* numeric for? */ + endfor = luaK_codeAsBx(fs, OP_FORLOOP, base, NO_JUMP); + else { /* generic for */ + luaK_codeABC(fs, OP_TFORCALL, base, 0, nvars); + luaK_fixline(fs, line); + endfor = luaK_codeAsBx(fs, OP_TFORLOOP, base + 2, NO_JUMP); + } + luaK_patchlist(fs, endfor, prep + 1); + luaK_fixline(fs, line); +} + + +static void fornum (LexState *ls, TString *varname, int line) { + /* fornum -> NAME = exp1,exp1[,exp1] forbody */ + FuncState *fs = ls->fs; + int base = fs->freereg; + new_localvarliteral(ls, "(for index)"); + new_localvarliteral(ls, "(for limit)"); + new_localvarliteral(ls, "(for step)"); + new_localvar(ls, varname); + checknext(ls, '='); + exp1(ls); /* initial value */ + checknext(ls, ','); + exp1(ls); /* limit */ + if (testnext(ls, ',')) + exp1(ls); /* optional step */ + else { /* default step = 1 */ + luaK_codek(fs, fs->freereg, luaK_numberK(fs, 1)); + luaK_reserveregs(fs, 1); + } + forbody(ls, base, line, 1, 1); +} + + +static void forlist (LexState *ls, TString *indexname) { + /* forlist -> NAME {,NAME} IN explist forbody */ + FuncState *fs = ls->fs; + expdesc e; + int nvars = 4; /* gen, state, control, plus at least one declared var */ + int line; + int base = fs->freereg; + /* create control variables */ + new_localvarliteral(ls, "(for generator)"); + new_localvarliteral(ls, "(for state)"); + new_localvarliteral(ls, "(for control)"); + /* create declared variables */ + new_localvar(ls, indexname); + while (testnext(ls, ',')) { + new_localvar(ls, str_checkname(ls)); + nvars++; + } + checknext(ls, TK_IN); + line = ls->linenumber; + adjust_assign(ls, 3, explist(ls, &e), &e); + luaK_checkstack(fs, 3); /* extra space to call generator */ + forbody(ls, base, line, nvars - 3, 0); +} + + +static void forstat (LexState *ls, int line) { + /* forstat -> FOR (fornum | forlist) END */ + FuncState *fs = ls->fs; + TString *varname; + BlockCnt bl; + enterblock(fs, &bl, 1); /* scope for loop and control variables */ + luaX_next(ls); /* skip `for' */ + varname = str_checkname(ls); /* first variable name */ + switch (ls->t.token) { + case '=': fornum(ls, varname, line); break; + case ',': case TK_IN: forlist(ls, varname); break; + default: luaX_syntaxerror(ls, LUA_QL("=") " or " LUA_QL("in") " expected"); + } + check_match(ls, TK_END, TK_FOR, line); + leaveblock(fs); /* loop scope (`break' jumps to this point) */ +} + + +static void test_then_block (LexState *ls, int *escapelist) { + /* test_then_block -> [IF | ELSEIF] cond THEN block */ + BlockCnt bl; + FuncState *fs = ls->fs; + expdesc v; + int jf; /* instruction to skip 'then' code (if condition is false) */ + luaX_next(ls); /* skip IF or ELSEIF */ + expr(ls, &v); /* read condition */ + checknext(ls, TK_THEN); + if (ls->t.token == TK_GOTO || ls->t.token == TK_BREAK) { + luaK_goiffalse(ls->fs, &v); /* will jump to label if condition is true */ + enterblock(fs, &bl, 0); /* must enter block before 'goto' */ + gotostat(ls, v.t); /* handle goto/break */ + skipnoopstat(ls); /* skip other no-op statements */ + if (block_follow(ls, 0)) { /* 'goto' is the entire block? */ + leaveblock(fs); + return; /* and that is it */ + } + else /* must skip over 'then' part if condition is false */ + jf = luaK_jump(fs); + } + else { /* regular case (not goto/break) */ + luaK_goiftrue(ls->fs, &v); /* skip over block if condition is false */ + enterblock(fs, &bl, 0); + jf = v.f; + } + statlist(ls); /* `then' part */ + leaveblock(fs); + if (ls->t.token == TK_ELSE || + ls->t.token == TK_ELSEIF) /* followed by 'else'/'elseif'? */ + luaK_concat(fs, escapelist, luaK_jump(fs)); /* must jump over it */ + luaK_patchtohere(fs, jf); +} + + +static void ifstat (LexState *ls, int line) { + /* ifstat -> IF cond THEN block {ELSEIF cond THEN block} [ELSE block] END */ + FuncState *fs = ls->fs; + int escapelist = NO_JUMP; /* exit list for finished parts */ + test_then_block(ls, &escapelist); /* IF cond THEN block */ + while (ls->t.token == TK_ELSEIF) + test_then_block(ls, &escapelist); /* ELSEIF cond THEN block */ + if (testnext(ls, TK_ELSE)) + block(ls); /* `else' part */ + check_match(ls, TK_END, TK_IF, line); + luaK_patchtohere(fs, escapelist); /* patch escape list to 'if' end */ +} + + +static void localfunc (LexState *ls) { + expdesc b; + FuncState *fs = ls->fs; + new_localvar(ls, str_checkname(ls)); /* new local variable */ + adjustlocalvars(ls, 1); /* enter its scope */ + body(ls, &b, 0, ls->linenumber); /* function created in next register */ + /* debug information will only see the variable after this point! */ + getlocvar(fs, b.u.info)->startpc = fs->pc; +} + + +static void localstat (LexState *ls) { + /* stat -> LOCAL NAME {`,' NAME} [`=' explist] */ + int nvars = 0; + int nexps; + expdesc e; + do { + new_localvar(ls, str_checkname(ls)); + nvars++; + } while (testnext(ls, ',')); + if (testnext(ls, '=')) + nexps = explist(ls, &e); + else { + e.k = VVOID; + nexps = 0; + } + adjust_assign(ls, nvars, nexps, &e); + adjustlocalvars(ls, nvars); +} + + +static int funcname (LexState *ls, expdesc *v) { + /* funcname -> NAME {fieldsel} [`:' NAME] */ + int ismethod = 0; + singlevar(ls, v); + while (ls->t.token == '.') + fieldsel(ls, v); + if (ls->t.token == ':') { + ismethod = 1; + fieldsel(ls, v); + } + return ismethod; +} + + +static void funcstat (LexState *ls, int line) { + /* funcstat -> FUNCTION funcname body */ + int ismethod; + expdesc v, b; + luaX_next(ls); /* skip FUNCTION */ + ismethod = funcname(ls, &v); + body(ls, &b, ismethod, line); + luaK_storevar(ls->fs, &v, &b); + luaK_fixline(ls->fs, line); /* definition `happens' in the first line */ +} + + +static void exprstat (LexState *ls) { + /* stat -> func | assignment */ + FuncState *fs = ls->fs; + struct LHS_assign v; + suffixedexp(ls, &v.v); + if (ls->t.token == '=' || ls->t.token == ',') { /* stat -> assignment ? */ + v.prev = NULL; + assignment(ls, &v, 1); + } + else { /* stat -> func */ + check_condition(ls, v.v.k == VCALL, "syntax error"); + SETARG_C(getcode(fs, &v.v), 1); /* call statement uses no results */ + } +} + + +static void retstat (LexState *ls) { + /* stat -> RETURN [explist] [';'] */ + FuncState *fs = ls->fs; + expdesc e; + int first, nret; /* registers with returned values */ + if (block_follow(ls, 1) || ls->t.token == ';') + first = nret = 0; /* return no values */ + else { + nret = explist(ls, &e); /* optional return values */ + if (hasmultret(e.k)) { + luaK_setmultret(fs, &e); + if (e.k == VCALL && nret == 1) { /* tail call? */ + SET_OPCODE(getcode(fs,&e), OP_TAILCALL); + lua_assert(GETARG_A(getcode(fs,&e)) == fs->nactvar); + } + first = fs->nactvar; + nret = LUA_MULTRET; /* return all values */ + } + else { + if (nret == 1) /* only one single value? */ + first = luaK_exp2anyreg(fs, &e); + else { + luaK_exp2nextreg(fs, &e); /* values must go to the `stack' */ + first = fs->nactvar; /* return all `active' values */ + lua_assert(nret == fs->freereg - first); + } + } + } + luaK_ret(fs, first, nret); + testnext(ls, ';'); /* skip optional semicolon */ +} + + +static void statement (LexState *ls) { + int line = ls->linenumber; /* may be needed for error messages */ + enterlevel(ls); + switch (ls->t.token) { + case ';': { /* stat -> ';' (empty statement) */ + luaX_next(ls); /* skip ';' */ + break; + } + case TK_IF: { /* stat -> ifstat */ + ifstat(ls, line); + break; + } + case TK_WHILE: { /* stat -> whilestat */ + whilestat(ls, line); + break; + } + case TK_DO: { /* stat -> DO block END */ + luaX_next(ls); /* skip DO */ + block(ls); + check_match(ls, TK_END, TK_DO, line); + break; + } + case TK_FOR: { /* stat -> forstat */ + forstat(ls, line); + break; + } + case TK_REPEAT: { /* stat -> repeatstat */ + repeatstat(ls, line); + break; + } + case TK_FUNCTION: { /* stat -> funcstat */ + funcstat(ls, line); + break; + } + case TK_LOCAL: { /* stat -> localstat */ + luaX_next(ls); /* skip LOCAL */ + if (testnext(ls, TK_FUNCTION)) /* local function? */ + localfunc(ls); + else + localstat(ls); + break; + } + case TK_DBCOLON: { /* stat -> label */ + luaX_next(ls); /* skip double colon */ + labelstat(ls, str_checkname(ls), line); + break; + } + case TK_RETURN: { /* stat -> retstat */ + luaX_next(ls); /* skip RETURN */ + retstat(ls); + break; + } + case TK_BREAK: /* stat -> breakstat */ + case TK_GOTO: { /* stat -> 'goto' NAME */ + gotostat(ls, luaK_jump(ls->fs)); + break; + } + default: { /* stat -> func | assignment */ + exprstat(ls); + break; + } + } + lua_assert(ls->fs->f->maxstacksize >= ls->fs->freereg && + ls->fs->freereg >= ls->fs->nactvar); + ls->fs->freereg = ls->fs->nactvar; /* free registers */ + leavelevel(ls); +} + +/* }====================================================================== */ + + +/* +** compiles the main function, which is a regular vararg function with an +** upvalue named LUA_ENV +*/ +static void mainfunc (LexState *ls, FuncState *fs) { + BlockCnt bl; + expdesc v; + open_func(ls, fs, &bl); + fs->f->is_vararg = 1; /* main function is always vararg */ + init_exp(&v, VLOCAL, 0); /* create and... */ + newupvalue(fs, ls->envn, &v); /* ...set environment upvalue */ + luaX_next(ls); /* read first token */ + statlist(ls); /* parse main body */ + check(ls, TK_EOS); + close_func(ls); +} + + +Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff, + Dyndata *dyd, const char *name, int firstchar) { + LexState lexstate; + FuncState funcstate; + Closure *cl = luaF_newLclosure(L, 1); /* create main closure */ + /* anchor closure (to avoid being collected) */ + setclLvalue(L, L->top, cl); + incr_top(L); + funcstate.f = cl->l.p = luaF_newproto(L); + funcstate.f->source = luaS_new(L, name); /* create and anchor TString */ + lexstate.buff = buff; + lexstate.dyd = dyd; + dyd->actvar.n = dyd->gt.n = dyd->label.n = 0; + luaX_setinput(L, &lexstate, z, funcstate.f->source, firstchar); + mainfunc(&lexstate, &funcstate); + lua_assert(!funcstate.prev && funcstate.nups == 1 && !lexstate.fs); + /* all scopes should be correctly finished */ + lua_assert(dyd->actvar.n == 0 && dyd->gt.n == 0 && dyd->label.n == 0); + return cl; /* it's on the stack too */ +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h new file mode 100644 index 000000000000..0346e3c41a80 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h @@ -0,0 +1,119 @@ +/* +** $Id: lparser.h,v 1.70.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua Parser +** See Copyright Notice in lua.h +*/ + +#ifndef lparser_h +#define lparser_h + +#include "llimits.h" +#include "lobject.h" +#include "lzio.h" + + +/* +** Expression descriptor +*/ + +typedef enum { + VVOID, /* no value */ + VNIL, + VTRUE, + VFALSE, + VK, /* info = index of constant in `k' */ + VKNUM, /* nval = numerical value */ + VNONRELOC, /* info = result register */ + VLOCAL, /* info = local register */ + VUPVAL, /* info = index of upvalue in 'upvalues' */ + VINDEXED, /* t = table register/upvalue; idx = index R/K */ + VJMP, /* info = instruction pc */ + VRELOCABLE, /* info = instruction pc */ + VCALL, /* info = instruction pc */ + VVARARG /* info = instruction pc */ +} expkind; + + +#define vkisvar(k) (VLOCAL <= (k) && (k) <= VINDEXED) +#define vkisinreg(k) ((k) == VNONRELOC || (k) == VLOCAL) + +typedef struct expdesc { + expkind k; + union { + struct { /* for indexed variables (VINDEXED) */ + short idx; /* index (R/K) */ + lu_byte t; /* table (register or upvalue) */ + lu_byte vt; /* whether 't' is register (VLOCAL) or upvalue (VUPVAL) */ + } ind; + int info; /* for generic use */ + lua_Number nval; /* for VKNUM */ + } u; + int t; /* patch list of `exit when true' */ + int f; /* patch list of `exit when false' */ +} expdesc; + + +/* description of active local variable */ +typedef struct Vardesc { + short idx; /* variable index in stack */ +} Vardesc; + + +/* description of pending goto statements and label statements */ +typedef struct Labeldesc { + TString *name; /* label identifier */ + int pc; /* position in code */ + int line; /* line where it appeared */ + lu_byte nactvar; /* local level where it appears in current block */ +} Labeldesc; + + +/* list of labels or gotos */ +typedef struct Labellist { + Labeldesc *arr; /* array */ + int n; /* number of entries in use */ + int size; /* array size */ +} Labellist; + + +/* dynamic structures used by the parser */ +typedef struct Dyndata { + struct { /* list of active local variables */ + Vardesc *arr; + int n; + int size; + } actvar; + Labellist gt; /* list of pending gotos */ + Labellist label; /* list of active labels */ +} Dyndata; + + +/* control of blocks */ +struct BlockCnt; /* defined in lparser.c */ + + +/* state needed to generate code for a given function */ +typedef struct FuncState { + Proto *f; /* current function header */ + Table *h; /* table to find (and reuse) elements in `k' */ + struct FuncState *prev; /* enclosing function */ + struct LexState *ls; /* lexical state */ + struct BlockCnt *bl; /* chain of current blocks */ + int pc; /* next position to code (equivalent to `ncode') */ + int lasttarget; /* 'label' of last 'jump label' */ + int jpc; /* list of pending jumps to `pc' */ + int nk; /* number of elements in `k' */ + int np; /* number of elements in `p' */ + int firstlocal; /* index of first local var (in Dyndata array) */ + short nlocvars; /* number of elements in 'f->locvars' */ + lu_byte nactvar; /* number of active local variables */ + lu_byte nups; /* number of upvalues */ + lu_byte freereg; /* first free register */ +} FuncState; + + +LUAI_FUNC Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff, + Dyndata *dyd, const char *name, int firstchar); + + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c new file mode 100644 index 000000000000..b98ce5c2b52b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c @@ -0,0 +1,321 @@ +/* +** $Id: lstate.c,v 2.99.1.2 2013/11/08 17:45:31 roberto Exp $ +** Global State +** See Copyright Notice in lua.h +*/ + + +#include <sys/zfs_context.h> + +#define lstate_c +#define LUA_CORE + +#include "lua.h" + +#include "lapi.h" +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "lgc.h" +#include "llex.h" +#include "lmem.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" + + +#if !defined(LUAI_GCPAUSE) +#define LUAI_GCPAUSE 200 /* 200% */ +#endif + +#if !defined(LUAI_GCMAJOR) +#define LUAI_GCMAJOR 200 /* 200% */ +#endif + +#if !defined(LUAI_GCMUL) +#define LUAI_GCMUL 200 /* GC runs 'twice the speed' of memory allocation */ +#endif + + +#define MEMERRMSG "not enough memory" + + +/* +** a macro to help the creation of a unique random seed when a state is +** created; the seed is used to randomize hashes. +*/ +#if !defined(luai_makeseed) +#define luai_makeseed() cast(unsigned int, gethrtime()) +#endif + + + +/* +** thread state + extra space +*/ +typedef struct LX { +#if defined(LUAI_EXTRASPACE) + char buff[LUAI_EXTRASPACE]; +#endif + lua_State l; +} LX; + + +/* +** Main thread combines a thread state and the global state +*/ +typedef struct LG { + LX l; + global_State g; +} LG; + + + +#define fromstate(L) (cast(LX *, cast(lu_byte *, (L)) - offsetof(LX, l))) + + +/* +** Compute an initial seed as random as possible. In ANSI, rely on +** Address Space Layout Randomization (if present) to increase +** randomness.. +*/ +#define addbuff(b,p,e) \ + { size_t t = cast(size_t, e); \ + memcpy(buff + p, &t, sizeof(t)); p += sizeof(t); } + +static unsigned int makeseed (lua_State *L) { + char buff[4 * sizeof(size_t)]; + unsigned int h = luai_makeseed(); + int p = 0; + addbuff(buff, p, L); /* heap variable */ + addbuff(buff, p, &h); /* local variable */ + addbuff(buff, p, luaO_nilobject); /* global variable */ + addbuff(buff, p, &lua_newstate); /* public function */ + lua_assert(p == sizeof(buff)); + return luaS_hash(buff, p, h); +} + + +/* +** set GCdebt to a new value keeping the value (totalbytes + GCdebt) +** invariant +*/ +void luaE_setdebt (global_State *g, l_mem debt) { + g->totalbytes -= (debt - g->GCdebt); + g->GCdebt = debt; +} + + +CallInfo *luaE_extendCI (lua_State *L) { + CallInfo *ci = luaM_new(L, CallInfo); + lua_assert(L->ci->next == NULL); + L->ci->next = ci; + ci->previous = L->ci; + ci->next = NULL; + return ci; +} + + +void luaE_freeCI (lua_State *L) { + CallInfo *ci = L->ci; + CallInfo *next = ci->next; + ci->next = NULL; + while ((ci = next) != NULL) { + next = ci->next; + luaM_free(L, ci); + } +} + + +static void stack_init (lua_State *L1, lua_State *L) { + int i; CallInfo *ci; + /* initialize stack array */ + L1->stack = luaM_newvector(L, BASIC_STACK_SIZE, TValue); + L1->stacksize = BASIC_STACK_SIZE; + for (i = 0; i < BASIC_STACK_SIZE; i++) + setnilvalue(L1->stack + i); /* erase new stack */ + L1->top = L1->stack; + L1->stack_last = L1->stack + L1->stacksize - EXTRA_STACK; + /* initialize first ci */ + ci = &L1->base_ci; + ci->next = ci->previous = NULL; + ci->callstatus = 0; + ci->func = L1->top; + setnilvalue(L1->top++); /* 'function' entry for this 'ci' */ + ci->top = L1->top + LUA_MINSTACK; + L1->ci = ci; +} + + +static void freestack (lua_State *L) { + if (L->stack == NULL) + return; /* stack not completely built yet */ + L->ci = &L->base_ci; /* free the entire 'ci' list */ + luaE_freeCI(L); + luaM_freearray(L, L->stack, L->stacksize); /* free stack array */ +} + + +/* +** Create registry table and its predefined values +*/ +static void init_registry (lua_State *L, global_State *g) { + TValue mt; + /* create registry */ + Table *registry = luaH_new(L); + sethvalue(L, &g->l_registry, registry); + luaH_resize(L, registry, LUA_RIDX_LAST, 0); + /* registry[LUA_RIDX_MAINTHREAD] = L */ + setthvalue(L, &mt, L); + luaH_setint(L, registry, LUA_RIDX_MAINTHREAD, &mt); + /* registry[LUA_RIDX_GLOBALS] = table of globals */ + sethvalue(L, &mt, luaH_new(L)); + luaH_setint(L, registry, LUA_RIDX_GLOBALS, &mt); +} + + +/* +** open parts of the state that may cause memory-allocation errors +*/ +static void f_luaopen (lua_State *L, void *ud) { + global_State *g = G(L); + UNUSED(ud); + stack_init(L, L); /* init stack */ + init_registry(L, g); + luaS_resize(L, MINSTRTABSIZE); /* initial size of string table */ + luaT_init(L); + luaX_init(L); + /* pre-create memory-error message */ + g->memerrmsg = luaS_newliteral(L, MEMERRMSG); + luaS_fix(g->memerrmsg); /* it should never be collected */ + g->gcrunning = 1; /* allow gc */ + g->version = lua_version(NULL); + luai_userstateopen(L); +} + + +/* +** preinitialize a state with consistent values without allocating +** any memory (to avoid errors) +*/ +static void preinit_state (lua_State *L, global_State *g) { + G(L) = g; + L->stack = NULL; + L->ci = NULL; + L->stacksize = 0; + L->errorJmp = NULL; + L->nCcalls = 0; + L->hook = NULL; + L->hookmask = 0; + L->basehookcount = 0; + L->allowhook = 1; + resethookcount(L); + L->openupval = NULL; + L->nny = 1; + L->status = LUA_OK; + L->errfunc = 0; +} + + +static void close_state (lua_State *L) { + global_State *g = G(L); + luaF_close(L, L->stack); /* close all upvalues for this thread */ + luaC_freeallobjects(L); /* collect all objects */ + if (g->version) /* closing a fully built state? */ + luai_userstateclose(L); + luaM_freearray(L, G(L)->strt.hash, G(L)->strt.size); + luaZ_freebuffer(L, &g->buff); + freestack(L); + lua_assert(gettotalbytes(g) == sizeof(LG)); + (*g->frealloc)(g->ud, fromstate(L), sizeof(LG), 0); /* free main block */ +} + + +LUA_API lua_State *lua_newthread (lua_State *L) { + lua_State *L1; + lua_lock(L); + luaC_checkGC(L); + L1 = &luaC_newobj(L, LUA_TTHREAD, sizeof(LX), NULL, offsetof(LX, l))->th; + setthvalue(L, L->top, L1); + api_incr_top(L); + preinit_state(L1, G(L)); + L1->hookmask = L->hookmask; + L1->basehookcount = L->basehookcount; + L1->hook = L->hook; + resethookcount(L1); + luai_userstatethread(L, L1); + stack_init(L1, L); /* init stack */ + lua_unlock(L); + return L1; +} + + +void luaE_freethread (lua_State *L, lua_State *L1) { + LX *l = fromstate(L1); + luaF_close(L1, L1->stack); /* close all upvalues for this thread */ + lua_assert(L1->openupval == NULL); + luai_userstatefree(L, L1); + freestack(L1); + luaM_free(L, l); +} + + +LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) { + int i; + lua_State *L; + global_State *g; + LG *l = cast(LG *, (*f)(ud, NULL, LUA_TTHREAD, sizeof(LG))); + if (l == NULL) return NULL; + L = &l->l.l; + g = &l->g; + L->next = NULL; + L->tt = LUA_TTHREAD; + g->currentwhite = bit2mask(WHITE0BIT, FIXEDBIT); + L->marked = luaC_white(g); + g->gckind = KGC_NORMAL; + preinit_state(L, g); + g->frealloc = f; + g->ud = ud; + g->mainthread = L; + g->seed = makeseed(L); + g->uvhead.u.l.prev = &g->uvhead; + g->uvhead.u.l.next = &g->uvhead; + g->gcrunning = 0; /* no GC while building state */ + g->GCestimate = 0; + g->strt.size = 0; + g->strt.nuse = 0; + g->strt.hash = NULL; + setnilvalue(&g->l_registry); + luaZ_initbuffer(L, &g->buff); + g->panic = NULL; + g->version = NULL; + g->gcstate = GCSpause; + g->allgc = NULL; + g->finobj = NULL; + g->tobefnz = NULL; + g->sweepgc = g->sweepfin = NULL; + g->gray = g->grayagain = NULL; + g->weak = g->ephemeron = g->allweak = NULL; + g->totalbytes = sizeof(LG); + g->GCdebt = 0; + g->gcpause = LUAI_GCPAUSE; + g->gcmajorinc = LUAI_GCMAJOR; + g->gcstepmul = LUAI_GCMUL; + for (i=0; i < LUA_NUMTAGS; i++) g->mt[i] = NULL; + if (luaD_rawrunprotected(L, f_luaopen, NULL) != LUA_OK) { + /* memory allocation error: free partial state */ + close_state(L); + L = NULL; + } + return L; +} + + +LUA_API void lua_close (lua_State *L) { + L = G(L)->mainthread; /* only the main thread can be closed */ + lua_lock(L); + close_state(L); +} + + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h new file mode 100644 index 000000000000..daffd9aacfbb --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h @@ -0,0 +1,228 @@ +/* +** $Id: lstate.h,v 2.82.1.1 2013/04/12 18:48:47 roberto Exp $ +** Global State +** See Copyright Notice in lua.h +*/ + +#ifndef lstate_h +#define lstate_h + +#include "lua.h" + +#include "lobject.h" +#include "ltm.h" +#include "lzio.h" + + +/* + +** Some notes about garbage-collected objects: All objects in Lua must +** be kept somehow accessible until being freed. +** +** Lua keeps most objects linked in list g->allgc. The link uses field +** 'next' of the CommonHeader. +** +** Strings are kept in several lists headed by the array g->strt.hash. +** +** Open upvalues are not subject to independent garbage collection. They +** are collected together with their respective threads. Lua keeps a +** double-linked list with all open upvalues (g->uvhead) so that it can +** mark objects referred by them. (They are always gray, so they must +** be remarked in the atomic step. Usually their contents would be marked +** when traversing the respective threads, but the thread may already be +** dead, while the upvalue is still accessible through closures.) +** +** Objects with finalizers are kept in the list g->finobj. +** +** The list g->tobefnz links all objects being finalized. + +*/ + + +struct lua_longjmp; /* defined in ldo.c */ + + + +/* extra stack space to handle TM calls and some other extras */ +#define EXTRA_STACK 5 + + +#define BASIC_STACK_SIZE (2*LUA_MINSTACK) + + +/* kinds of Garbage Collection */ +#define KGC_NORMAL 0 +#define KGC_EMERGENCY 1 /* gc was forced by an allocation failure */ +#define KGC_GEN 2 /* generational collection */ + + +typedef struct stringtable { + GCObject **hash; + lu_int32 nuse; /* number of elements */ + int size; +} stringtable; + + +/* +** information about a call +*/ +typedef struct CallInfo { + StkId func; /* function index in the stack */ + StkId top; /* top for this function */ + struct CallInfo *previous, *next; /* dynamic call link */ + short nresults; /* expected number of results from this function */ + lu_byte callstatus; + ptrdiff_t extra; + union { + struct { /* only for Lua functions */ + StkId base; /* base for this function */ + const Instruction *savedpc; + } l; + struct { /* only for C functions */ + int ctx; /* context info. in case of yields */ + lua_CFunction k; /* continuation in case of yields */ + ptrdiff_t old_errfunc; + lu_byte old_allowhook; + lu_byte status; + } c; + } u; +} CallInfo; + + +/* +** Bits in CallInfo status +*/ +#define CIST_LUA (1<<0) /* call is running a Lua function */ +#define CIST_HOOKED (1<<1) /* call is running a debug hook */ +#define CIST_REENTRY (1<<2) /* call is running on same invocation of + luaV_execute of previous call */ +#define CIST_YIELDED (1<<3) /* call reentered after suspension */ +#define CIST_YPCALL (1<<4) /* call is a yieldable protected call */ +#define CIST_STAT (1<<5) /* call has an error status (pcall) */ +#define CIST_TAIL (1<<6) /* call was tail called */ +#define CIST_HOOKYIELD (1<<7) /* last hook called yielded */ + + +#define isLua(ci) ((ci)->callstatus & CIST_LUA) + + +/* +** `global state', shared by all threads of this state +*/ +typedef struct global_State { + lua_Alloc frealloc; /* function to reallocate memory */ + void *ud; /* auxiliary data to `frealloc' */ + lu_mem totalbytes; /* number of bytes currently allocated - GCdebt */ + l_mem GCdebt; /* bytes allocated not yet compensated by the collector */ + lu_mem GCmemtrav; /* memory traversed by the GC */ + lu_mem GCestimate; /* an estimate of the non-garbage memory in use */ + stringtable strt; /* hash table for strings */ + TValue l_registry; + unsigned int seed; /* randomized seed for hashes */ + lu_byte currentwhite; + lu_byte gcstate; /* state of garbage collector */ + lu_byte gckind; /* kind of GC running */ + lu_byte gcrunning; /* true if GC is running */ + int sweepstrgc; /* position of sweep in `strt' */ + GCObject *allgc; /* list of all collectable objects */ + GCObject *finobj; /* list of collectable objects with finalizers */ + GCObject **sweepgc; /* current position of sweep in list 'allgc' */ + GCObject **sweepfin; /* current position of sweep in list 'finobj' */ + GCObject *gray; /* list of gray objects */ + GCObject *grayagain; /* list of objects to be traversed atomically */ + GCObject *weak; /* list of tables with weak values */ + GCObject *ephemeron; /* list of ephemeron tables (weak keys) */ + GCObject *allweak; /* list of all-weak tables */ + GCObject *tobefnz; /* list of userdata to be GC */ + UpVal uvhead; /* head of double-linked list of all open upvalues */ + Mbuffer buff; /* temporary buffer for string concatenation */ + int gcpause; /* size of pause between successive GCs */ + int gcmajorinc; /* pause between major collections (only in gen. mode) */ + int gcstepmul; /* GC `granularity' */ + lua_CFunction panic; /* to be called in unprotected errors */ + struct lua_State *mainthread; + const lua_Number *version; /* pointer to version number */ + TString *memerrmsg; /* memory-error message */ + TString *tmname[TM_N]; /* array with tag-method names */ + struct Table *mt[LUA_NUMTAGS]; /* metatables for basic types */ +} global_State; + + +/* +** `per thread' state +*/ +struct lua_State { + CommonHeader; + lu_byte status; + StkId top; /* first free slot in the stack */ + global_State *l_G; + CallInfo *ci; /* call info for current function */ + const Instruction *oldpc; /* last pc traced */ + StkId stack_last; /* last free slot in the stack */ + StkId stack; /* stack base */ + int stacksize; + unsigned short nny; /* number of non-yieldable calls in stack */ + unsigned short nCcalls; /* number of nested C calls */ + lu_byte hookmask; + lu_byte allowhook; + int basehookcount; + int hookcount; + lua_Hook hook; + GCObject *openupval; /* list of open upvalues in this stack */ + GCObject *gclist; + struct lua_longjmp *errorJmp; /* current error recover point */ + ptrdiff_t errfunc; /* current error handling function (stack index) */ + CallInfo base_ci; /* CallInfo for first level (C calling Lua) */ +}; + + +#define G(L) (L->l_G) + + +/* +** Union of all collectable objects +*/ +union GCObject { + GCheader gch; /* common header */ + union TString ts; + union Udata u; + union Closure cl; + struct Table h; + struct Proto p; + struct UpVal uv; + struct lua_State th; /* thread */ +}; + + +#define gch(o) (&(o)->gch) + +/* macros to convert a GCObject into a specific value */ +#define rawgco2ts(o) \ + check_exp(novariant((o)->gch.tt) == LUA_TSTRING, &((o)->ts)) +#define gco2ts(o) (&rawgco2ts(o)->tsv) +#define rawgco2u(o) check_exp((o)->gch.tt == LUA_TUSERDATA, &((o)->u)) +#define gco2u(o) (&rawgco2u(o)->uv) +#define gco2lcl(o) check_exp((o)->gch.tt == LUA_TLCL, &((o)->cl.l)) +#define gco2ccl(o) check_exp((o)->gch.tt == LUA_TCCL, &((o)->cl.c)) +#define gco2cl(o) \ + check_exp(novariant((o)->gch.tt) == LUA_TFUNCTION, &((o)->cl)) +#define gco2t(o) check_exp((o)->gch.tt == LUA_TTABLE, &((o)->h)) +#define gco2p(o) check_exp((o)->gch.tt == LUA_TPROTO, &((o)->p)) +#define gco2uv(o) check_exp((o)->gch.tt == LUA_TUPVAL, &((o)->uv)) +#define gco2th(o) check_exp((o)->gch.tt == LUA_TTHREAD, &((o)->th)) + +/* macro to convert any Lua object into a GCObject */ +#define obj2gco(v) (cast(GCObject *, (v))) + + +/* actual number of total bytes allocated */ +#define gettotalbytes(g) ((g)->totalbytes + (g)->GCdebt) + +LUAI_FUNC void luaE_setdebt (global_State *g, l_mem debt); +LUAI_FUNC void luaE_freethread (lua_State *L, lua_State *L1); +LUAI_FUNC CallInfo *luaE_extendCI (lua_State *L); +LUAI_FUNC void luaE_freeCI (lua_State *L); + + +#endif + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c new file mode 100644 index 000000000000..e20ab04b12de --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c @@ -0,0 +1,185 @@ +/* +** $Id: lstring.c,v 2.26.1.1 2013/04/12 18:48:47 roberto Exp $ +** String table (keeps all strings handled by Lua) +** See Copyright Notice in lua.h +*/ + + +#include <sys/zfs_context.h> + +#define lstring_c +#define LUA_CORE + +#include "lua.h" + +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" +#include "lstring.h" + + +/* +** Lua will use at most ~(2^LUAI_HASHLIMIT) bytes from a string to +** compute its hash +*/ +#if !defined(LUAI_HASHLIMIT) +#define LUAI_HASHLIMIT 5 +#endif + + +/* +** equality for long strings +*/ +int luaS_eqlngstr (TString *a, TString *b) { + size_t len = a->tsv.len; + lua_assert(a->tsv.tt == LUA_TLNGSTR && b->tsv.tt == LUA_TLNGSTR); + return (a == b) || /* same instance or... */ + ((len == b->tsv.len) && /* equal length and ... */ + (memcmp(getstr(a), getstr(b), len) == 0)); /* equal contents */ +} + + +/* +** equality for strings +*/ +int luaS_eqstr (TString *a, TString *b) { + return (a->tsv.tt == b->tsv.tt) && + (a->tsv.tt == LUA_TSHRSTR ? eqshrstr(a, b) : luaS_eqlngstr(a, b)); +} + + +unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) { + unsigned int h = seed ^ cast(unsigned int, l); + size_t l1; + size_t step = (l >> LUAI_HASHLIMIT) + 1; + for (l1 = l; l1 >= step; l1 -= step) + h = h ^ ((h<<5) + (h>>2) + cast_byte(str[l1 - 1])); + return h; +} + + +/* +** resizes the string table +*/ +void luaS_resize (lua_State *L, int newsize) { + int i; + stringtable *tb = &G(L)->strt; + /* cannot resize while GC is traversing strings */ + luaC_runtilstate(L, ~bitmask(GCSsweepstring)); + if (newsize > tb->size) { + luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *); + for (i = tb->size; i < newsize; i++) tb->hash[i] = NULL; + } + /* rehash */ + for (i=0; i<tb->size; i++) { + GCObject *p = tb->hash[i]; + tb->hash[i] = NULL; + while (p) { /* for each node in the list */ + GCObject *next = gch(p)->next; /* save next */ + unsigned int h = lmod(gco2ts(p)->hash, newsize); /* new position */ + gch(p)->next = tb->hash[h]; /* chain it */ + tb->hash[h] = p; + resetoldbit(p); /* see MOVE OLD rule */ + p = next; + } + } + if (newsize < tb->size) { + /* shrinking slice must be empty */ + lua_assert(tb->hash[newsize] == NULL && tb->hash[tb->size - 1] == NULL); + luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *); + } + tb->size = newsize; +} + + +/* +** creates a new string object +*/ +static TString *createstrobj (lua_State *L, const char *str, size_t l, + int tag, unsigned int h, GCObject **list) { + TString *ts; + size_t totalsize; /* total size of TString object */ + totalsize = sizeof(TString) + ((l + 1) * sizeof(char)); + ts = &luaC_newobj(L, tag, totalsize, list, 0)->ts; + ts->tsv.len = l; + ts->tsv.hash = h; + ts->tsv.extra = 0; + memcpy(ts+1, str, l*sizeof(char)); + ((char *)(ts+1))[l] = '\0'; /* ending 0 */ + return ts; +} + + +/* +** creates a new short string, inserting it into string table +*/ +static TString *newshrstr (lua_State *L, const char *str, size_t l, + unsigned int h) { + GCObject **list; /* (pointer to) list where it will be inserted */ + stringtable *tb = &G(L)->strt; + TString *s; + if (tb->nuse >= cast(lu_int32, tb->size) && tb->size <= MAX_INT/2) + luaS_resize(L, tb->size*2); /* too crowded */ + list = &tb->hash[lmod(h, tb->size)]; + s = createstrobj(L, str, l, LUA_TSHRSTR, h, list); + tb->nuse++; + return s; +} + + +/* +** checks whether short string exists and reuses it or creates a new one +*/ +static TString *internshrstr (lua_State *L, const char *str, size_t l) { + GCObject *o; + global_State *g = G(L); + unsigned int h = luaS_hash(str, l, g->seed); + for (o = g->strt.hash[lmod(h, g->strt.size)]; + o != NULL; + o = gch(o)->next) { + TString *ts = rawgco2ts(o); + if (h == ts->tsv.hash && + l == ts->tsv.len && + (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) { + if (isdead(G(L), o)) /* string is dead (but was not collected yet)? */ + changewhite(o); /* resurrect it */ + return ts; + } + } + return newshrstr(L, str, l, h); /* not found; create a new string */ +} + + +/* +** new string (with explicit length) +*/ +TString *luaS_newlstr (lua_State *L, const char *str, size_t l) { + if (l <= LUAI_MAXSHORTLEN) /* short string? */ + return internshrstr(L, str, l); + else { + if (l + 1 > (MAX_SIZET - sizeof(TString))/sizeof(char)) + luaM_toobig(L); + return createstrobj(L, str, l, LUA_TLNGSTR, G(L)->seed, NULL); + } +} + + +/* +** new zero-terminated string +*/ +TString *luaS_new (lua_State *L, const char *str) { + return luaS_newlstr(L, str, strlen(str)); +} + + +Udata *luaS_newudata (lua_State *L, size_t s, Table *e) { + Udata *u; + if (s > MAX_SIZET - sizeof(Udata)) + luaM_toobig(L); + u = &luaC_newobj(L, LUA_TUSERDATA, sizeof(Udata) + s, NULL, 0)->u; + u->uv.len = s; + u->uv.metatable = NULL; + u->uv.env = e; + return u; +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h new file mode 100644 index 000000000000..260e7f169bd0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h @@ -0,0 +1,46 @@ +/* +** $Id: lstring.h,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $ +** String table (keep all strings handled by Lua) +** See Copyright Notice in lua.h +*/ + +#ifndef lstring_h +#define lstring_h + +#include "lgc.h" +#include "lobject.h" +#include "lstate.h" + + +#define sizestring(s) (sizeof(union TString)+((s)->len+1)*sizeof(char)) + +#define sizeudata(u) (sizeof(union Udata)+(u)->len) + +#define luaS_newliteral(L, s) (luaS_newlstr(L, "" s, \ + (sizeof(s)/sizeof(char))-1)) + +#define luaS_fix(s) l_setbit((s)->tsv.marked, FIXEDBIT) + + +/* +** test whether a string is a reserved word +*/ +#define isreserved(s) ((s)->tsv.tt == LUA_TSHRSTR && (s)->tsv.extra > 0) + + +/* +** equality for short strings, which are always internalized +*/ +#define eqshrstr(a,b) check_exp((a)->tsv.tt == LUA_TSHRSTR, (a) == (b)) + + +LUAI_FUNC unsigned int luaS_hash (const char *str, size_t l, unsigned int seed); +LUAI_FUNC int luaS_eqlngstr (TString *a, TString *b); +LUAI_FUNC int luaS_eqstr (TString *a, TString *b); +LUAI_FUNC void luaS_resize (lua_State *L, int newsize); +LUAI_FUNC Udata *luaS_newudata (lua_State *L, size_t s, Table *e); +LUAI_FUNC TString *luaS_newlstr (lua_State *L, const char *str, size_t l); +LUAI_FUNC TString *luaS_new (lua_State *L, const char *str); + + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c new file mode 100644 index 000000000000..589752d3690e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c @@ -0,0 +1,1050 @@ +/* +** $Id: lstrlib.c,v 1.178.1.1 2013/04/12 18:48:47 roberto Exp $ +** Standard library for string operations and pattern-matching +** See Copyright Notice in lua.h +*/ + + +#include <sys/ctype.h> +#include <sys/zfs_context.h> + +#define lstrlib_c +#define LUA_LIB + +#include "lua.h" + +#include "lauxlib.h" +#include "lualib.h" + + +/* +** maximum number of captures that a pattern can do during +** pattern-matching. This limit is arbitrary. +*/ +#if !defined(LUA_MAXCAPTURES) +#define LUA_MAXCAPTURES 32 +#endif + + +/* macro to `unsign' a character */ +#define uchar(c) ((unsigned char)(c)) + +/* + * PATCHED: add missing character macros. + */ +#ifdef illumos +#define tolower(C) (((C) >= 'A' && (C) <= 'Z') ? (C) - 'A' + 'a' : (C)) +#define toupper(C) (((C) >= 'a' && (C) <= 'z') ? (C) - 'a' + 'A': (C)) +#define iscntrl(C) ((((C) >= 0) && ((C) <= 0x1f)) || ((C) == 0x7f)) +#else +#define isalnum(C) (isalpha(C) || isdigit(C)) +#define iscntrl(C) (uchar(C) <= 0x1f || uchar(C) == 0x7f) +#endif +#define isgraph(C) ((C) >= 0x21 && (C) <= 0x7E) +#define ispunct(C) (((C) >= 0x21 && (C) <= 0x2F) || \ + ((C) >= 0x3A && (C) <= 0x40) || \ + ((C) >= 0x5B && (C) <= 0x60) || \ + ((C) >= 0x7B && (C) <= 0x7E)) + +/* + * The provided version of sprintf returns a char *, but str_format expects + * it to return the number of characters printed. This version has the expected + * behavior. + */ +static size_t str_sprintf(char *buf, const char *fmt, ...) { + va_list args; + size_t len; + + va_start(args, fmt); + len = vsnprintf(buf, INT_MAX, fmt, args); + va_end(args); + + return len; +} + + +static int str_len (lua_State *L) { + size_t l; + luaL_checklstring(L, 1, &l); + lua_pushinteger(L, (lua_Integer)l); + return 1; +} + + +/* translate a relative string position: negative means back from end */ +static size_t posrelat (ptrdiff_t pos, size_t len) { + if (pos >= 0) return (size_t)pos; + else if (0u - (size_t)pos > len) return 0; + else return len - ((size_t)-pos) + 1; +} + + +static int str_sub (lua_State *L) { + size_t l; + const char *s = luaL_checklstring(L, 1, &l); + size_t start = posrelat(luaL_checkinteger(L, 2), l); + size_t end = posrelat(luaL_optinteger(L, 3, -1), l); + if (start < 1) start = 1; + if (end > l) end = l; + if (start <= end) + lua_pushlstring(L, s + start - 1, end - start + 1); + else lua_pushliteral(L, ""); + return 1; +} + + +static int str_reverse (lua_State *L) { + size_t l, i; + luaL_Buffer b; + const char *s = luaL_checklstring(L, 1, &l); + char *p = luaL_buffinitsize(L, &b, l); + for (i = 0; i < l; i++) + p[i] = s[l - i - 1]; + luaL_pushresultsize(&b, l); + return 1; +} + + +static int str_lower (lua_State *L) { + size_t l; + size_t i; + luaL_Buffer b; + const char *s = luaL_checklstring(L, 1, &l); + char *p = luaL_buffinitsize(L, &b, l); + for (i=0; i<l; i++) + p[i] = tolower(uchar(s[i])); + luaL_pushresultsize(&b, l); + return 1; +} + + +static int str_upper (lua_State *L) { + size_t l; + size_t i; + luaL_Buffer b; + const char *s = luaL_checklstring(L, 1, &l); + char *p = luaL_buffinitsize(L, &b, l); + for (i=0; i<l; i++) + p[i] = toupper(uchar(s[i])); + luaL_pushresultsize(&b, l); + return 1; +} + + +/* reasonable limit to avoid arithmetic overflow */ +#define MAXSIZE ((~(size_t)0) >> 1) + +static int str_rep (lua_State *L) { + size_t l, lsep; + const char *s = luaL_checklstring(L, 1, &l); + int n = luaL_checkint(L, 2); + const char *sep = luaL_optlstring(L, 3, "", &lsep); + if (n <= 0) lua_pushliteral(L, ""); + else if (l + lsep < l || l + lsep >= MAXSIZE / n) /* may overflow? */ + return luaL_error(L, "resulting string too large"); + else { + size_t totallen = n * l + (n - 1) * lsep; + luaL_Buffer b; + char *p = luaL_buffinitsize(L, &b, totallen); + while (n-- > 1) { /* first n-1 copies (followed by separator) */ + memcpy(p, s, l * sizeof(char)); p += l; + if (lsep > 0) { /* avoid empty 'memcpy' (may be expensive) */ + memcpy(p, sep, lsep * sizeof(char)); p += lsep; + } + } + memcpy(p, s, l * sizeof(char)); /* last copy (not followed by separator) */ + luaL_pushresultsize(&b, totallen); + } + return 1; +} + + +static int str_byte (lua_State *L) { + size_t l; + const char *s = luaL_checklstring(L, 1, &l); + size_t posi = posrelat(luaL_optinteger(L, 2, 1), l); + size_t pose = posrelat(luaL_optinteger(L, 3, posi), l); + int n, i; + if (posi < 1) posi = 1; + if (pose > l) pose = l; + if (posi > pose) return 0; /* empty interval; return no values */ + n = (int)(pose - posi + 1); + if (posi + n <= pose) /* (size_t -> int) overflow? */ + return luaL_error(L, "string slice too long"); + luaL_checkstack(L, n, "string slice too long"); + for (i=0; i<n; i++) + lua_pushinteger(L, uchar(s[posi+i-1])); + return n; +} + + +static int str_char (lua_State *L) { + int n = lua_gettop(L); /* number of arguments */ + int i; + luaL_Buffer b; + char *p = luaL_buffinitsize(L, &b, n); + for (i=1; i<=n; i++) { + int c = luaL_checkint(L, i); + luaL_argcheck(L, uchar(c) == c, i, "value out of range"); + p[i - 1] = uchar(c); + } + luaL_pushresultsize(&b, n); + return 1; +} + + +static int writer (lua_State *L, const void* b, size_t size, void* B) { + (void)L; + luaL_addlstring((luaL_Buffer*) B, (const char *)b, size); + return 0; +} + + +static int str_dump (lua_State *L) { + luaL_Buffer b; + luaL_checktype(L, 1, LUA_TFUNCTION); + lua_settop(L, 1); + luaL_buffinit(L,&b); + if (lua_dump(L, writer, &b) != 0) + return luaL_error(L, "unable to dump given function"); + luaL_pushresult(&b); + return 1; +} + + + +/* +** {====================================================== +** PATTERN MATCHING +** ======================================================= +*/ + + +#define CAP_UNFINISHED (-1) +#define CAP_POSITION (-2) + + +typedef struct MatchState { + int matchdepth; /* control for recursive depth (to avoid C stack overflow) */ + const char *src_init; /* init of source string */ + const char *src_end; /* end ('\0') of source string */ + const char *p_end; /* end ('\0') of pattern */ + lua_State *L; + int level; /* total number of captures (finished or unfinished) */ + struct { + const char *init; + ptrdiff_t len; + } capture[LUA_MAXCAPTURES]; +} MatchState; + + +/* recursive function */ +static const char *match (MatchState *ms, const char *s, const char *p); + + +/* maximum recursion depth for 'match' */ +#if !defined(MAXCCALLS) +#define MAXCCALLS 200 +#endif + + +#define L_ESC '%' +#define SPECIALS "^$*+?.([%-" + + +static int check_capture (MatchState *ms, int l) { + l -= '1'; + if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED) + return luaL_error(ms->L, "invalid capture index %%%d", l + 1); + return l; +} + + +static int capture_to_close (MatchState *ms) { + int level = ms->level; + for (level--; level>=0; level--) + if (ms->capture[level].len == CAP_UNFINISHED) return level; + return luaL_error(ms->L, "invalid pattern capture"); +} + + +static const char *classend (MatchState *ms, const char *p) { + switch (*p++) { + case L_ESC: { + if (p == ms->p_end) + luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")"); + return p+1; + } + case '[': { + if (*p == '^') p++; + do { /* look for a `]' */ + if (p == ms->p_end) + luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")"); + if (*(p++) == L_ESC && p < ms->p_end) + p++; /* skip escapes (e.g. `%]') */ + } while (*p != ']'); + return p+1; + } + default: { + return p; + } + } +} + + +static int match_class (int c, int cl) { + int res; + switch (tolower(cl)) { + case 'a' : res = isalpha(c); break; + case 'c' : res = iscntrl(c); break; + case 'd' : res = isdigit(c); break; + case 'g' : res = isgraph(c); break; + case 'l' : res = islower(c); break; + case 'p' : res = ispunct(c); break; + case 's' : res = isspace(c); break; + case 'u' : res = isupper(c); break; + case 'w' : res = isalnum(c); break; + case 'x' : res = isxdigit(c); break; + case 'z' : res = (c == 0); break; /* deprecated option */ + default: return (cl == c); + } + return (islower(cl) ? res : !res); +} + + +static int matchbracketclass (int c, const char *p, const char *ec) { + int sig = 1; + if (*(p+1) == '^') { + sig = 0; + p++; /* skip the `^' */ + } + while (++p < ec) { + if (*p == L_ESC) { + p++; + if (match_class(c, uchar(*p))) + return sig; + } + else if ((*(p+1) == '-') && (p+2 < ec)) { + p+=2; + if (uchar(*(p-2)) <= c && c <= uchar(*p)) + return sig; + } + else if (uchar(*p) == c) return sig; + } + return !sig; +} + + +static int singlematch (MatchState *ms, const char *s, const char *p, + const char *ep) { + if (s >= ms->src_end) + return 0; + else { + int c = uchar(*s); + switch (*p) { + case '.': return 1; /* matches any char */ + case L_ESC: return match_class(c, uchar(*(p+1))); + case '[': return matchbracketclass(c, p, ep-1); + default: return (uchar(*p) == c); + } + } +} + + +static const char *matchbalance (MatchState *ms, const char *s, + const char *p) { + if (p >= ms->p_end - 1) + luaL_error(ms->L, "malformed pattern " + "(missing arguments to " LUA_QL("%%b") ")"); + if (*s != *p) return NULL; + else { + int b = *p; + int e = *(p+1); + int cont = 1; + while (++s < ms->src_end) { + if (*s == e) { + if (--cont == 0) return s+1; + } + else if (*s == b) cont++; + } + } + return NULL; /* string ends out of balance */ +} + + +static const char *max_expand (MatchState *ms, const char *s, + const char *p, const char *ep) { + ptrdiff_t i = 0; /* counts maximum expand for item */ + while (singlematch(ms, s + i, p, ep)) + i++; + /* keeps trying to match with the maximum repetitions */ + while (i>=0) { + const char *res = match(ms, (s+i), ep+1); + if (res) return res; + i--; /* else didn't match; reduce 1 repetition to try again */ + } + return NULL; +} + + +static const char *min_expand (MatchState *ms, const char *s, + const char *p, const char *ep) { + for (;;) { + const char *res = match(ms, s, ep+1); + if (res != NULL) + return res; + else if (singlematch(ms, s, p, ep)) + s++; /* try with one more repetition */ + else return NULL; + } +} + + +static const char *start_capture (MatchState *ms, const char *s, + const char *p, int what) { + const char *res; + int level = ms->level; + if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures"); + ms->capture[level].init = s; + ms->capture[level].len = what; + ms->level = level+1; + if ((res=match(ms, s, p)) == NULL) /* match failed? */ + ms->level--; /* undo capture */ + return res; +} + + +static const char *end_capture (MatchState *ms, const char *s, + const char *p) { + int l = capture_to_close(ms); + const char *res; + ms->capture[l].len = s - ms->capture[l].init; /* close capture */ + if ((res = match(ms, s, p)) == NULL) /* match failed? */ + ms->capture[l].len = CAP_UNFINISHED; /* undo capture */ + return res; +} + + +static const char *match_capture (MatchState *ms, const char *s, int l) { + size_t len; + l = check_capture(ms, l); + len = ms->capture[l].len; + if ((size_t)(ms->src_end-s) >= len && + memcmp(ms->capture[l].init, s, len) == 0) + return s+len; + else return NULL; +} + + +static const char *match (MatchState *ms, const char *s, const char *p) { + if (ms->matchdepth-- == 0) + luaL_error(ms->L, "pattern too complex"); + init: /* using goto's to optimize tail recursion */ + if (p != ms->p_end) { /* end of pattern? */ + switch (*p) { + case '(': { /* start capture */ + if (*(p + 1) == ')') /* position capture? */ + s = start_capture(ms, s, p + 2, CAP_POSITION); + else + s = start_capture(ms, s, p + 1, CAP_UNFINISHED); + break; + } + case ')': { /* end capture */ + s = end_capture(ms, s, p + 1); + break; + } + case '$': { + if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */ + goto dflt; /* no; go to default */ + s = (s == ms->src_end) ? s : NULL; /* check end of string */ + break; + } + case L_ESC: { /* escaped sequences not in the format class[*+?-]? */ + switch (*(p + 1)) { + case 'b': { /* balanced string? */ + s = matchbalance(ms, s, p + 2); + if (s != NULL) { + p += 4; goto init; /* return match(ms, s, p + 4); */ + } /* else fail (s == NULL) */ + break; + } + case 'f': { /* frontier? */ + const char *ep; char previous; + p += 2; + if (*p != '[') + luaL_error(ms->L, "missing " LUA_QL("[") " after " + LUA_QL("%%f") " in pattern"); + ep = classend(ms, p); /* points to what is next */ + previous = (s == ms->src_init) ? '\0' : *(s - 1); + if (!matchbracketclass(uchar(previous), p, ep - 1) && + matchbracketclass(uchar(*s), p, ep - 1)) { + p = ep; goto init; /* return match(ms, s, ep); */ + } + s = NULL; /* match failed */ + break; + } + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + case '8': case '9': { /* capture results (%0-%9)? */ + s = match_capture(ms, s, uchar(*(p + 1))); + if (s != NULL) { + p += 2; goto init; /* return match(ms, s, p + 2) */ + } + break; + } + default: goto dflt; + } + break; + } + default: dflt: { /* pattern class plus optional suffix */ + const char *ep = classend(ms, p); /* points to optional suffix */ + /* does not match at least once? */ + if (!singlematch(ms, s, p, ep)) { + if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */ + p = ep + 1; goto init; /* return match(ms, s, ep + 1); */ + } + else /* '+' or no suffix */ + s = NULL; /* fail */ + } + else { /* matched once */ + switch (*ep) { /* handle optional suffix */ + case '?': { /* optional */ + const char *res; + if ((res = match(ms, s + 1, ep + 1)) != NULL) + s = res; + else { + p = ep + 1; goto init; /* else return match(ms, s, ep + 1); */ + } + break; + } + case '+': /* 1 or more repetitions */ + s++; /* 1 match already done */ + /* FALLTHROUGH */ + case '*': /* 0 or more repetitions */ + s = max_expand(ms, s, p, ep); + break; + case '-': /* 0 or more repetitions (minimum) */ + s = min_expand(ms, s, p, ep); + break; + default: /* no suffix */ + s++; p = ep; goto init; /* return match(ms, s + 1, ep); */ + } + } + break; + } + } + } + ms->matchdepth++; + return s; +} + + + +static const char *lmemfind (const char *s1, size_t l1, + const char *s2, size_t l2) { + if (l2 == 0) return s1; /* empty strings are everywhere */ + else if (l2 > l1) return NULL; /* avoids a negative `l1' */ + else { + const char *init; /* to search for a `*s2' inside `s1' */ + l2--; /* 1st char will be checked by `memchr' */ + l1 = l1-l2; /* `s2' cannot be found after that */ + while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) { + init++; /* 1st char is already checked */ + if (memcmp(init, s2+1, l2) == 0) + return init-1; + else { /* correct `l1' and `s1' to try again */ + l1 -= init-s1; + s1 = init; + } + } + return NULL; /* not found */ + } +} + + +static void push_onecapture (MatchState *ms, int i, const char *s, + const char *e) { + if (i >= ms->level) { + if (i == 0) /* ms->level == 0, too */ + lua_pushlstring(ms->L, s, e - s); /* add whole match */ + else + luaL_error(ms->L, "invalid capture index"); + } + else { + ptrdiff_t l = ms->capture[i].len; + if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture"); + if (l == CAP_POSITION) + lua_pushinteger(ms->L, ms->capture[i].init - ms->src_init + 1); + else + lua_pushlstring(ms->L, ms->capture[i].init, l); + } +} + + +static int push_captures (MatchState *ms, const char *s, const char *e) { + int i; + int nlevels = (ms->level == 0 && s) ? 1 : ms->level; + luaL_checkstack(ms->L, nlevels, "too many captures"); + for (i = 0; i < nlevels; i++) + push_onecapture(ms, i, s, e); + return nlevels; /* number of strings pushed */ +} + + +/* check whether pattern has no special characters */ +static int nospecials (const char *p, size_t l) { + size_t upto = 0; + do { + if (strpbrk(p + upto, SPECIALS)) + return 0; /* pattern has a special character */ + upto += strlen(p + upto) + 1; /* may have more after \0 */ + } while (upto <= l); + return 1; /* no special chars found */ +} + + +static int str_find_aux (lua_State *L, int find) { + size_t ls, lp; + const char *s = luaL_checklstring(L, 1, &ls); + const char *p = luaL_checklstring(L, 2, &lp); + size_t init = posrelat(luaL_optinteger(L, 3, 1), ls); + if (init < 1) init = 1; + else if (init > ls + 1) { /* start after string's end? */ + lua_pushnil(L); /* cannot find anything */ + return 1; + } + /* explicit request or no special characters? */ + if (find && (lua_toboolean(L, 4) || nospecials(p, lp))) { + /* do a plain search */ + const char *s2 = lmemfind(s + init - 1, ls - init + 1, p, lp); + if (s2) { + lua_pushinteger(L, s2 - s + 1); + lua_pushinteger(L, s2 - s + lp); + return 2; + } + } + else { + MatchState ms; + const char *s1 = s + init - 1; + int anchor = (*p == '^'); + if (anchor) { + p++; lp--; /* skip anchor character */ + } + ms.L = L; + ms.matchdepth = MAXCCALLS; + ms.src_init = s; + ms.src_end = s + ls; + ms.p_end = p + lp; + do { + const char *res; + ms.level = 0; + lua_assert(ms.matchdepth == MAXCCALLS); + if ((res=match(&ms, s1, p)) != NULL) { + if (find) { + lua_pushinteger(L, s1 - s + 1); /* start */ + lua_pushinteger(L, res - s); /* end */ + return push_captures(&ms, NULL, 0) + 2; + } + else + return push_captures(&ms, s1, res); + } + } while (s1++ < ms.src_end && !anchor); + } + lua_pushnil(L); /* not found */ + return 1; +} + + +static int str_find (lua_State *L) { + return str_find_aux(L, 1); +} + + +static int str_match (lua_State *L) { + return str_find_aux(L, 0); +} + + +static int gmatch_aux (lua_State *L) { + MatchState ms; + size_t ls, lp; + const char *s = lua_tolstring(L, lua_upvalueindex(1), &ls); + const char *p = lua_tolstring(L, lua_upvalueindex(2), &lp); + const char *src; + ms.L = L; + ms.matchdepth = MAXCCALLS; + ms.src_init = s; + ms.src_end = s+ls; + ms.p_end = p + lp; + for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3)); + src <= ms.src_end; + src++) { + const char *e; + ms.level = 0; + lua_assert(ms.matchdepth == MAXCCALLS); + if ((e = match(&ms, src, p)) != NULL) { + lua_Integer newstart = e-s; + if (e == src) newstart++; /* empty match? go at least one position */ + lua_pushinteger(L, newstart); + lua_replace(L, lua_upvalueindex(3)); + return push_captures(&ms, src, e); + } + } + return 0; /* not found */ +} + + +static int str_gmatch (lua_State *L) { + luaL_checkstring(L, 1); + luaL_checkstring(L, 2); + lua_settop(L, 2); + lua_pushinteger(L, 0); + lua_pushcclosure(L, gmatch_aux, 3); + return 1; +} + + +static void add_s (MatchState *ms, luaL_Buffer *b, const char *s, + const char *e) { + size_t l, i; + const char *news = lua_tolstring(ms->L, 3, &l); + for (i = 0; i < l; i++) { + if (news[i] != L_ESC) + luaL_addchar(b, news[i]); + else { + i++; /* skip ESC */ + if (!isdigit(uchar(news[i]))) { + if (news[i] != L_ESC) + luaL_error(ms->L, "invalid use of " LUA_QL("%c") + " in replacement string", L_ESC); + luaL_addchar(b, news[i]); + } + else if (news[i] == '0') + luaL_addlstring(b, s, e - s); + else { + push_onecapture(ms, news[i] - '1', s, e); + luaL_addvalue(b); /* add capture to accumulated result */ + } + } + } +} + + +static void add_value (MatchState *ms, luaL_Buffer *b, const char *s, + const char *e, int tr) { + lua_State *L = ms->L; + switch (tr) { + case LUA_TFUNCTION: { + int n; + lua_pushvalue(L, 3); + n = push_captures(ms, s, e); + lua_call(L, n, 1); + break; + } + case LUA_TTABLE: { + push_onecapture(ms, 0, s, e); + lua_gettable(L, 3); + break; + } + default: { /* LUA_TNUMBER or LUA_TSTRING */ + add_s(ms, b, s, e); + return; + } + } + if (!lua_toboolean(L, -1)) { /* nil or false? */ + lua_pop(L, 1); + lua_pushlstring(L, s, e - s); /* keep original text */ + } + else if (!lua_isstring(L, -1)) + luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1)); + luaL_addvalue(b); /* add result to accumulator */ +} + + +static int str_gsub (lua_State *L) { + size_t srcl, lp; + const char *src = luaL_checklstring(L, 1, &srcl); + const char *p = luaL_checklstring(L, 2, &lp); + int tr = lua_type(L, 3); + size_t max_s = luaL_optinteger(L, 4, srcl+1); + int anchor = (*p == '^'); + size_t n = 0; + MatchState ms; + luaL_Buffer b; + luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING || + tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3, + "string/function/table expected"); + luaL_buffinit(L, &b); + if (anchor) { + p++; lp--; /* skip anchor character */ + } + ms.L = L; + ms.matchdepth = MAXCCALLS; + ms.src_init = src; + ms.src_end = src+srcl; + ms.p_end = p + lp; + while (n < max_s) { + const char *e; + ms.level = 0; + lua_assert(ms.matchdepth == MAXCCALLS); + e = match(&ms, src, p); + if (e) { + n++; + add_value(&ms, &b, src, e, tr); + } + if (e && e>src) /* non empty match? */ + src = e; /* skip it */ + else if (src < ms.src_end) + luaL_addchar(&b, *src++); + else break; + if (anchor) break; + } + luaL_addlstring(&b, src, ms.src_end-src); + luaL_pushresult(&b); + lua_pushinteger(L, n); /* number of substitutions */ + return 2; +} + +/* }====================================================== */ + + + +/* +** {====================================================== +** STRING FORMAT +** ======================================================= +*/ + +/* +** LUA_INTFRMLEN is the length modifier for integer conversions in +** 'string.format'; LUA_INTFRM_T is the integer type corresponding to +** the previous length +*/ +#if !defined(LUA_INTFRMLEN) /* { */ +#if defined(LUA_USE_LONGLONG) + +#define LUA_INTFRMLEN "ll" +#define LUA_INTFRM_T long long + +#else + +#define LUA_INTFRMLEN "l" +#define LUA_INTFRM_T long + +#endif +#endif /* } */ + + +/* +** LUA_FLTFRMLEN is the length modifier for float conversions in +** 'string.format'; LUA_FLTFRM_T is the float type corresponding to +** the previous length +*/ +#if !defined(LUA_FLTFRMLEN) + +#define LUA_FLTFRMLEN "" +#define LUA_FLTFRM_T double + +#endif + + +/* maximum size of each formatted item (> len(format('%99.99f', -1e308))) */ +#define MAX_ITEM 512 +/* valid flags in a format specification */ +#define FLAGS "-+ #0" +/* +** maximum size of each format specification (such as '%-099.99d') +** (+10 accounts for %99.99x plus margin of error) +*/ +#define MAX_FORMAT (sizeof(FLAGS) + sizeof(LUA_INTFRMLEN) + 10) + + +static void addquoted (lua_State *L, luaL_Buffer *b, int arg) { + size_t l; + const char *s = luaL_checklstring(L, arg, &l); + luaL_addchar(b, '"'); + while (l--) { + if (*s == '"' || *s == '\\' || *s == '\n') { + luaL_addchar(b, '\\'); + luaL_addchar(b, *s); + } + else if (*s == '\0' || iscntrl(uchar(*s))) { + char buff[10]; + if (!isdigit(uchar(*(s+1)))) + sprintf(buff, "\\%d", (int)uchar(*s)); + else + sprintf(buff, "\\%03d", (int)uchar(*s)); + luaL_addstring(b, buff); + } + else + luaL_addchar(b, *s); + s++; + } + luaL_addchar(b, '"'); +} + +static const char *scanformat (lua_State *L, const char *strfrmt, char *form) { + const char *p = strfrmt; + while (*p != '\0' && strchr(FLAGS, *p) != NULL) p++; /* skip flags */ + if ((size_t)(p - strfrmt) >= sizeof(FLAGS)/sizeof(char)) + luaL_error(L, "invalid format (repeated flags)"); + if (isdigit(uchar(*p))) p++; /* skip width */ + if (isdigit(uchar(*p))) p++; /* (2 digits at most) */ + if (*p == '.') { + p++; + if (isdigit(uchar(*p))) p++; /* skip precision */ + if (isdigit(uchar(*p))) p++; /* (2 digits at most) */ + } + if (isdigit(uchar(*p))) + luaL_error(L, "invalid format (width or precision too long)"); + *(form++) = '%'; + memcpy(form, strfrmt, (p - strfrmt + 1) * sizeof(char)); + form += p - strfrmt + 1; + *form = '\0'; + return p; +} + + +/* +** add length modifier into formats +*/ +static void addlenmod (char *form, const char *lenmod) { + size_t l = strlen(form); + size_t lm = strlen(lenmod); + char spec = form[l - 1]; + strcpy(form + l - 1, lenmod); + form[l + lm - 1] = spec; + form[l + lm] = '\0'; +} + + +static int str_format (lua_State *L) { + int top = lua_gettop(L); + int arg = 1; + size_t sfl; + const char *strfrmt = luaL_checklstring(L, arg, &sfl); + const char *strfrmt_end = strfrmt+sfl; + luaL_Buffer b; + luaL_buffinit(L, &b); + while (strfrmt < strfrmt_end) { + if (*strfrmt != L_ESC) + luaL_addchar(&b, *strfrmt++); + else if (*++strfrmt == L_ESC) + luaL_addchar(&b, *strfrmt++); /* %% */ + else { /* format item */ + char form[MAX_FORMAT]; /* to store the format (`%...') */ + char *buff = luaL_prepbuffsize(&b, MAX_ITEM); /* to put formatted item */ + int nb = 0; /* number of bytes in added item */ + if (++arg > top) + luaL_argerror(L, arg, "no value"); + strfrmt = scanformat(L, strfrmt, form); + switch (*strfrmt++) { + case 'c': { + nb = str_sprintf(buff, form, luaL_checkint(L, arg)); + break; + } + case 'd': case 'i': { + lua_Number n = luaL_checknumber(L, arg); + LUA_INTFRM_T ni = (LUA_INTFRM_T)n; + lua_Number diff = n - (lua_Number)ni; + luaL_argcheck(L, -1 < diff && diff < 1, arg, + "not a number in proper range"); + addlenmod(form, LUA_INTFRMLEN); + nb = str_sprintf(buff, form, ni); + break; + } + case 'o': case 'u': case 'x': case 'X': { + lua_Number n = luaL_checknumber(L, arg); + unsigned LUA_INTFRM_T ni = (unsigned LUA_INTFRM_T)n; + lua_Number diff = n - (lua_Number)ni; + luaL_argcheck(L, -1 < diff && diff < 1, arg, + "not a non-negative number in proper range"); + addlenmod(form, LUA_INTFRMLEN); + nb = str_sprintf(buff, form, ni); + break; + } +#if defined(LUA_USE_FLOAT_FORMATS) + case 'e': case 'E': case 'f': +#if defined(LUA_USE_AFORMAT) + case 'a': case 'A': +#endif + case 'g': case 'G': { + addlenmod(form, LUA_FLTFRMLEN); + nb = str_sprintf(buff, form, (LUA_FLTFRM_T)luaL_checknumber(L, arg)); + break; + } +#endif + case 'q': { + addquoted(L, &b, arg); + break; + } + case 's': { + size_t l; + const char *s = luaL_tolstring(L, arg, &l); + if (!strchr(form, '.') && l >= 100) { + /* no precision and string is too long to be formatted; + keep original string */ + luaL_addvalue(&b); + break; + } + else { + nb = str_sprintf(buff, form, s); + lua_pop(L, 1); /* remove result from 'luaL_tolstring' */ + break; + } + } + default: { /* also treat cases `pnLlh' */ + return luaL_error(L, "invalid option " LUA_QL("%%%c") " to " + LUA_QL("format"), *(strfrmt - 1)); + } + } + luaL_addsize(&b, nb); + } + } + luaL_pushresult(&b); + return 1; +} + +/* }====================================================== */ + + +static const luaL_Reg strlib[] = { + {"byte", str_byte}, + {"char", str_char}, + {"dump", str_dump}, + {"find", str_find}, + {"format", str_format}, + {"gmatch", str_gmatch}, + {"gsub", str_gsub}, + {"len", str_len}, + {"lower", str_lower}, + {"match", str_match}, + {"rep", str_rep}, + {"reverse", str_reverse}, + {"sub", str_sub}, + {"upper", str_upper}, + {NULL, NULL} +}; + + +static void createmetatable (lua_State *L) { + lua_createtable(L, 0, 1); /* table to be metatable for strings */ + lua_pushliteral(L, ""); /* dummy string */ + lua_pushvalue(L, -2); /* copy table */ + lua_setmetatable(L, -2); /* set table as metatable for strings */ + lua_pop(L, 1); /* pop dummy string */ + lua_pushvalue(L, -2); /* get string library */ + lua_setfield(L, -2, "__index"); /* metatable.__index = string */ + lua_pop(L, 1); /* pop metatable */ +} + + +/* +** Open string library +*/ +LUAMOD_API int luaopen_string (lua_State *L) { + luaL_newlib(L, strlib); + createmetatable(L); + return 1; +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c new file mode 100644 index 000000000000..4f8ab1b16733 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c @@ -0,0 +1,589 @@ +/* +** $Id: ltable.c,v 2.72.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua tables (hash) +** See Copyright Notice in lua.h +*/ + + +/* +** Implementation of tables (aka arrays, objects, or hash tables). +** Tables keep its elements in two parts: an array part and a hash part. +** Non-negative integer keys are all candidates to be kept in the array +** part. The actual size of the array is the largest `n' such that at +** least half the slots between 0 and n are in use. +** Hash uses a mix of chained scatter table with Brent's variation. +** A main invariant of these tables is that, if an element is not +** in its main position (i.e. the `original' position that its hash gives +** to it), then the colliding element is in its own main position. +** Hence even when the load factor reaches 100%, performance remains good. +*/ + +#include <sys/zfs_context.h> + +#define ltable_c +#define LUA_CORE + +#include "lua.h" + +#include "ldebug.h" +#include "ldo.h" +#include "lgc.h" +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "lvm.h" + + +/* +** max size of array part is 2^MAXBITS +*/ +#if LUAI_BITSINT >= 32 +#define MAXBITS 30 +#else +#define MAXBITS (LUAI_BITSINT-2) +#endif + +#define MAXASIZE (1 << MAXBITS) + + +#define hashpow2(t,n) (gnode(t, lmod((n), sizenode(t)))) + +#define hashstr(t,str) hashpow2(t, (str)->tsv.hash) +#define hashboolean(t,p) hashpow2(t, p) + + +/* +** for some types, it is better to avoid modulus by power of 2, as +** they tend to have many 2 factors. +*/ +#define hashmod(t,n) (gnode(t, ((n) % ((sizenode(t)-1)|1)))) + + +#define hashpointer(t,p) hashmod(t, IntPoint(p)) + + +#define dummynode (&dummynode_) + +#define isdummy(n) ((n) == dummynode) + +static const Node dummynode_ = { + {NILCONSTANT}, /* value */ + {{NILCONSTANT, NULL}} /* key */ +}; + + +/* +** hash for lua_Numbers +*/ +static Node *hashnum (const Table *t, lua_Number n) { + int i; + luai_hashnum(i, n); + if (i < 0) { + if (cast(unsigned int, i) == 0u - i) /* use unsigned to avoid overflows */ + i = 0; /* handle INT_MIN */ + i = -i; /* must be a positive value */ + } + return hashmod(t, i); +} + + + +/* +** returns the `main' position of an element in a table (that is, the index +** of its hash value) +*/ +static Node *mainposition (const Table *t, const TValue *key) { + switch (ttype(key)) { + case LUA_TNUMBER: + return hashnum(t, nvalue(key)); + case LUA_TLNGSTR: { + TString *s = rawtsvalue(key); + if (s->tsv.extra == 0) { /* no hash? */ + s->tsv.hash = luaS_hash(getstr(s), s->tsv.len, s->tsv.hash); + s->tsv.extra = 1; /* now it has its hash */ + } + return hashstr(t, rawtsvalue(key)); + } + case LUA_TSHRSTR: + return hashstr(t, rawtsvalue(key)); + case LUA_TBOOLEAN: + return hashboolean(t, bvalue(key)); + case LUA_TLIGHTUSERDATA: + return hashpointer(t, pvalue(key)); + case LUA_TLCF: + return hashpointer(t, fvalue(key)); + default: + return hashpointer(t, gcvalue(key)); + } +} + + +/* +** returns the index for `key' if `key' is an appropriate key to live in +** the array part of the table, -1 otherwise. +*/ +static int arrayindex (const TValue *key) { + if (ttisnumber(key)) { + lua_Number n = nvalue(key); + int k; + lua_number2int(k, n); + if (luai_numeq(cast_num(k), n)) + return k; + } + return -1; /* `key' did not match some condition */ +} + + +/* +** returns the index of a `key' for table traversals. First goes all +** elements in the array part, then elements in the hash part. The +** beginning of a traversal is signaled by -1. +*/ +static int findindex (lua_State *L, Table *t, StkId key) { + int i; + if (ttisnil(key)) return -1; /* first iteration */ + i = arrayindex(key); + if (0 < i && i <= t->sizearray) /* is `key' inside array part? */ + return i-1; /* yes; that's the index (corrected to C) */ + else { + Node *n = mainposition(t, key); + for (;;) { /* check whether `key' is somewhere in the chain */ + /* key may be dead already, but it is ok to use it in `next' */ + if (luaV_rawequalobj(gkey(n), key) || + (ttisdeadkey(gkey(n)) && iscollectable(key) && + deadvalue(gkey(n)) == gcvalue(key))) { + i = cast_int(n - gnode(t, 0)); /* key index in hash table */ + /* hash elements are numbered after array ones */ + return i + t->sizearray; + } + else n = gnext(n); + if (n == NULL) + luaG_runerror(L, "invalid key to " LUA_QL("next")); /* key not found */ + } + } +} + + +int luaH_next (lua_State *L, Table *t, StkId key) { + int i = findindex(L, t, key); /* find original element */ + for (i++; i < t->sizearray; i++) { /* try first array part */ + if (!ttisnil(&t->array[i])) { /* a non-nil value? */ + setnvalue(key, cast_num(i+1)); + setobj2s(L, key+1, &t->array[i]); + return 1; + } + } + for (i -= t->sizearray; i < sizenode(t); i++) { /* then hash part */ + if (!ttisnil(gval(gnode(t, i)))) { /* a non-nil value? */ + setobj2s(L, key, gkey(gnode(t, i))); + setobj2s(L, key+1, gval(gnode(t, i))); + return 1; + } + } + return 0; /* no more elements */ +} + + +/* +** {============================================================= +** Rehash +** ============================================================== +*/ + + +static int computesizes (int nums[], int *narray) { + int i; + int twotoi; /* 2^i */ + int a = 0; /* number of elements smaller than 2^i */ + int na = 0; /* number of elements to go to array part */ + int n = 0; /* optimal size for array part */ + for (i = 0, twotoi = 1; twotoi/2 < *narray; i++, twotoi *= 2) { + if (nums[i] > 0) { + a += nums[i]; + if (a > twotoi/2) { /* more than half elements present? */ + n = twotoi; /* optimal size (till now) */ + na = a; /* all elements smaller than n will go to array part */ + } + } + if (a == *narray) break; /* all elements already counted */ + } + *narray = n; + lua_assert(*narray/2 <= na && na <= *narray); + return na; +} + + +static int countint (const TValue *key, int *nums) { + int k = arrayindex(key); + if (0 < k && k <= MAXASIZE) { /* is `key' an appropriate array index? */ + nums[luaO_ceillog2(k)]++; /* count as such */ + return 1; + } + else + return 0; +} + + +static int numusearray (const Table *t, int *nums) { + int lg; + int ttlg; /* 2^lg */ + int ause = 0; /* summation of `nums' */ + int i = 1; /* count to traverse all array keys */ + for (lg=0, ttlg=1; lg<=MAXBITS; lg++, ttlg*=2) { /* for each slice */ + int lc = 0; /* counter */ + int lim = ttlg; + if (lim > t->sizearray) { + lim = t->sizearray; /* adjust upper limit */ + if (i > lim) + break; /* no more elements to count */ + } + /* count elements in range (2^(lg-1), 2^lg] */ + for (; i <= lim; i++) { + if (!ttisnil(&t->array[i-1])) + lc++; + } + nums[lg] += lc; + ause += lc; + } + return ause; +} + + +static int numusehash (const Table *t, int *nums, int *pnasize) { + int totaluse = 0; /* total number of elements */ + int ause = 0; /* summation of `nums' */ + int i = sizenode(t); + while (i--) { + Node *n = &t->node[i]; + if (!ttisnil(gval(n))) { + ause += countint(gkey(n), nums); + totaluse++; + } + } + *pnasize += ause; + return totaluse; +} + + +static void setarrayvector (lua_State *L, Table *t, int size) { + int i; + luaM_reallocvector(L, t->array, t->sizearray, size, TValue); + for (i=t->sizearray; i<size; i++) + setnilvalue(&t->array[i]); + t->sizearray = size; +} + + +static void setnodevector (lua_State *L, Table *t, int size) { + int lsize; + if (size == 0) { /* no elements to hash part? */ + t->node = cast(Node *, dummynode); /* use common `dummynode' */ + lsize = 0; + } + else { + int i; + lsize = luaO_ceillog2(size); + if (lsize > MAXBITS) + luaG_runerror(L, "table overflow"); + size = twoto(lsize); + t->node = luaM_newvector(L, size, Node); + for (i=0; i<size; i++) { + Node *n = gnode(t, i); + gnext(n) = NULL; + setnilvalue(gkey(n)); + setnilvalue(gval(n)); + } + } + t->lsizenode = cast_byte(lsize); + t->lastfree = gnode(t, size); /* all positions are free */ +} + + +void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize) { + int i; + int oldasize = t->sizearray; + int oldhsize = t->lsizenode; + Node *nold = t->node; /* save old hash ... */ + if (nasize > oldasize) /* array part must grow? */ + setarrayvector(L, t, nasize); + /* create new hash part with appropriate size */ + setnodevector(L, t, nhsize); + if (nasize < oldasize) { /* array part must shrink? */ + t->sizearray = nasize; + /* re-insert elements from vanishing slice */ + for (i=nasize; i<oldasize; i++) { + if (!ttisnil(&t->array[i])) + luaH_setint(L, t, i + 1, &t->array[i]); + } + /* shrink array */ + luaM_reallocvector(L, t->array, oldasize, nasize, TValue); + } + /* re-insert elements from hash part */ + for (i = twoto(oldhsize) - 1; i >= 0; i--) { + Node *old = nold+i; + if (!ttisnil(gval(old))) { + /* doesn't need barrier/invalidate cache, as entry was + already present in the table */ + setobjt2t(L, luaH_set(L, t, gkey(old)), gval(old)); + } + } + if (!isdummy(nold)) + luaM_freearray(L, nold, cast(size_t, twoto(oldhsize))); /* free old array */ +} + + +void luaH_resizearray (lua_State *L, Table *t, int nasize) { + int nsize = isdummy(t->node) ? 0 : sizenode(t); + luaH_resize(L, t, nasize, nsize); +} + + +static void rehash (lua_State *L, Table *t, const TValue *ek) { + int nasize, na; + int nums[MAXBITS+1]; /* nums[i] = number of keys with 2^(i-1) < k <= 2^i */ + int i; + int totaluse; + for (i=0; i<=MAXBITS; i++) nums[i] = 0; /* reset counts */ + nasize = numusearray(t, nums); /* count keys in array part */ + totaluse = nasize; /* all those keys are integer keys */ + totaluse += numusehash(t, nums, &nasize); /* count keys in hash part */ + /* count extra key */ + nasize += countint(ek, nums); + totaluse++; + /* compute new size for array part */ + na = computesizes(nums, &nasize); + /* resize the table to new computed sizes */ + luaH_resize(L, t, nasize, totaluse - na); +} + + + +/* +** }============================================================= +*/ + + +Table *luaH_new (lua_State *L) { + Table *t = &luaC_newobj(L, LUA_TTABLE, sizeof(Table), NULL, 0)->h; + t->metatable = NULL; + t->flags = cast_byte(~0); + t->array = NULL; + t->sizearray = 0; + setnodevector(L, t, 0); + return t; +} + + +void luaH_free (lua_State *L, Table *t) { + if (!isdummy(t->node)) + luaM_freearray(L, t->node, cast(size_t, sizenode(t))); + luaM_freearray(L, t->array, t->sizearray); + luaM_free(L, t); +} + + +static Node *getfreepos (Table *t) { + while (t->lastfree > t->node) { + t->lastfree--; + if (ttisnil(gkey(t->lastfree))) + return t->lastfree; + } + return NULL; /* could not find a free place */ +} + + + +/* +** inserts a new key into a hash table; first, check whether key's main +** position is free. If not, check whether colliding node is in its main +** position or not: if it is not, move colliding node to an empty place and +** put new key in its main position; otherwise (colliding node is in its main +** position), new key goes to an empty position. +*/ +TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key) { + Node *mp; + if (ttisnil(key)) luaG_runerror(L, "table index is nil"); + else if (ttisnumber(key) && luai_numisnan(L, nvalue(key))) + luaG_runerror(L, "table index is NaN"); + mp = mainposition(t, key); + if (!ttisnil(gval(mp)) || isdummy(mp)) { /* main position is taken? */ + Node *othern; + Node *n = getfreepos(t); /* get a free place */ + if (n == NULL) { /* cannot find a free place? */ + rehash(L, t, key); /* grow table */ + /* whatever called 'newkey' take care of TM cache and GC barrier */ + return luaH_set(L, t, key); /* insert key into grown table */ + } + lua_assert(!isdummy(n)); + othern = mainposition(t, gkey(mp)); + if (othern != mp) { /* is colliding node out of its main position? */ + /* yes; move colliding node into free position */ + while (gnext(othern) != mp) othern = gnext(othern); /* find previous */ + gnext(othern) = n; /* redo the chain with `n' in place of `mp' */ + *n = *mp; /* copy colliding node into free pos. (mp->next also goes) */ + gnext(mp) = NULL; /* now `mp' is free */ + setnilvalue(gval(mp)); + } + else { /* colliding node is in its own main position */ + /* new node will go into free position */ + gnext(n) = gnext(mp); /* chain new position */ + gnext(mp) = n; + mp = n; + } + } + setobj2t(L, gkey(mp), key); + luaC_barrierback(L, obj2gco(t), key); + lua_assert(ttisnil(gval(mp))); + return gval(mp); +} + + +/* +** search function for integers +*/ +const TValue *luaH_getint (Table *t, int key) { + /* (1 <= key && key <= t->sizearray) */ + if (cast(unsigned int, key-1) < cast(unsigned int, t->sizearray)) + return &t->array[key-1]; + else { + lua_Number nk = cast_num(key); + Node *n = hashnum(t, nk); + do { /* check whether `key' is somewhere in the chain */ + if (ttisnumber(gkey(n)) && luai_numeq(nvalue(gkey(n)), nk)) + return gval(n); /* that's it */ + else n = gnext(n); + } while (n); + return luaO_nilobject; + } +} + + +/* +** search function for short strings +*/ +const TValue *luaH_getstr (Table *t, TString *key) { + Node *n = hashstr(t, key); + lua_assert(key->tsv.tt == LUA_TSHRSTR); + do { /* check whether `key' is somewhere in the chain */ + if (ttisshrstring(gkey(n)) && eqshrstr(rawtsvalue(gkey(n)), key)) + return gval(n); /* that's it */ + else n = gnext(n); + } while (n); + return luaO_nilobject; +} + + +/* +** main search function +*/ +const TValue *luaH_get (Table *t, const TValue *key) { + switch (ttype(key)) { + case LUA_TSHRSTR: return luaH_getstr(t, rawtsvalue(key)); + case LUA_TNIL: return luaO_nilobject; + case LUA_TNUMBER: { + int k; + lua_Number n = nvalue(key); + lua_number2int(k, n); + if (luai_numeq(cast_num(k), n)) /* index is int? */ + return luaH_getint(t, k); /* use specialized version */ + /* else go through */ + } + /* FALLTHROUGH */ + default: { + Node *n = mainposition(t, key); + do { /* check whether `key' is somewhere in the chain */ + if (luaV_rawequalobj(gkey(n), key)) + return gval(n); /* that's it */ + else n = gnext(n); + } while (n); + return luaO_nilobject; + } + } +} + + +/* +** beware: when using this function you probably need to check a GC +** barrier and invalidate the TM cache. +*/ +TValue *luaH_set (lua_State *L, Table *t, const TValue *key) { + const TValue *p = luaH_get(t, key); + if (p != luaO_nilobject) + return cast(TValue *, p); + else return luaH_newkey(L, t, key); +} + + +void luaH_setint (lua_State *L, Table *t, int key, TValue *value) { + const TValue *p = luaH_getint(t, key); + TValue *cell; + if (p != luaO_nilobject) + cell = cast(TValue *, p); + else { + TValue k; + setnvalue(&k, cast_num(key)); + cell = luaH_newkey(L, t, &k); + } + setobj2t(L, cell, value); +} + + +static int unbound_search (Table *t, unsigned int j) { + unsigned int i = j; /* i is zero or a present index */ + j++; + /* find `i' and `j' such that i is present and j is not */ + while (!ttisnil(luaH_getint(t, j))) { + i = j; + j *= 2; + if (j > cast(unsigned int, MAX_INT)) { /* overflow? */ + /* table was built with bad purposes: resort to linear search */ + i = 1; + while (!ttisnil(luaH_getint(t, i))) i++; + return i - 1; + } + } + /* now do a binary search between them */ + while (j - i > 1) { + unsigned int m = (i+j)/2; + if (ttisnil(luaH_getint(t, m))) j = m; + else i = m; + } + return i; +} + + +/* +** Try to find a boundary in table `t'. A `boundary' is an integer index +** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil). +*/ +int luaH_getn (Table *t) { + unsigned int j = t->sizearray; + if (j > 0 && ttisnil(&t->array[j - 1])) { + /* there is a boundary in the array part: (binary) search for it */ + unsigned int i = 0; + while (j - i > 1) { + unsigned int m = (i+j)/2; + if (ttisnil(&t->array[m - 1])) j = m; + else i = m; + } + return i; + } + /* else must find a boundary in hash part */ + else if (isdummy(t->node)) /* hash part is empty? */ + return j; /* that is easy... */ + else return unbound_search(t, j); +} + + + +#if defined(LUA_DEBUG) + +Node *luaH_mainposition (const Table *t, const TValue *key) { + return mainposition(t, key); +} + +int luaH_isdummy (Node *n) { return isdummy(n); } + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h new file mode 100644 index 000000000000..d69449b2b863 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h @@ -0,0 +1,45 @@ +/* +** $Id: ltable.h,v 2.16.1.2 2013/08/30 15:49:41 roberto Exp $ +** Lua tables (hash) +** See Copyright Notice in lua.h +*/ + +#ifndef ltable_h +#define ltable_h + +#include "lobject.h" + + +#define gnode(t,i) (&(t)->node[i]) +#define gkey(n) (&(n)->i_key.tvk) +#define gval(n) (&(n)->i_val) +#define gnext(n) ((n)->i_key.nk.next) + +#define invalidateTMcache(t) ((t)->flags = 0) + +/* returns the key, given the value of a table entry */ +#define keyfromval(v) \ + (gkey(cast(Node *, cast(char *, (v)) - offsetof(Node, i_val)))) + + +LUAI_FUNC const TValue *luaH_getint (Table *t, int key); +LUAI_FUNC void luaH_setint (lua_State *L, Table *t, int key, TValue *value); +LUAI_FUNC const TValue *luaH_getstr (Table *t, TString *key); +LUAI_FUNC const TValue *luaH_get (Table *t, const TValue *key); +LUAI_FUNC TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key); +LUAI_FUNC TValue *luaH_set (lua_State *L, Table *t, const TValue *key); +LUAI_FUNC Table *luaH_new (lua_State *L); +LUAI_FUNC void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize); +LUAI_FUNC void luaH_resizearray (lua_State *L, Table *t, int nasize); +LUAI_FUNC void luaH_free (lua_State *L, Table *t); +LUAI_FUNC int luaH_next (lua_State *L, Table *t, StkId key); +LUAI_FUNC int luaH_getn (Table *t); + + +#if defined(LUA_DEBUG) +LUAI_FUNC Node *luaH_mainposition (const Table *t, const TValue *key); +LUAI_FUNC int luaH_isdummy (Node *n); +#endif + + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c new file mode 100644 index 000000000000..ac9a662448fa --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c @@ -0,0 +1,284 @@ +/* +** $Id: ltablib.c,v 1.65.1.2 2014/05/07 16:32:55 roberto Exp $ +** Library for Table Manipulation +** See Copyright Notice in lua.h +*/ + + +#include <sys/zfs_context.h> + +#define ltablib_c +#define LUA_LIB + +#include "lua.h" + +#include "lauxlib.h" +#include "lualib.h" + + +#define aux_getn(L,n) (luaL_checktype(L, n, LUA_TTABLE), luaL_len(L, n)) + + + +#if defined(LUA_COMPAT_MAXN) +static int maxn (lua_State *L) { + lua_Number max = 0; + luaL_checktype(L, 1, LUA_TTABLE); + lua_pushnil(L); /* first key */ + while (lua_next(L, 1)) { + lua_pop(L, 1); /* remove value */ + if (lua_type(L, -1) == LUA_TNUMBER) { + lua_Number v = lua_tonumber(L, -1); + if (v > max) max = v; + } + } + lua_pushnumber(L, max); + return 1; +} +#endif + + +static int tinsert (lua_State *L) { + int e = aux_getn(L, 1) + 1; /* first empty element */ + int pos; /* where to insert new element */ + switch (lua_gettop(L)) { + case 2: { /* called with only 2 arguments */ + pos = e; /* insert new element at the end */ + break; + } + case 3: { + int i; + pos = luaL_checkint(L, 2); /* 2nd argument is the position */ + luaL_argcheck(L, 1 <= pos && pos <= e, 2, "position out of bounds"); + for (i = e; i > pos; i--) { /* move up elements */ + lua_rawgeti(L, 1, i-1); + lua_rawseti(L, 1, i); /* t[i] = t[i-1] */ + } + break; + } + default: { + return luaL_error(L, "wrong number of arguments to " LUA_QL("insert")); + } + } + lua_rawseti(L, 1, pos); /* t[pos] = v */ + return 0; +} + + +static int tremove (lua_State *L) { + int size = aux_getn(L, 1); + int pos = luaL_optint(L, 2, size); + if (pos != size) /* validate 'pos' if given */ + luaL_argcheck(L, 1 <= pos && pos <= size + 1, 1, "position out of bounds"); + lua_rawgeti(L, 1, pos); /* result = t[pos] */ + for ( ; pos < size; pos++) { + lua_rawgeti(L, 1, pos+1); + lua_rawseti(L, 1, pos); /* t[pos] = t[pos+1] */ + } + lua_pushnil(L); + lua_rawseti(L, 1, pos); /* t[pos] = nil */ + return 1; +} + + +static void addfield (lua_State *L, luaL_Buffer *b, int i) { + lua_rawgeti(L, 1, i); + if (!lua_isstring(L, -1)) + luaL_error(L, "invalid value (%s) at index %d in table for " + LUA_QL("concat"), luaL_typename(L, -1), i); + luaL_addvalue(b); +} + + +static int tconcat (lua_State *L) { + luaL_Buffer b; + size_t lsep; + int i, last; + const char *sep = luaL_optlstring(L, 2, "", &lsep); + luaL_checktype(L, 1, LUA_TTABLE); + i = luaL_optint(L, 3, 1); + last = luaL_opt(L, luaL_checkint, 4, luaL_len(L, 1)); + luaL_buffinit(L, &b); + for (; i < last; i++) { + addfield(L, &b, i); + luaL_addlstring(&b, sep, lsep); + } + if (i == last) /* add last value (if interval was not empty) */ + addfield(L, &b, i); + luaL_pushresult(&b); + return 1; +} + + +/* +** {====================================================== +** Pack/unpack +** ======================================================= +*/ + +static int pack (lua_State *L) { + int n = lua_gettop(L); /* number of elements to pack */ + lua_createtable(L, n, 1); /* create result table */ + lua_pushinteger(L, n); + lua_setfield(L, -2, "n"); /* t.n = number of elements */ + if (n > 0) { /* at least one element? */ + int i; + lua_pushvalue(L, 1); + lua_rawseti(L, -2, 1); /* insert first element */ + lua_replace(L, 1); /* move table into index 1 */ + for (i = n; i >= 2; i--) /* assign other elements */ + lua_rawseti(L, 1, i); + } + return 1; /* return table */ +} + + +static int unpack (lua_State *L) { + int i, e; + unsigned int n; + luaL_checktype(L, 1, LUA_TTABLE); + i = luaL_optint(L, 2, 1); + e = luaL_opt(L, luaL_checkint, 3, luaL_len(L, 1)); + if (i > e) return 0; /* empty range */ + n = (unsigned int)e - (unsigned int)i; /* number of elements minus 1 */ + if (n > (INT_MAX - 10) || !lua_checkstack(L, ++n)) + return luaL_error(L, "too many results to unpack"); + lua_rawgeti(L, 1, i); /* push arg[i] (avoiding overflow problems) */ + while (i++ < e) /* push arg[i + 1...e] */ + lua_rawgeti(L, 1, i); + return n; +} + +/* }====================================================== */ + + + +/* +** {====================================================== +** Quicksort +** (based on `Algorithms in MODULA-3', Robert Sedgewick; +** Addison-Wesley, 1993.) +** ======================================================= +*/ + + +static void set2 (lua_State *L, int i, int j) { + lua_rawseti(L, 1, i); + lua_rawseti(L, 1, j); +} + +static int sort_comp (lua_State *L, int a, int b) { + if (!lua_isnil(L, 2)) { /* function? */ + int res; + lua_pushvalue(L, 2); + lua_pushvalue(L, a-1); /* -1 to compensate function */ + lua_pushvalue(L, b-2); /* -2 to compensate function and `a' */ + lua_call(L, 2, 1); + res = lua_toboolean(L, -1); + lua_pop(L, 1); + return res; + } + else /* a < b? */ + return lua_compare(L, a, b, LUA_OPLT); +} + +static void auxsort (lua_State *L, int l, int u) { + while (l < u) { /* for tail recursion */ + int i, j; + /* sort elements a[l], a[(l+u)/2] and a[u] */ + lua_rawgeti(L, 1, l); + lua_rawgeti(L, 1, u); + if (sort_comp(L, -1, -2)) /* a[u] < a[l]? */ + set2(L, l, u); /* swap a[l] - a[u] */ + else + lua_pop(L, 2); + if (u-l == 1) break; /* only 2 elements */ + i = (l+u)/2; + lua_rawgeti(L, 1, i); + lua_rawgeti(L, 1, l); + if (sort_comp(L, -2, -1)) /* a[i]<a[l]? */ + set2(L, i, l); + else { + lua_pop(L, 1); /* remove a[l] */ + lua_rawgeti(L, 1, u); + if (sort_comp(L, -1, -2)) /* a[u]<a[i]? */ + set2(L, i, u); + else + lua_pop(L, 2); + } + if (u-l == 2) break; /* only 3 elements */ + lua_rawgeti(L, 1, i); /* Pivot */ + lua_pushvalue(L, -1); + lua_rawgeti(L, 1, u-1); + set2(L, i, u-1); + /* a[l] <= P == a[u-1] <= a[u], only need to sort from l+1 to u-2 */ + i = l; j = u-1; + for (;;) { /* invariant: a[l..i] <= P <= a[j..u] */ + /* repeat ++i until a[i] >= P */ + while (lua_rawgeti(L, 1, ++i), sort_comp(L, -1, -2)) { + if (i>=u) luaL_error(L, "invalid order function for sorting"); + lua_pop(L, 1); /* remove a[i] */ + } + /* repeat --j until a[j] <= P */ + while (lua_rawgeti(L, 1, --j), sort_comp(L, -3, -1)) { + if (j<=l) luaL_error(L, "invalid order function for sorting"); + lua_pop(L, 1); /* remove a[j] */ + } + if (j<i) { + lua_pop(L, 3); /* pop pivot, a[i], a[j] */ + break; + } + set2(L, i, j); + } + lua_rawgeti(L, 1, u-1); + lua_rawgeti(L, 1, i); + set2(L, u-1, i); /* swap pivot (a[u-1]) with a[i] */ + /* a[l..i-1] <= a[i] == P <= a[i+1..u] */ + /* adjust so that smaller half is in [j..i] and larger one in [l..u] */ + if (i-l < u-i) { + j=l; i=i-1; l=i+2; + } + else { + j=i+1; i=u; u=j-2; + } + auxsort(L, j, i); /* call recursively the smaller one */ + } /* repeat the routine for the larger one */ +} + +static int sort (lua_State *L) { + int n = aux_getn(L, 1); + luaL_checkstack(L, 40, ""); /* assume array is smaller than 2^40 */ + if (!lua_isnoneornil(L, 2)) /* is there a 2nd argument? */ + luaL_checktype(L, 2, LUA_TFUNCTION); + lua_settop(L, 2); /* make sure there is two arguments */ + auxsort(L, 1, n); + return 0; +} + +/* }====================================================== */ + + +static const luaL_Reg tab_funcs[] = { + {"concat", tconcat}, +#if defined(LUA_COMPAT_MAXN) + {"maxn", maxn}, +#endif + {"insert", tinsert}, + {"pack", pack}, + {"unpack", unpack}, + {"remove", tremove}, + {"sort", sort}, + {NULL, NULL} +}; + + +LUAMOD_API int luaopen_table (lua_State *L) { + luaL_newlib(L, tab_funcs); +#if defined(LUA_COMPAT_UNPACK) + /* _G.unpack = table.unpack */ + lua_getfield(L, -1, "unpack"); + lua_setglobal(L, "unpack"); +#endif + return 1; +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c new file mode 100644 index 000000000000..671ff82505b1 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c @@ -0,0 +1,77 @@ +/* +** $Id: ltm.c,v 2.14.1.1 2013/04/12 18:48:47 roberto Exp $ +** Tag methods +** See Copyright Notice in lua.h +*/ + + +#include <sys/zfs_context.h> + +#define ltm_c +#define LUA_CORE + +#include "lua.h" + +#include "lobject.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" + + +static const char udatatypename[] = "userdata"; + +LUAI_DDEF const char *const luaT_typenames_[LUA_TOTALTAGS] = { + "no value", + "nil", "boolean", udatatypename, "number", + "string", "table", "function", udatatypename, "thread", + "proto", "upval" /* these last two cases are used for tests only */ +}; + + +void luaT_init (lua_State *L) { + static const char *const luaT_eventname[] = { /* ORDER TM */ + "__index", "__newindex", + "__gc", "__mode", "__len", "__eq", + "__add", "__sub", "__mul", "__div", "__mod", + "__pow", "__unm", "__lt", "__le", + "__concat", "__call" + }; + int i; + for (i=0; i<TM_N; i++) { + G(L)->tmname[i] = luaS_new(L, luaT_eventname[i]); + luaS_fix(G(L)->tmname[i]); /* never collect these names */ + } +} + + +/* +** function to be used with macro "fasttm": optimized for absence of +** tag methods +*/ +const TValue *luaT_gettm (Table *events, TMS event, TString *ename) { + const TValue *tm = luaH_getstr(events, ename); + lua_assert(event <= TM_EQ); + if (ttisnil(tm)) { /* no tag method? */ + events->flags |= cast_byte(1u<<event); /* cache this fact */ + return NULL; + } + else return tm; +} + + +const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o, TMS event) { + Table *mt; + switch (ttypenv(o)) { + case LUA_TTABLE: + mt = hvalue(o)->metatable; + break; + case LUA_TUSERDATA: + mt = uvalue(o)->metatable; + break; + default: + mt = G(L)->mt[ttypenv(o)]; + } + return (mt ? luaH_getstr(mt, G(L)->tmname[event]) : luaO_nilobject); +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h new file mode 100644 index 000000000000..7f89c841f9c0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h @@ -0,0 +1,57 @@ +/* +** $Id: ltm.h,v 2.11.1.1 2013/04/12 18:48:47 roberto Exp $ +** Tag methods +** See Copyright Notice in lua.h +*/ + +#ifndef ltm_h +#define ltm_h + + +#include "lobject.h" + + +/* +* WARNING: if you change the order of this enumeration, +* grep "ORDER TM" +*/ +typedef enum { + TM_INDEX, + TM_NEWINDEX, + TM_GC, + TM_MODE, + TM_LEN, + TM_EQ, /* last tag method with `fast' access */ + TM_ADD, + TM_SUB, + TM_MUL, + TM_DIV, + TM_MOD, + TM_POW, + TM_UNM, + TM_LT, + TM_LE, + TM_CONCAT, + TM_CALL, + TM_N /* number of elements in the enum */ +} TMS; + + + +#define gfasttm(g,et,e) ((et) == NULL ? NULL : \ + ((et)->flags & (1u<<(e))) ? NULL : luaT_gettm(et, e, (g)->tmname[e])) + +#define fasttm(l,et,e) gfasttm(G(l), et, e) + +#define ttypename(x) luaT_typenames_[(x) + 1] +#define objtypename(x) ttypename(ttypenv(x)) + +LUAI_DDEC const char *const luaT_typenames_[LUA_TOTALTAGS]; + + +LUAI_FUNC const TValue *luaT_gettm (Table *events, TMS event, TString *ename); +LUAI_FUNC const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o, + TMS event); +LUAI_FUNC void luaT_init (lua_State *L); + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h new file mode 100644 index 000000000000..4610dad45ed8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h @@ -0,0 +1,443 @@ +/* +** $Id: lua.h,v 1.285.1.4 2015/02/21 14:04:50 roberto Exp $ +** Lua - A Scripting Language +** Lua.org, PUC-Rio, Brazil (http://www.lua.org) +** See Copyright Notice at the end of this file +*/ + + +#ifndef lua_h +#define lua_h + +#include <sys/zfs_context.h> + +#include "luaconf.h" + + +#define LUA_VERSION_MAJOR "5" +#define LUA_VERSION_MINOR "2" +#define LUA_VERSION_NUM 502 +#define LUA_VERSION_RELEASE "4" + +#define LUA_VERSION "Lua " LUA_VERSION_MAJOR "." LUA_VERSION_MINOR +#define LUA_RELEASE LUA_VERSION "." LUA_VERSION_RELEASE +#define LUA_COPYRIGHT LUA_RELEASE " Copyright (C) 1994-2015 Lua.org, PUC-Rio" +#define LUA_AUTHORS "R. Ierusalimschy, L. H. de Figueiredo, W. Celes" + + +/* mark for precompiled code ('<esc>Lua') */ +#define LUA_SIGNATURE "\033Lua" + +/* option for multiple returns in 'lua_pcall' and 'lua_call' */ +#define LUA_MULTRET (-1) + + +/* +** pseudo-indices +*/ +#define LUA_REGISTRYINDEX LUAI_FIRSTPSEUDOIDX +#define lua_upvalueindex(i) (LUA_REGISTRYINDEX - (i)) + + +/* thread status */ +#define LUA_OK 0 +#define LUA_YIELD 1 +#define LUA_ERRRUN 2 +#define LUA_ERRSYNTAX 3 +#define LUA_ERRMEM 4 +#define LUA_ERRGCMM 5 +#define LUA_ERRERR 6 + + +typedef struct lua_State lua_State; + +typedef int (*lua_CFunction) (lua_State *L); + + +/* +** functions that read/write blocks when loading/dumping Lua chunks +*/ +typedef const char * (*lua_Reader) (lua_State *L, void *ud, size_t *sz); + +typedef int (*lua_Writer) (lua_State *L, const void* p, size_t sz, void* ud); + + +/* +** prototype for memory-allocation functions +*/ +typedef void * (*lua_Alloc) (void *ud, void *ptr, size_t osize, size_t nsize); + + +/* +** basic types +*/ +#define LUA_TNONE (-1) + +#define LUA_TNIL 0 +#define LUA_TBOOLEAN 1 +#define LUA_TLIGHTUSERDATA 2 +#define LUA_TNUMBER 3 +#define LUA_TSTRING 4 +#define LUA_TTABLE 5 +#define LUA_TFUNCTION 6 +#define LUA_TUSERDATA 7 +#define LUA_TTHREAD 8 + +#define LUA_NUMTAGS 9 + + + +/* minimum Lua stack available to a C function */ +#define LUA_MINSTACK 20 + + +/* predefined values in the registry */ +#define LUA_RIDX_MAINTHREAD 1 +#define LUA_RIDX_GLOBALS 2 +#define LUA_RIDX_LAST LUA_RIDX_GLOBALS + + +/* type of numbers in Lua */ +typedef LUA_NUMBER lua_Number; + + +/* type for integer functions */ +typedef LUA_INTEGER lua_Integer; + +/* unsigned integer type */ +typedef LUA_UNSIGNED lua_Unsigned; + + + + +/* +** generic extra include file +*/ +#if defined(LUA_USER_H) +#include LUA_USER_H +#endif + + +/* +** RCS ident string +*/ +extern const char lua_ident[]; + + +/* +** state manipulation +*/ +LUA_API lua_State *(lua_newstate) (lua_Alloc f, void *ud); +LUA_API void (lua_close) (lua_State *L); +LUA_API lua_State *(lua_newthread) (lua_State *L); + +LUA_API lua_CFunction (lua_atpanic) (lua_State *L, lua_CFunction panicf); + + +LUA_API const lua_Number *(lua_version) (lua_State *L); + + +/* +** basic stack manipulation +*/ +LUA_API int (lua_absindex) (lua_State *L, int idx); +LUA_API int (lua_gettop) (lua_State *L); +LUA_API void (lua_settop) (lua_State *L, int idx); +LUA_API void (lua_pushvalue) (lua_State *L, int idx); +LUA_API void (lua_remove) (lua_State *L, int idx); +LUA_API void (lua_insert) (lua_State *L, int idx); +LUA_API void (lua_replace) (lua_State *L, int idx); +LUA_API void (lua_copy) (lua_State *L, int fromidx, int toidx); +LUA_API int (lua_checkstack) (lua_State *L, int sz); + +LUA_API void (lua_xmove) (lua_State *from, lua_State *to, int n); + + +/* +** access functions (stack -> C) +*/ + +LUA_API int (lua_isnumber) (lua_State *L, int idx); +LUA_API int (lua_isstring) (lua_State *L, int idx); +LUA_API int (lua_iscfunction) (lua_State *L, int idx); +LUA_API int (lua_isuserdata) (lua_State *L, int idx); +LUA_API int (lua_type) (lua_State *L, int idx); +LUA_API const char *(lua_typename) (lua_State *L, int tp); + +LUA_API lua_Number (lua_tonumberx) (lua_State *L, int idx, int *isnum); +LUA_API lua_Integer (lua_tointegerx) (lua_State *L, int idx, int *isnum); +LUA_API lua_Unsigned (lua_tounsignedx) (lua_State *L, int idx, int *isnum); +LUA_API int (lua_toboolean) (lua_State *L, int idx); +LUA_API const char *(lua_tolstring) (lua_State *L, int idx, size_t *len); +LUA_API size_t (lua_rawlen) (lua_State *L, int idx); +LUA_API lua_CFunction (lua_tocfunction) (lua_State *L, int idx); +LUA_API void *(lua_touserdata) (lua_State *L, int idx); +LUA_API lua_State *(lua_tothread) (lua_State *L, int idx); +LUA_API const void *(lua_topointer) (lua_State *L, int idx); + + +/* +** Comparison and arithmetic functions +*/ + +#define LUA_OPADD 0 /* ORDER TM */ +#define LUA_OPSUB 1 +#define LUA_OPMUL 2 +#define LUA_OPDIV 3 +#define LUA_OPMOD 4 +#define LUA_OPPOW 5 +#define LUA_OPUNM 6 + +LUA_API void (lua_arith) (lua_State *L, int op); + +#define LUA_OPEQ 0 +#define LUA_OPLT 1 +#define LUA_OPLE 2 + +LUA_API int (lua_rawequal) (lua_State *L, int idx1, int idx2); +LUA_API int (lua_compare) (lua_State *L, int idx1, int idx2, int op); + + +/* +** push functions (C -> stack) +*/ +LUA_API void (lua_pushnil) (lua_State *L); +LUA_API void (lua_pushnumber) (lua_State *L, lua_Number n); +LUA_API void (lua_pushinteger) (lua_State *L, lua_Integer n); +LUA_API void (lua_pushunsigned) (lua_State *L, lua_Unsigned n); +LUA_API const char *(lua_pushlstring) (lua_State *L, const char *s, size_t l); +LUA_API const char *(lua_pushstring) (lua_State *L, const char *s); +LUA_API const char *(lua_pushvfstring) (lua_State *L, const char *fmt, + va_list argp); +LUA_API const char *(lua_pushfstring) (lua_State *L, const char *fmt, ...); +LUA_API void (lua_pushcclosure) (lua_State *L, lua_CFunction fn, int n); +LUA_API void (lua_pushboolean) (lua_State *L, int b); +LUA_API void (lua_pushlightuserdata) (lua_State *L, void *p); +LUA_API int (lua_pushthread) (lua_State *L); + + +/* +** get functions (Lua -> stack) +*/ +LUA_API void (lua_getglobal) (lua_State *L, const char *var); +LUA_API void (lua_gettable) (lua_State *L, int idx); +LUA_API void (lua_getfield) (lua_State *L, int idx, const char *k); +LUA_API void (lua_rawget) (lua_State *L, int idx); +LUA_API void (lua_rawgeti) (lua_State *L, int idx, int n); +LUA_API void (lua_rawgetp) (lua_State *L, int idx, const void *p); +LUA_API void (lua_createtable) (lua_State *L, int narr, int nrec); +LUA_API void *(lua_newuserdata) (lua_State *L, size_t sz); +LUA_API int (lua_getmetatable) (lua_State *L, int objindex); +LUA_API void (lua_getuservalue) (lua_State *L, int idx); + + +/* +** set functions (stack -> Lua) +*/ +LUA_API void (lua_setglobal) (lua_State *L, const char *var); +LUA_API void (lua_settable) (lua_State *L, int idx); +LUA_API void (lua_setfield) (lua_State *L, int idx, const char *k); +LUA_API void (lua_rawset) (lua_State *L, int idx); +LUA_API void (lua_rawseti) (lua_State *L, int idx, int n); +LUA_API void (lua_rawsetp) (lua_State *L, int idx, const void *p); +LUA_API int (lua_setmetatable) (lua_State *L, int objindex); +LUA_API void (lua_setuservalue) (lua_State *L, int idx); + + +/* +** 'load' and 'call' functions (load and run Lua code) +*/ +LUA_API void (lua_callk) (lua_State *L, int nargs, int nresults, int ctx, + lua_CFunction k); +#define lua_call(L,n,r) lua_callk(L, (n), (r), 0, NULL) + +LUA_API int (lua_getctx) (lua_State *L, int *ctx); + +LUA_API int (lua_pcallk) (lua_State *L, int nargs, int nresults, int errfunc, + int ctx, lua_CFunction k); +#define lua_pcall(L,n,r,f) lua_pcallk(L, (n), (r), (f), 0, NULL) + +LUA_API int (lua_load) (lua_State *L, lua_Reader reader, void *dt, + const char *chunkname, + const char *mode); + +LUA_API int (lua_dump) (lua_State *L, lua_Writer writer, void *data); + + +/* +** coroutine functions +*/ +LUA_API int (lua_yieldk) (lua_State *L, int nresults, int ctx, + lua_CFunction k); +#define lua_yield(L,n) lua_yieldk(L, (n), 0, NULL) +LUA_API int (lua_resume) (lua_State *L, lua_State *from, int narg); +LUA_API int (lua_status) (lua_State *L); + +/* +** garbage-collection function and options +*/ + +#define LUA_GCSTOP 0 +#define LUA_GCRESTART 1 +#define LUA_GCCOLLECT 2 +#define LUA_GCCOUNT 3 +#define LUA_GCCOUNTB 4 +#define LUA_GCSTEP 5 +#define LUA_GCSETPAUSE 6 +#define LUA_GCSETSTEPMUL 7 +#define LUA_GCSETMAJORINC 8 +#define LUA_GCISRUNNING 9 +#define LUA_GCGEN 10 +#define LUA_GCINC 11 + +LUA_API int (lua_gc) (lua_State *L, int what, int data); + + +/* +** miscellaneous functions +*/ + +LUA_API int (lua_error) (lua_State *L); + +LUA_API int (lua_next) (lua_State *L, int idx); + +LUA_API void (lua_concat) (lua_State *L, int n); +LUA_API void (lua_len) (lua_State *L, int idx); + +LUA_API lua_Alloc (lua_getallocf) (lua_State *L, void **ud); +LUA_API void (lua_setallocf) (lua_State *L, lua_Alloc f, void *ud); + + + +/* +** =============================================================== +** some useful macros +** =============================================================== +*/ + +#define lua_tonumber(L,i) lua_tonumberx(L,i,NULL) +#define lua_tointeger(L,i) lua_tointegerx(L,i,NULL) +#define lua_tounsigned(L,i) lua_tounsignedx(L,i,NULL) + +#define lua_pop(L,n) lua_settop(L, -(n)-1) + +#define lua_newtable(L) lua_createtable(L, 0, 0) + +#define lua_register(L,n,f) (lua_pushcfunction(L, (f)), lua_setglobal(L, (n))) + +#define lua_pushcfunction(L,f) lua_pushcclosure(L, (f), 0) + +#define lua_isfunction(L,n) (lua_type(L, (n)) == LUA_TFUNCTION) +#define lua_istable(L,n) (lua_type(L, (n)) == LUA_TTABLE) +#define lua_islightuserdata(L,n) (lua_type(L, (n)) == LUA_TLIGHTUSERDATA) +#define lua_isnil(L,n) (lua_type(L, (n)) == LUA_TNIL) +#define lua_isboolean(L,n) (lua_type(L, (n)) == LUA_TBOOLEAN) +#define lua_isthread(L,n) (lua_type(L, (n)) == LUA_TTHREAD) +#define lua_isnone(L,n) (lua_type(L, (n)) == LUA_TNONE) +#define lua_isnoneornil(L, n) (lua_type(L, (n)) <= 0) + +#define lua_pushliteral(L, s) \ + lua_pushlstring(L, "" s, (sizeof(s)/sizeof(char))-1) + +#define lua_pushglobaltable(L) \ + lua_rawgeti(L, LUA_REGISTRYINDEX, LUA_RIDX_GLOBALS) + +#define lua_tostring(L,i) lua_tolstring(L, (i), NULL) + + + +/* +** {====================================================================== +** Debug API +** ======================================================================= +*/ + + +/* +** Event codes +*/ +#define LUA_HOOKCALL 0 +#define LUA_HOOKRET 1 +#define LUA_HOOKLINE 2 +#define LUA_HOOKCOUNT 3 +#define LUA_HOOKTAILCALL 4 + + +/* +** Event masks +*/ +#define LUA_MASKCALL (1 << LUA_HOOKCALL) +#define LUA_MASKRET (1 << LUA_HOOKRET) +#define LUA_MASKLINE (1 << LUA_HOOKLINE) +#define LUA_MASKCOUNT (1 << LUA_HOOKCOUNT) + +typedef struct lua_Debug lua_Debug; /* activation record */ + + +/* Functions to be called by the debugger in specific events */ +typedef void (*lua_Hook) (lua_State *L, lua_Debug *ar); + + +LUA_API int (lua_getstack) (lua_State *L, int level, lua_Debug *ar); +LUA_API int (lua_getinfo) (lua_State *L, const char *what, lua_Debug *ar); +LUA_API const char *(lua_getlocal) (lua_State *L, const lua_Debug *ar, int n); +LUA_API const char *(lua_setlocal) (lua_State *L, const lua_Debug *ar, int n); +LUA_API const char *(lua_getupvalue) (lua_State *L, int funcindex, int n); +LUA_API const char *(lua_setupvalue) (lua_State *L, int funcindex, int n); + +LUA_API void *(lua_upvalueid) (lua_State *L, int fidx, int n); +LUA_API void (lua_upvaluejoin) (lua_State *L, int fidx1, int n1, + int fidx2, int n2); + +LUA_API int (lua_sethook) (lua_State *L, lua_Hook func, int mask, int count); +LUA_API lua_Hook (lua_gethook) (lua_State *L); +LUA_API int (lua_gethookmask) (lua_State *L); +LUA_API int (lua_gethookcount) (lua_State *L); + + +struct lua_Debug { + int event; + const char *name; /* (n) */ + const char *namewhat; /* (n) 'global', 'local', 'field', 'method' */ + const char *what; /* (S) 'Lua', 'C', 'main', 'tail' */ + const char *source; /* (S) */ + int currentline; /* (l) */ + int linedefined; /* (S) */ + int lastlinedefined; /* (S) */ + unsigned char nups; /* (u) number of upvalues */ + unsigned char nparams;/* (u) number of parameters */ + char isvararg; /* (u) */ + char istailcall; /* (t) */ + char short_src[LUA_IDSIZE]; /* (S) */ + /* private part */ + struct CallInfo *i_ci; /* active function */ +}; + +/* }====================================================================== */ + + +/****************************************************************************** +* Copyright (C) 1994-2015 Lua.org, PUC-Rio. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +******************************************************************************/ + + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h new file mode 100644 index 000000000000..e856eee264ff --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h @@ -0,0 +1,555 @@ +/* +** $Id: luaconf.h,v 1.176.1.2 2013/11/21 17:26:16 roberto Exp $ +** Configuration file for Lua +** See Copyright Notice in lua.h +*/ + + +#ifndef lconfig_h +#define lconfig_h + +#include <sys/zfs_context.h> +#ifdef illumos +#include <sys/int_fmtio.h> +#else +#include <machine/_inttypes.h> +#endif + +extern ssize_t lcompat_sprintf(char *, const char *, ...); +extern int64_t lcompat_strtoll(const char *, char **); +extern int64_t lcompat_pow(int64_t, int64_t); + +/* +** ================================================================== +** Search for "@@" to find all configurable definitions. +** =================================================================== +*/ + + +/* +@@ LUA_ANSI controls the use of non-ansi features. +** CHANGE it (define it) if you want Lua to avoid the use of any +** non-ansi feature or library. +*/ +#if !defined(LUA_ANSI) && defined(__STRICT_ANSI__) +#define LUA_ANSI +#endif + + +#if !defined(LUA_ANSI) && defined(_WIN32) && !defined(_WIN32_WCE) +#define LUA_WIN /* enable goodies for regular Windows platforms */ +#endif + +#if defined(LUA_WIN) +#define LUA_DL_DLL +#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */ +#endif + + + +#if defined(LUA_USE_LINUX) +#define LUA_USE_POSIX +#define LUA_USE_DLOPEN /* needs an extra library: -ldl */ +#define LUA_USE_READLINE /* needs some extra libraries */ +#define LUA_USE_STRTODHEX /* assume 'strtod' handles hex formats */ +#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */ +#define LUA_USE_LONGLONG /* assume support for long long */ +#endif + +#if defined(LUA_USE_MACOSX) +#define LUA_USE_POSIX +#define LUA_USE_DLOPEN /* does not need -ldl */ +#define LUA_USE_READLINE /* needs an extra library: -lreadline */ +#define LUA_USE_STRTODHEX /* assume 'strtod' handles hex formats */ +#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */ +#define LUA_USE_LONGLONG /* assume support for long long */ +#endif + + + +/* +@@ LUA_USE_POSIX includes all functionality listed as X/Open System +@* Interfaces Extension (XSI). +** CHANGE it (define it) if your system is XSI compatible. +*/ +#if defined(LUA_USE_POSIX) +#define LUA_USE_MKSTEMP +#define LUA_USE_ISATTY +#define LUA_USE_POPEN +#define LUA_USE_ULONGJMP +#define LUA_USE_GMTIME_R +#endif + + + +/* +@@ LUA_PATH_DEFAULT is the default path that Lua uses to look for +@* Lua libraries. +@@ LUA_CPATH_DEFAULT is the default path that Lua uses to look for +@* C libraries. +** CHANGE them if your machine has a non-conventional directory +** hierarchy or if you want to install your libraries in +** non-conventional directories. +*/ +#if defined(_WIN32) /* { */ +/* +** In Windows, any exclamation mark ('!') in the path is replaced by the +** path of the directory of the executable file of the current process. +*/ +#define LUA_LDIR "!\\lua\\" +#define LUA_CDIR "!\\" +#define LUA_PATH_DEFAULT \ + LUA_LDIR"?.lua;" LUA_LDIR"?\\init.lua;" \ + LUA_CDIR"?.lua;" LUA_CDIR"?\\init.lua;" ".\\?.lua" +#define LUA_CPATH_DEFAULT \ + LUA_CDIR"?.dll;" LUA_CDIR"loadall.dll;" ".\\?.dll" + +#else /* }{ */ + +#define LUA_VDIR LUA_VERSION_MAJOR "." LUA_VERSION_MINOR "/" +#define LUA_ROOT "/usr/local/" +#define LUA_LDIR LUA_ROOT "share/lua/" LUA_VDIR +#define LUA_CDIR LUA_ROOT "lib/lua/" LUA_VDIR +#define LUA_PATH_DEFAULT \ + LUA_LDIR"?.lua;" LUA_LDIR"?/init.lua;" \ + LUA_CDIR"?.lua;" LUA_CDIR"?/init.lua;" "./?.lua" +#define LUA_CPATH_DEFAULT \ + LUA_CDIR"?.so;" LUA_CDIR"loadall.so;" "./?.so" +#endif /* } */ + + +/* +@@ LUA_DIRSEP is the directory separator (for submodules). +** CHANGE it if your machine does not use "/" as the directory separator +** and is not Windows. (On Windows Lua automatically uses "\".) +*/ +#if defined(_WIN32) +#define LUA_DIRSEP "\\" +#else +#define LUA_DIRSEP "/" +#endif + + +/* +@@ LUA_ENV is the name of the variable that holds the current +@@ environment, used to access global names. +** CHANGE it if you do not like this name. +*/ +#define LUA_ENV "_ENV" + + +/* +@@ LUA_API is a mark for all core API functions. +@@ LUALIB_API is a mark for all auxiliary library functions. +@@ LUAMOD_API is a mark for all standard library opening functions. +** CHANGE them if you need to define those functions in some special way. +** For instance, if you want to create one Windows DLL with the core and +** the libraries, you may want to use the following definition (define +** LUA_BUILD_AS_DLL to get it). +*/ +#if defined(LUA_BUILD_AS_DLL) /* { */ + +#if defined(LUA_CORE) || defined(LUA_LIB) /* { */ +#define LUA_API __declspec(dllexport) +#else /* }{ */ +#define LUA_API __declspec(dllimport) +#endif /* } */ + +#else /* }{ */ + +#define LUA_API extern + +#endif /* } */ + + +/* more often than not the libs go together with the core */ +#define LUALIB_API LUA_API +#define LUAMOD_API LUALIB_API + + +/* +@@ LUAI_FUNC is a mark for all extern functions that are not to be +@* exported to outside modules. +@@ LUAI_DDEF and LUAI_DDEC are marks for all extern (const) variables +@* that are not to be exported to outside modules (LUAI_DDEF for +@* definitions and LUAI_DDEC for declarations). +** CHANGE them if you need to mark them in some special way. Elf/gcc +** (versions 3.2 and later) mark them as "hidden" to optimize access +** when Lua is compiled as a shared library. Not all elf targets support +** this attribute. Unfortunately, gcc does not offer a way to check +** whether the target offers that support, and those without support +** give a warning about it. To avoid these warnings, change to the +** default definition. +*/ +#if defined(__GNUC__) && ((__GNUC__*100 + __GNUC_MINOR__) >= 302) && \ + defined(__ELF__) /* { */ +#define LUAI_FUNC __attribute__((visibility("hidden"))) extern +#define LUAI_DDEC LUAI_FUNC +#define LUAI_DDEF /* empty */ + +#else /* }{ */ +#define LUAI_FUNC extern +#define LUAI_DDEC extern +#define LUAI_DDEF /* empty */ +#endif /* } */ + + + +/* +@@ LUA_QL describes how error messages quote program elements. +** CHANGE it if you want a different appearance. +*/ +#define LUA_QL(x) "'" x "'" +#define LUA_QS LUA_QL("%s") + + +/* +@@ LUA_IDSIZE gives the maximum size for the description of the source +@* of a function in debug information. +** CHANGE it if you want a different size. +*/ +#define LUA_IDSIZE 60 + + +/* +@@ luai_writestringerror defines how to print error messages. +** (A format string with one argument is enough for Lua...) +*/ +#ifdef _KERNEL +#define luai_writestringerror(s,p) \ + (zfs_dbgmsg((s), (p))) +#else +#define luai_writestringerror(s,p) \ + (fprintf(stderr, (s), (p)), fflush(stderr)) +#endif + + +/* +@@ LUAI_MAXSHORTLEN is the maximum length for short strings, that is, +** strings that are internalized. (Cannot be smaller than reserved words +** or tags for metamethods, as these strings must be internalized; +** #("function") = 8, #("__newindex") = 10.) +*/ +#define LUAI_MAXSHORTLEN 40 + + + +/* +** {================================================================== +** Compatibility with previous versions +** =================================================================== +*/ + +/* +@@ LUA_COMPAT_ALL controls all compatibility options. +** You can define it to get all options, or change specific options +** to fit your specific needs. +*/ +#if defined(LUA_COMPAT_ALL) /* { */ + +/* +@@ LUA_COMPAT_UNPACK controls the presence of global 'unpack'. +** You can replace it with 'table.unpack'. +*/ +#define LUA_COMPAT_UNPACK + +/* +@@ LUA_COMPAT_LOADERS controls the presence of table 'package.loaders'. +** You can replace it with 'package.searchers'. +*/ +#define LUA_COMPAT_LOADERS + +/* +@@ macro 'lua_cpcall' emulates deprecated function lua_cpcall. +** You can call your C function directly (with light C functions). +*/ +#define lua_cpcall(L,f,u) \ + (lua_pushcfunction(L, (f)), \ + lua_pushlightuserdata(L,(u)), \ + lua_pcall(L,1,0,0)) + + +/* +@@ LUA_COMPAT_LOG10 defines the function 'log10' in the math library. +** You can rewrite 'log10(x)' as 'log(x, 10)'. +*/ +#define LUA_COMPAT_LOG10 + +/* +@@ LUA_COMPAT_LOADSTRING defines the function 'loadstring' in the base +** library. You can rewrite 'loadstring(s)' as 'load(s)'. +*/ +#define LUA_COMPAT_LOADSTRING + +/* +@@ LUA_COMPAT_MAXN defines the function 'maxn' in the table library. +*/ +#define LUA_COMPAT_MAXN + +/* +@@ The following macros supply trivial compatibility for some +** changes in the API. The macros themselves document how to +** change your code to avoid using them. +*/ +#define lua_strlen(L,i) lua_rawlen(L, (i)) + +#define lua_objlen(L,i) lua_rawlen(L, (i)) + +#define lua_equal(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPEQ) +#define lua_lessthan(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPLT) + +/* +@@ LUA_COMPAT_MODULE controls compatibility with previous +** module functions 'module' (Lua) and 'luaL_register' (C). +*/ +#define LUA_COMPAT_MODULE + +#endif /* } */ + +/* }================================================================== */ + + + +/* +@@ LUAI_BITSINT defines the number of bits in an int. +** CHANGE here if Lua cannot automatically detect the number of bits of +** your machine. Probably you do not need to change this. +*/ +/* avoid overflows in comparison */ +#if INT_MAX-20 < 32760 /* { */ +#define LUAI_BITSINT 16 +#elif INT_MAX > 2147483640L /* }{ */ +/* int has at least 32 bits */ +#define LUAI_BITSINT 32 +#else /* }{ */ +#error "you must define LUA_BITSINT with number of bits in an integer" +#endif /* } */ + + +/* +@@ LUA_INT32 is a signed integer with exactly 32 bits. +@@ LUAI_UMEM is an unsigned integer big enough to count the total +@* memory used by Lua. +@@ LUAI_MEM is a signed integer big enough to count the total memory +@* used by Lua. +** CHANGE here if for some weird reason the default definitions are not +** good enough for your machine. Probably you do not need to change +** this. +*/ +#if LUAI_BITSINT >= 32 /* { */ +#define LUA_INT32 int +#define LUAI_UMEM size_t +#define LUAI_MEM ptrdiff_t +#else /* }{ */ +/* 16-bit ints */ +#define LUA_INT32 long +#define LUAI_UMEM unsigned long +#define LUAI_MEM long +#endif /* } */ + + +/* +@@ LUAI_MAXSTACK limits the size of the Lua stack. +** CHANGE it if you need a different limit. This limit is arbitrary; +** its only purpose is to stop Lua from consuming unlimited stack +** space (and to reserve some numbers for pseudo-indices). +*/ +#if LUAI_BITSINT >= 32 +#define LUAI_MAXSTACK 1000000 +#else +#define LUAI_MAXSTACK 15000 +#endif + +/* reserve some space for error handling */ +#define LUAI_FIRSTPSEUDOIDX (-LUAI_MAXSTACK - 1000) + + + + +/* +@@ LUAL_BUFFERSIZE is the buffer size used by the lauxlib buffer system. +** CHANGE it if it uses too much C-stack space. +*/ +#define LUAL_BUFFERSIZE 1024 + + + + +/* +** {================================================================== +@@ LUA_NUMBER is the type of numbers in Lua. +** CHANGE the following definitions only if you want to build Lua +** with a number type different from double. You may also need to +** change lua_number2int & lua_number2integer. +** =================================================================== +*/ + +#define LUA_NUMBER int64_t + +/* +@@ LUAI_UACNUMBER is the result of an 'usual argument conversion' +@* over a number. +*/ +#define LUAI_UACNUMBER int64_t + + +/* +@@ LUA_NUMBER_SCAN is the format for reading numbers. +@@ LUA_NUMBER_FMT is the format for writing numbers. +@@ lua_number2str converts a number to a string. +@@ LUAI_MAXNUMBER2STR is maximum size of previous conversion. +*/ +#define LUA_NUMBER_FMT "%" PRId64 +#define lua_number2str(s,n) lcompat_sprintf((s), LUA_NUMBER_FMT, (n)) +#define LUAI_MAXNUMBER2STR 32 /* 16 digits, sign, point, and \0 */ + + +/* +@@ l_mathop allows the addition of an 'l' or 'f' to all math operations +*/ +#define l_mathop(x) (x ## l) + + +/* +@@ lua_str2number converts a decimal numeric string to a number. +@@ lua_strx2number converts an hexadecimal numeric string to a number. +** In C99, 'strtod' does both conversions. C89, however, has no function +** to convert floating hexadecimal strings to numbers. For these +** systems, you can leave 'lua_strx2number' undefined and Lua will +** provide its own implementation. +*/ +#define lua_str2number(s,p) lcompat_strtoll((s), (p)) + +#if defined(LUA_USE_STRTODHEX) +#define lua_strx2number(s,p) lcompat_strtoll((s), (p)) +#endif + + +/* +@@ The luai_num* macros define the primitive operations over numbers. +*/ + +/* the following operations need the math library */ +#if defined(lobject_c) || defined(lvm_c) +#define luai_nummod(L,a,b) ((a) % (b)) +#define luai_numpow(L,a,b) (lcompat_pow((a),(b))) +#endif + +/* these are quite standard operations */ +#if defined(LUA_CORE) +#define luai_numadd(L,a,b) ((a)+(b)) +#define luai_numsub(L,a,b) ((a)-(b)) +#define luai_nummul(L,a,b) ((a)*(b)) +#define luai_numdiv(L,a,b) ((a)/(b)) +#define luai_numunm(L,a) (-(a)) +#define luai_numeq(a,b) ((a)==(b)) +#define luai_numlt(L,a,b) ((a)<(b)) +#define luai_numle(L,a,b) ((a)<=(b)) +#define luai_numisnan(L,a) (!luai_numeq((a), (a))) +#endif + + + +/* +@@ LUA_INTEGER is the integral type used by lua_pushinteger/lua_tointeger. +** CHANGE that if ptrdiff_t is not adequate on your machine. (On most +** machines, ptrdiff_t gives a good choice between int or long.) +*/ +#define LUA_INTEGER ptrdiff_t + +/* +@@ LUA_UNSIGNED is the integral type used by lua_pushunsigned/lua_tounsigned. +** It must have at least 32 bits. +*/ +#define LUA_UNSIGNED uint64_t + + + +/* +** Some tricks with doubles +*/ + +#if defined(LUA_NUMBER_DOUBLE) && !defined(LUA_ANSI) /* { */ +/* +** The next definitions activate some tricks to speed up the +** conversion from doubles to integer types, mainly to LUA_UNSIGNED. +** +@@ LUA_MSASMTRICK uses Microsoft assembler to avoid clashes with a +** DirectX idiosyncrasy. +** +@@ LUA_IEEE754TRICK uses a trick that should work on any machine +** using IEEE754 with a 32-bit integer type. +** +@@ LUA_IEEELL extends the trick to LUA_INTEGER; should only be +** defined when LUA_INTEGER is a 32-bit integer. +** +@@ LUA_IEEEENDIAN is the endianness of doubles in your machine +** (0 for little endian, 1 for big endian); if not defined, Lua will +** check it dynamically for LUA_IEEE754TRICK (but not for LUA_NANTRICK). +** +@@ LUA_NANTRICK controls the use of a trick to pack all types into +** a single double value, using NaN values to represent non-number +** values. The trick only works on 32-bit machines (ints and pointers +** are 32-bit values) with numbers represented as IEEE 754-2008 doubles +** with conventional endianess (12345678 or 87654321), in CPUs that do +** not produce signaling NaN values (all NaNs are quiet). +*/ + +/* Microsoft compiler on a Pentium (32 bit) ? */ +#if defined(LUA_WIN) && defined(_MSC_VER) && defined(_M_IX86) /* { */ + +#define LUA_MSASMTRICK +#define LUA_IEEEENDIAN 0 +#define LUA_NANTRICK + + +/* pentium 32 bits? */ +#elif defined(__i386__) || defined(__i386) || defined(__X86__) /* }{ */ + +#define LUA_IEEE754TRICK +#define LUA_IEEELL +#define LUA_IEEEENDIAN 0 +#define LUA_NANTRICK + +/* pentium 64 bits? */ +#elif defined(__x86_64) /* }{ */ + +#define LUA_IEEE754TRICK +#define LUA_IEEEENDIAN 0 + +#elif defined(__POWERPC__) || defined(__ppc__) /* }{ */ + +#define LUA_IEEE754TRICK +#define LUA_IEEEENDIAN 1 + +#else /* }{ */ + +/* assume IEEE754 and a 32-bit integer type */ +#define LUA_IEEE754TRICK + +#endif /* } */ + +#endif /* } */ + +/* }================================================================== */ + + + + +/* =================================================================== */ + +/* +** Local configuration. You can use this space to add your redefinitions +** without modifying the main part of the file. +*/ + +#define getlocaledecpoint() ('.') + +#define abs(x) (((x) < 0) ? -(x) : (x)) + +#if !defined(UCHAR_MAX) +#define UCHAR_MAX (0xff) +#endif + +#endif + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h new file mode 100644 index 000000000000..da82005c9de2 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h @@ -0,0 +1,55 @@ +/* +** $Id: lualib.h,v 1.43.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua standard libraries +** See Copyright Notice in lua.h +*/ + + +#ifndef lualib_h +#define lualib_h + +#include "lua.h" + + + +LUAMOD_API int (luaopen_base) (lua_State *L); + +#define LUA_COLIBNAME "coroutine" +LUAMOD_API int (luaopen_coroutine) (lua_State *L); + +#define LUA_TABLIBNAME "table" +LUAMOD_API int (luaopen_table) (lua_State *L); + +#define LUA_IOLIBNAME "io" +LUAMOD_API int (luaopen_io) (lua_State *L); + +#define LUA_OSLIBNAME "os" +LUAMOD_API int (luaopen_os) (lua_State *L); + +#define LUA_STRLIBNAME "string" +LUAMOD_API int (luaopen_string) (lua_State *L); + +#define LUA_BITLIBNAME "bit32" +LUAMOD_API int (luaopen_bit32) (lua_State *L); + +#define LUA_MATHLIBNAME "math" +LUAMOD_API int (luaopen_math) (lua_State *L); + +#define LUA_DBLIBNAME "debug" +LUAMOD_API int (luaopen_debug) (lua_State *L); + +#define LUA_LOADLIBNAME "package" +LUAMOD_API int (luaopen_package) (lua_State *L); + + +/* open all previous libraries */ +LUALIB_API void (luaL_openlibs) (lua_State *L); + + + +#if !defined(lua_assert) +#define lua_assert(x) ((void)0) +#endif + + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c new file mode 100644 index 000000000000..4d53749a0273 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c @@ -0,0 +1,258 @@ +/* +** $Id: lundump.c,v 2.22.1.1 2013/04/12 18:48:47 roberto Exp $ +** load precompiled Lua chunks +** See Copyright Notice in lua.h +*/ + +#include <sys/zfs_context.h> + +#define lundump_c +#define LUA_CORE + +#include "lua.h" + +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "lmem.h" +#include "lobject.h" +#include "lstring.h" +#include "lundump.h" +#include "lzio.h" + +typedef struct { + lua_State* L; + ZIO* Z; + Mbuffer* b; + const char* name; +} LoadState; + +static l_noret error(LoadState* S, const char* why) +{ + luaO_pushfstring(S->L,"%s: %s precompiled chunk",S->name,why); + luaD_throw(S->L,LUA_ERRSYNTAX); +} + +#define LoadMem(S,b,n,size) LoadBlock(S,b,(n)*(size)) +#define LoadByte(S) (lu_byte)LoadChar(S) +#define LoadVar(S,x) LoadMem(S,&x,1,sizeof(x)) +#define LoadVector(S,b,n,size) LoadMem(S,b,n,size) + +#if !defined(luai_verifycode) +#define luai_verifycode(L,b,f) /* empty */ +#endif + +static void LoadBlock(LoadState* S, void* b, size_t size) +{ + if (luaZ_read(S->Z,b,size)!=0) error(S,"truncated"); +} + +static int LoadChar(LoadState* S) +{ + char x; + LoadVar(S,x); + return x; +} + +static int LoadInt(LoadState* S) +{ + int x; + LoadVar(S,x); + if (x<0) error(S,"corrupted"); + return x; +} + +static lua_Number LoadNumber(LoadState* S) +{ + lua_Number x; + LoadVar(S,x); + return x; +} + +static TString* LoadString(LoadState* S) +{ + size_t size; + LoadVar(S,size); + if (size==0) + return NULL; + else + { + char* s=luaZ_openspace(S->L,S->b,size); + LoadBlock(S,s,size*sizeof(char)); + return luaS_newlstr(S->L,s,size-1); /* remove trailing '\0' */ + } +} + +static void LoadCode(LoadState* S, Proto* f) +{ + int n=LoadInt(S); + f->code=luaM_newvector(S->L,n,Instruction); + f->sizecode=n; + LoadVector(S,f->code,n,sizeof(Instruction)); +} + +static void LoadFunction(LoadState* S, Proto* f); + +static void LoadConstants(LoadState* S, Proto* f) +{ + int i,n; + n=LoadInt(S); + f->k=luaM_newvector(S->L,n,TValue); + f->sizek=n; + for (i=0; i<n; i++) setnilvalue(&f->k[i]); + for (i=0; i<n; i++) + { + TValue* o=&f->k[i]; + int t=LoadChar(S); + switch (t) + { + case LUA_TNIL: + setnilvalue(o); + break; + case LUA_TBOOLEAN: + setbvalue(o,LoadChar(S)); + break; + case LUA_TNUMBER: + setnvalue(o,LoadNumber(S)); + break; + case LUA_TSTRING: + setsvalue2n(S->L,o,LoadString(S)); + break; + default: lua_assert(0); + } + } + n=LoadInt(S); + f->p=luaM_newvector(S->L,n,Proto*); + f->sizep=n; + for (i=0; i<n; i++) f->p[i]=NULL; + for (i=0; i<n; i++) + { + f->p[i]=luaF_newproto(S->L); + LoadFunction(S,f->p[i]); + } +} + +static void LoadUpvalues(LoadState* S, Proto* f) +{ + int i,n; + n=LoadInt(S); + f->upvalues=luaM_newvector(S->L,n,Upvaldesc); + f->sizeupvalues=n; + for (i=0; i<n; i++) f->upvalues[i].name=NULL; + for (i=0; i<n; i++) + { + f->upvalues[i].instack=LoadByte(S); + f->upvalues[i].idx=LoadByte(S); + } +} + +static void LoadDebug(LoadState* S, Proto* f) +{ + int i,n; + f->source=LoadString(S); + n=LoadInt(S); + f->lineinfo=luaM_newvector(S->L,n,int); + f->sizelineinfo=n; + LoadVector(S,f->lineinfo,n,sizeof(int)); + n=LoadInt(S); + f->locvars=luaM_newvector(S->L,n,LocVar); + f->sizelocvars=n; + for (i=0; i<n; i++) f->locvars[i].varname=NULL; + for (i=0; i<n; i++) + { + f->locvars[i].varname=LoadString(S); + f->locvars[i].startpc=LoadInt(S); + f->locvars[i].endpc=LoadInt(S); + } + n=LoadInt(S); + for (i=0; i<n; i++) f->upvalues[i].name=LoadString(S); +} + +static void LoadFunction(LoadState* S, Proto* f) +{ + f->linedefined=LoadInt(S); + f->lastlinedefined=LoadInt(S); + f->numparams=LoadByte(S); + f->is_vararg=LoadByte(S); + f->maxstacksize=LoadByte(S); + LoadCode(S,f); + LoadConstants(S,f); + LoadUpvalues(S,f); + LoadDebug(S,f); +} + +/* the code below must be consistent with the code in luaU_header */ +#define N0 LUAC_HEADERSIZE +#define N1 (sizeof(LUA_SIGNATURE)-sizeof(char)) +#define N2 N1+2 +#define N3 N2+6 + +static void LoadHeader(LoadState* S) +{ + lu_byte h[LUAC_HEADERSIZE]; + lu_byte s[LUAC_HEADERSIZE]; + luaU_header(h); + memcpy(s,h,sizeof(char)); /* first char already read */ + LoadBlock(S,s+sizeof(char),LUAC_HEADERSIZE-sizeof(char)); + if (memcmp(h,s,N0)==0) return; + if (memcmp(h,s,N1)!=0) error(S,"not a"); + if (memcmp(h,s,N2)!=0) error(S,"version mismatch in"); + if (memcmp(h,s,N3)!=0) error(S,"incompatible"); else error(S,"corrupted"); +} + +/* +** load precompiled chunk +*/ +Closure* luaU_undump (lua_State* L, ZIO* Z, Mbuffer* buff, const char* name) +{ + LoadState S; + Closure* cl; + if (*name=='@' || *name=='=') + S.name=name+1; + else if (*name==LUA_SIGNATURE[0]) + S.name="binary string"; + else + S.name=name; + S.L=L; + S.Z=Z; + S.b=buff; + LoadHeader(&S); + cl=luaF_newLclosure(L,1); + setclLvalue(L,L->top,cl); incr_top(L); + cl->l.p=luaF_newproto(L); + LoadFunction(&S,cl->l.p); + if (cl->l.p->sizeupvalues != 1) + { + Proto* p=cl->l.p; + cl=luaF_newLclosure(L,cl->l.p->sizeupvalues); + cl->l.p=p; + setclLvalue(L,L->top-1,cl); + } + luai_verifycode(L,buff,cl->l.p); + return cl; +} + +#define MYINT(s) (s[0]-'0') +#define VERSION MYINT(LUA_VERSION_MAJOR)*16+MYINT(LUA_VERSION_MINOR) +#define FORMAT 0 /* this is the official format */ + +/* +* make header for precompiled chunks +* if you change the code below be sure to update LoadHeader and FORMAT above +* and LUAC_HEADERSIZE in lundump.h +*/ +void luaU_header (lu_byte* h) +{ + int x=1; + memcpy(h,LUA_SIGNATURE,sizeof(LUA_SIGNATURE)-sizeof(char)); + h+=sizeof(LUA_SIGNATURE)-sizeof(char); + *h++=cast_byte(VERSION); + *h++=cast_byte(FORMAT); + *h++=cast_byte(*(char*)&x); /* endianness */ + *h++=cast_byte(sizeof(int)); + *h++=cast_byte(sizeof(size_t)); + *h++=cast_byte(sizeof(Instruction)); + *h++=cast_byte(sizeof(lua_Number)); + *h++=cast_byte(((lua_Number)0.5)==0); /* is lua_Number integral? */ + memcpy(h,LUAC_TAIL,sizeof(LUAC_TAIL)-sizeof(char)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h new file mode 100644 index 000000000000..5255db259dfe --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h @@ -0,0 +1,28 @@ +/* +** $Id: lundump.h,v 1.39.1.1 2013/04/12 18:48:47 roberto Exp $ +** load precompiled Lua chunks +** See Copyright Notice in lua.h +*/ + +#ifndef lundump_h +#define lundump_h + +#include "lobject.h" +#include "lzio.h" + +/* load one chunk; from lundump.c */ +LUAI_FUNC Closure* luaU_undump (lua_State* L, ZIO* Z, Mbuffer* buff, const char* name); + +/* make header; from lundump.c */ +LUAI_FUNC void luaU_header (lu_byte* h); + +/* dump one chunk; from ldump.c */ +LUAI_FUNC int luaU_dump (lua_State* L, const Proto* f, lua_Writer w, void* data, int strip); + +/* data to catch conversion errors */ +#define LUAC_TAIL "\x19\x93\r\n\x1a\n" + +/* size in bytes of header of binary files */ +#define LUAC_HEADERSIZE (sizeof(LUA_SIGNATURE)-sizeof(char)+2+6+sizeof(LUAC_TAIL)-sizeof(char)) + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c new file mode 100644 index 000000000000..a06e36e5ceae --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c @@ -0,0 +1,930 @@ +/* +** $Id: lvm.c,v 2.155.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua virtual machine +** See Copyright Notice in lua.h +*/ + + +#include <sys/zfs_context.h> + +#define strcoll(l,r) (strcmp((l),(r))) + +#define lvm_c +#define LUA_CORE + +#include "lua.h" + +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "lgc.h" +#include "lobject.h" +#include "lopcodes.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" +#include "lvm.h" + + + +/* limit for table tag-method chains (to avoid loops) */ +#define MAXTAGLOOP 100 + + +const TValue *luaV_tonumber (const TValue *obj, TValue *n) { + lua_Number num; + if (ttisnumber(obj)) return obj; + if (ttisstring(obj) && luaO_str2d(svalue(obj), tsvalue(obj)->len, &num)) { + setnvalue(n, num); + return n; + } + else + return NULL; +} + + +int luaV_tostring (lua_State *L, StkId obj) { + if (!ttisnumber(obj)) + return 0; + else { + char s[LUAI_MAXNUMBER2STR]; + lua_Number n = nvalue(obj); + int l = lua_number2str(s, n); + setsvalue2s(L, obj, luaS_newlstr(L, s, l)); + return 1; + } +} + + +static void traceexec (lua_State *L) { + CallInfo *ci = L->ci; + lu_byte mask = L->hookmask; + int counthook = ((mask & LUA_MASKCOUNT) && L->hookcount == 0); + if (counthook) + resethookcount(L); /* reset count */ + if (ci->callstatus & CIST_HOOKYIELD) { /* called hook last time? */ + ci->callstatus &= ~CIST_HOOKYIELD; /* erase mark */ + return; /* do not call hook again (VM yielded, so it did not move) */ + } + if (counthook) + luaD_hook(L, LUA_HOOKCOUNT, -1); /* call count hook */ + if (mask & LUA_MASKLINE) { + Proto *p = ci_func(ci)->p; + int npc = pcRel(ci->u.l.savedpc, p); + int newline = getfuncline(p, npc); + if (npc == 0 || /* call linehook when enter a new function, */ + ci->u.l.savedpc <= L->oldpc || /* when jump back (loop), or when */ + newline != getfuncline(p, pcRel(L->oldpc, p))) /* enter a new line */ + luaD_hook(L, LUA_HOOKLINE, newline); /* call line hook */ + } + L->oldpc = ci->u.l.savedpc; + if (L->status == LUA_YIELD) { /* did hook yield? */ + if (counthook) + L->hookcount = 1; /* undo decrement to zero */ + ci->u.l.savedpc--; /* undo increment (resume will increment it again) */ + ci->callstatus |= CIST_HOOKYIELD; /* mark that it yielded */ + ci->func = L->top - 1; /* protect stack below results */ + luaD_throw(L, LUA_YIELD); + } +} + + +static void callTM (lua_State *L, const TValue *f, const TValue *p1, + const TValue *p2, TValue *p3, int hasres) { + ptrdiff_t result = savestack(L, p3); + setobj2s(L, L->top++, f); /* push function */ + setobj2s(L, L->top++, p1); /* 1st argument */ + setobj2s(L, L->top++, p2); /* 2nd argument */ + if (!hasres) /* no result? 'p3' is third argument */ + setobj2s(L, L->top++, p3); /* 3rd argument */ + /* metamethod may yield only when called from Lua code */ + luaD_call(L, L->top - (4 - hasres), hasres, isLua(L->ci)); + if (hasres) { /* if has result, move it to its place */ + p3 = restorestack(L, result); + setobjs2s(L, p3, --L->top); + } +} + + +void luaV_gettable (lua_State *L, const TValue *t, TValue *key, StkId val) { + int loop; + for (loop = 0; loop < MAXTAGLOOP; loop++) { + const TValue *tm; + if (ttistable(t)) { /* `t' is a table? */ + Table *h = hvalue(t); + const TValue *res = luaH_get(h, key); /* do a primitive get */ + if (!ttisnil(res) || /* result is not nil? */ + (tm = fasttm(L, h->metatable, TM_INDEX)) == NULL) { /* or no TM? */ + setobj2s(L, val, res); + return; + } + /* else will try the tag method */ + } + else if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_INDEX))) + luaG_typeerror(L, t, "index"); + if (ttisfunction(tm)) { + callTM(L, tm, t, key, val, 1); + return; + } + t = tm; /* else repeat with 'tm' */ + } + luaG_runerror(L, "loop in gettable"); +} + + +void luaV_settable (lua_State *L, const TValue *t, TValue *key, StkId val) { + int loop; + for (loop = 0; loop < MAXTAGLOOP; loop++) { + const TValue *tm; + if (ttistable(t)) { /* `t' is a table? */ + Table *h = hvalue(t); + TValue *oldval = cast(TValue *, luaH_get(h, key)); + /* if previous value is not nil, there must be a previous entry + in the table; moreover, a metamethod has no relevance */ + if (!ttisnil(oldval) || + /* previous value is nil; must check the metamethod */ + ((tm = fasttm(L, h->metatable, TM_NEWINDEX)) == NULL && + /* no metamethod; is there a previous entry in the table? */ + (oldval != luaO_nilobject || + /* no previous entry; must create one. (The next test is + always true; we only need the assignment.) */ + (oldval = luaH_newkey(L, h, key), 1)))) { + /* no metamethod and (now) there is an entry with given key */ + setobj2t(L, oldval, val); /* assign new value to that entry */ + invalidateTMcache(h); + luaC_barrierback(L, obj2gco(h), val); + return; + } + /* else will try the metamethod */ + } + else /* not a table; check metamethod */ + if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_NEWINDEX))) + luaG_typeerror(L, t, "index"); + /* there is a metamethod */ + if (ttisfunction(tm)) { + callTM(L, tm, t, key, val, 0); + return; + } + t = tm; /* else repeat with 'tm' */ + } + luaG_runerror(L, "loop in settable"); +} + + +static int call_binTM (lua_State *L, const TValue *p1, const TValue *p2, + StkId res, TMS event) { + const TValue *tm = luaT_gettmbyobj(L, p1, event); /* try first operand */ + if (ttisnil(tm)) + tm = luaT_gettmbyobj(L, p2, event); /* try second operand */ + if (ttisnil(tm)) return 0; + callTM(L, tm, p1, p2, res, 1); + return 1; +} + + +static const TValue *get_equalTM (lua_State *L, Table *mt1, Table *mt2, + TMS event) { + const TValue *tm1 = fasttm(L, mt1, event); + const TValue *tm2; + if (tm1 == NULL) return NULL; /* no metamethod */ + if (mt1 == mt2) return tm1; /* same metatables => same metamethods */ + tm2 = fasttm(L, mt2, event); + if (tm2 == NULL) return NULL; /* no metamethod */ + if (luaV_rawequalobj(tm1, tm2)) /* same metamethods? */ + return tm1; + return NULL; +} + + +static int call_orderTM (lua_State *L, const TValue *p1, const TValue *p2, + TMS event) { + if (!call_binTM(L, p1, p2, L->top, event)) + return -1; /* no metamethod */ + else + return !l_isfalse(L->top); +} + + +static int l_strcmp (const TString *ls, const TString *rs) { + const char *l = getstr(ls); + size_t ll = ls->tsv.len; + const char *r = getstr(rs); + size_t lr = rs->tsv.len; + for (;;) { + int temp = strcoll(l, r); + if (temp != 0) return temp; + else { /* strings are equal up to a `\0' */ + size_t len = strlen(l); /* index of first `\0' in both strings */ + if (len == lr) /* r is finished? */ + return (len == ll) ? 0 : 1; + else if (len == ll) /* l is finished? */ + return -1; /* l is smaller than r (because r is not finished) */ + /* both strings longer than `len'; go on comparing (after the `\0') */ + len++; + l += len; ll -= len; r += len; lr -= len; + } + } +} + + +int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r) { + int res; + if (ttisnumber(l) && ttisnumber(r)) + return luai_numlt(L, nvalue(l), nvalue(r)); + else if (ttisstring(l) && ttisstring(r)) + return l_strcmp(rawtsvalue(l), rawtsvalue(r)) < 0; + else if ((res = call_orderTM(L, l, r, TM_LT)) < 0) + luaG_ordererror(L, l, r); + return res; +} + + +int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r) { + int res; + if (ttisnumber(l) && ttisnumber(r)) + return luai_numle(L, nvalue(l), nvalue(r)); + else if (ttisstring(l) && ttisstring(r)) + return l_strcmp(rawtsvalue(l), rawtsvalue(r)) <= 0; + else if ((res = call_orderTM(L, l, r, TM_LE)) >= 0) /* first try `le' */ + return res; + else if ((res = call_orderTM(L, r, l, TM_LT)) < 0) /* else try `lt' */ + luaG_ordererror(L, l, r); + return !res; +} + + +/* +** equality of Lua values. L == NULL means raw equality (no metamethods) +*/ +int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2) { + const TValue *tm; + lua_assert(ttisequal(t1, t2)); + switch (ttype(t1)) { + case LUA_TNIL: return 1; + case LUA_TNUMBER: return luai_numeq(nvalue(t1), nvalue(t2)); + case LUA_TBOOLEAN: return bvalue(t1) == bvalue(t2); /* true must be 1 !! */ + case LUA_TLIGHTUSERDATA: return pvalue(t1) == pvalue(t2); + case LUA_TLCF: return fvalue(t1) == fvalue(t2); + case LUA_TSHRSTR: return eqshrstr(rawtsvalue(t1), rawtsvalue(t2)); + case LUA_TLNGSTR: return luaS_eqlngstr(rawtsvalue(t1), rawtsvalue(t2)); + case LUA_TUSERDATA: { + if (uvalue(t1) == uvalue(t2)) return 1; + else if (L == NULL) return 0; + tm = get_equalTM(L, uvalue(t1)->metatable, uvalue(t2)->metatable, TM_EQ); + break; /* will try TM */ + } + case LUA_TTABLE: { + if (hvalue(t1) == hvalue(t2)) return 1; + else if (L == NULL) return 0; + tm = get_equalTM(L, hvalue(t1)->metatable, hvalue(t2)->metatable, TM_EQ); + break; /* will try TM */ + } + default: + lua_assert(iscollectable(t1)); + return gcvalue(t1) == gcvalue(t2); + } + if (tm == NULL) return 0; /* no TM? */ + callTM(L, tm, t1, t2, L->top, 1); /* call TM */ + return !l_isfalse(L->top); +} + + +void luaV_concat (lua_State *L, int total) { + lua_assert(total >= 2); + do { + StkId top = L->top; + int n = 2; /* number of elements handled in this pass (at least 2) */ + if (!(ttisstring(top-2) || ttisnumber(top-2)) || !tostring(L, top-1)) { + if (!call_binTM(L, top-2, top-1, top-2, TM_CONCAT)) + luaG_concaterror(L, top-2, top-1); + } + else if (tsvalue(top-1)->len == 0) /* second operand is empty? */ + (void)tostring(L, top - 2); /* result is first operand */ + else if (ttisstring(top-2) && tsvalue(top-2)->len == 0) { + setobjs2s(L, top - 2, top - 1); /* result is second op. */ + } + else { + /* at least two non-empty string values; get as many as possible */ + size_t tl = tsvalue(top-1)->len; + char *buffer; + int i; + /* collect total length */ + for (i = 1; i < total && tostring(L, top-i-1); i++) { + size_t l = tsvalue(top-i-1)->len; + if (l >= (MAX_SIZET/sizeof(char)) - tl) + luaG_runerror(L, "string length overflow"); + tl += l; + } + buffer = luaZ_openspace(L, &G(L)->buff, tl); + tl = 0; + n = i; + do { /* concat all strings */ + size_t l = tsvalue(top-i)->len; + memcpy(buffer+tl, svalue(top-i), l * sizeof(char)); + tl += l; + } while (--i > 0); + setsvalue2s(L, top-n, luaS_newlstr(L, buffer, tl)); + } + total -= n-1; /* got 'n' strings to create 1 new */ + L->top -= n-1; /* popped 'n' strings and pushed one */ + } while (total > 1); /* repeat until only 1 result left */ +} + + +void luaV_objlen (lua_State *L, StkId ra, const TValue *rb) { + const TValue *tm; + switch (ttypenv(rb)) { + case LUA_TTABLE: { + Table *h = hvalue(rb); + tm = fasttm(L, h->metatable, TM_LEN); + if (tm) break; /* metamethod? break switch to call it */ + setnvalue(ra, cast_num(luaH_getn(h))); /* else primitive len */ + return; + } + case LUA_TSTRING: { + setnvalue(ra, cast_num(tsvalue(rb)->len)); + return; + } + default: { /* try metamethod */ + tm = luaT_gettmbyobj(L, rb, TM_LEN); + if (ttisnil(tm)) /* no metamethod? */ + luaG_typeerror(L, rb, "get length of"); + break; + } + } + callTM(L, tm, rb, rb, ra, 1); +} + +/* + * luaV_div and luaV_mod patched in from Lua 5.3.2 in order to properly handle + * div/mod by zero (instead of crashing, which is the default behavior in + * Lua 5.2) + */ + +/* +** Integer division; return 'm // n', that is, floor(m/n). +** C division truncates its result (rounds towards zero). +** 'floor(q) == trunc(q)' when 'q >= 0' or when 'q' is integer, +** otherwise 'floor(q) == trunc(q) - 1'. +*/ +static lua_Number luaV_div (lua_State *L, lua_Number m, lua_Number n) { + if ((lua_Unsigned)(n) + 1u <= 1u) { /* special cases: -1 or 0 */ + if (n == 0) + luaG_runerror(L, "attempt to divide by zero"); + return (0 - m); /* n==-1; avoid overflow with 0x80000...//-1 */ + } + else { + lua_Number q = m / n; /* perform C division */ + if ((m ^ n) < 0 && m % n != 0) /* 'm/n' would be negative non-integer? */ + q -= 1; /* correct result for different rounding */ + return q; + } +} + + +/* +** Integer modulus; return 'm % n'. (Assume that C '%' with +** negative operands follows C99 behavior. See previous comment +** about luaV_div.) +*/ +static lua_Number luaV_mod (lua_State *L, lua_Number m, lua_Number n) { + if ((lua_Unsigned)(n) + 1u <= 1u) { /* special cases: -1 or 0 */ + if (n == 0) + luaG_runerror(L, "attempt to perform 'n%%0'"); + return 0; /* m % -1 == 0; avoid overflow with 0x80000...%-1 */ + } + else { + lua_Number r = m % n; + if (r != 0 && (m ^ n) < 0) /* 'm/n' would be non-integer negative? */ + r += n; /* correct result for different rounding */ + return r; + } +} + +/* + * End patch from 5.3.2 + */ + +void luaV_arith (lua_State *L, StkId ra, const TValue *rb, + const TValue *rc, TMS op) { + TValue tempb, tempc; + const TValue *b, *c; + if ((b = luaV_tonumber(rb, &tempb)) != NULL && + (c = luaV_tonumber(rc, &tempc)) != NULL) { + /* + * Patched: if dividing or modding, use patched functions from 5.3 + */ + lua_Number res; + int lop = op - TM_ADD + LUA_OPADD; + if (lop == LUA_OPDIV) { + res = luaV_div(L, nvalue(b), nvalue(c)); + } else if (lop == LUA_OPMOD) { + res = luaV_mod(L, nvalue(b), nvalue(c)); + } else { + res = luaO_arith(op - TM_ADD + LUA_OPADD, nvalue(b), nvalue(c)); + } + setnvalue(ra, res); + } + else if (!call_binTM(L, rb, rc, ra, op)) + luaG_aritherror(L, rb, rc); +} + + +/* +** check whether cached closure in prototype 'p' may be reused, that is, +** whether there is a cached closure with the same upvalues needed by +** new closure to be created. +*/ +static Closure *getcached (Proto *p, UpVal **encup, StkId base) { + Closure *c = p->cache; + if (c != NULL) { /* is there a cached closure? */ + int nup = p->sizeupvalues; + Upvaldesc *uv = p->upvalues; + int i; + for (i = 0; i < nup; i++) { /* check whether it has right upvalues */ + TValue *v = uv[i].instack ? base + uv[i].idx : encup[uv[i].idx]->v; + if (c->l.upvals[i]->v != v) + return NULL; /* wrong upvalue; cannot reuse closure */ + } + } + return c; /* return cached closure (or NULL if no cached closure) */ +} + + +/* +** create a new Lua closure, push it in the stack, and initialize +** its upvalues. Note that the call to 'luaC_barrierproto' must come +** before the assignment to 'p->cache', as the function needs the +** original value of that field. +*/ +static void pushclosure (lua_State *L, Proto *p, UpVal **encup, StkId base, + StkId ra) { + int nup = p->sizeupvalues; + Upvaldesc *uv = p->upvalues; + int i; + Closure *ncl = luaF_newLclosure(L, nup); + ncl->l.p = p; + setclLvalue(L, ra, ncl); /* anchor new closure in stack */ + for (i = 0; i < nup; i++) { /* fill in its upvalues */ + if (uv[i].instack) /* upvalue refers to local variable? */ + ncl->l.upvals[i] = luaF_findupval(L, base + uv[i].idx); + else /* get upvalue from enclosing function */ + ncl->l.upvals[i] = encup[uv[i].idx]; + } + luaC_barrierproto(L, p, ncl); + p->cache = ncl; /* save it on cache for reuse */ +} + + +/* +** finish execution of an opcode interrupted by an yield +*/ +void luaV_finishOp (lua_State *L) { + CallInfo *ci = L->ci; + StkId base = ci->u.l.base; + Instruction inst = *(ci->u.l.savedpc - 1); /* interrupted instruction */ + OpCode op = GET_OPCODE(inst); + switch (op) { /* finish its execution */ + case OP_ADD: case OP_SUB: case OP_MUL: case OP_DIV: + case OP_MOD: case OP_POW: case OP_UNM: case OP_LEN: + case OP_GETTABUP: case OP_GETTABLE: case OP_SELF: { + setobjs2s(L, base + GETARG_A(inst), --L->top); + break; + } + case OP_LE: case OP_LT: case OP_EQ: { + int res = !l_isfalse(L->top - 1); + L->top--; + /* metamethod should not be called when operand is K */ + lua_assert(!ISK(GETARG_B(inst))); + if (op == OP_LE && /* "<=" using "<" instead? */ + ttisnil(luaT_gettmbyobj(L, base + GETARG_B(inst), TM_LE))) + res = !res; /* invert result */ + lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_JMP); + if (res != GETARG_A(inst)) /* condition failed? */ + ci->u.l.savedpc++; /* skip jump instruction */ + break; + } + case OP_CONCAT: { + StkId top = L->top - 1; /* top when 'call_binTM' was called */ + int b = GETARG_B(inst); /* first element to concatenate */ + int total = cast_int(top - 1 - (base + b)); /* yet to concatenate */ + setobj2s(L, top - 2, top); /* put TM result in proper position */ + if (total > 1) { /* are there elements to concat? */ + L->top = top - 1; /* top is one after last element (at top-2) */ + luaV_concat(L, total); /* concat them (may yield again) */ + } + /* move final result to final position */ + setobj2s(L, ci->u.l.base + GETARG_A(inst), L->top - 1); + L->top = ci->top; /* restore top */ + break; + } + case OP_TFORCALL: { + lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_TFORLOOP); + L->top = ci->top; /* correct top */ + break; + } + case OP_CALL: { + if (GETARG_C(inst) - 1 >= 0) /* nresults >= 0? */ + L->top = ci->top; /* adjust results */ + break; + } + case OP_TAILCALL: case OP_SETTABUP: case OP_SETTABLE: + break; + default: lua_assert(0); + } +} + + + +/* +** some macros for common tasks in `luaV_execute' +*/ + +#if !defined luai_runtimecheck +#define luai_runtimecheck(L, c) /* void */ +#endif + + +#define RA(i) (base+GETARG_A(i)) +/* to be used after possible stack reallocation */ +#define RB(i) check_exp(getBMode(GET_OPCODE(i)) == OpArgR, base+GETARG_B(i)) +#define RC(i) check_exp(getCMode(GET_OPCODE(i)) == OpArgR, base+GETARG_C(i)) +#define RKB(i) check_exp(getBMode(GET_OPCODE(i)) == OpArgK, \ + ISK(GETARG_B(i)) ? k+INDEXK(GETARG_B(i)) : base+GETARG_B(i)) +#define RKC(i) check_exp(getCMode(GET_OPCODE(i)) == OpArgK, \ + ISK(GETARG_C(i)) ? k+INDEXK(GETARG_C(i)) : base+GETARG_C(i)) +#define KBx(i) \ + (k + (GETARG_Bx(i) != 0 ? GETARG_Bx(i) - 1 : GETARG_Ax(*ci->u.l.savedpc++))) + + +/* execute a jump instruction */ +#define dojump(ci,i,e) \ + { int a = GETARG_A(i); \ + if (a > 0) luaF_close(L, ci->u.l.base + a - 1); \ + ci->u.l.savedpc += GETARG_sBx(i) + e; } + +/* for test instructions, execute the jump instruction that follows it */ +#define donextjump(ci) { i = *ci->u.l.savedpc; dojump(ci, i, 1); } + + +#define Protect(x) { {x;}; base = ci->u.l.base; } + +#define checkGC(L,c) \ + Protect( luaC_condGC(L,{L->top = (c); /* limit of live values */ \ + luaC_step(L); \ + L->top = ci->top;}) /* restore top */ \ + luai_threadyield(L); ) + + +#define arith_op(op,tm) { \ + TValue *rb = RKB(i); \ + TValue *rc = RKC(i); \ + if (ttisnumber(rb) && ttisnumber(rc)) { \ + lua_Number nb = nvalue(rb), nc = nvalue(rc); \ + setnvalue(ra, op(L, nb, nc)); \ + } \ + else { Protect(luaV_arith(L, ra, rb, rc, tm)); } } + + +#define vmdispatch(o) switch(o) +#define vmcase(l,b) case l: {b} break; +#define vmcasenb(l,b) case l: {b} /* nb = no break */ + +void luaV_execute (lua_State *L) { + CallInfo *ci = L->ci; + LClosure *cl; + TValue *k; + StkId base; + newframe: /* reentry point when frame changes (call/return) */ + lua_assert(ci == L->ci); + cl = clLvalue(ci->func); + k = cl->p->k; + base = ci->u.l.base; + /* main loop of interpreter */ + for (;;) { + Instruction i = *(ci->u.l.savedpc++); + StkId ra; + if ((L->hookmask & (LUA_MASKLINE | LUA_MASKCOUNT)) && + (--L->hookcount == 0 || L->hookmask & LUA_MASKLINE)) { + Protect(traceexec(L)); + } + /* WARNING: several calls may realloc the stack and invalidate `ra' */ + ra = RA(i); + lua_assert(base == ci->u.l.base); + lua_assert(base <= L->top && L->top < L->stack + L->stacksize); + vmdispatch (GET_OPCODE(i)) { + vmcase(OP_MOVE, + setobjs2s(L, ra, RB(i)); + ) + vmcase(OP_LOADK, + TValue *rb = k + GETARG_Bx(i); + setobj2s(L, ra, rb); + ) + vmcase(OP_LOADKX, + TValue *rb; + lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG); + rb = k + GETARG_Ax(*ci->u.l.savedpc++); + setobj2s(L, ra, rb); + ) + vmcase(OP_LOADBOOL, + setbvalue(ra, GETARG_B(i)); + if (GETARG_C(i)) ci->u.l.savedpc++; /* skip next instruction (if C) */ + ) + vmcase(OP_LOADNIL, + int b = GETARG_B(i); + do { + setnilvalue(ra++); + } while (b--); + ) + vmcase(OP_GETUPVAL, + int b = GETARG_B(i); + setobj2s(L, ra, cl->upvals[b]->v); + ) + vmcase(OP_GETTABUP, + int b = GETARG_B(i); + Protect(luaV_gettable(L, cl->upvals[b]->v, RKC(i), ra)); + ) + vmcase(OP_GETTABLE, + Protect(luaV_gettable(L, RB(i), RKC(i), ra)); + ) + vmcase(OP_SETTABUP, + int a = GETARG_A(i); + Protect(luaV_settable(L, cl->upvals[a]->v, RKB(i), RKC(i))); + ) + vmcase(OP_SETUPVAL, + UpVal *uv = cl->upvals[GETARG_B(i)]; + setobj(L, uv->v, ra); + luaC_barrier(L, uv, ra); + ) + vmcase(OP_SETTABLE, + Protect(luaV_settable(L, ra, RKB(i), RKC(i))); + ) + vmcase(OP_NEWTABLE, + int b = GETARG_B(i); + int c = GETARG_C(i); + Table *t = luaH_new(L); + sethvalue(L, ra, t); + if (b != 0 || c != 0) + luaH_resize(L, t, luaO_fb2int(b), luaO_fb2int(c)); + checkGC(L, ra + 1); + ) + vmcase(OP_SELF, + StkId rb = RB(i); + setobjs2s(L, ra+1, rb); + Protect(luaV_gettable(L, rb, RKC(i), ra)); + ) + vmcase(OP_ADD, + arith_op(luai_numadd, TM_ADD); + ) + vmcase(OP_SUB, + arith_op(luai_numsub, TM_SUB); + ) + vmcase(OP_MUL, + arith_op(luai_nummul, TM_MUL); + ) + /* + * Patched: use luaV_* instead of luai_* to handle div/mod by 0 + */ + vmcase(OP_DIV, + arith_op(luaV_div, TM_DIV); + ) + vmcase(OP_MOD, + arith_op(luaV_mod, TM_MOD); + ) + vmcase(OP_POW, + arith_op(luai_numpow, TM_POW); + ) + vmcase(OP_UNM, + TValue *rb = RB(i); + if (ttisnumber(rb)) { + lua_Number nb = nvalue(rb); + setnvalue(ra, luai_numunm(L, nb)); + } + else { + Protect(luaV_arith(L, ra, rb, rb, TM_UNM)); + } + ) + vmcase(OP_NOT, + TValue *rb = RB(i); + int res = l_isfalse(rb); /* next assignment may change this value */ + setbvalue(ra, res); + ) + vmcase(OP_LEN, + Protect(luaV_objlen(L, ra, RB(i))); + ) + vmcase(OP_CONCAT, + int b = GETARG_B(i); + int c = GETARG_C(i); + StkId rb; + L->top = base + c + 1; /* mark the end of concat operands */ + Protect(luaV_concat(L, c - b + 1)); + ra = RA(i); /* 'luav_concat' may invoke TMs and move the stack */ + rb = b + base; + setobjs2s(L, ra, rb); + checkGC(L, (ra >= rb ? ra + 1 : rb)); + L->top = ci->top; /* restore top */ + ) + vmcase(OP_JMP, + dojump(ci, i, 0); + ) + vmcase(OP_EQ, + TValue *rb = RKB(i); + TValue *rc = RKC(i); + Protect( + if (cast_int(equalobj(L, rb, rc)) != GETARG_A(i)) + ci->u.l.savedpc++; + else + donextjump(ci); + ) + ) + vmcase(OP_LT, + Protect( + if (luaV_lessthan(L, RKB(i), RKC(i)) != GETARG_A(i)) + ci->u.l.savedpc++; + else + donextjump(ci); + ) + ) + vmcase(OP_LE, + Protect( + if (luaV_lessequal(L, RKB(i), RKC(i)) != GETARG_A(i)) + ci->u.l.savedpc++; + else + donextjump(ci); + ) + ) + vmcase(OP_TEST, + if (GETARG_C(i) ? l_isfalse(ra) : !l_isfalse(ra)) + ci->u.l.savedpc++; + else + donextjump(ci); + ) + vmcase(OP_TESTSET, + TValue *rb = RB(i); + if (GETARG_C(i) ? l_isfalse(rb) : !l_isfalse(rb)) + ci->u.l.savedpc++; + else { + setobjs2s(L, ra, rb); + donextjump(ci); + } + ) + vmcase(OP_CALL, + int b = GETARG_B(i); + int nresults = GETARG_C(i) - 1; + if (b != 0) L->top = ra+b; /* else previous instruction set top */ + if (luaD_precall(L, ra, nresults)) { /* C function? */ + if (nresults >= 0) L->top = ci->top; /* adjust results */ + base = ci->u.l.base; + } + else { /* Lua function */ + ci = L->ci; + ci->callstatus |= CIST_REENTRY; + goto newframe; /* restart luaV_execute over new Lua function */ + } + ) + vmcase(OP_TAILCALL, + int b = GETARG_B(i); + if (b != 0) L->top = ra+b; /* else previous instruction set top */ + lua_assert(GETARG_C(i) - 1 == LUA_MULTRET); + if (luaD_precall(L, ra, LUA_MULTRET)) /* C function? */ + base = ci->u.l.base; + else { + /* tail call: put called frame (n) in place of caller one (o) */ + CallInfo *nci = L->ci; /* called frame */ + CallInfo *oci = nci->previous; /* caller frame */ + StkId nfunc = nci->func; /* called function */ + StkId ofunc = oci->func; /* caller function */ + /* last stack slot filled by 'precall' */ + StkId lim = nci->u.l.base + getproto(nfunc)->numparams; + int aux; + /* close all upvalues from previous call */ + if (cl->p->sizep > 0) luaF_close(L, oci->u.l.base); + /* move new frame into old one */ + for (aux = 0; nfunc + aux < lim; aux++) + setobjs2s(L, ofunc + aux, nfunc + aux); + oci->u.l.base = ofunc + (nci->u.l.base - nfunc); /* correct base */ + oci->top = L->top = ofunc + (L->top - nfunc); /* correct top */ + oci->u.l.savedpc = nci->u.l.savedpc; + oci->callstatus |= CIST_TAIL; /* function was tail called */ + ci = L->ci = oci; /* remove new frame */ + lua_assert(L->top == oci->u.l.base + getproto(ofunc)->maxstacksize); + goto newframe; /* restart luaV_execute over new Lua function */ + } + ) + vmcasenb(OP_RETURN, + int b = GETARG_B(i); + if (b != 0) L->top = ra+b-1; + if (cl->p->sizep > 0) luaF_close(L, base); + b = luaD_poscall(L, ra); + if (!(ci->callstatus & CIST_REENTRY)) /* 'ci' still the called one */ + return; /* external invocation: return */ + else { /* invocation via reentry: continue execution */ + ci = L->ci; + if (b) L->top = ci->top; + lua_assert(isLua(ci)); + lua_assert(GET_OPCODE(*((ci)->u.l.savedpc - 1)) == OP_CALL); + goto newframe; /* restart luaV_execute over new Lua function */ + } + ) + vmcase(OP_FORLOOP, + lua_Number step = nvalue(ra+2); + lua_Number idx = luai_numadd(L, nvalue(ra), step); /* increment index */ + lua_Number limit = nvalue(ra+1); + if (luai_numlt(L, 0, step) ? luai_numle(L, idx, limit) + : luai_numle(L, limit, idx)) { + ci->u.l.savedpc += GETARG_sBx(i); /* jump back */ + setnvalue(ra, idx); /* update internal index... */ + setnvalue(ra+3, idx); /* ...and external index */ + } + ) + vmcase(OP_FORPREP, + const TValue *init = ra; + const TValue *plimit = ra+1; + const TValue *pstep = ra+2; + if (!tonumber(init, ra)) + luaG_runerror(L, LUA_QL("for") " initial value must be a number"); + else if (!tonumber(plimit, ra+1)) + luaG_runerror(L, LUA_QL("for") " limit must be a number"); + else if (!tonumber(pstep, ra+2)) + luaG_runerror(L, LUA_QL("for") " step must be a number"); + setnvalue(ra, luai_numsub(L, nvalue(ra), nvalue(pstep))); + ci->u.l.savedpc += GETARG_sBx(i); + ) + vmcasenb(OP_TFORCALL, + StkId cb = ra + 3; /* call base */ + setobjs2s(L, cb+2, ra+2); + setobjs2s(L, cb+1, ra+1); + setobjs2s(L, cb, ra); + L->top = cb + 3; /* func. + 2 args (state and index) */ + Protect(luaD_call(L, cb, GETARG_C(i), 1)); + L->top = ci->top; + i = *(ci->u.l.savedpc++); /* go to next instruction */ + ra = RA(i); + lua_assert(GET_OPCODE(i) == OP_TFORLOOP); + goto l_tforloop; + ) + vmcase(OP_TFORLOOP, + l_tforloop: + if (!ttisnil(ra + 1)) { /* continue loop? */ + setobjs2s(L, ra, ra + 1); /* save control variable */ + ci->u.l.savedpc += GETARG_sBx(i); /* jump back */ + } + ) + vmcase(OP_SETLIST, + int n = GETARG_B(i); + int c = GETARG_C(i); + int last; + Table *h; + if (n == 0) n = cast_int(L->top - ra) - 1; + if (c == 0) { + lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG); + c = GETARG_Ax(*ci->u.l.savedpc++); + } + luai_runtimecheck(L, ttistable(ra)); + h = hvalue(ra); + last = ((c-1)*LFIELDS_PER_FLUSH) + n; + if (last > h->sizearray) /* needs more space? */ + luaH_resizearray(L, h, last); /* pre-allocate it at once */ + for (; n > 0; n--) { + TValue *val = ra+n; + luaH_setint(L, h, last--, val); + luaC_barrierback(L, obj2gco(h), val); + } + L->top = ci->top; /* correct top (in case of previous open call) */ + ) + vmcase(OP_CLOSURE, + Proto *p = cl->p->p[GETARG_Bx(i)]; + Closure *ncl = getcached(p, cl->upvals, base); /* cached closure */ + if (ncl == NULL) /* no match? */ + pushclosure(L, p, cl->upvals, base, ra); /* create a new one */ + else + setclLvalue(L, ra, ncl); /* push cashed closure */ + checkGC(L, ra + 1); + ) + vmcase(OP_VARARG, + int b = GETARG_B(i) - 1; + int j; + int n = cast_int(base - ci->func) - cl->p->numparams - 1; + if (b < 0) { /* B == 0? */ + b = n; /* get all var. arguments */ + Protect(luaD_checkstack(L, n)); + ra = RA(i); /* previous call may change the stack */ + L->top = ra + n; + } + for (j = 0; j < b; j++) { + if (j < n) { + setobjs2s(L, ra + j, base - n + j); + } + else { + setnilvalue(ra + j); + } + } + ) + vmcase(OP_EXTRAARG, + lua_assert(0); + ) + } + } +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h new file mode 100644 index 000000000000..5380270da63d --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h @@ -0,0 +1,44 @@ +/* +** $Id: lvm.h,v 2.18.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua virtual machine +** See Copyright Notice in lua.h +*/ + +#ifndef lvm_h +#define lvm_h + + +#include "ldo.h" +#include "lobject.h" +#include "ltm.h" + + +#define tostring(L,o) (ttisstring(o) || (luaV_tostring(L, o))) + +#define tonumber(o,n) (ttisnumber(o) || (((o) = luaV_tonumber(o,n)) != NULL)) + +#define equalobj(L,o1,o2) (ttisequal(o1, o2) && luaV_equalobj_(L, o1, o2)) + +#define luaV_rawequalobj(o1,o2) equalobj(NULL,o1,o2) + + +/* not to called directly */ +LUAI_FUNC int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2); + + +LUAI_FUNC int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r); +LUAI_FUNC int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r); +LUAI_FUNC const TValue *luaV_tonumber (const TValue *obj, TValue *n); +LUAI_FUNC int luaV_tostring (lua_State *L, StkId obj); +LUAI_FUNC void luaV_gettable (lua_State *L, const TValue *t, TValue *key, + StkId val); +LUAI_FUNC void luaV_settable (lua_State *L, const TValue *t, TValue *key, + StkId val); +LUAI_FUNC void luaV_finishOp (lua_State *L); +LUAI_FUNC void luaV_execute (lua_State *L); +LUAI_FUNC void luaV_concat (lua_State *L, int total); +LUAI_FUNC void luaV_arith (lua_State *L, StkId ra, const TValue *rb, + const TValue *rc, TMS op); +LUAI_FUNC void luaV_objlen (lua_State *L, StkId ra, const TValue *rb); + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c new file mode 100644 index 000000000000..53e6a3daeb5a --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c @@ -0,0 +1,76 @@ +/* +** $Id: lzio.c,v 1.35.1.1 2013/04/12 18:48:47 roberto Exp $ +** Buffered streams +** See Copyright Notice in lua.h +*/ + + +#include <sys/zfs_context.h> + +#define lzio_c +#define LUA_CORE + +#include "lua.h" + +#include "llimits.h" +#include "lmem.h" +#include "lstate.h" +#include "lzio.h" + + +int luaZ_fill (ZIO *z) { + size_t size; + lua_State *L = z->L; + const char *buff; + lua_unlock(L); + buff = z->reader(L, z->data, &size); + lua_lock(L); + if (buff == NULL || size == 0) + return EOZ; + z->n = size - 1; /* discount char being returned */ + z->p = buff; + return cast_uchar(*(z->p++)); +} + + +void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader, void *data) { + z->L = L; + z->reader = reader; + z->data = data; + z->n = 0; + z->p = NULL; +} + + +/* --------------------------------------------------------------- read --- */ +size_t luaZ_read (ZIO *z, void *b, size_t n) { + while (n) { + size_t m; + if (z->n == 0) { /* no bytes in buffer? */ + if (luaZ_fill(z) == EOZ) /* try to read more */ + return n; /* no more input; return number of missing bytes */ + else { + z->n++; /* luaZ_fill consumed first byte; put it back */ + z->p--; + } + } + m = (n <= z->n) ? n : z->n; /* min. between n and z->n */ + memcpy(b, z->p, m); + z->n -= m; + z->p += m; + b = (char *)b + m; + n -= m; + } + return 0; +} + +/* ------------------------------------------------------------------------ */ +char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n) { + if (n > buff->buffsize) { + if (n < LUA_MINBUFFER) n = LUA_MINBUFFER; + luaZ_resizebuffer(L, buff, n); + } + return buff->buffer; +} + + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h new file mode 100644 index 000000000000..441f7479cb14 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h @@ -0,0 +1,65 @@ +/* +** $Id: lzio.h,v 1.26.1.1 2013/04/12 18:48:47 roberto Exp $ +** Buffered streams +** See Copyright Notice in lua.h +*/ + + +#ifndef lzio_h +#define lzio_h + +#include "lua.h" + +#include "lmem.h" + + +#define EOZ (-1) /* end of stream */ + +typedef struct Zio ZIO; + +#define zgetc(z) (((z)->n--)>0 ? cast_uchar(*(z)->p++) : luaZ_fill(z)) + + +typedef struct Mbuffer { + char *buffer; + size_t n; + size_t buffsize; +} Mbuffer; + +#define luaZ_initbuffer(L, buff) ((buff)->buffer = NULL, (buff)->buffsize = 0) + +#define luaZ_buffer(buff) ((buff)->buffer) +#define luaZ_sizebuffer(buff) ((buff)->buffsize) +#define luaZ_bufflen(buff) ((buff)->n) + +#define luaZ_resetbuffer(buff) ((buff)->n = 0) + + +#define luaZ_resizebuffer(L, buff, size) \ + (luaM_reallocvector(L, (buff)->buffer, (buff)->buffsize, size, char), \ + (buff)->buffsize = size) + +#define luaZ_freebuffer(L, buff) luaZ_resizebuffer(L, buff, 0) + + +LUAI_FUNC char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n); +LUAI_FUNC void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader, + void *data); +LUAI_FUNC size_t luaZ_read (ZIO* z, void* b, size_t n); /* read next n bytes */ + + + +/* --------- Private Part ------------------ */ + +struct Zio { + size_t n; /* bytes still unread */ + const char *p; /* current position in buffer */ + lua_Reader reader; /* reader function */ + void* data; /* additional data */ + lua_State *L; /* Lua state (for reader) */ +}; + + +LUAI_FUNC int luaZ_fill (ZIO *z); + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c new file mode 100644 index 000000000000..ce595d24d89c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c @@ -0,0 +1,1028 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Header File + * Copyright (C) 2011-2013, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + */ +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> + +static int real_LZ4_compress(const char *source, char *dest, int isize, + int osize); +static int LZ4_compressBound(int isize); +static int LZ4_uncompress_unknownOutputSize(const char *source, char *dest, + int isize, int maxOutputSize); +static int LZ4_compressCtx(void *ctx, const char *source, char *dest, + int isize, int osize); +static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest, + int isize, int osize); + +static kmem_cache_t *lz4_ctx_cache; + +/*ARGSUSED*/ +size_t +lz4_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uint32_t bufsiz; + char *dest = d_start; + + ASSERT(d_len >= sizeof (bufsiz)); + + bufsiz = real_LZ4_compress(s_start, &dest[sizeof (bufsiz)], s_len, + d_len - sizeof (bufsiz)); + + /* Signal an error if the compression routine returned zero. */ + if (bufsiz == 0) + return (s_len); + + /* + * Encode the compresed buffer size at the start. We'll need this in + * decompression to counter the effects of padding which might be + * added to the compressed buffer and which, if unhandled, would + * confuse the hell out of our decompression function. + */ + *(uint32_t *)dest = BE_32(bufsiz); + + return (bufsiz + sizeof (bufsiz)); +} + +/*ARGSUSED*/ +int +lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + const char *src = s_start; + uint32_t bufsiz = BE_IN32(src); + + /* invalid compressed buffer size encoded at start */ + if (bufsiz + sizeof (bufsiz) > s_len) + return (1); + + /* + * Returns 0 on success (decompression function returned non-negative) + * and non-zero on failure (decompression function returned negative). + */ + return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)], + d_start, bufsiz, d_len) < 0); +} + +/* + * LZ4 API Description: + * + * Simple Functions: + * real_LZ4_compress() : + * isize : is the input size. Max supported value is ~1.9GB + * return : the number of bytes written in buffer dest + * or 0 if the compression fails (if LZ4_COMPRESSMIN is set). + * note : destination buffer must be already allocated. + * destination buffer must be sized to handle worst cases + * situations (input data not compressible) worst case size + * evaluation is provided by function LZ4_compressBound(). + * + * Advanced Functions + * + * LZ4_compressBound() : + * Provides the maximum size that LZ4 may output in a "worst case" + * scenario (input data not compressible) primarily useful for memory + * allocation of output buffer. + * + * isize : is the input size. Max supported value is ~1.9GB + * return : maximum output size in a "worst case" scenario + * note : this function is limited by "int" range (2^31-1) + * + * LZ4_uncompress_unknownOutputSize() : + * isize : is the input size, therefore the compressed size + * maxOutputSize : is the size of the destination buffer (which must be + * already allocated) + * return : the number of bytes decoded in the destination buffer + * (necessarily <= maxOutputSize). If the source stream is + * malformed, the function will stop decoding and return a + * negative result, indicating the byte position of the faulty + * instruction. This function never writes beyond dest + + * maxOutputSize, and is therefore protected against malicious + * data packets. + * note : Destination buffer must be already allocated. + * + * LZ4_compressCtx() : + * This function explicitly handles the CTX memory structure. + * + * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated + * by the caller (either on the stack or using kmem_zalloc). Passing NULL + * isn't valid. + * + * LZ4_compress64kCtx() : + * Same as LZ4_compressCtx(), but specific to small inputs (<64KB). + * isize *Must* be <64KB, otherwise the output will be corrupted. + * + * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated + * by the caller (either on the stack or using kmem_zalloc). Passing NULL + * isn't valid. + */ + +/* + * Tuning parameters + */ + +/* + * COMPRESSIONLEVEL: Increasing this value improves compression ratio + * Lowering this value reduces memory usage. Reduced memory usage + * typically improves speed, due to cache effect (ex: L1 32KB for Intel, + * L1 64KB for AMD). Memory usage formula : N->2^(N+2) Bytes + * (examples : 12 -> 16KB ; 17 -> 512KB) + */ +#define COMPRESSIONLEVEL 12 + +/* + * NOTCOMPRESSIBLE_CONFIRMATION: Decreasing this value will make the + * algorithm skip faster data segments considered "incompressible". + * This may decrease compression ratio dramatically, but will be + * faster on incompressible data. Increasing this value will make + * the algorithm search more before declaring a segment "incompressible". + * This could improve compression a bit, but will be slower on + * incompressible data. The default value (6) is recommended. + */ +#define NOTCOMPRESSIBLE_CONFIRMATION 6 + +/* + * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE: This will provide a boost to + * performance for big endian cpu, but the resulting compressed stream + * will be incompatible with little-endian CPU. You can set this option + * to 1 in situations where data will stay within closed environment. + * This option is useless on Little_Endian CPU (such as x86). + */ +/* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */ + +/* + * CPU Feature Detection + */ + +/* 32 or 64 bits ? */ +#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || \ + defined(__amd64) || defined(__ppc64__) || defined(_WIN64) || \ + defined(__LP64__) || defined(_LP64)) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +/* + * Limits the amount of stack space that the algorithm may consume to hold + * the compression lookup table. The value `9' here means we'll never use + * more than 2k of stack (see above for a description of COMPRESSIONLEVEL). + * If more memory is needed, it is allocated from the heap. + */ +/* FreeBSD: Use heap for all platforms for now */ +#define STACKLIMIT 0 + +/* + * Little Endian or Big Endian? + * Note: overwrite the below #define if you know your architecture endianess. + */ +#if BYTE_ORDER == BIG_ENDIAN +#define LZ4_BIG_ENDIAN 1 +#else +/* + * Little Endian assumed. PDP Endian and other very rare endian format + * are unsupported. + */ +#endif + +/* + * Unaligned memory access is automatically enabled for "common" CPU, + * such as x86. For others CPU, the compiler will be more cautious, and + * insert extra code to ensure aligned access is respected. If you know + * your target CPU supports unaligned memory access, you may want to + * force this option manually to improve performance + */ +#if defined(__ARM_FEATURE_UNALIGNED) +#define LZ4_FORCE_UNALIGNED_ACCESS 1 +#endif + +/* + * FreeBSD: can't use GCC's __builtin_ctz when using sparc64 because + * gcc currently rely on libcompiler_rt. + * + * TODO: revisit this when situation changes. + */ +#if defined(__sparc64__) +#define LZ4_FORCE_SW_BITCOUNT +#endif + +/* + * Compiler Options + */ +#if __STDC_VERSION__ >= 199901L /* C99 */ +/* "restrict" is a known keyword */ +#else +/* Disable restrict */ +#define restrict +#endif + +#define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \ + (((x) & 0xffu) << 8))) + +#define expect(expr, value) (__builtin_expect((expr), (value))) + +#if defined(likely) +#undef likely +#endif +#if defined(unlikely) +#undef unlikely +#endif + +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + +/* Basic types */ +#define BYTE uint8_t +#define U16 uint16_t +#define U32 uint32_t +#define S32 int32_t +#define U64 uint64_t + +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +#pragma pack(1) +#endif + +typedef struct _U16_S { + U16 v; +} U16_S; +typedef struct _U32_S { + U32 v; +} U32_S; +typedef struct _U64_S { + U64 v; +} U64_S; + +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +#pragma pack() +#endif + +#define A64(x) (((U64_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A16(x) (((U16_S *)(x))->v) + +/* + * Constants + */ +#define MINMATCH 4 + +#define HASH_LOG COMPRESSIONLEVEL +#define HASHTABLESIZE (1 << HASH_LOG) +#define HASH_MASK (HASHTABLESIZE - 1) + +#define SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION > 2 ? \ + NOTCOMPRESSIBLE_CONFIRMATION : 2) + +/* + * Defines if memory is allocated into the stack (local variable), + * or into the heap (kmem_alloc()). + */ +#define HEAPMODE (HASH_LOG > STACKLIMIT) +#define COPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH + MINMATCH) +#define MINLENGTH (MFLIMIT + 1) + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) + +#define ML_BITS 4 +#define ML_MASK ((1U<<ML_BITS)-1) +#define RUN_BITS (8-ML_BITS) +#define RUN_MASK ((1U<<RUN_BITS)-1) + + +/* + * Architecture-specific macros + */ +#if LZ4_ARCH64 +#define STEPSIZE 8 +#define UARCH U64 +#define AARCH A64 +#define LZ4_COPYSTEP(s, d) A64(d) = A64(s); d += 8; s += 8; +#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d) +#define LZ4_SECURECOPY(s, d, e) if (d < e) LZ4_WILDCOPY(s, d, e) +#define HTYPE U32 +#define INITBASE(base) const BYTE* const base = ip +#else /* !LZ4_ARCH64 */ +#define STEPSIZE 4 +#define UARCH U32 +#define AARCH A32 +#define LZ4_COPYSTEP(s, d) A32(d) = A32(s); d += 4; s += 4; +#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d); LZ4_COPYSTEP(s, d); +#define LZ4_SECURECOPY LZ4_WILDCOPY +#define HTYPE const BYTE * +#define INITBASE(base) const int base = 0 +#endif /* !LZ4_ARCH64 */ + +#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE)) +#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ + { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; } +#define LZ4_WRITE_LITTLEENDIAN_16(p, i) \ + { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p += 2; } +#else +#define LZ4_READ_LITTLEENDIAN_16(d, s, p) { d = (s) - A16(p); } +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) { A16(p) = v; p += 2; } +#endif + + +/* Local structures */ +struct refTables { + HTYPE hashTable[HASHTABLESIZE]; +}; + + +/* Macros */ +#define LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH * 8) - \ + HASH_LOG)) +#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p)) +#define LZ4_WILDCOPY(s, d, e) do { LZ4_COPYPACKET(s, d) } while (d < e); +#define LZ4_BLINDCOPY(s, d, l) { BYTE* e = (d) + l; LZ4_WILDCOPY(s, d, e); \ + d = e; } + + +/* Private functions */ +#if LZ4_ARCH64 + +static inline int +LZ4_NbCommonBytes(register U64 val) +{ +#if defined(LZ4_BIG_ENDIAN) +#if !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll(val) >> 3); +#else + int r; + if (!(val >> 32)) { + r = 4; + } else { + r = 0; + val >>= 32; + } + if (!(val >> 16)) { + r += 2; + val >>= 8; + } else { + val >>= 24; + } + r += (!val); + return (r); +#endif +#else +#if !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll(val) >> 3); +#else + static const int DeBruijnBytePos[64] = + { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, + 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, + 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, + 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 + }; + return DeBruijnBytePos[((U64) ((val & -val) * 0x0218A392CDABBD3F)) >> + 58]; +#endif +#endif +} + +#else + +static inline int +LZ4_NbCommonBytes(register U32 val) +{ +#if defined(LZ4_BIG_ENDIAN) +#if !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz(val) >> 3); +#else + int r; + if (!(val >> 16)) { + r = 2; + val >>= 8; + } else { + r = 0; + val >>= 24; + } + r += (!val); + return (r); +#endif +#else +#if !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz(val) >> 3); +#else + static const int DeBruijnBytePos[32] = { + 0, 0, 3, 0, 3, 1, 3, 0, + 3, 2, 2, 1, 3, 2, 0, 1, + 3, 3, 1, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 1, 0, 1, 1 + }; + return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> + 27]; +#endif +#endif +} + +#endif + +/* Public functions */ + +static int +LZ4_compressBound(int isize) +{ + return (isize + (isize / 255) + 16); +} + +/* Compression functions */ + +/*ARGSUSED*/ +static int +LZ4_compressCtx(void *ctx, const char *source, char *dest, int isize, + int osize) +{ +#if HEAPMODE + struct refTables *srt = (struct refTables *)ctx; + HTYPE *HashTable = (HTYPE *) (srt->hashTable); +#else + HTYPE HashTable[HASHTABLESIZE] = { 0 }; +#endif + + const BYTE *ip = (BYTE *) source; + INITBASE(base); + const BYTE *anchor = ip; + const BYTE *const iend = ip + isize; + const BYTE *const oend = (BYTE *) dest + osize; + const BYTE *const mflimit = iend - MFLIMIT; +#define matchlimit (iend - LASTLITERALS) + + BYTE *op = (BYTE *) dest; + + int len, length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + /* First Byte */ + HashTable[LZ4_HASH_VALUE(ip)] = ip - base; + ip++; + forwardH = LZ4_HASH_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findMatchAttempts = (1U << skipStrength) + 3; + const BYTE *forwardIp = ip; + const BYTE *ref; + BYTE *token; + + /* Find a match */ + do { + U32 h = forwardH; + int step = findMatchAttempts++ >> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if unlikely(forwardIp > mflimit) { + goto _last_literals; + } + + forwardH = LZ4_HASH_VALUE(forwardIp); + ref = base + HashTable[h]; + HashTable[h] = ip - base; + + } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); + + /* Catch up */ + while ((ip > anchor) && (ref > (BYTE *) source) && + unlikely(ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = ip - anchor; + token = op++; + + /* Check output limit */ + if unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend) + return (0); + + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254; len -= 255) + *op++ = 255; + *op++ = (BYTE)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + + _next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref); + + /* Start Counting */ + ip += MINMATCH; + ref += MINMATCH; /* MinMatch verified */ + anchor = ip; + while likely(ip < matchlimit - (STEPSIZE - 1)) { + UARCH diff = AARCH(ref) ^ AARCH(ip); + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NbCommonBytes(diff); + goto _endCount; + } +#if LZ4_ARCH64 + if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } +#endif + if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < matchlimit) && (*ref == *ip)) + ip++; + _endCount: + + /* Encode MatchLength */ + len = (ip - anchor); + /* Check output limit */ + if unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend) + return (0); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (BYTE)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + /* Fill table */ + HashTable[LZ4_HASH_VALUE(ip - 2)] = ip - 2 - base; + + /* Test next position */ + ref = base + HashTable[LZ4_HASH_VALUE(ip)]; + HashTable[LZ4_HASH_VALUE(ip)] = ip - base; + if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { + token = op++; + *token = 0; + goto _next_match; + } + /* Prepare next loop */ + anchor = ip++; + forwardH = LZ4_HASH_VALUE(ip); + } + + _last_literals: + /* Encode Last Literals */ + { + int lastRun = iend - anchor; + if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) > + oend) + return (0); + if (lastRun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastRun -= RUN_MASK; + for (; lastRun > 254; lastRun -= 255) { + *op++ = 255; + } + *op++ = (BYTE)lastRun; + } else + *op++ = (lastRun << ML_BITS); + (void) memcpy(op, anchor, iend - anchor); + op += iend - anchor; + } + + /* End */ + return (int)(((char *)op) - dest); +} + + + +/* Note : this function is valid only if isize < LZ4_64KLIMIT */ +#define LZ4_64KLIMIT ((1 << 16) + (MFLIMIT - 1)) +#define HASHLOG64K (HASH_LOG + 1) +#define HASH64KTABLESIZE (1U << HASHLOG64K) +#define LZ4_HASH64K_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8) - \ + HASHLOG64K)) +#define LZ4_HASH64K_VALUE(p) LZ4_HASH64K_FUNCTION(A32(p)) + +/*ARGSUSED*/ +static int +LZ4_compress64kCtx(void *ctx, const char *source, char *dest, int isize, + int osize) +{ +#if HEAPMODE + struct refTables *srt = (struct refTables *)ctx; + U16 *HashTable = (U16 *) (srt->hashTable); +#else + U16 HashTable[HASH64KTABLESIZE] = { 0 }; +#endif + + const BYTE *ip = (BYTE *) source; + const BYTE *anchor = ip; + const BYTE *const base = ip; + const BYTE *const iend = ip + isize; + const BYTE *const oend = (BYTE *) dest + osize; + const BYTE *const mflimit = iend - MFLIMIT; +#define matchlimit (iend - LASTLITERALS) + + BYTE *op = (BYTE *) dest; + + int len, length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + /* First Byte */ + ip++; + forwardH = LZ4_HASH64K_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findMatchAttempts = (1U << skipStrength) + 3; + const BYTE *forwardIp = ip; + const BYTE *ref; + BYTE *token; + + /* Find a match */ + do { + U32 h = forwardH; + int step = findMatchAttempts++ >> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if (forwardIp > mflimit) { + goto _last_literals; + } + + forwardH = LZ4_HASH64K_VALUE(forwardIp); + ref = base + HashTable[h]; + HashTable[h] = ip - base; + + } while (A32(ref) != A32(ip)); + + /* Catch up */ + while ((ip > anchor) && (ref > (BYTE *) source) && + (ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = ip - anchor; + token = op++; + + /* Check output limit */ + if unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend) + return (0); + + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254; len -= 255) + *op++ = 255; + *op++ = (BYTE)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + + _next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref); + + /* Start Counting */ + ip += MINMATCH; + ref += MINMATCH; /* MinMatch verified */ + anchor = ip; + while (ip < matchlimit - (STEPSIZE - 1)) { + UARCH diff = AARCH(ref) ^ AARCH(ip); + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NbCommonBytes(diff); + goto _endCount; + } +#if LZ4_ARCH64 + if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } +#endif + if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < matchlimit) && (*ref == *ip)) + ip++; + _endCount: + + /* Encode MatchLength */ + len = (ip - anchor); + /* Check output limit */ + if unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend) + return (0); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (BYTE)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + /* Fill table */ + HashTable[LZ4_HASH64K_VALUE(ip - 2)] = ip - 2 - base; + + /* Test next position */ + ref = base + HashTable[LZ4_HASH64K_VALUE(ip)]; + HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base; + if (A32(ref) == A32(ip)) { + token = op++; + *token = 0; + goto _next_match; + } + /* Prepare next loop */ + anchor = ip++; + forwardH = LZ4_HASH64K_VALUE(ip); + } + + _last_literals: + /* Encode Last Literals */ + { + int lastRun = iend - anchor; + if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) > + oend) + return (0); + if (lastRun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastRun -= RUN_MASK; + for (; lastRun > 254; lastRun -= 255) + *op++ = 255; + *op++ = (BYTE)lastRun; + } else + *op++ = (lastRun << ML_BITS); + (void) memcpy(op, anchor, iend - anchor); + op += iend - anchor; + } + + /* End */ + return (int)(((char *)op) - dest); +} + +static int +real_LZ4_compress(const char *source, char *dest, int isize, int osize) +{ +#if HEAPMODE + void *ctx = kmem_cache_alloc(lz4_ctx_cache, KM_NOSLEEP); + int result; + + /* + * out of kernel memory, gently fall through - this will disable + * compression in zio_compress_data + */ + if (ctx == NULL) + return (0); + + bzero(ctx, sizeof(struct refTables)); + if (isize < LZ4_64KLIMIT) + result = LZ4_compress64kCtx(ctx, source, dest, isize, osize); + else + result = LZ4_compressCtx(ctx, source, dest, isize, osize); + + kmem_cache_free(lz4_ctx_cache, ctx); + return (result); +#else + if (isize < (int)LZ4_64KLIMIT) + return (LZ4_compress64kCtx(NULL, source, dest, isize, osize)); + return (LZ4_compressCtx(NULL, source, dest, isize, osize)); +#endif +} + +/* Decompression functions */ + +/* + * Note: The decoding function LZ4_uncompress_unknownOutputSize() is safe + * against "buffer overflow" attack type. They will never write nor + * read outside of the provided output buffers. + * LZ4_uncompress_unknownOutputSize() also insures that it will never + * read outside of the input buffer. A corrupted input will produce + * an error result, a negative int, indicating the position of the + * error within input stream. + */ + +static int +LZ4_uncompress_unknownOutputSize(const char *source, char *dest, int isize, + int maxOutputSize) +{ + /* Local Variables */ + const BYTE *restrict ip = (const BYTE *) source; + const BYTE *const iend = ip + isize; + const BYTE *ref; + + BYTE *op = (BYTE *) dest; + BYTE *const oend = op + maxOutputSize; + BYTE *cpy; + + size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 + size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; +#endif + + /* Main Loop */ + while (ip < iend) { + unsigned token; + size_t length; + + /* get runlength */ + token = *ip++; + if ((length = (token >> ML_BITS)) == RUN_MASK) { + int s = 255; + while ((ip < iend) && (s == 255)) { + s = *ip++; + length += s; + } + } + /* copy literals */ + cpy = op + length; + /* CORNER-CASE: cpy might overflow. */ + if (cpy < op) + goto _output_error; /* cpy was overflowed, bail! */ + if ((cpy > oend - COPYLENGTH) || + (ip + length > iend - COPYLENGTH)) { + if (cpy > oend) + /* Error: writes beyond output buffer */ + goto _output_error; + if (ip + length != iend) + /* + * Error: LZ4 format requires to consume all + * input at this stage + */ + goto _output_error; + (void) memcpy(op, ip, length); + op += length; + /* Necessarily EOF, due to parsing restrictions */ + break; + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + if (ref < (BYTE * const) dest) + /* + * Error: offset creates reference outside of + * destination buffer + */ + goto _output_error; + + /* get matchlength */ + if ((length = (token & ML_MASK)) == ML_MASK) { + while (ip < iend) { + int s = *ip++; + length += s; + if (s == 255) + continue; + break; + } + } + /* copy repeated sequence */ + if unlikely(op - ref < STEPSIZE) { +#if LZ4_ARCH64 + size_t dec64 = dec64table[op-ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op-ref]; + A32(op) = A32(ref); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE - 4); + if (cpy > oend - COPYLENGTH) { + if (cpy > oend) + /* + * Error: request to write outside of + * destination buffer + */ + goto _output_error; + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + if (op == oend) + /* + * Check EOF (should never happen, since + * last 5 bytes are supposed to be literals) + */ + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + + /* end of decoding */ + return (int)(((char *)op) - dest); + + /* write overflow error detected */ + _output_error: + return (int)(-(((char *)ip) - source)); +} + +extern void +lz4_init(void) +{ + +#if HEAPMODE + lz4_ctx_cache = kmem_cache_create("lz4_ctx", sizeof(struct refTables), + 0, NULL, NULL, NULL, NULL, NULL, 0); +#endif +} + +extern void +lz4_fini(void) +{ + +#if HEAPMODE + kmem_cache_destroy(lz4_ctx_cache); +#endif +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c new file mode 100644 index 000000000000..699373ad4d43 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c @@ -0,0 +1,129 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * We keep our own copy of this algorithm for 3 main reasons: + * 1. If we didn't, anyone modifying common/os/compress.c would + * directly break our on disk format + * 2. Our version of lzjb does not have a number of checks that the + * common/os version needs and uses + * 3. We initialize the lempel to ensure deterministic results, + * so that identical blocks can always be deduplicated. + * In particular, we are adding the "feature" that compress() can + * take a destination buffer size and returns the compressed length, or the + * source length if compression would overflow the destination buffer. + */ + +#include <sys/zfs_context.h> +#include <sys/types.h> +#include <sys/param.h> + +#define MATCH_BITS 6 +#define MATCH_MIN 3 +#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1)) +#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) +#define LEMPEL_SIZE 1024 + +/*ARGSUSED*/ +size_t +lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *cpy; + uchar_t *copymap = NULL; + int copymask = 1 << (NBBY - 1); + int mlen, offset, hash; + uint16_t *hp; + uint16_t lempel[LEMPEL_SIZE] = { 0 }; + + while (src < (uchar_t *)s_start + s_len) { + if ((copymask <<= 1) == (1 << NBBY)) { + if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) + return (s_len); + copymask = 1; + copymap = dst; + *dst++ = 0; + } + if (src > (uchar_t *)s_start + s_len - MATCH_MAX) { + *dst++ = *src++; + continue; + } + hash = (src[0] << 16) + (src[1] << 8) + src[2]; + hash += hash >> 9; + hash += hash >> 5; + hp = &lempel[hash & (LEMPEL_SIZE - 1)]; + offset = (intptr_t)(src - *hp) & OFFSET_MASK; + *hp = (uint16_t)(uintptr_t)src; + cpy = src - offset; + if (cpy >= (uchar_t *)s_start && cpy != src && + src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) { + *copymap |= copymask; + for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++) + if (src[mlen] != cpy[mlen]) + break; + *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) | + (offset >> NBBY); + *dst++ = (uchar_t)offset; + src += mlen; + } else { + *dst++ = *src++; + } + } + return (dst - (uchar_t *)d_start); +} + +/*ARGSUSED*/ +int +lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *d_end = (uchar_t *)d_start + d_len; + uchar_t *cpy; + uchar_t copymap = 0; + int copymask = 1 << (NBBY - 1); + + while (dst < d_end) { + if ((copymask <<= 1) == (1 << NBBY)) { + copymask = 1; + copymap = *src++; + } + if (copymap & copymask) { + int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN; + int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK; + src += 2; + if ((cpy = dst - offset) < (uchar_t *)d_start) + return (-1); + if (mlen > (d_end - dst)) + mlen = d_end - dst; + while (--mlen >= 0) + *dst++ = *cpy++; + } else { + *dst++ = *src++; + } + } + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c new file mode 100644 index 000000000000..fcb1f3487b31 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -0,0 +1,4216 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include <sys/zfs_context.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/space_map.h> +#include <sys/metaslab_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/spa_impl.h> +#include <sys/zfeature.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/zap.h> + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); + +#define GANG_ALLOCATION(flags) \ + ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) + +uint64_t metaslab_aliquot = 512ULL << 10; +uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ +SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, force_ganging, CTLFLAG_RWTUN, + &metaslab_force_ganging, 0, + "Force gang block allocation for blocks larger than or equal to this value"); + +/* + * Since we can touch multiple metaslabs (and their respective space maps) + * with each transaction group, we benefit from having a smaller space map + * block size since it allows us to issue more I/O operations scattered + * around the disk. + */ +int zfs_metaslab_sm_blksz = (1 << 12); +SYSCTL_INT(_vfs_zfs, OID_AUTO, metaslab_sm_blksz, CTLFLAG_RDTUN, + &zfs_metaslab_sm_blksz, 0, + "Block size for metaslab DTL space map. Power of 2 and greater than 4096."); + +/* + * The in-core space map representation is more compact than its on-disk form. + * The zfs_condense_pct determines how much more compact the in-core + * space map representation must be before we compact it on-disk. + * Values should be greater than or equal to 100. + */ +int zfs_condense_pct = 200; +SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, + &zfs_condense_pct, 0, + "Condense on-disk spacemap when it is more than this many percents" + " of in-memory counterpart"); + +/* + * Condensing a metaslab is not guaranteed to actually reduce the amount of + * space used on disk. In particular, a space map uses data in increments of + * MAX(1 << ashift, space_map_blksize), so a metaslab might use the + * same number of blocks after condensing. Since the goal of condensing is to + * reduce the number of IOPs required to read the space map, we only want to + * condense when we can be sure we will reduce the number of blocks used by the + * space map. Unfortunately, we cannot precisely compute whether or not this is + * the case in metaslab_should_condense since we are holding ms_lock. Instead, + * we apply the following heuristic: do not condense a spacemap unless the + * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold + * blocks. + */ +int zfs_metaslab_condense_block_threshold = 4; + +/* + * The zfs_mg_noalloc_threshold defines which metaslab groups should + * be eligible for allocation. The value is defined as a percentage of + * free space. Metaslab groups that have more free space than + * zfs_mg_noalloc_threshold are always eligible for allocations. Once + * a metaslab group's free space is less than or equal to the + * zfs_mg_noalloc_threshold the allocator will avoid allocating to that + * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. + * Once all groups in the pool reach zfs_mg_noalloc_threshold then all + * groups are allowed to accept allocations. Gang blocks are always + * eligible to allocate on any metaslab group. The default value of 0 means + * no metaslab group will be excluded based on this criterion. + */ +int zfs_mg_noalloc_threshold = 0; +SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, + &zfs_mg_noalloc_threshold, 0, + "Percentage of metaslab group size that should be free" + " to make it eligible for allocation"); + +/* + * Metaslab groups are considered eligible for allocations if their + * fragmenation metric (measured as a percentage) is less than or equal to + * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold + * then it will be skipped unless all metaslab groups within the metaslab + * class have also crossed this threshold. + */ +int zfs_mg_fragmentation_threshold = 85; +SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, + &zfs_mg_fragmentation_threshold, 0, + "Percentage of metaslab group size that should be considered " + "eligible for allocations unless all metaslab groups within the metaslab class " + "have also crossed this threshold"); + +/* + * Allow metaslabs to keep their active state as long as their fragmentation + * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An + * active metaslab that exceeds this threshold will no longer keep its active + * status allowing better metaslabs to be selected. + */ +int zfs_metaslab_fragmentation_threshold = 70; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, + &zfs_metaslab_fragmentation_threshold, 0, + "Maximum percentage of metaslab fragmentation level to keep their active state"); + +/* + * When set will load all metaslabs when pool is first opened. + */ +int metaslab_debug_load = 0; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, + &metaslab_debug_load, 0, + "Load all metaslabs when pool is first opened"); + +/* + * When set will prevent metaslabs from being unloaded. + */ +int metaslab_debug_unload = 0; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, + &metaslab_debug_unload, 0, + "Prevent metaslabs from being unloaded"); + +/* + * Minimum size which forces the dynamic allocator to change + * it's allocation strategy. Once the space map cannot satisfy + * an allocation of this size then it switches to using more + * aggressive strategy (i.e search by size rather than offset). + */ +uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; +SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, + &metaslab_df_alloc_threshold, 0, + "Minimum size which forces the dynamic allocator to change it's allocation strategy"); + +/* + * The minimum free space, in percent, which must be available + * in a space map to continue allocations in a first-fit fashion. + * Once the space map's free space drops below this level we dynamically + * switch to using best-fit allocations. + */ +int metaslab_df_free_pct = 4; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, + &metaslab_df_free_pct, 0, + "The minimum free space, in percent, which must be available in a " + "space map to continue allocations in a first-fit fashion"); + +/* + * A metaslab is considered "free" if it contains a contiguous + * segment which is greater than metaslab_min_alloc_size. + */ +uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; +SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, + &metaslab_min_alloc_size, 0, + "A metaslab is considered \"free\" if it contains a contiguous " + "segment which is greater than vfs.zfs.metaslab.min_alloc_size"); + +/* + * Percentage of all cpus that can be used by the metaslab taskq. + */ +int metaslab_load_pct = 50; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, + &metaslab_load_pct, 0, + "Percentage of cpus that can be used by the metaslab taskq"); + +/* + * Determines how many txgs a metaslab may remain loaded without having any + * allocations from it. As long as a metaslab continues to be used we will + * keep it loaded. + */ +int metaslab_unload_delay = TXG_SIZE * 2; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, + &metaslab_unload_delay, 0, + "Number of TXGs that an unused metaslab can be kept in memory"); + +/* + * Max number of metaslabs per group to preload. + */ +int metaslab_preload_limit = SPA_DVAS_PER_BP; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, + &metaslab_preload_limit, 0, + "Max number of metaslabs per group to preload"); + +/* + * Enable/disable preloading of metaslab. + */ +boolean_t metaslab_preload_enabled = B_TRUE; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, + &metaslab_preload_enabled, 0, + "Max number of metaslabs per group to preload"); + +/* + * Enable/disable fragmentation weighting on metaslabs. + */ +boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, + &metaslab_fragmentation_factor_enabled, 0, + "Enable fragmentation weighting on metaslabs"); + +/* + * Enable/disable lba weighting (i.e. outer tracks are given preference). + */ +boolean_t metaslab_lba_weighting_enabled = B_TRUE; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, + &metaslab_lba_weighting_enabled, 0, + "Enable LBA weighting (i.e. outer tracks are given preference)"); + +/* + * Enable/disable metaslab group biasing. + */ +boolean_t metaslab_bias_enabled = B_TRUE; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, + &metaslab_bias_enabled, 0, + "Enable metaslab group biasing"); + +/* + * Enable/disable remapping of indirect DVAs to their concrete vdevs. + */ +boolean_t zfs_remap_blkptr_enable = B_TRUE; + +/* + * Enable/disable segment-based metaslab selection. + */ +boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; + +/* + * When using segment-based metaslab selection, we will continue + * allocating from the active metaslab until we have exhausted + * zfs_metaslab_switch_threshold of its buckets. + */ +int zfs_metaslab_switch_threshold = 2; + +/* + * Internal switch to enable/disable the metaslab allocation tracing + * facility. + */ +boolean_t metaslab_trace_enabled = B_TRUE; + +/* + * Maximum entries that the metaslab allocation tracing facility will keep + * in a given list when running in non-debug mode. We limit the number + * of entries in non-debug mode to prevent us from using up too much memory. + * The limit should be sufficiently large that we don't expect any allocation + * to every exceed this value. In debug mode, the system will panic if this + * limit is ever reached allowing for further investigation. + */ +uint64_t metaslab_trace_max_entries = 5000; + +static uint64_t metaslab_weight(metaslab_t *); +static void metaslab_set_fragmentation(metaslab_t *); +static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); +static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); +static void metaslab_passivate(metaslab_t *msp, uint64_t weight); +static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); + +kmem_cache_t *metaslab_alloc_trace_cache; + +/* + * ========================================================================== + * Metaslab classes + * ========================================================================== + */ +metaslab_class_t * +metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) +{ + metaslab_class_t *mc; + + mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); + + mc->mc_spa = spa; + mc->mc_rotor = NULL; + mc->mc_ops = ops; + mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); + mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * + sizeof (refcount_t), KM_SLEEP); + mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * + sizeof (uint64_t), KM_SLEEP); + for (int i = 0; i < spa->spa_alloc_count; i++) + refcount_create_tracked(&mc->mc_alloc_slots[i]); + + return (mc); +} + +void +metaslab_class_destroy(metaslab_class_t *mc) +{ + ASSERT(mc->mc_rotor == NULL); + ASSERT(mc->mc_alloc == 0); + ASSERT(mc->mc_deferred == 0); + ASSERT(mc->mc_space == 0); + ASSERT(mc->mc_dspace == 0); + + for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) + refcount_destroy(&mc->mc_alloc_slots[i]); + kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * + sizeof (refcount_t)); + kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * + sizeof (uint64_t)); + mutex_destroy(&mc->mc_lock); + kmem_free(mc, sizeof (metaslab_class_t)); +} + +int +metaslab_class_validate(metaslab_class_t *mc) +{ + metaslab_group_t *mg; + vdev_t *vd; + + /* + * Must hold one of the spa_config locks. + */ + ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || + spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); + + if ((mg = mc->mc_rotor) == NULL) + return (0); + + do { + vd = mg->mg_vd; + ASSERT(vd->vdev_mg != NULL); + ASSERT3P(vd->vdev_top, ==, vd); + ASSERT3P(mg->mg_class, ==, mc); + ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); + } while ((mg = mg->mg_next) != mc->mc_rotor); + + return (0); +} + +void +metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, + int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) +{ + atomic_add_64(&mc->mc_alloc, alloc_delta); + atomic_add_64(&mc->mc_deferred, defer_delta); + atomic_add_64(&mc->mc_space, space_delta); + atomic_add_64(&mc->mc_dspace, dspace_delta); +} + +void +metaslab_class_minblocksize_update(metaslab_class_t *mc) +{ + metaslab_group_t *mg; + vdev_t *vd; + uint64_t minashift = UINT64_MAX; + + if ((mg = mc->mc_rotor) == NULL) { + mc->mc_minblocksize = SPA_MINBLOCKSIZE; + return; + } + + do { + vd = mg->mg_vd; + if (vd->vdev_ashift < minashift) + minashift = vd->vdev_ashift; + } while ((mg = mg->mg_next) != mc->mc_rotor); + + mc->mc_minblocksize = 1ULL << minashift; +} + +uint64_t +metaslab_class_get_alloc(metaslab_class_t *mc) +{ + return (mc->mc_alloc); +} + +uint64_t +metaslab_class_get_deferred(metaslab_class_t *mc) +{ + return (mc->mc_deferred); +} + +uint64_t +metaslab_class_get_space(metaslab_class_t *mc) +{ + return (mc->mc_space); +} + +uint64_t +metaslab_class_get_dspace(metaslab_class_t *mc) +{ + return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); +} + +uint64_t +metaslab_class_get_minblocksize(metaslab_class_t *mc) +{ + return (mc->mc_minblocksize); +} + +void +metaslab_class_histogram_verify(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t *mc_hist; + int i; + + if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) + return; + + mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + KM_SLEEP); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + /* + * Skip any holes, uninitialized top-levels, or + * vdevs that are not in this metalab class. + */ + if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + mc_hist[i] += mg->mg_histogram[i]; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); + + kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); +} + +/* + * Calculate the metaslab class's fragmentation metric. The metric + * is weighted based on the space contribution of each metaslab group. + * The return value will be a number between 0 and 100 (inclusive), or + * ZFS_FRAG_INVALID if the metric has not been set. See comment above the + * zfs_frag_table for more information about the metric. + */ +uint64_t +metaslab_class_fragmentation(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t fragmentation = 0; + + spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + /* + * Skip any holes, uninitialized top-levels, + * or vdevs that are not in this metalab class. + */ + if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + /* + * If a metaslab group does not contain a fragmentation + * metric then just bail out. + */ + if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (ZFS_FRAG_INVALID); + } + + /* + * Determine how much this metaslab_group is contributing + * to the overall pool fragmentation metric. + */ + fragmentation += mg->mg_fragmentation * + metaslab_group_get_space(mg); + } + fragmentation /= metaslab_class_get_space(mc); + + ASSERT3U(fragmentation, <=, 100); + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (fragmentation); +} + +/* + * Calculate the amount of expandable space that is available in + * this metaslab class. If a device is expanded then its expandable + * space will be the amount of allocatable space that is currently not + * part of this metaslab class. + */ +uint64_t +metaslab_class_expandable_space(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t space = 0; + + spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); + for (int c = 0; c < rvd->vdev_children; c++) { + uint64_t tspace; + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + /* + * Calculate if we have enough space to add additional + * metaslabs. We report the expandable space in terms + * of the metaslab size since that's the unit of expansion. + * Adjust by efi system partition size. + */ + tspace = tvd->vdev_max_asize - tvd->vdev_asize; + if (tspace > mc->mc_spa->spa_bootsize) { + tspace -= mc->mc_spa->spa_bootsize; + } + space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); + } + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (space); +} + +static int +metaslab_compare(const void *x1, const void *x2) +{ + const metaslab_t *m1 = (const metaslab_t *)x1; + const metaslab_t *m2 = (const metaslab_t *)x2; + + int sort1 = 0; + int sort2 = 0; + if (m1->ms_allocator != -1 && m1->ms_primary) + sort1 = 1; + else if (m1->ms_allocator != -1 && !m1->ms_primary) + sort1 = 2; + if (m2->ms_allocator != -1 && m2->ms_primary) + sort2 = 1; + else if (m2->ms_allocator != -1 && !m2->ms_primary) + sort2 = 2; + + /* + * Sort inactive metaslabs first, then primaries, then secondaries. When + * selecting a metaslab to allocate from, an allocator first tries its + * primary, then secondary active metaslab. If it doesn't have active + * metaslabs, or can't allocate from them, it searches for an inactive + * metaslab to activate. If it can't find a suitable one, it will steal + * a primary or secondary metaslab from another allocator. + */ + if (sort1 < sort2) + return (-1); + if (sort1 > sort2) + return (1); + + int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); + if (likely(cmp)) + return (cmp); + + IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); + + return (AVL_CMP(m1->ms_start, m2->ms_start)); +} + +/* + * Verify that the space accounting on disk matches the in-core range_trees. + */ +void +metaslab_verify_space(metaslab_t *msp, uint64_t txg) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t allocated = 0; + uint64_t sm_free_space, msp_free_space; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + /* + * We can only verify the metaslab space when we're called + * from syncing context with a loaded metaslab that has an allocated + * space map. Calling this in non-syncing context does not + * provide a consistent view of the metaslab since we're performing + * allocations in the future. + */ + if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || + !msp->ms_loaded) + return; + + sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - + space_map_alloc_delta(msp->ms_sm); + + /* + * Account for future allocations since we would have already + * deducted that space from the ms_freetree. + */ + for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { + allocated += + range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); + } + + msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + + msp->ms_deferspace + range_tree_space(msp->ms_freed); + + VERIFY3U(sm_free_space, ==, msp_free_space); +} + +/* + * ========================================================================== + * Metaslab groups + * ========================================================================== + */ +/* + * Update the allocatable flag and the metaslab group's capacity. + * The allocatable flag is set to true if the capacity is below + * the zfs_mg_noalloc_threshold or has a fragmentation value that is + * greater than zfs_mg_fragmentation_threshold. If a metaslab group + * transitions from allocatable to non-allocatable or vice versa then the + * metaslab group's class is updated to reflect the transition. + */ +static void +metaslab_group_alloc_update(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + metaslab_class_t *mc = mg->mg_class; + vdev_stat_t *vs = &vd->vdev_stat; + boolean_t was_allocatable; + boolean_t was_initialized; + + ASSERT(vd == vd->vdev_top); + ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, + SCL_ALLOC); + + mutex_enter(&mg->mg_lock); + was_allocatable = mg->mg_allocatable; + was_initialized = mg->mg_initialized; + + mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / + (vs->vs_space + 1); + + mutex_enter(&mc->mc_lock); + + /* + * If the metaslab group was just added then it won't + * have any space until we finish syncing out this txg. + * At that point we will consider it initialized and available + * for allocations. We also don't consider non-activated + * metaslab groups (e.g. vdevs that are in the middle of being removed) + * to be initialized, because they can't be used for allocation. + */ + mg->mg_initialized = metaslab_group_initialized(mg); + if (!was_initialized && mg->mg_initialized) { + mc->mc_groups++; + } else if (was_initialized && !mg->mg_initialized) { + ASSERT3U(mc->mc_groups, >, 0); + mc->mc_groups--; + } + if (mg->mg_initialized) + mg->mg_no_free_space = B_FALSE; + + /* + * A metaslab group is considered allocatable if it has plenty + * of free space or is not heavily fragmented. We only take + * fragmentation into account if the metaslab group has a valid + * fragmentation metric (i.e. a value between 0 and 100). + */ + mg->mg_allocatable = (mg->mg_activation_count > 0 && + mg->mg_free_capacity > zfs_mg_noalloc_threshold && + (mg->mg_fragmentation == ZFS_FRAG_INVALID || + mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); + + /* + * The mc_alloc_groups maintains a count of the number of + * groups in this metaslab class that are still above the + * zfs_mg_noalloc_threshold. This is used by the allocating + * threads to determine if they should avoid allocations to + * a given group. The allocator will avoid allocations to a group + * if that group has reached or is below the zfs_mg_noalloc_threshold + * and there are still other groups that are above the threshold. + * When a group transitions from allocatable to non-allocatable or + * vice versa we update the metaslab class to reflect that change. + * When the mc_alloc_groups value drops to 0 that means that all + * groups have reached the zfs_mg_noalloc_threshold making all groups + * eligible for allocations. This effectively means that all devices + * are balanced again. + */ + if (was_allocatable && !mg->mg_allocatable) + mc->mc_alloc_groups--; + else if (!was_allocatable && mg->mg_allocatable) + mc->mc_alloc_groups++; + mutex_exit(&mc->mc_lock); + + mutex_exit(&mg->mg_lock); +} + +metaslab_group_t * +metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) +{ + metaslab_group_t *mg; + + mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); + mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL); + mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), + KM_SLEEP); + mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), + KM_SLEEP); + avl_create(&mg->mg_metaslab_tree, metaslab_compare, + sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); + mg->mg_vd = vd; + mg->mg_class = mc; + mg->mg_activation_count = 0; + mg->mg_initialized = B_FALSE; + mg->mg_no_free_space = B_TRUE; + mg->mg_allocators = allocators; + + mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t), + KM_SLEEP); + mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * + sizeof (uint64_t), KM_SLEEP); + for (int i = 0; i < allocators; i++) { + refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); + mg->mg_cur_max_alloc_queue_depth[i] = 0; + } + + mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, + minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); + + return (mg); +} + +void +metaslab_group_destroy(metaslab_group_t *mg) +{ + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + /* + * We may have gone below zero with the activation count + * either because we never activated in the first place or + * because we're done, and possibly removing the vdev. + */ + ASSERT(mg->mg_activation_count <= 0); + + taskq_destroy(mg->mg_taskq); + avl_destroy(&mg->mg_metaslab_tree); + kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); + kmem_free(mg->mg_secondaries, mg->mg_allocators * + sizeof (metaslab_t *)); + mutex_destroy(&mg->mg_lock); + mutex_destroy(&mg->mg_ms_initialize_lock); + cv_destroy(&mg->mg_ms_initialize_cv); + + for (int i = 0; i < mg->mg_allocators; i++) { + refcount_destroy(&mg->mg_alloc_queue_depth[i]); + mg->mg_cur_max_alloc_queue_depth[i] = 0; + } + kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * + sizeof (refcount_t)); + kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * + sizeof (uint64_t)); + + kmem_free(mg, sizeof (metaslab_group_t)); +} + +void +metaslab_group_activate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + metaslab_group_t *mgprev, *mgnext; + + ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); + + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count <= 0); + + if (++mg->mg_activation_count <= 0) + return; + + mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); + metaslab_group_alloc_update(mg); + + if ((mgprev = mc->mc_rotor) == NULL) { + mg->mg_prev = mg; + mg->mg_next = mg; + } else { + mgnext = mgprev->mg_next; + mg->mg_prev = mgprev; + mg->mg_next = mgnext; + mgprev->mg_next = mg; + mgnext->mg_prev = mg; + } + mc->mc_rotor = mg; + metaslab_class_minblocksize_update(mc); +} + +/* + * Passivate a metaslab group and remove it from the allocation rotor. + * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating + * a metaslab group. This function will momentarily drop spa_config_locks + * that are lower than the SCL_ALLOC lock (see comment below). + */ +void +metaslab_group_passivate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + spa_t *spa = mc->mc_spa; + metaslab_group_t *mgprev, *mgnext; + int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); + + ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, + (SCL_ALLOC | SCL_ZIO)); + + if (--mg->mg_activation_count != 0) { + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count < 0); + return; + } + + /* + * The spa_config_lock is an array of rwlocks, ordered as + * follows (from highest to lowest): + * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > + * SCL_ZIO > SCL_FREE > SCL_VDEV + * (For more information about the spa_config_lock see spa_misc.c) + * The higher the lock, the broader its coverage. When we passivate + * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO + * config locks. However, the metaslab group's taskq might be trying + * to preload metaslabs so we must drop the SCL_ZIO lock and any + * lower locks to allow the I/O to complete. At a minimum, + * we continue to hold the SCL_ALLOC lock, which prevents any future + * allocations from taking place and any changes to the vdev tree. + */ + spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); + taskq_wait(mg->mg_taskq); + spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); + metaslab_group_alloc_update(mg); + for (int i = 0; i < mg->mg_allocators; i++) { + metaslab_t *msp = mg->mg_primaries[i]; + if (msp != NULL) { + mutex_enter(&msp->ms_lock); + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + mutex_exit(&msp->ms_lock); + } + msp = mg->mg_secondaries[i]; + if (msp != NULL) { + mutex_enter(&msp->ms_lock); + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + mutex_exit(&msp->ms_lock); + } + } + + mgprev = mg->mg_prev; + mgnext = mg->mg_next; + + if (mg == mgnext) { + mc->mc_rotor = NULL; + } else { + mc->mc_rotor = mgnext; + mgprev->mg_next = mgnext; + mgnext->mg_prev = mgprev; + } + + mg->mg_prev = NULL; + mg->mg_next = NULL; + metaslab_class_minblocksize_update(mc); +} + +boolean_t +metaslab_group_initialized(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + vdev_stat_t *vs = &vd->vdev_stat; + + return (vs->vs_space != 0 && mg->mg_activation_count > 0); +} + +uint64_t +metaslab_group_get_space(metaslab_group_t *mg) +{ + return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); +} + +void +metaslab_group_histogram_verify(metaslab_group_t *mg) +{ + uint64_t *mg_hist; + vdev_t *vd = mg->mg_vd; + uint64_t ashift = vd->vdev_ashift; + int i; + + if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) + return; + + mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + KM_SLEEP); + + ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, + SPACE_MAP_HISTOGRAM_SIZE + ashift); + + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_sm == NULL) + continue; + + for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + mg_hist[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) + VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); + + kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); +} + +static void +metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) +{ + metaslab_class_t *mc = mg->mg_class; + uint64_t ashift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_sm == NULL) + return; + + mutex_enter(&mg->mg_lock); + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + mg->mg_histogram[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + mc->mc_histogram[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + } + mutex_exit(&mg->mg_lock); +} + +void +metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) +{ + metaslab_class_t *mc = mg->mg_class; + uint64_t ashift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_sm == NULL) + return; + + mutex_enter(&mg->mg_lock); + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + ASSERT3U(mg->mg_histogram[i + ashift], >=, + msp->ms_sm->sm_phys->smp_histogram[i]); + ASSERT3U(mc->mc_histogram[i + ashift], >=, + msp->ms_sm->sm_phys->smp_histogram[i]); + + mg->mg_histogram[i + ashift] -= + msp->ms_sm->sm_phys->smp_histogram[i]; + mc->mc_histogram[i + ashift] -= + msp->ms_sm->sm_phys->smp_histogram[i]; + } + mutex_exit(&mg->mg_lock); +} + +static void +metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) +{ + ASSERT(msp->ms_group == NULL); + mutex_enter(&mg->mg_lock); + msp->ms_group = mg; + msp->ms_weight = 0; + avl_add(&mg->mg_metaslab_tree, msp); + mutex_exit(&mg->mg_lock); + + mutex_enter(&msp->ms_lock); + metaslab_group_histogram_add(mg, msp); + mutex_exit(&msp->ms_lock); +} + +static void +metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) +{ + mutex_enter(&msp->ms_lock); + metaslab_group_histogram_remove(mg, msp); + mutex_exit(&msp->ms_lock); + + mutex_enter(&mg->mg_lock); + ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); + msp->ms_group = NULL; + mutex_exit(&mg->mg_lock); +} + +static void +metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) +{ + ASSERT(MUTEX_HELD(&mg->mg_lock)); + ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); + msp->ms_weight = weight; + avl_add(&mg->mg_metaslab_tree, msp); + +} + +static void +metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) +{ + /* + * Although in principle the weight can be any value, in + * practice we do not use values in the range [1, 511]. + */ + ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + mutex_enter(&mg->mg_lock); + metaslab_group_sort_impl(mg, msp, weight); + mutex_exit(&mg->mg_lock); +} + +/* + * Calculate the fragmentation for a given metaslab group. We can use + * a simple average here since all metaslabs within the group must have + * the same size. The return value will be a value between 0 and 100 + * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this + * group have a fragmentation metric. + */ +uint64_t +metaslab_group_fragmentation(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + uint64_t fragmentation = 0; + uint64_t valid_ms = 0; + + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_fragmentation == ZFS_FRAG_INVALID) + continue; + + valid_ms++; + fragmentation += msp->ms_fragmentation; + } + + if (valid_ms <= vd->vdev_ms_count / 2) + return (ZFS_FRAG_INVALID); + + fragmentation /= valid_ms; + ASSERT3U(fragmentation, <=, 100); + return (fragmentation); +} + +/* + * Determine if a given metaslab group should skip allocations. A metaslab + * group should avoid allocations if its free capacity is less than the + * zfs_mg_noalloc_threshold or its fragmentation metric is greater than + * zfs_mg_fragmentation_threshold and there is at least one metaslab group + * that can still handle allocations. If the allocation throttle is enabled + * then we skip allocations to devices that have reached their maximum + * allocation queue depth unless the selected metaslab group is the only + * eligible group remaining. + */ +static boolean_t +metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, + uint64_t psize, int allocator, int d) +{ + spa_t *spa = mg->mg_vd->vdev_spa; + metaslab_class_t *mc = mg->mg_class; + + /* + * We can only consider skipping this metaslab group if it's + * in the normal metaslab class and there are other metaslab + * groups to select from. Otherwise, we always consider it eligible + * for allocations. + */ + if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) + return (B_TRUE); + + /* + * If the metaslab group's mg_allocatable flag is set (see comments + * in metaslab_group_alloc_update() for more information) and + * the allocation throttle is disabled then allow allocations to this + * device. However, if the allocation throttle is enabled then + * check if we have reached our allocation limit (mg_alloc_queue_depth) + * to determine if we should allow allocations to this metaslab group. + * If all metaslab groups are no longer considered allocatable + * (mc_alloc_groups == 0) or we're trying to allocate the smallest + * gang block size then we allow allocations on this metaslab group + * regardless of the mg_allocatable or throttle settings. + */ + if (mg->mg_allocatable) { + metaslab_group_t *mgp; + int64_t qdepth; + uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; + + if (!mc->mc_alloc_throttle_enabled) + return (B_TRUE); + + /* + * If this metaslab group does not have any free space, then + * there is no point in looking further. + */ + if (mg->mg_no_free_space) + return (B_FALSE); + + /* + * Relax allocation throttling for ditto blocks. Due to + * random imbalances in allocation it tends to push copies + * to one vdev, that looks a bit better at the moment. + */ + qmax = qmax * (4 + d) / 4; + + qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]); + + /* + * If this metaslab group is below its qmax or it's + * the only allocatable metasable group, then attempt + * to allocate from it. + */ + if (qdepth < qmax || mc->mc_alloc_groups == 1) + return (B_TRUE); + ASSERT3U(mc->mc_alloc_groups, >, 1); + + /* + * Since this metaslab group is at or over its qmax, we + * need to determine if there are metaslab groups after this + * one that might be able to handle this allocation. This is + * racy since we can't hold the locks for all metaslab + * groups at the same time when we make this check. + */ + for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { + qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; + qmax = qmax * (4 + d) / 4; + qdepth = refcount_count( + &mgp->mg_alloc_queue_depth[allocator]); + + /* + * If there is another metaslab group that + * might be able to handle the allocation, then + * we return false so that we skip this group. + */ + if (qdepth < qmax && !mgp->mg_no_free_space) + return (B_FALSE); + } + + /* + * We didn't find another group to handle the allocation + * so we can't skip this metaslab group even though + * we are at or over our qmax. + */ + return (B_TRUE); + + } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * ========================================================================== + * Range tree callbacks + * ========================================================================== + */ + +/* + * Comparison function for the private size-ordered tree. Tree is sorted + * by size, larger sizes at the end of the tree. + */ +static int +metaslab_rangesize_compare(const void *x1, const void *x2) +{ + const range_seg_t *r1 = x1; + const range_seg_t *r2 = x2; + uint64_t rs_size1 = r1->rs_end - r1->rs_start; + uint64_t rs_size2 = r2->rs_end - r2->rs_start; + + int cmp = AVL_CMP(rs_size1, rs_size2); + if (likely(cmp)) + return (cmp); + + if (r1->rs_start < r2->rs_start) + return (-1); + + return (AVL_CMP(r1->rs_start, r2->rs_start)); +} + +/* + * ========================================================================== + * Common allocator routines + * ========================================================================== + */ + +/* + * Return the maximum contiguous segment within the metaslab. + */ +uint64_t +metaslab_block_maxsize(metaslab_t *msp) +{ + avl_tree_t *t = &msp->ms_allocatable_by_size; + range_seg_t *rs; + + if (t == NULL || (rs = avl_last(t)) == NULL) + return (0ULL); + + return (rs->rs_end - rs->rs_start); +} + +static range_seg_t * +metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) +{ + range_seg_t *rs, rsearch; + avl_index_t where; + + rsearch.rs_start = start; + rsearch.rs_end = start + size; + + rs = avl_find(t, &rsearch, &where); + if (rs == NULL) { + rs = avl_nearest(t, where, AVL_AFTER); + } + + return (rs); +} + +/* + * This is a helper function that can be used by the allocator to find + * a suitable block to allocate. This will search the specified AVL + * tree looking for a block that matches the specified criteria. + */ +static uint64_t +metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, + uint64_t align) +{ + range_seg_t *rs = metaslab_block_find(t, *cursor, size); + + while (rs != NULL) { + uint64_t offset = P2ROUNDUP(rs->rs_start, align); + + if (offset + size <= rs->rs_end) { + *cursor = offset + size; + return (offset); + } + rs = AVL_NEXT(t, rs); + } + + /* + * If we know we've searched the whole map (*cursor == 0), give up. + * Otherwise, reset the cursor to the beginning and try again. + */ + if (*cursor == 0) + return (-1ULL); + + *cursor = 0; + return (metaslab_block_picker(t, cursor, size, align)); +} + +/* + * ========================================================================== + * The first-fit block allocator + * ========================================================================== + */ +static uint64_t +metaslab_ff_alloc(metaslab_t *msp, uint64_t size) +{ + /* + * Find the largest power of 2 block size that evenly divides the + * requested size. This is used to try to allocate blocks with similar + * alignment from the same area of the metaslab (i.e. same cursor + * bucket) but it does not guarantee that other allocations sizes + * may exist in the same region. + */ + uint64_t align = size & -size; + uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; + avl_tree_t *t = &msp->ms_allocatable->rt_root; + + return (metaslab_block_picker(t, cursor, size, align)); +} + +static metaslab_ops_t metaslab_ff_ops = { + metaslab_ff_alloc +}; + +/* + * ========================================================================== + * Dynamic block allocator - + * Uses the first fit allocation scheme until space get low and then + * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold + * and metaslab_df_free_pct to determine when to switch the allocation scheme. + * ========================================================================== + */ +static uint64_t +metaslab_df_alloc(metaslab_t *msp, uint64_t size) +{ + /* + * Find the largest power of 2 block size that evenly divides the + * requested size. This is used to try to allocate blocks with similar + * alignment from the same area of the metaslab (i.e. same cursor + * bucket) but it does not guarantee that other allocations sizes + * may exist in the same region. + */ + uint64_t align = size & -size; + uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; + range_tree_t *rt = msp->ms_allocatable; + avl_tree_t *t = &rt->rt_root; + uint64_t max_size = metaslab_block_maxsize(msp); + int free_pct = range_tree_space(rt) * 100 / msp->ms_size; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(avl_numnodes(t), ==, + avl_numnodes(&msp->ms_allocatable_by_size)); + + if (max_size < size) + return (-1ULL); + + /* + * If we're running low on space switch to using the size + * sorted AVL tree (best-fit). + */ + if (max_size < metaslab_df_alloc_threshold || + free_pct < metaslab_df_free_pct) { + t = &msp->ms_allocatable_by_size; + *cursor = 0; + } + + return (metaslab_block_picker(t, cursor, size, 1ULL)); +} + +static metaslab_ops_t metaslab_df_ops = { + metaslab_df_alloc +}; + +/* + * ========================================================================== + * Cursor fit block allocator - + * Select the largest region in the metaslab, set the cursor to the beginning + * of the range and the cursor_end to the end of the range. As allocations + * are made advance the cursor. Continue allocating from the cursor until + * the range is exhausted and then find a new range. + * ========================================================================== + */ +static uint64_t +metaslab_cf_alloc(metaslab_t *msp, uint64_t size) +{ + range_tree_t *rt = msp->ms_allocatable; + avl_tree_t *t = &msp->ms_allocatable_by_size; + uint64_t *cursor = &msp->ms_lbas[0]; + uint64_t *cursor_end = &msp->ms_lbas[1]; + uint64_t offset = 0; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); + + ASSERT3U(*cursor_end, >=, *cursor); + + if ((*cursor + size) > *cursor_end) { + range_seg_t *rs; + + rs = avl_last(&msp->ms_allocatable_by_size); + if (rs == NULL || (rs->rs_end - rs->rs_start) < size) + return (-1ULL); + + *cursor = rs->rs_start; + *cursor_end = rs->rs_end; + } + + offset = *cursor; + *cursor += size; + + return (offset); +} + +static metaslab_ops_t metaslab_cf_ops = { + metaslab_cf_alloc +}; + +/* + * ========================================================================== + * New dynamic fit allocator - + * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift + * contiguous blocks. If no region is found then just use the largest segment + * that remains. + * ========================================================================== + */ + +/* + * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) + * to request from the allocator. + */ +uint64_t metaslab_ndf_clump_shift = 4; + +static uint64_t +metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) +{ + avl_tree_t *t = &msp->ms_allocatable->rt_root; + avl_index_t where; + range_seg_t *rs, rsearch; + uint64_t hbit = highbit64(size); + uint64_t *cursor = &msp->ms_lbas[hbit - 1]; + uint64_t max_size = metaslab_block_maxsize(msp); + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(avl_numnodes(t), ==, + avl_numnodes(&msp->ms_allocatable_by_size)); + + if (max_size < size) + return (-1ULL); + + rsearch.rs_start = *cursor; + rsearch.rs_end = *cursor + size; + + rs = avl_find(t, &rsearch, &where); + if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { + t = &msp->ms_allocatable_by_size; + + rsearch.rs_start = 0; + rsearch.rs_end = MIN(max_size, + 1ULL << (hbit + metaslab_ndf_clump_shift)); + rs = avl_find(t, &rsearch, &where); + if (rs == NULL) + rs = avl_nearest(t, where, AVL_AFTER); + ASSERT(rs != NULL); + } + + if ((rs->rs_end - rs->rs_start) >= size) { + *cursor = rs->rs_start + size; + return (rs->rs_start); + } + return (-1ULL); +} + +static metaslab_ops_t metaslab_ndf_ops = { + metaslab_ndf_alloc +}; + +metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; + +/* + * ========================================================================== + * Metaslabs + * ========================================================================== + */ + +/* + * Wait for any in-progress metaslab loads to complete. + */ +void +metaslab_load_wait(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + while (msp->ms_loading) { + ASSERT(!msp->ms_loaded); + cv_wait(&msp->ms_load_cv, &msp->ms_lock); + } +} + +int +metaslab_load(metaslab_t *msp) +{ + int error = 0; + boolean_t success = B_FALSE; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(!msp->ms_loaded); + ASSERT(!msp->ms_loading); + + msp->ms_loading = B_TRUE; + /* + * Nobody else can manipulate a loading metaslab, so it's now safe + * to drop the lock. This way we don't have to hold the lock while + * reading the spacemap from disk. + */ + mutex_exit(&msp->ms_lock); + + /* + * If the space map has not been allocated yet, then treat + * all the space in the metaslab as free and add it to ms_allocatable. + */ + if (msp->ms_sm != NULL) { + error = space_map_load(msp->ms_sm, msp->ms_allocatable, + SM_FREE); + } else { + range_tree_add(msp->ms_allocatable, + msp->ms_start, msp->ms_size); + } + + success = (error == 0); + + mutex_enter(&msp->ms_lock); + msp->ms_loading = B_FALSE; + + if (success) { + ASSERT3P(msp->ms_group, !=, NULL); + msp->ms_loaded = B_TRUE; + + /* + * If the metaslab already has a spacemap, then we need to + * remove all segments from the defer tree; otherwise, the + * metaslab is completely empty and we can skip this. + */ + if (msp->ms_sm != NULL) { + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_walk(msp->ms_defer[t], + range_tree_remove, msp->ms_allocatable); + } + } + msp->ms_max_size = metaslab_block_maxsize(msp); + } + cv_broadcast(&msp->ms_load_cv); + return (error); +} + +void +metaslab_unload(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); + msp->ms_loaded = B_FALSE; + msp->ms_weight &= ~METASLAB_ACTIVE_MASK; + msp->ms_max_size = 0; +} + +int +metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, + metaslab_t **msp) +{ + vdev_t *vd = mg->mg_vd; + objset_t *mos = vd->vdev_spa->spa_meta_objset; + metaslab_t *ms; + int error; + + ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); + mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); + + ms->ms_id = id; + ms->ms_start = id << vd->vdev_ms_shift; + ms->ms_size = 1ULL << vd->vdev_ms_shift; + ms->ms_allocator = -1; + ms->ms_new = B_TRUE; + + /* + * We only open space map objects that already exist. All others + * will be opened when we finally allocate an object for it. + */ + if (object != 0) { + error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, + ms->ms_size, vd->vdev_ashift); + + if (error != 0) { + kmem_free(ms, sizeof (metaslab_t)); + return (error); + } + + ASSERT(ms->ms_sm != NULL); + } + + /* + * We create the main range tree here, but we don't create the + * other range trees until metaslab_sync_done(). This serves + * two purposes: it allows metaslab_sync_done() to detect the + * addition of new space; and for debugging, it ensures that we'd + * data fault on any attempt to use this metaslab before it's ready. + */ + ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size, + metaslab_rangesize_compare, 0); + metaslab_group_add(mg, ms); + + metaslab_set_fragmentation(ms); + + /* + * If we're opening an existing pool (txg == 0) or creating + * a new one (txg == TXG_INITIAL), all space is available now. + * If we're adding space to an existing pool, the new space + * does not become available until after this txg has synced. + * The metaslab's weight will also be initialized when we sync + * out this txg. This ensures that we don't attempt to allocate + * from it before we have initialized it completely. + */ + if (txg <= TXG_INITIAL) + metaslab_sync_done(ms, 0); + + /* + * If metaslab_debug_load is set and we're initializing a metaslab + * that has an allocated space map object then load the its space + * map so that can verify frees. + */ + if (metaslab_debug_load && ms->ms_sm != NULL) { + mutex_enter(&ms->ms_lock); + VERIFY0(metaslab_load(ms)); + mutex_exit(&ms->ms_lock); + } + + if (txg != 0) { + vdev_dirty(vd, 0, NULL, txg); + vdev_dirty(vd, VDD_METASLAB, ms, txg); + } + + *msp = ms; + + return (0); +} + +void +metaslab_fini(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + + metaslab_group_remove(mg, msp); + + mutex_enter(&msp->ms_lock); + VERIFY(msp->ms_group == NULL); + vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), + 0, -msp->ms_size); + space_map_close(msp->ms_sm); + + metaslab_unload(msp); + range_tree_destroy(msp->ms_allocatable); + range_tree_destroy(msp->ms_freeing); + range_tree_destroy(msp->ms_freed); + + for (int t = 0; t < TXG_SIZE; t++) { + range_tree_destroy(msp->ms_allocating[t]); + } + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_destroy(msp->ms_defer[t]); + } + ASSERT0(msp->ms_deferspace); + + range_tree_destroy(msp->ms_checkpointing); + + mutex_exit(&msp->ms_lock); + cv_destroy(&msp->ms_load_cv); + mutex_destroy(&msp->ms_lock); + mutex_destroy(&msp->ms_sync_lock); + ASSERT3U(msp->ms_allocator, ==, -1); + + kmem_free(msp, sizeof (metaslab_t)); +} + +#define FRAGMENTATION_TABLE_SIZE 17 + +/* + * This table defines a segment size based fragmentation metric that will + * allow each metaslab to derive its own fragmentation value. This is done + * by calculating the space in each bucket of the spacemap histogram and + * multiplying that by the fragmetation metric in this table. Doing + * this for all buckets and dividing it by the total amount of free + * space in this metaslab (i.e. the total free space in all buckets) gives + * us the fragmentation metric. This means that a high fragmentation metric + * equates to most of the free space being comprised of small segments. + * Conversely, if the metric is low, then most of the free space is in + * large segments. A 10% change in fragmentation equates to approximately + * double the number of segments. + * + * This table defines 0% fragmented space using 16MB segments. Testing has + * shown that segments that are greater than or equal to 16MB do not suffer + * from drastic performance problems. Using this value, we derive the rest + * of the table. Since the fragmentation value is never stored on disk, it + * is possible to change these calculations in the future. + */ +int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { + 100, /* 512B */ + 100, /* 1K */ + 98, /* 2K */ + 95, /* 4K */ + 90, /* 8K */ + 80, /* 16K */ + 70, /* 32K */ + 60, /* 64K */ + 50, /* 128K */ + 40, /* 256K */ + 30, /* 512K */ + 20, /* 1M */ + 15, /* 2M */ + 10, /* 4M */ + 5, /* 8M */ + 0 /* 16M */ +}; + +/* + * Calclate the metaslab's fragmentation metric. A return value + * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does + * not support this metric. Otherwise, the return value should be in the + * range [0, 100]. + */ +static void +metaslab_set_fragmentation(metaslab_t *msp) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t fragmentation = 0; + uint64_t total = 0; + boolean_t feature_enabled = spa_feature_is_enabled(spa, + SPA_FEATURE_SPACEMAP_HISTOGRAM); + + if (!feature_enabled) { + msp->ms_fragmentation = ZFS_FRAG_INVALID; + return; + } + + /* + * A null space map means that the entire metaslab is free + * and thus is not fragmented. + */ + if (msp->ms_sm == NULL) { + msp->ms_fragmentation = 0; + return; + } + + /* + * If this metaslab's space map has not been upgraded, flag it + * so that we upgrade next time we encounter it. + */ + if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { + uint64_t txg = spa_syncing_txg(spa); + vdev_t *vd = msp->ms_group->mg_vd; + + /* + * If we've reached the final dirty txg, then we must + * be shutting down the pool. We don't want to dirty + * any data past this point so skip setting the condense + * flag. We can retry this action the next time the pool + * is imported. + */ + if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { + msp->ms_condense_wanted = B_TRUE; + vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); + zfs_dbgmsg("txg %llu, requesting force condense: " + "ms_id %llu, vdev_id %llu", txg, msp->ms_id, + vd->vdev_id); + } + msp->ms_fragmentation = ZFS_FRAG_INVALID; + return; + } + + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + uint64_t space = 0; + uint8_t shift = msp->ms_sm->sm_shift; + + int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, + FRAGMENTATION_TABLE_SIZE - 1); + + if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) + continue; + + space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); + total += space; + + ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); + fragmentation += space * zfs_frag_table[idx]; + } + + if (total > 0) + fragmentation /= total; + ASSERT3U(fragmentation, <=, 100); + + msp->ms_fragmentation = fragmentation; +} + +/* + * Compute a weight -- a selection preference value -- for the given metaslab. + * This is based on the amount of free space, the level of fragmentation, + * the LBA range, and whether the metaslab is loaded. + */ +static uint64_t +metaslab_space_weight(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + uint64_t weight, space; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(!vd->vdev_removing); + + /* + * The baseline weight is the metaslab's free space. + */ + space = msp->ms_size - space_map_allocated(msp->ms_sm); + + if (metaslab_fragmentation_factor_enabled && + msp->ms_fragmentation != ZFS_FRAG_INVALID) { + /* + * Use the fragmentation information to inversely scale + * down the baseline weight. We need to ensure that we + * don't exclude this metaslab completely when it's 100% + * fragmented. To avoid this we reduce the fragmented value + * by 1. + */ + space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; + + /* + * If space < SPA_MINBLOCKSIZE, then we will not allocate from + * this metaslab again. The fragmentation metric may have + * decreased the space to something smaller than + * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE + * so that we can consume any remaining space. + */ + if (space > 0 && space < SPA_MINBLOCKSIZE) + space = SPA_MINBLOCKSIZE; + } + weight = space; + + /* + * Modern disks have uniform bit density and constant angular velocity. + * Therefore, the outer recording zones are faster (higher bandwidth) + * than the inner zones by the ratio of outer to inner track diameter, + * which is typically around 2:1. We account for this by assigning + * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). + * In effect, this means that we'll select the metaslab with the most + * free bandwidth rather than simply the one with the most free space. + */ + if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { + weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; + ASSERT(weight >= space && weight <= 2 * space); + } + + /* + * If this metaslab is one we're actively using, adjust its + * weight to make it preferable to any inactive metaslab so + * we'll polish it off. If the fragmentation on this metaslab + * has exceed our threshold, then don't mark it active. + */ + if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && + msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { + weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + } + + WEIGHT_SET_SPACEBASED(weight); + return (weight); +} + +/* + * Return the weight of the specified metaslab, according to the segment-based + * weighting algorithm. The metaslab must be loaded. This function can + * be called within a sync pass since it relies only on the metaslab's + * range tree which is always accurate when the metaslab is loaded. + */ +static uint64_t +metaslab_weight_from_range_tree(metaslab_t *msp) +{ + uint64_t weight = 0; + uint32_t segments = 0; + + ASSERT(msp->ms_loaded); + + for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; + i--) { + uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; + int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; + + segments <<= 1; + segments += msp->ms_allocatable->rt_histogram[i]; + + /* + * The range tree provides more precision than the space map + * and must be downgraded so that all values fit within the + * space map's histogram. This allows us to compare loaded + * vs. unloaded metaslabs to determine which metaslab is + * considered "best". + */ + if (i > max_idx) + continue; + + if (segments != 0) { + WEIGHT_SET_COUNT(weight, segments); + WEIGHT_SET_INDEX(weight, i); + WEIGHT_SET_ACTIVE(weight, 0); + break; + } + } + return (weight); +} + +/* + * Calculate the weight based on the on-disk histogram. This should only + * be called after a sync pass has completely finished since the on-disk + * information is updated in metaslab_sync(). + */ +static uint64_t +metaslab_weight_from_spacemap(metaslab_t *msp) +{ + uint64_t weight = 0; + + for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { + if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { + WEIGHT_SET_COUNT(weight, + msp->ms_sm->sm_phys->smp_histogram[i]); + WEIGHT_SET_INDEX(weight, i + + msp->ms_sm->sm_shift); + WEIGHT_SET_ACTIVE(weight, 0); + break; + } + } + return (weight); +} + +/* + * Compute a segment-based weight for the specified metaslab. The weight + * is determined by highest bucket in the histogram. The information + * for the highest bucket is encoded into the weight value. + */ +static uint64_t +metaslab_segment_weight(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + uint64_t weight = 0; + uint8_t shift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * The metaslab is completely free. + */ + if (space_map_allocated(msp->ms_sm) == 0) { + int idx = highbit64(msp->ms_size) - 1; + int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; + + if (idx < max_idx) { + WEIGHT_SET_COUNT(weight, 1ULL); + WEIGHT_SET_INDEX(weight, idx); + } else { + WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); + WEIGHT_SET_INDEX(weight, max_idx); + } + WEIGHT_SET_ACTIVE(weight, 0); + ASSERT(!WEIGHT_IS_SPACEBASED(weight)); + + return (weight); + } + + ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); + + /* + * If the metaslab is fully allocated then just make the weight 0. + */ + if (space_map_allocated(msp->ms_sm) == msp->ms_size) + return (0); + /* + * If the metaslab is already loaded, then use the range tree to + * determine the weight. Otherwise, we rely on the space map information + * to generate the weight. + */ + if (msp->ms_loaded) { + weight = metaslab_weight_from_range_tree(msp); + } else { + weight = metaslab_weight_from_spacemap(msp); + } + + /* + * If the metaslab was active the last time we calculated its weight + * then keep it active. We want to consume the entire region that + * is associated with this weight. + */ + if (msp->ms_activation_weight != 0 && weight != 0) + WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); + return (weight); +} + +/* + * Determine if we should attempt to allocate from this metaslab. If the + * metaslab has a maximum size then we can quickly determine if the desired + * allocation size can be satisfied. Otherwise, if we're using segment-based + * weighting then we can determine the maximum allocation that this metaslab + * can accommodate based on the index encoded in the weight. If we're using + * space-based weights then rely on the entire weight (excluding the weight + * type bit). + */ +boolean_t +metaslab_should_allocate(metaslab_t *msp, uint64_t asize) +{ + boolean_t should_allocate; + + if (msp->ms_max_size != 0) + return (msp->ms_max_size >= asize); + + if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { + /* + * The metaslab segment weight indicates segments in the + * range [2^i, 2^(i+1)), where i is the index in the weight. + * Since the asize might be in the middle of the range, we + * should attempt the allocation if asize < 2^(i+1). + */ + should_allocate = (asize < + 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); + } else { + should_allocate = (asize <= + (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); + } + return (should_allocate); +} + +static uint64_t +metaslab_weight(metaslab_t *msp) +{ + vdev_t *vd = msp->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + uint64_t weight; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * If this vdev is in the process of being removed, there is nothing + * for us to do here. + */ + if (vd->vdev_removing) + return (0); + + metaslab_set_fragmentation(msp); + + /* + * Update the maximum size if the metaslab is loaded. This will + * ensure that we get an accurate maximum size if newly freed space + * has been added back into the free tree. + */ + if (msp->ms_loaded) + msp->ms_max_size = metaslab_block_maxsize(msp); + + /* + * Segment-based weighting requires space map histogram support. + */ + if (zfs_metaslab_segment_weight_enabled && + spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && + (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == + sizeof (space_map_phys_t))) { + weight = metaslab_segment_weight(msp); + } else { + weight = metaslab_space_weight(msp); + } + return (weight); +} + +static int +metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, + int allocator, uint64_t activation_weight) +{ + /* + * If we're activating for the claim code, we don't want to actually + * set the metaslab up for a specific allocator. + */ + if (activation_weight == METASLAB_WEIGHT_CLAIM) + return (0); + metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? + mg->mg_primaries : mg->mg_secondaries); + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + mutex_enter(&mg->mg_lock); + if (arr[allocator] != NULL) { + mutex_exit(&mg->mg_lock); + return (EEXIST); + } + + arr[allocator] = msp; + ASSERT3S(msp->ms_allocator, ==, -1); + msp->ms_allocator = allocator; + msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); + mutex_exit(&mg->mg_lock); + + return (0); +} + +static int +metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { + int error = 0; + metaslab_load_wait(msp); + if (!msp->ms_loaded) { + if ((error = metaslab_load(msp)) != 0) { + metaslab_group_sort(msp->ms_group, msp, 0); + return (error); + } + } + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { + /* + * The metaslab was activated for another allocator + * while we were waiting, we should reselect. + */ + return (EBUSY); + } + if ((error = metaslab_activate_allocator(msp->ms_group, msp, + allocator, activation_weight)) != 0) { + return (error); + } + + msp->ms_activation_weight = msp->ms_weight; + metaslab_group_sort(msp->ms_group, msp, + msp->ms_weight | activation_weight); + } + ASSERT(msp->ms_loaded); + ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); + + return (0); +} + +static void +metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, + uint64_t weight) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { + metaslab_group_sort(mg, msp, weight); + return; + } + + mutex_enter(&mg->mg_lock); + ASSERT3P(msp->ms_group, ==, mg); + if (msp->ms_primary) { + ASSERT3U(0, <=, msp->ms_allocator); + ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); + ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); + ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); + mg->mg_primaries[msp->ms_allocator] = NULL; + } else { + ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); + ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); + mg->mg_secondaries[msp->ms_allocator] = NULL; + } + msp->ms_allocator = -1; + metaslab_group_sort_impl(mg, msp, weight); + mutex_exit(&mg->mg_lock); +} + +static void +metaslab_passivate(metaslab_t *msp, uint64_t weight) +{ + uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; + + /* + * If size < SPA_MINBLOCKSIZE, then we will not allocate from + * this metaslab again. In that case, it had better be empty, + * or we would be leaving space on the table. + */ + ASSERT(size >= SPA_MINBLOCKSIZE || + range_tree_is_empty(msp->ms_allocatable)); + ASSERT0(weight & METASLAB_ACTIVE_MASK); + + msp->ms_activation_weight = 0; + metaslab_passivate_allocator(msp->ms_group, msp, weight); + ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); +} + +/* + * Segment-based metaslabs are activated once and remain active until + * we either fail an allocation attempt (similar to space-based metaslabs) + * or have exhausted the free space in zfs_metaslab_switch_threshold + * buckets since the metaslab was activated. This function checks to see + * if we've exhaused the zfs_metaslab_switch_threshold buckets in the + * metaslab and passivates it proactively. This will allow us to select a + * metaslabs with larger contiguous region if any remaining within this + * metaslab group. If we're in sync pass > 1, then we continue using this + * metaslab so that we don't dirty more block and cause more sync passes. + */ +void +metaslab_segment_may_passivate(metaslab_t *msp) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) + return; + + /* + * Since we are in the middle of a sync pass, the most accurate + * information that is accessible to us is the in-core range tree + * histogram; calculate the new weight based on that information. + */ + uint64_t weight = metaslab_weight_from_range_tree(msp); + int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); + int current_idx = WEIGHT_GET_INDEX(weight); + + if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) + metaslab_passivate(msp, weight); +} + +static void +metaslab_preload(void *arg) +{ + metaslab_t *msp = arg; + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); + + mutex_enter(&msp->ms_lock); + metaslab_load_wait(msp); + if (!msp->ms_loaded) + (void) metaslab_load(msp); + msp->ms_selected_txg = spa_syncing_txg(spa); + mutex_exit(&msp->ms_lock); +} + +static void +metaslab_group_preload(metaslab_group_t *mg) +{ + spa_t *spa = mg->mg_vd->vdev_spa; + metaslab_t *msp; + avl_tree_t *t = &mg->mg_metaslab_tree; + int m = 0; + + if (spa_shutting_down(spa) || !metaslab_preload_enabled) { + taskq_wait(mg->mg_taskq); + return; + } + + mutex_enter(&mg->mg_lock); + + /* + * Load the next potential metaslabs + */ + for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { + ASSERT3P(msp->ms_group, ==, mg); + + /* + * We preload only the maximum number of metaslabs specified + * by metaslab_preload_limit. If a metaslab is being forced + * to condense then we preload it too. This will ensure + * that force condensing happens in the next txg. + */ + if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { + continue; + } + + VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, + msp, TQ_SLEEP) != 0); + } + mutex_exit(&mg->mg_lock); +} + +/* + * Determine if the space map's on-disk footprint is past our tolerance + * for inefficiency. We would like to use the following criteria to make + * our decision: + * + * 1. The size of the space map object should not dramatically increase as a + * result of writing out the free space range tree. + * + * 2. The minimal on-disk space map representation is zfs_condense_pct/100 + * times the size than the free space range tree representation + * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). + * + * 3. The on-disk size of the space map should actually decrease. + * + * Unfortunately, we cannot compute the on-disk size of the space map in this + * context because we cannot accurately compute the effects of compression, etc. + * Instead, we apply the heuristic described in the block comment for + * zfs_metaslab_condense_block_threshold - we only condense if the space used + * is greater than a threshold number of blocks. + */ +static boolean_t +metaslab_should_condense(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_sm; + vdev_t *vd = msp->ms_group->mg_vd; + uint64_t vdev_blocksize = 1 << vd->vdev_ashift; + uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(msp->ms_loaded); + + /* + * Allocations and frees in early passes are generally more space + * efficient (in terms of blocks described in space map entries) + * than the ones in later passes (e.g. we don't compress after + * sync pass 5) and condensing a metaslab multiple times in a txg + * could degrade performance. + * + * Thus we prefer condensing each metaslab at most once every txg at + * the earliest sync pass possible. If a metaslab is eligible for + * condensing again after being considered for condensing within the + * same txg, it will hopefully be dirty in the next txg where it will + * be condensed at an earlier pass. + */ + if (msp->ms_condense_checked_txg == current_txg) + return (B_FALSE); + msp->ms_condense_checked_txg = current_txg; + + /* + * We always condense metaslabs that are empty and metaslabs for + * which a condense request has been made. + */ + if (avl_is_empty(&msp->ms_allocatable_by_size) || + msp->ms_condense_wanted) + return (B_TRUE); + + uint64_t object_size = space_map_length(msp->ms_sm); + uint64_t optimal_size = space_map_estimate_optimal_size(sm, + msp->ms_allocatable, SM_NO_VDEVID); + + dmu_object_info_t doi; + dmu_object_info_from_db(sm->sm_dbuf, &doi); + uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); + + return (object_size >= (optimal_size * zfs_condense_pct / 100) && + object_size > zfs_metaslab_condense_block_threshold * record_size); +} + +/* + * Condense the on-disk space map representation to its minimized form. + * The minimized form consists of a small number of allocations followed by + * the entries of the free range tree. + */ +static void +metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) +{ + range_tree_t *condense_tree; + space_map_t *sm = msp->ms_sm; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(msp->ms_loaded); + + zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " + "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, + msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, + msp->ms_group->mg_vd->vdev_spa->spa_name, + space_map_length(msp->ms_sm), + avl_numnodes(&msp->ms_allocatable->rt_root), + msp->ms_condense_wanted ? "TRUE" : "FALSE"); + + msp->ms_condense_wanted = B_FALSE; + + /* + * Create an range tree that is 100% allocated. We remove segments + * that have been freed in this txg, any deferred frees that exist, + * and any allocation in the future. Removing segments should be + * a relatively inexpensive operation since we expect these trees to + * have a small number of nodes. + */ + condense_tree = range_tree_create(NULL, NULL); + range_tree_add(condense_tree, msp->ms_start, msp->ms_size); + + range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); + range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_walk(msp->ms_defer[t], + range_tree_remove, condense_tree); + } + + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], + range_tree_remove, condense_tree); + } + + /* + * We're about to drop the metaslab's lock thus allowing + * other consumers to change it's content. Set the + * metaslab's ms_condensing flag to ensure that + * allocations on this metaslab do not occur while we're + * in the middle of committing it to disk. This is only critical + * for ms_allocatable as all other range trees use per txg + * views of their content. + */ + msp->ms_condensing = B_TRUE; + + mutex_exit(&msp->ms_lock); + space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); + + /* + * While we would ideally like to create a space map representation + * that consists only of allocation records, doing so can be + * prohibitively expensive because the in-core free tree can be + * large, and therefore computationally expensive to subtract + * from the condense_tree. Instead we sync out two trees, a cheap + * allocation only tree followed by the in-core free tree. While not + * optimal, this is typically close to optimal, and much cheaper to + * compute. + */ + space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); + range_tree_vacate(condense_tree, NULL, NULL); + range_tree_destroy(condense_tree); + + space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); + mutex_enter(&msp->ms_lock); + msp->ms_condensing = B_FALSE; +} + +/* + * Write a metaslab to disk in the context of the specified transaction group. + */ +void +metaslab_sync(metaslab_t *msp, uint64_t txg) +{ + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa_meta_objset(spa); + range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; + dmu_tx_t *tx; + uint64_t object = space_map_object(msp->ms_sm); + + ASSERT(!vd->vdev_ishole); + + /* + * This metaslab has just been added so there's no work to do now. + */ + if (msp->ms_freeing == NULL) { + ASSERT3P(alloctree, ==, NULL); + return; + } + + ASSERT3P(alloctree, !=, NULL); + ASSERT3P(msp->ms_freeing, !=, NULL); + ASSERT3P(msp->ms_freed, !=, NULL); + ASSERT3P(msp->ms_checkpointing, !=, NULL); + + /* + * Normally, we don't want to process a metaslab if there are no + * allocations or frees to perform. However, if the metaslab is being + * forced to condense and it's loaded, we need to let it through. + */ + if (range_tree_is_empty(alloctree) && + range_tree_is_empty(msp->ms_freeing) && + range_tree_is_empty(msp->ms_checkpointing) && + !(msp->ms_loaded && msp->ms_condense_wanted)) + return; + + + VERIFY(txg <= spa_final_dirty_txg(spa)); + + /* + * The only state that can actually be changing concurrently with + * metaslab_sync() is the metaslab's ms_allocatable. No other + * thread can be modifying this txg's alloc, freeing, + * freed, or space_map_phys_t. We drop ms_lock whenever we + * could call into the DMU, because the DMU can call down to us + * (e.g. via zio_free()) at any time. + * + * The spa_vdev_remove_thread() can be reading metaslab state + * concurrently, and it is locked out by the ms_sync_lock. Note + * that the ms_lock is insufficient for this, because it is dropped + * by space_map_write(). + */ + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + if (msp->ms_sm == NULL) { + uint64_t new_object; + + new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); + VERIFY3U(new_object, !=, 0); + + VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, + msp->ms_start, msp->ms_size, vd->vdev_ashift)); + ASSERT(msp->ms_sm != NULL); + } + + if (!range_tree_is_empty(msp->ms_checkpointing) && + vd->vdev_checkpoint_sm == NULL) { + ASSERT(spa_has_checkpoint(spa)); + + uint64_t new_object = space_map_alloc(mos, + vdev_standard_sm_blksz, tx); + VERIFY3U(new_object, !=, 0); + + VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, + mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); + ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); + + /* + * We save the space map object as an entry in vdev_top_zap + * so it can be retrieved when the pool is reopened after an + * export or through zdb. + */ + VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, + sizeof (new_object), 1, &new_object, tx)); + } + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + /* + * Note: metaslab_condense() clears the space map's histogram. + * Therefore we must verify and remove this histogram before + * condensing. + */ + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + metaslab_group_histogram_remove(mg, msp); + + if (msp->ms_loaded && metaslab_should_condense(msp)) { + metaslab_condense(msp, txg, tx); + } else { + mutex_exit(&msp->ms_lock); + space_map_write(msp->ms_sm, alloctree, SM_ALLOC, + SM_NO_VDEVID, tx); + space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, + SM_NO_VDEVID, tx); + mutex_enter(&msp->ms_lock); + } + + if (!range_tree_is_empty(msp->ms_checkpointing)) { + ASSERT(spa_has_checkpoint(spa)); + ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); + + /* + * Since we are doing writes to disk and the ms_checkpointing + * tree won't be changing during that time, we drop the + * ms_lock while writing to the checkpoint space map. + */ + mutex_exit(&msp->ms_lock); + space_map_write(vd->vdev_checkpoint_sm, + msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); + mutex_enter(&msp->ms_lock); + space_map_update(vd->vdev_checkpoint_sm); + + spa->spa_checkpoint_info.sci_dspace += + range_tree_space(msp->ms_checkpointing); + vd->vdev_stat.vs_checkpoint_space += + range_tree_space(msp->ms_checkpointing); + ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, + -vd->vdev_checkpoint_sm->sm_alloc); + + range_tree_vacate(msp->ms_checkpointing, NULL, NULL); + } + + if (msp->ms_loaded) { + /* + * When the space map is loaded, we have an accurate + * histogram in the range tree. This gives us an opportunity + * to bring the space map's histogram up-to-date so we clear + * it first before updating it. + */ + space_map_histogram_clear(msp->ms_sm); + space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); + + /* + * Since we've cleared the histogram we need to add back + * any free space that has already been processed, plus + * any deferred space. This allows the on-disk histogram + * to accurately reflect all free space even if some space + * is not yet available for allocation (i.e. deferred). + */ + space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); + + /* + * Add back any deferred free space that has not been + * added back into the in-core free tree yet. This will + * ensure that we don't end up with a space map histogram + * that is completely empty unless the metaslab is fully + * allocated. + */ + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + space_map_histogram_add(msp->ms_sm, + msp->ms_defer[t], tx); + } + } + + /* + * Always add the free space from this sync pass to the space + * map histogram. We want to make sure that the on-disk histogram + * accounts for all free space. If the space map is not loaded, + * then we will lose some accuracy but will correct it the next + * time we load the space map. + */ + space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); + + metaslab_group_histogram_add(mg, msp); + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + + /* + * For sync pass 1, we avoid traversing this txg's free range tree + * and instead will just swap the pointers for freeing and + * freed. We can safely do this since the freed_tree is + * guaranteed to be empty on the initial pass. + */ + if (spa_sync_pass(spa) == 1) { + range_tree_swap(&msp->ms_freeing, &msp->ms_freed); + } else { + range_tree_vacate(msp->ms_freeing, + range_tree_add, msp->ms_freed); + } + range_tree_vacate(alloctree, NULL, NULL); + + ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) + & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); + + mutex_exit(&msp->ms_lock); + + if (object != space_map_object(msp->ms_sm)) { + object = space_map_object(msp->ms_sm); + dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * + msp->ms_id, sizeof (uint64_t), &object, tx); + } + mutex_exit(&msp->ms_sync_lock); + dmu_tx_commit(tx); +} + +/* + * Called after a transaction group has completely synced to mark + * all of the metaslab's free space as usable. + */ +void +metaslab_sync_done(metaslab_t *msp, uint64_t txg) +{ + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; + range_tree_t **defer_tree; + int64_t alloc_delta, defer_delta; + boolean_t defer_allowed = B_TRUE; + + ASSERT(!vd->vdev_ishole); + + mutex_enter(&msp->ms_lock); + + /* + * If this metaslab is just becoming available, initialize its + * range trees and add its capacity to the vdev. + */ + if (msp->ms_freed == NULL) { + for (int t = 0; t < TXG_SIZE; t++) { + ASSERT(msp->ms_allocating[t] == NULL); + + msp->ms_allocating[t] = range_tree_create(NULL, NULL); + } + + ASSERT3P(msp->ms_freeing, ==, NULL); + msp->ms_freeing = range_tree_create(NULL, NULL); + + ASSERT3P(msp->ms_freed, ==, NULL); + msp->ms_freed = range_tree_create(NULL, NULL); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + ASSERT(msp->ms_defer[t] == NULL); + + msp->ms_defer[t] = range_tree_create(NULL, NULL); + } + + ASSERT3P(msp->ms_checkpointing, ==, NULL); + msp->ms_checkpointing = range_tree_create(NULL, NULL); + + vdev_space_update(vd, 0, 0, msp->ms_size); + } + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); + + defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; + + uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - + metaslab_class_get_alloc(spa_normal_class(spa)); + if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { + defer_allowed = B_FALSE; + } + + defer_delta = 0; + alloc_delta = space_map_alloc_delta(msp->ms_sm); + if (defer_allowed) { + defer_delta = range_tree_space(msp->ms_freed) - + range_tree_space(*defer_tree); + } else { + defer_delta -= range_tree_space(*defer_tree); + } + + vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); + + /* + * If there's a metaslab_load() in progress, wait for it to complete + * so that we have a consistent view of the in-core space map. + */ + metaslab_load_wait(msp); + + /* + * Move the frees from the defer_tree back to the free + * range tree (if it's loaded). Swap the freed_tree and + * the defer_tree -- this is safe to do because we've + * just emptied out the defer_tree. + */ + range_tree_vacate(*defer_tree, + msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); + if (defer_allowed) { + range_tree_swap(&msp->ms_freed, defer_tree); + } else { + range_tree_vacate(msp->ms_freed, + msp->ms_loaded ? range_tree_add : NULL, + msp->ms_allocatable); + } + space_map_update(msp->ms_sm); + + msp->ms_deferspace += defer_delta; + ASSERT3S(msp->ms_deferspace, >=, 0); + ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); + if (msp->ms_deferspace != 0) { + /* + * Keep syncing this metaslab until all deferred frees + * are back in circulation. + */ + vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); + } + + if (msp->ms_new) { + msp->ms_new = B_FALSE; + mutex_enter(&mg->mg_lock); + mg->mg_ms_ready++; + mutex_exit(&mg->mg_lock); + } + /* + * Calculate the new weights before unloading any metaslabs. + * This will give us the most accurate weighting. + */ + metaslab_group_sort(mg, msp, metaslab_weight(msp) | + (msp->ms_weight & METASLAB_ACTIVE_MASK)); + + /* + * If the metaslab is loaded and we've not tried to load or allocate + * from it in 'metaslab_unload_delay' txgs, then unload it. + */ + if (msp->ms_loaded && + msp->ms_initializing == 0 && + msp->ms_selected_txg + metaslab_unload_delay < txg) { + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + VERIFY0(range_tree_space( + msp->ms_allocating[(txg + t) & TXG_MASK])); + } + if (msp->ms_allocator != -1) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } + + if (!metaslab_debug_unload) + metaslab_unload(msp); + } + + ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_freed)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); + + mutex_exit(&msp->ms_lock); +} + +void +metaslab_sync_reassess(metaslab_group_t *mg) +{ + spa_t *spa = mg->mg_class->mc_spa; + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); + metaslab_group_alloc_update(mg); + mg->mg_fragmentation = metaslab_group_fragmentation(mg); + + /* + * Preload the next potential metaslabs but only on active + * metaslab groups. We can get into a state where the metaslab + * is no longer active since we dirty metaslabs as we remove a + * a device, thus potentially making the metaslab group eligible + * for preloading. + */ + if (mg->mg_activation_count > 0) { + metaslab_group_preload(mg); + } + spa_config_exit(spa, SCL_ALLOC, FTAG); +} + +static uint64_t +metaslab_distance(metaslab_t *msp, dva_t *dva) +{ + uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; + uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; + uint64_t start = msp->ms_id; + + if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) + return (1ULL << 63); + + if (offset < start) + return ((start - offset) << ms_shift); + if (offset > start) + return ((offset - start) << ms_shift); + return (0); +} + +/* + * ========================================================================== + * Metaslab allocation tracing facility + * ========================================================================== + */ +kstat_t *metaslab_trace_ksp; +kstat_named_t metaslab_trace_over_limit; + +void +metaslab_alloc_trace_init(void) +{ + ASSERT(metaslab_alloc_trace_cache == NULL); + metaslab_alloc_trace_cache = kmem_cache_create( + "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", + "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); + if (metaslab_trace_ksp != NULL) { + metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; + kstat_named_init(&metaslab_trace_over_limit, + "metaslab_trace_over_limit", KSTAT_DATA_UINT64); + kstat_install(metaslab_trace_ksp); + } +} + +void +metaslab_alloc_trace_fini(void) +{ + if (metaslab_trace_ksp != NULL) { + kstat_delete(metaslab_trace_ksp); + metaslab_trace_ksp = NULL; + } + kmem_cache_destroy(metaslab_alloc_trace_cache); + metaslab_alloc_trace_cache = NULL; +} + +/* + * Add an allocation trace element to the allocation tracing list. + */ +static void +metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, + metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, + int allocator) +{ + if (!metaslab_trace_enabled) + return; + + /* + * When the tracing list reaches its maximum we remove + * the second element in the list before adding a new one. + * By removing the second element we preserve the original + * entry as a clue to what allocations steps have already been + * performed. + */ + if (zal->zal_size == metaslab_trace_max_entries) { + metaslab_alloc_trace_t *mat_next; +#ifdef DEBUG + panic("too many entries in allocation list"); +#endif + atomic_inc_64(&metaslab_trace_over_limit.value.ui64); + zal->zal_size--; + mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); + list_remove(&zal->zal_list, mat_next); + kmem_cache_free(metaslab_alloc_trace_cache, mat_next); + } + + metaslab_alloc_trace_t *mat = + kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); + list_link_init(&mat->mat_list_node); + mat->mat_mg = mg; + mat->mat_msp = msp; + mat->mat_size = psize; + mat->mat_dva_id = dva_id; + mat->mat_offset = offset; + mat->mat_weight = 0; + mat->mat_allocator = allocator; + + if (msp != NULL) + mat->mat_weight = msp->ms_weight; + + /* + * The list is part of the zio so locking is not required. Only + * a single thread will perform allocations for a given zio. + */ + list_insert_tail(&zal->zal_list, mat); + zal->zal_size++; + + ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); +} + +void +metaslab_trace_init(zio_alloc_list_t *zal) +{ + list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), + offsetof(metaslab_alloc_trace_t, mat_list_node)); + zal->zal_size = 0; +} + +void +metaslab_trace_fini(zio_alloc_list_t *zal) +{ + metaslab_alloc_trace_t *mat; + + while ((mat = list_remove_head(&zal->zal_list)) != NULL) + kmem_cache_free(metaslab_alloc_trace_cache, mat); + list_destroy(&zal->zal_list); + zal->zal_size = 0; +} + +/* + * ========================================================================== + * Metaslab block operations + * ========================================================================== + */ + +static void +metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, + int allocator) +{ + if (!(flags & METASLAB_ASYNC_ALLOC) || + (flags & METASLAB_DONT_THROTTLE)) + return; + + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + if (!mg->mg_class->mc_alloc_throttle_enabled) + return; + + (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); +} + +static void +metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) +{ + uint64_t max = mg->mg_max_alloc_queue_depth; + uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; + while (cur < max) { + if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], + cur, cur + 1) == cur) { + atomic_inc_64( + &mg->mg_class->mc_alloc_max_slots[allocator]); + return; + } + cur = mg->mg_cur_max_alloc_queue_depth[allocator]; + } +} + +void +metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, + int allocator, boolean_t io_complete) +{ + if (!(flags & METASLAB_ASYNC_ALLOC) || + (flags & METASLAB_DONT_THROTTLE)) + return; + + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + if (!mg->mg_class->mc_alloc_throttle_enabled) + return; + + (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); + if (io_complete) + metaslab_group_increment_qdepth(mg, allocator); +} + +void +metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, + int allocator) +{ +#ifdef ZFS_DEBUG + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + + for (int d = 0; d < ndvas; d++) { + uint64_t vdev = DVA_GET_VDEV(&dva[d]); + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator], + tag)); + } +#endif +} + +static uint64_t +metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) +{ + uint64_t start; + range_tree_t *rt = msp->ms_allocatable; + metaslab_class_t *mc = msp->ms_group->mg_class; + + VERIFY(!msp->ms_condensing); + VERIFY0(msp->ms_initializing); + + start = mc->mc_ops->msop_alloc(msp, size); + if (start != -1ULL) { + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + + VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); + range_tree_remove(rt, start, size); + + if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) + vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); + + range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); + + /* Track the last successful allocation */ + msp->ms_alloc_txg = txg; + metaslab_verify_space(msp, txg); + } + + /* + * Now that we've attempted the allocation we need to update the + * metaslab's maximum block size since it may have changed. + */ + msp->ms_max_size = metaslab_block_maxsize(msp); + return (start); +} + +/* + * Find the metaslab with the highest weight that is less than what we've + * already tried. In the common case, this means that we will examine each + * metaslab at most once. Note that concurrent callers could reorder metaslabs + * by activation/passivation once we have dropped the mg_lock. If a metaslab is + * activated by another thread, and we fail to allocate from the metaslab we + * have selected, we may not try the newly-activated metaslab, and instead + * activate another metaslab. This is not optimal, but generally does not cause + * any problems (a possible exception being if every metaslab is completely full + * except for the the newly-activated metaslab which we fail to examine). + */ +static metaslab_t * +find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, + dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator, + zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) +{ + avl_index_t idx; + avl_tree_t *t = &mg->mg_metaslab_tree; + metaslab_t *msp = avl_find(t, search, &idx); + if (msp == NULL) + msp = avl_nearest(t, idx, AVL_AFTER); + + for (; msp != NULL; msp = AVL_NEXT(t, msp)) { + int i; + if (!metaslab_should_allocate(msp, asize)) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_TOO_SMALL, allocator); + continue; + } + + /* + * If the selected metaslab is condensing or being + * initialized, skip it. + */ + if (msp->ms_condensing || msp->ms_initializing > 0) + continue; + + *was_active = msp->ms_allocator != -1; + /* + * If we're activating as primary, this is our first allocation + * from this disk, so we don't need to check how close we are. + * If the metaslab under consideration was already active, + * we're getting desperate enough to steal another allocator's + * metaslab, so we still don't care about distances. + */ + if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) + break; + + uint64_t target_distance = min_distance + + (space_map_allocated(msp->ms_sm) != 0 ? 0 : + min_distance >> 1); + + for (i = 0; i < d; i++) { + if (metaslab_distance(msp, &dva[i]) < target_distance) + break; + } + if (i == d) + break; + } + + if (msp != NULL) { + search->ms_weight = msp->ms_weight; + search->ms_start = msp->ms_start + 1; + search->ms_allocator = msp->ms_allocator; + search->ms_primary = msp->ms_primary; + } + return (msp); +} + +/* ARGSUSED */ +static uint64_t +metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, + int allocator) +{ + metaslab_t *msp = NULL; + uint64_t offset = -1ULL; + uint64_t activation_weight; + + activation_weight = METASLAB_WEIGHT_PRIMARY; + for (int i = 0; i < d; i++) { + if (activation_weight == METASLAB_WEIGHT_PRIMARY && + DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { + activation_weight = METASLAB_WEIGHT_SECONDARY; + } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && + DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { + activation_weight = METASLAB_WEIGHT_CLAIM; + break; + } + } + + /* + * If we don't have enough metaslabs active to fill the entire array, we + * just use the 0th slot. + */ + if (mg->mg_ms_ready < mg->mg_allocators * 3) + allocator = 0; + + ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); + + metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); + search->ms_weight = UINT64_MAX; + search->ms_start = 0; + /* + * At the end of the metaslab tree are the already-active metaslabs, + * first the primaries, then the secondaries. When we resume searching + * through the tree, we need to consider ms_allocator and ms_primary so + * we start in the location right after where we left off, and don't + * accidentally loop forever considering the same metaslabs. + */ + search->ms_allocator = -1; + search->ms_primary = B_TRUE; + for (;;) { + boolean_t was_active = B_FALSE; + + mutex_enter(&mg->mg_lock); + + if (activation_weight == METASLAB_WEIGHT_PRIMARY && + mg->mg_primaries[allocator] != NULL) { + msp = mg->mg_primaries[allocator]; + was_active = B_TRUE; + } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && + mg->mg_secondaries[allocator] != NULL) { + msp = mg->mg_secondaries[allocator]; + was_active = B_TRUE; + } else { + msp = find_valid_metaslab(mg, activation_weight, dva, d, + min_distance, asize, allocator, zal, search, + &was_active); + } + + mutex_exit(&mg->mg_lock); + if (msp == NULL) { + kmem_free(search, sizeof (*search)); + return (-1ULL); + } + + mutex_enter(&msp->ms_lock); + /* + * Ensure that the metaslab we have selected is still + * capable of handling our request. It's possible that + * another thread may have changed the weight while we + * were blocked on the metaslab lock. We check the + * active status first to see if we need to reselect + * a new metaslab. + */ + if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { + mutex_exit(&msp->ms_lock); + continue; + } + + /* + * If the metaslab is freshly activated for an allocator that + * isn't the one we're allocating from, or if it's a primary and + * we're seeking a secondary (or vice versa), we go back and + * select a new metaslab. + */ + if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && + (msp->ms_allocator != -1) && + (msp->ms_allocator != allocator || ((activation_weight == + METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { + mutex_exit(&msp->ms_lock); + continue; + } + + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && + activation_weight != METASLAB_WEIGHT_CLAIM) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_WEIGHT_CLAIM); + mutex_exit(&msp->ms_lock); + continue; + } + + if (metaslab_activate(msp, allocator, activation_weight) != 0) { + mutex_exit(&msp->ms_lock); + continue; + } + + msp->ms_selected_txg = txg; + + /* + * Now that we have the lock, recheck to see if we should + * continue to use this metaslab for this allocation. The + * the metaslab is now loaded so metaslab_should_allocate() can + * accurately determine if the allocation attempt should + * proceed. + */ + if (!metaslab_should_allocate(msp, asize)) { + /* Passivate this metaslab and select a new one. */ + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_TOO_SMALL, allocator); + goto next; + } + + /* + * If this metaslab is currently condensing then pick again as + * we can't manipulate this metaslab until it's committed + * to disk. If this metaslab is being initialized, we shouldn't + * allocate from it since the allocated region might be + * overwritten after allocation. + */ + if (msp->ms_condensing) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_CONDENSING, allocator); + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + mutex_exit(&msp->ms_lock); + continue; + } else if (msp->ms_initializing > 0) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_INITIALIZING, allocator); + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + mutex_exit(&msp->ms_lock); + continue; + } + + offset = metaslab_block_alloc(msp, asize, txg); + metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); + + if (offset != -1ULL) { + /* Proactively passivate the metaslab, if needed */ + metaslab_segment_may_passivate(msp); + break; + } +next: + ASSERT(msp->ms_loaded); + + /* + * We were unable to allocate from this metaslab so determine + * a new weight for this metaslab. Now that we have loaded + * the metaslab we can provide a better hint to the metaslab + * selector. + * + * For space-based metaslabs, we use the maximum block size. + * This information is only available when the metaslab + * is loaded and is more accurate than the generic free + * space weight that was calculated by metaslab_weight(). + * This information allows us to quickly compare the maximum + * available allocation in the metaslab to the allocation + * size being requested. + * + * For segment-based metaslabs, determine the new weight + * based on the highest bucket in the range tree. We + * explicitly use the loaded segment weight (i.e. the range + * tree histogram) since it contains the space that is + * currently available for allocation and is accurate + * even within a sync pass. + */ + if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { + uint64_t weight = metaslab_block_maxsize(msp); + WEIGHT_SET_SPACEBASED(weight); + metaslab_passivate(msp, weight); + } else { + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + } + + /* + * We have just failed an allocation attempt, check + * that metaslab_should_allocate() agrees. Otherwise, + * we may end up in an infinite loop retrying the same + * metaslab. + */ + ASSERT(!metaslab_should_allocate(msp, asize)); + mutex_exit(&msp->ms_lock); + } + mutex_exit(&msp->ms_lock); + kmem_free(search, sizeof (*search)); + return (offset); +} + +static uint64_t +metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, + int allocator) +{ + uint64_t offset; + ASSERT(mg->mg_initialized); + + offset = metaslab_group_alloc_normal(mg, zal, asize, txg, + min_distance, dva, d, allocator); + + mutex_enter(&mg->mg_lock); + if (offset == -1ULL) { + mg->mg_failed_allocations++; + metaslab_trace_add(zal, mg, NULL, asize, d, + TRACE_GROUP_FAILURE, allocator); + if (asize == SPA_GANGBLOCKSIZE) { + /* + * This metaslab group was unable to allocate + * the minimum gang block size so it must be out of + * space. We must notify the allocation throttle + * to start skipping allocation attempts to this + * metaslab group until more space becomes available. + * Note: this failure cannot be caused by the + * allocation throttle since the allocation throttle + * is only responsible for skipping devices and + * not failing block allocations. + */ + mg->mg_no_free_space = B_TRUE; + } + } + mg->mg_allocations++; + mutex_exit(&mg->mg_lock); + return (offset); +} + +/* + * If we have to write a ditto block (i.e. more than one DVA for a given BP) + * on the same vdev as an existing DVA of this BP, then try to allocate it + * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the + * existing DVAs. + */ +int ditto_same_vdev_distance_shift = 3; + +/* + * Allocate a block for the specified i/o. + */ +int +metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, + zio_alloc_list_t *zal, int allocator) +{ + metaslab_group_t *mg, *rotor; + vdev_t *vd; + boolean_t try_hard = B_FALSE; + + ASSERT(!DVA_IS_VALID(&dva[d])); + + /* + * For testing, make some blocks above a certain size be gang blocks. + */ + if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { + metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, + allocator); + return (SET_ERROR(ENOSPC)); + } + + /* + * Start at the rotor and loop through all mgs until we find something. + * Note that there's no locking on mc_rotor or mc_aliquot because + * nothing actually breaks if we miss a few updates -- we just won't + * allocate quite as evenly. It all balances out over time. + * + * If we are doing ditto or log blocks, try to spread them across + * consecutive vdevs. If we're forced to reuse a vdev before we've + * allocated all of our ditto blocks, then try and spread them out on + * that vdev as much as possible. If it turns out to not be possible, + * gradually lower our standards until anything becomes acceptable. + * Also, allocating on consecutive vdevs (as opposed to random vdevs) + * gives us hope of containing our fault domains to something we're + * able to reason about. Otherwise, any two top-level vdev failures + * will guarantee the loss of data. With consecutive allocation, + * only two adjacent top-level vdev failures will result in data loss. + * + * If we are doing gang blocks (hintdva is non-NULL), try to keep + * ourselves on the same vdev as our gang block header. That + * way, we can hope for locality in vdev_cache, plus it makes our + * fault domains something tractable. + */ + if (hintdva) { + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); + + /* + * It's possible the vdev we're using as the hint no + * longer exists or its mg has been closed (e.g. by + * device removal). Consult the rotor when + * all else fails. + */ + if (vd != NULL && vd->vdev_mg != NULL) { + mg = vd->vdev_mg; + + if (flags & METASLAB_HINTBP_AVOID && + mg->mg_next != NULL) + mg = mg->mg_next; + } else { + mg = mc->mc_rotor; + } + } else if (d != 0) { + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); + mg = vd->vdev_mg->mg_next; + } else { + mg = mc->mc_rotor; + } + + /* + * If the hint put us into the wrong metaslab class, or into a + * metaslab group that has been passivated, just follow the rotor. + */ + if (mg->mg_class != mc || mg->mg_activation_count <= 0) + mg = mc->mc_rotor; + + rotor = mg; +top: + do { + boolean_t allocatable; + + ASSERT(mg->mg_activation_count == 1); + vd = mg->mg_vd; + + /* + * Don't allocate from faulted devices. + */ + if (try_hard) { + spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); + allocatable = vdev_allocatable(vd); + spa_config_exit(spa, SCL_ZIO, FTAG); + } else { + allocatable = vdev_allocatable(vd); + } + + /* + * Determine if the selected metaslab group is eligible + * for allocations. If we're ganging then don't allow + * this metaslab group to skip allocations since that would + * inadvertently return ENOSPC and suspend the pool + * even though space is still available. + */ + if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { + allocatable = metaslab_group_allocatable(mg, rotor, + psize, allocator, d); + } + + if (!allocatable) { + metaslab_trace_add(zal, mg, NULL, psize, d, + TRACE_NOT_ALLOCATABLE, allocator); + goto next; + } + + ASSERT(mg->mg_initialized); + + /* + * Avoid writing single-copy data to a failing, + * non-redundant vdev, unless we've already tried all + * other vdevs. + */ + if ((vd->vdev_stat.vs_write_errors > 0 || + vd->vdev_state < VDEV_STATE_HEALTHY) && + d == 0 && !try_hard && vd->vdev_children == 0) { + metaslab_trace_add(zal, mg, NULL, psize, d, + TRACE_VDEV_ERROR, allocator); + goto next; + } + + ASSERT(mg->mg_class == mc); + + /* + * If we don't need to try hard, then require that the + * block be 1/8th of the device away from any other DVAs + * in this BP. If we are trying hard, allow any offset + * to be used (distance=0). + */ + uint64_t distance = 0; + if (!try_hard) { + distance = vd->vdev_asize >> + ditto_same_vdev_distance_shift; + if (distance <= (1ULL << vd->vdev_ms_shift)) + distance = 0; + } + + uint64_t asize = vdev_psize_to_asize(vd, psize); + ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); + + uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, + distance, dva, d, allocator); + + if (offset != -1ULL) { + /* + * If we've just selected this metaslab group, + * figure out whether the corresponding vdev is + * over- or under-used relative to the pool, + * and set an allocation bias to even it out. + */ + if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { + vdev_stat_t *vs = &vd->vdev_stat; + int64_t vu, cu; + + vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); + cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); + + /* + * Calculate how much more or less we should + * try to allocate from this device during + * this iteration around the rotor. + * For example, if a device is 80% full + * and the pool is 20% full then we should + * reduce allocations by 60% on this device. + * + * mg_bias = (20 - 80) * 512K / 100 = -307K + * + * This reduces allocations by 307K for this + * iteration. + */ + mg->mg_bias = ((cu - vu) * + (int64_t)mg->mg_aliquot) / 100; + } else if (!metaslab_bias_enabled) { + mg->mg_bias = 0; + } + + if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= + mg->mg_aliquot + mg->mg_bias) { + mc->mc_rotor = mg->mg_next; + mc->mc_aliquot = 0; + } + + DVA_SET_VDEV(&dva[d], vd->vdev_id); + DVA_SET_OFFSET(&dva[d], offset); + DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); + DVA_SET_ASIZE(&dva[d], asize); + + return (0); + } +next: + mc->mc_rotor = mg->mg_next; + mc->mc_aliquot = 0; + } while ((mg = mg->mg_next) != rotor); + + /* + * If we haven't tried hard, do so now. + */ + if (!try_hard) { + try_hard = B_TRUE; + goto top; + } + + bzero(&dva[d], sizeof (dva_t)); + + metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); + return (SET_ERROR(ENOSPC)); +} + +void +metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, + boolean_t checkpoint) +{ + metaslab_t *msp; + spa_t *spa = vd->vdev_spa; + + ASSERT(vdev_is_concrete(vd)); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + VERIFY(!msp->ms_condensing); + VERIFY3U(offset, >=, msp->ms_start); + VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); + + metaslab_check_free_impl(vd, offset, asize); + + mutex_enter(&msp->ms_lock); + if (range_tree_is_empty(msp->ms_freeing) && + range_tree_is_empty(msp->ms_checkpointing)) { + vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); + } + + if (checkpoint) { + ASSERT(spa_has_checkpoint(spa)); + range_tree_add(msp->ms_checkpointing, offset, asize); + } else { + range_tree_add(msp->ms_freeing, offset, asize); + } + mutex_exit(&msp->ms_lock); +} + +/* ARGSUSED */ +void +metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + boolean_t *checkpoint = arg; + + ASSERT3P(checkpoint, !=, NULL); + + if (vd->vdev_ops->vdev_op_remap != NULL) + vdev_indirect_mark_obsolete(vd, offset, size); + else + metaslab_free_impl(vd, offset, size, *checkpoint); +} + +static void +metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, + boolean_t checkpoint) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + + if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) + return; + + if (spa->spa_vdev_removal != NULL && + spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && + vdev_is_concrete(vd)) { + /* + * Note: we check if the vdev is concrete because when + * we complete the removal, we first change the vdev to be + * an indirect vdev (in open context), and then (in syncing + * context) clear spa_vdev_removal. + */ + free_from_removing_vdev(vd, offset, size); + } else if (vd->vdev_ops->vdev_op_remap != NULL) { + vdev_indirect_mark_obsolete(vd, offset, size); + vd->vdev_ops->vdev_op_remap(vd, offset, size, + metaslab_free_impl_cb, &checkpoint); + } else { + metaslab_free_concrete(vd, offset, size, checkpoint); + } +} + +typedef struct remap_blkptr_cb_arg { + blkptr_t *rbca_bp; + spa_remap_cb_t rbca_cb; + vdev_t *rbca_remap_vd; + uint64_t rbca_remap_offset; + void *rbca_cb_arg; +} remap_blkptr_cb_arg_t; + +void +remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + remap_blkptr_cb_arg_t *rbca = arg; + blkptr_t *bp = rbca->rbca_bp; + + /* We can not remap split blocks. */ + if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) + return; + ASSERT0(inner_offset); + + if (rbca->rbca_cb != NULL) { + /* + * At this point we know that we are not handling split + * blocks and we invoke the callback on the previous + * vdev which must be indirect. + */ + ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); + + rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, + rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); + + /* set up remap_blkptr_cb_arg for the next call */ + rbca->rbca_remap_vd = vd; + rbca->rbca_remap_offset = offset; + } + + /* + * The phys birth time is that of dva[0]. This ensures that we know + * when each dva was written, so that resilver can determine which + * blocks need to be scrubbed (i.e. those written during the time + * the vdev was offline). It also ensures that the key used in + * the ARC hash table is unique (i.e. dva[0] + phys_birth). If + * we didn't change the phys_birth, a lookup in the ARC for a + * remapped BP could find the data that was previously stored at + * this vdev + offset. + */ + vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, + DVA_GET_VDEV(&bp->blk_dva[0])); + vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; + bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, + DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); + + DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], offset); +} + +/* + * If the block pointer contains any indirect DVAs, modify them to refer to + * concrete DVAs. Note that this will sometimes not be possible, leaving + * the indirect DVA in place. This happens if the indirect DVA spans multiple + * segments in the mapping (i.e. it is a "split block"). + * + * If the BP was remapped, calls the callback on the original dva (note the + * callback can be called multiple times if the original indirect DVA refers + * to another indirect DVA, etc). + * + * Returns TRUE if the BP was remapped. + */ +boolean_t +spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) +{ + remap_blkptr_cb_arg_t rbca; + + if (!zfs_remap_blkptr_enable) + return (B_FALSE); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) + return (B_FALSE); + + /* + * Dedup BP's can not be remapped, because ddt_phys_select() depends + * on DVA[0] being the same in the BP as in the DDT (dedup table). + */ + if (BP_GET_DEDUP(bp)) + return (B_FALSE); + + /* + * Gang blocks can not be remapped, because + * zio_checksum_gang_verifier() depends on the DVA[0] that's in + * the BP used to read the gang block header (GBH) being the same + * as the DVA[0] that we allocated for the GBH. + */ + if (BP_IS_GANG(bp)) + return (B_FALSE); + + /* + * Embedded BP's have no DVA to remap. + */ + if (BP_GET_NDVAS(bp) < 1) + return (B_FALSE); + + /* + * Note: we only remap dva[0]. If we remapped other dvas, we + * would no longer know what their phys birth txg is. + */ + dva_t *dva = &bp->blk_dva[0]; + + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + + if (vd->vdev_ops->vdev_op_remap == NULL) + return (B_FALSE); + + rbca.rbca_bp = bp; + rbca.rbca_cb = callback; + rbca.rbca_remap_vd = vd; + rbca.rbca_remap_offset = offset; + rbca.rbca_cb_arg = arg; + + /* + * remap_blkptr_cb() will be called in order for each level of + * indirection, until a concrete vdev is reached or a split block is + * encountered. old_vd and old_offset are updated within the callback + * as we go from the one indirect vdev to the next one (either concrete + * or indirect again) in that order. + */ + vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); + + /* Check if the DVA wasn't remapped because it is a split block */ + if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Undo the allocation of a DVA which happened in the given transaction group. + */ +void +metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +{ + metaslab_t *msp; + vdev_t *vd; + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + + ASSERT(DVA_IS_VALID(dva)); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + + if (txg > spa_freeze_txg(spa)) + return; + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL || + (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { + cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", + (u_longlong_t)vdev, (u_longlong_t)offset); + ASSERT(0); + return; + } + + ASSERT(!vd->vdev_removing); + ASSERT(vdev_is_concrete(vd)); + ASSERT0(vd->vdev_indirect_config.vic_mapping_object); + ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + mutex_enter(&msp->ms_lock); + range_tree_remove(msp->ms_allocating[txg & TXG_MASK], + offset, size); + + VERIFY(!msp->ms_condensing); + VERIFY3U(offset, >=, msp->ms_start); + VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); + VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, + msp->ms_size); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + range_tree_add(msp->ms_allocatable, offset, size); + mutex_exit(&msp->ms_lock); +} + +/* + * Free the block represented by the given DVA. + */ +void +metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd = vdev_lookup_top(spa, vdev); + + ASSERT(DVA_IS_VALID(dva)); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + + if (DVA_GET_GANG(dva)) { + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + } + + metaslab_free_impl(vd, offset, size, checkpoint); +} + +/* + * Reserve some allocation slots. The reservation system must be called + * before we call into the allocator. If there aren't any available slots + * then the I/O will be throttled until an I/O completes and its slots are + * freed up. The function returns true if it was successful in placing + * the reservation. + */ +boolean_t +metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, + zio_t *zio, int flags) +{ + uint64_t available_slots = 0; + boolean_t slot_reserved = B_FALSE; + uint64_t max = mc->mc_alloc_max_slots[allocator]; + + ASSERT(mc->mc_alloc_throttle_enabled); + mutex_enter(&mc->mc_lock); + + uint64_t reserved_slots = + refcount_count(&mc->mc_alloc_slots[allocator]); + if (reserved_slots < max) + available_slots = max - reserved_slots; + + if (slots <= available_slots || GANG_ALLOCATION(flags)) { + /* + * We reserve the slots individually so that we can unreserve + * them individually when an I/O completes. + */ + for (int d = 0; d < slots; d++) { + reserved_slots = + refcount_add(&mc->mc_alloc_slots[allocator], + zio); + } + zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; + slot_reserved = B_TRUE; + } + + mutex_exit(&mc->mc_lock); + return (slot_reserved); +} + +void +metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, + int allocator, zio_t *zio) +{ + ASSERT(mc->mc_alloc_throttle_enabled); + mutex_enter(&mc->mc_lock); + for (int d = 0; d < slots; d++) { + (void) refcount_remove(&mc->mc_alloc_slots[allocator], + zio); + } + mutex_exit(&mc->mc_lock); +} + +static int +metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, + uint64_t txg) +{ + metaslab_t *msp; + spa_t *spa = vd->vdev_spa; + int error = 0; + + if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) + return (ENXIO); + + ASSERT3P(vd->vdev_ms, !=, NULL); + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + mutex_enter(&msp->ms_lock); + + if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) + error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); + /* + * No need to fail in that case; someone else has activated the + * metaslab, but that doesn't preclude us from using it. + */ + if (error == EBUSY) + error = 0; + + if (error == 0 && + !range_tree_contains(msp->ms_allocatable, offset, size)) + error = SET_ERROR(ENOENT); + + if (error || txg == 0) { /* txg == 0 indicates dry run */ + mutex_exit(&msp->ms_lock); + return (error); + } + + VERIFY(!msp->ms_condensing); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, + msp->ms_size); + range_tree_remove(msp->ms_allocatable, offset, size); + + if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ + if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) + vdev_dirty(vd, VDD_METASLAB, msp, txg); + range_tree_add(msp->ms_allocating[txg & TXG_MASK], + offset, size); + } + + mutex_exit(&msp->ms_lock); + + return (0); +} + +typedef struct metaslab_claim_cb_arg_t { + uint64_t mcca_txg; + int mcca_error; +} metaslab_claim_cb_arg_t; + +/* ARGSUSED */ +static void +metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + metaslab_claim_cb_arg_t *mcca_arg = arg; + + if (mcca_arg->mcca_error == 0) { + mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, + size, mcca_arg->mcca_txg); + } +} + +int +metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) +{ + if (vd->vdev_ops->vdev_op_remap != NULL) { + metaslab_claim_cb_arg_t arg; + + /* + * Only zdb(1M) can claim on indirect vdevs. This is used + * to detect leaks of mapped space (that are not accounted + * for in the obsolete counts, spacemap, or bpobj). + */ + ASSERT(!spa_writeable(vd->vdev_spa)); + arg.mcca_error = 0; + arg.mcca_txg = txg; + + vd->vdev_ops->vdev_op_remap(vd, offset, size, + metaslab_claim_impl_cb, &arg); + + if (arg.mcca_error == 0) { + arg.mcca_error = metaslab_claim_concrete(vd, + offset, size, txg); + } + return (arg.mcca_error); + } else { + return (metaslab_claim_concrete(vd, offset, size, txg)); + } +} + +/* + * Intent log support: upon opening the pool after a crash, notify the SPA + * of blocks that the intent log has allocated for immediate write, but + * which are still considered free by the SPA because the last transaction + * group didn't commit yet. + */ +static int +metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd; + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { + return (SET_ERROR(ENXIO)); + } + + ASSERT(DVA_IS_VALID(dva)); + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + return (metaslab_claim_impl(vd, offset, size, txg)); +} + +int +metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, + int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, + zio_alloc_list_t *zal, zio_t *zio, int allocator) +{ + dva_t *dva = bp->blk_dva; + dva_t *hintdva = hintbp->blk_dva; + int error = 0; + + ASSERT(bp->blk_birth == 0); + ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); + + if (mc->mc_rotor == NULL) { /* no vdevs in this class */ + spa_config_exit(spa, SCL_ALLOC, FTAG); + return (SET_ERROR(ENOSPC)); + } + + ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); + ASSERT(BP_GET_NDVAS(bp) == 0); + ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); + ASSERT3P(zal, !=, NULL); + + for (int d = 0; d < ndvas; d++) { + error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, + txg, flags, zal, allocator); + if (error != 0) { + for (d--; d >= 0; d--) { + metaslab_unalloc_dva(spa, &dva[d], txg); + metaslab_group_alloc_decrement(spa, + DVA_GET_VDEV(&dva[d]), zio, flags, + allocator, B_FALSE); + bzero(&dva[d], sizeof (dva_t)); + } + spa_config_exit(spa, SCL_ALLOC, FTAG); + return (error); + } else { + /* + * Update the metaslab group's queue depth + * based on the newly allocated dva. + */ + metaslab_group_alloc_increment(spa, + DVA_GET_VDEV(&dva[d]), zio, flags, allocator); + } + + } + ASSERT(error == 0); + ASSERT(BP_GET_NDVAS(bp) == ndvas); + + spa_config_exit(spa, SCL_ALLOC, FTAG); + + BP_SET_BIRTH(bp, txg, txg); + + return (0); +} + +void +metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); + + /* + * If we have a checkpoint for the pool we need to make sure that + * the blocks that we free that are part of the checkpoint won't be + * reused until the checkpoint is discarded or we revert to it. + * + * The checkpoint flag is passed down the metaslab_free code path + * and is set whenever we want to add a block to the checkpoint's + * accounting. That is, we "checkpoint" blocks that existed at the + * time the checkpoint was created and are therefore referenced by + * the checkpointed uberblock. + * + * Note that, we don't checkpoint any blocks if the current + * syncing txg <= spa_checkpoint_txg. We want these frees to sync + * normally as they will be referenced by the checkpointed uberblock. + */ + boolean_t checkpoint = B_FALSE; + if (bp->blk_birth <= spa->spa_checkpoint_txg && + spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { + /* + * At this point, if the block is part of the checkpoint + * there is no way it was created in the current txg. + */ + ASSERT(!now); + ASSERT3U(spa_syncing_txg(spa), ==, txg); + checkpoint = B_TRUE; + } + + spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); + + for (int d = 0; d < ndvas; d++) { + if (now) { + metaslab_unalloc_dva(spa, &dva[d], txg); + } else { + ASSERT3U(txg, ==, spa_syncing_txg(spa)); + metaslab_free_dva(spa, &dva[d], checkpoint); + } + } + + spa_config_exit(spa, SCL_FREE, FTAG); +} + +int +metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + int error = 0; + + ASSERT(!BP_IS_HOLE(bp)); + + if (txg != 0) { + /* + * First do a dry run to make sure all DVAs are claimable, + * so we don't have to unwind from partial failures below. + */ + if ((error = metaslab_claim(spa, bp, 0)) != 0) + return (error); + } + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); + + for (int d = 0; d < ndvas; d++) + if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) + break; + + spa_config_exit(spa, SCL_ALLOC, FTAG); + + ASSERT(error == 0 || txg == 0); + + return (error); +} + +/* ARGSUSED */ +static void +metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + if (vd->vdev_ops == &vdev_indirect_ops) + return; + + metaslab_check_free_impl(vd, offset, size); +} + +static void +metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) +{ + metaslab_t *msp; + spa_t *spa = vd->vdev_spa; + + if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) + return; + + if (vd->vdev_ops->vdev_op_remap != NULL) { + vd->vdev_ops->vdev_op_remap(vd, offset, size, + metaslab_check_free_impl_cb, NULL); + return; + } + + ASSERT(vdev_is_concrete(vd)); + ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + mutex_enter(&msp->ms_lock); + if (msp->ms_loaded) + range_tree_verify(msp->ms_allocatable, offset, size); + + range_tree_verify(msp->ms_freeing, offset, size); + range_tree_verify(msp->ms_checkpointing, offset, size); + range_tree_verify(msp->ms_freed, offset, size); + for (int j = 0; j < TXG_DEFER_SIZE; j++) + range_tree_verify(msp->ms_defer[j], offset, size); + mutex_exit(&msp->ms_lock); +} + +void +metaslab_check_free(spa_t *spa, const blkptr_t *bp) +{ + if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) + return; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + for (int i = 0; i < BP_GET_NDVAS(bp); i++) { + uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); + vdev_t *vd = vdev_lookup_top(spa, vdev); + uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); + uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); + + if (DVA_GET_GANG(&bp->blk_dva[i])) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + ASSERT3P(vd, !=, NULL); + + metaslab_check_free_impl(vd, offset, size); + } + spa_config_exit(spa, SCL_VDEV, FTAG); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c new file mode 100644 index 000000000000..b61d1358c8a8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c @@ -0,0 +1,401 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/multilist.h> + +/* needed for spa_get_random() */ +#include <sys/spa.h> + +/* + * This overrides the number of sublists in each multilist_t, which defaults + * to the number of CPUs in the system (see multilist_create()). + */ +int zfs_multilist_num_sublists = 0; + +/* + * Given the object contained on the list, return a pointer to the + * object's multilist_node_t structure it contains. + */ +static multilist_node_t * +multilist_d2l(multilist_t *ml, void *obj) +{ + return ((multilist_node_t *)((char *)obj + ml->ml_offset)); +} + +/* + * Initialize a new mutlilist using the parameters specified. + * + * - 'size' denotes the size of the structure containing the + * multilist_node_t. + * - 'offset' denotes the byte offset of the mutlilist_node_t within + * the structure that contains it. + * - 'num' specifies the number of internal sublists to create. + * - 'index_func' is used to determine which sublist to insert into + * when the multilist_insert() function is called; as well as which + * sublist to remove from when multilist_remove() is called. The + * requirements this function must meet, are the following: + * + * - It must always return the same value when called on the same + * object (to ensure the object is removed from the list it was + * inserted into). + * + * - It must return a value in the range [0, number of sublists). + * The multilist_get_num_sublists() function may be used to + * determine the number of sublists in the multilist. + * + * Also, in order to reduce internal contention between the sublists + * during insertion and removal, this function should choose evenly + * between all available sublists when inserting. This isn't a hard + * requirement, but a general rule of thumb in order to garner the + * best multi-threaded performance out of the data structure. + */ +static multilist_t * +multilist_create_impl(size_t size, size_t offset, + unsigned int num, multilist_sublist_index_func_t *index_func) +{ + ASSERT3U(size, >, 0); + ASSERT3U(size, >=, offset + sizeof (multilist_node_t)); + ASSERT3U(num, >, 0); + ASSERT3P(index_func, !=, NULL); + + multilist_t *ml = kmem_alloc(sizeof (*ml), KM_SLEEP); + ml->ml_offset = offset; + ml->ml_num_sublists = num; + ml->ml_index_func = index_func; + + ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) * + ml->ml_num_sublists, KM_SLEEP); + + ASSERT3P(ml->ml_sublists, !=, NULL); + + for (int i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&mls->mls_list, size, offset); + } + return (ml); +} + +/* + * Allocate a new multilist, using the default number of sublists + * (the number of CPUs, or at least 4, or the tunable + * zfs_multilist_num_sublists). + */ +multilist_t * +multilist_create(size_t size, size_t offset, + multilist_sublist_index_func_t *index_func) +{ + int num_sublists; + + if (zfs_multilist_num_sublists > 0) { + num_sublists = zfs_multilist_num_sublists; + } else { + num_sublists = MAX(max_ncpus, 4); + } + + return (multilist_create_impl(size, offset, num_sublists, index_func)); +} + +/* + * Destroy the given multilist object, and free up any memory it holds. + */ +void +multilist_destroy(multilist_t *ml) +{ + ASSERT(multilist_is_empty(ml)); + + for (int i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + + ASSERT(list_is_empty(&mls->mls_list)); + + list_destroy(&mls->mls_list); + mutex_destroy(&mls->mls_lock); + } + + ASSERT3P(ml->ml_sublists, !=, NULL); + kmem_free(ml->ml_sublists, + sizeof (multilist_sublist_t) * ml->ml_num_sublists); + + ml->ml_num_sublists = 0; + ml->ml_offset = 0; + kmem_free(ml, sizeof (multilist_t)); +} + +/* + * Insert the given object into the multilist. + * + * This function will insert the object specified into the sublist + * determined using the function given at multilist creation time. + * + * The sublist locks are automatically acquired if not already held, to + * ensure consistency when inserting and removing from multiple threads. + */ +void +multilist_insert(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__insert, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + + /* + * Note: Callers may already hold the sublist lock by calling + * multilist_sublist_lock(). Here we rely on MUTEX_HELD() + * returning TRUE if and only if the current thread holds the + * lock. While it's a little ugly to make the lock recursive in + * this way, it works and allows the calling code to be much + * simpler -- otherwise it would have to pass around a flag + * indicating that it already has the lock. + */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(!multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_insert_head(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Remove the given object from the multilist. + * + * This function will remove the object specified from the sublist + * determined using the function given at multilist creation time. + * + * The necessary sublist locks are automatically acquired, to ensure + * consistency when inserting and removing from multiple threads. + */ +void +multilist_remove(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__remove, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + /* See comment in multilist_insert(). */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_remove(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Check to see if this multilist object is empty. + * + * This will return TRUE if it finds all of the sublists of this + * multilist to be empty, and FALSE otherwise. Each sublist lock will be + * automatically acquired as necessary. + * + * If concurrent insertions and removals are occurring, the semantics + * of this function become a little fuzzy. Instead of locking all + * sublists for the entire call time of the function, each sublist is + * only locked as it is individually checked for emptiness. Thus, it's + * possible for this function to return TRUE with non-empty sublists at + * the time the function returns. This would be due to another thread + * inserting into a given sublist, after that specific sublist was check + * and deemed empty, but before all sublists have been checked. + */ +int +multilist_is_empty(multilist_t *ml) +{ + for (int i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + /* See comment in multilist_insert(). */ + boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + if (!list_is_empty(&mls->mls_list)) { + if (need_lock) + mutex_exit(&mls->mls_lock); + + return (FALSE); + } + + if (need_lock) + mutex_exit(&mls->mls_lock); + } + + return (TRUE); +} + +/* Return the number of sublists composing this multilist */ +unsigned int +multilist_get_num_sublists(multilist_t *ml) +{ + return (ml->ml_num_sublists); +} + +/* Return a randomly selected, valid sublist index for this multilist */ +unsigned int +multilist_get_random_index(multilist_t *ml) +{ + return (spa_get_random(ml->ml_num_sublists)); +} + +/* Lock and return the sublist specified at the given index */ +multilist_sublist_t * +multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) +{ + multilist_sublist_t *mls; + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + mls = &ml->ml_sublists[sublist_idx]; + mutex_enter(&mls->mls_lock); + + return (mls); +} + +/* Lock and return the sublist that would be used to store the specified obj */ +multilist_sublist_t * +multilist_sublist_lock_obj(multilist_t *ml, void *obj) +{ + return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj))); +} + +void +multilist_sublist_unlock(multilist_sublist_t *mls) +{ + mutex_exit(&mls->mls_lock); +} + +/* + * We're allowing any object to be inserted into this specific sublist, + * but this can lead to trouble if multilist_remove() is called to + * remove this object. Specifically, if calling ml_index_func on this + * object returns an index for sublist different than what is passed as + * a parameter here, any call to multilist_remove() with this newly + * inserted object is undefined! (the call to multilist_remove() will + * remove the object from a list that it isn't contained in) + */ +void +multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_head(&mls->mls_list, obj); +} + +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_tail(&mls->mls_list, obj); +} + +/* + * Move the object one element forward in the list. + * + * This function will move the given object forward in the list (towards + * the head) by one object. So, in essence, it will swap its position in + * the list with its "prev" pointer. If the given object is already at the + * head of the list, it cannot be moved forward any more than it already + * is, so no action is taken. + * + * NOTE: This function **must not** remove any object from the list other + * than the object given as the parameter. This is relied upon in + * arc_evict_state_impl(). + */ +void +multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj) +{ + void *prev = list_prev(&mls->mls_list, obj); + + ASSERT(MUTEX_HELD(&mls->mls_lock)); + ASSERT(!list_is_empty(&mls->mls_list)); + + /* 'obj' must be at the head of the list, nothing to do */ + if (prev == NULL) + return; + + list_remove(&mls->mls_list, obj); + list_insert_before(&mls->mls_list, prev, obj); +} + +void +multilist_sublist_remove(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_remove(&mls->mls_list, obj); +} + +void * +multilist_sublist_head(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_head(&mls->mls_list)); +} + +void * +multilist_sublist_tail(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_tail(&mls->mls_list)); +} + +void * +multilist_sublist_next(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_next(&mls->mls_list, obj)); +} + +void * +multilist_sublist_prev(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_prev(&mls->mls_list, obj)); +} + +void +multilist_link_init(multilist_node_t *link) +{ + list_link_init(link); +} + +int +multilist_link_active(multilist_node_t *link) +{ + return (list_link_active(link)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c new file mode 100644 index 000000000000..6359b72503ac --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c @@ -0,0 +1,672 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/dnode.h> +#include <sys/zio.h> +#include <sys/range_tree.h> + +/* + * Range trees are tree-based data structures that can be used to + * track free space or generally any space allocation information. + * A range tree keeps track of individual segments and automatically + * provides facilities such as adjacent extent merging and extent + * splitting in response to range add/remove requests. + * + * A range tree starts out completely empty, with no segments in it. + * Adding an allocation via range_tree_add to the range tree can either: + * 1) create a new extent + * 2) extend an adjacent extent + * 3) merge two adjacent extents + * Conversely, removing an allocation via range_tree_remove can: + * 1) completely remove an extent + * 2) shorten an extent (if the allocation was near one of its ends) + * 3) split an extent into two extents, in effect punching a hole + * + * A range tree is also capable of 'bridging' gaps when adding + * allocations. This is useful for cases when close proximity of + * allocations is an important detail that needs to be represented + * in the range tree. See range_tree_set_gap(). The default behavior + * is not to bridge gaps (i.e. the maximum allowed gap size is 0). + * + * In order to traverse a range tree, use either the range_tree_walk() + * or range_tree_vacate() functions. + * + * To obtain more accurate information on individual segment + * operations that the range tree performs "under the hood", you can + * specify a set of callbacks by passing a range_tree_ops_t structure + * to the range_tree_create function. Any callbacks that are non-NULL + * are then called at the appropriate times. + * + * The range tree code also supports a special variant of range trees + * that can bridge small gaps between segments. This kind of tree is used + * by the dsl scanning code to group I/Os into mostly sequential chunks to + * optimize disk performance. The code here attempts to do this with as + * little memory and computational overhead as possible. One limitation of + * this implementation is that segments of range trees with gaps can only + * support removing complete segments. + */ + +kmem_cache_t *range_seg_cache; + +/* Generic ops for managing an AVL tree alongside a range tree */ +struct range_tree_ops rt_avl_ops = { + .rtop_create = rt_avl_create, + .rtop_destroy = rt_avl_destroy, + .rtop_add = rt_avl_add, + .rtop_remove = rt_avl_remove, + .rtop_vacate = rt_avl_vacate, +}; + +void +range_tree_init(void) +{ + ASSERT(range_seg_cache == NULL); + range_seg_cache = kmem_cache_create("range_seg_cache", + sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +range_tree_fini(void) +{ + kmem_cache_destroy(range_seg_cache); + range_seg_cache = NULL; +} + +void +range_tree_stat_verify(range_tree_t *rt) +{ + range_seg_t *rs; + uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; + int i; + + for (rs = avl_first(&rt->rt_root); rs != NULL; + rs = AVL_NEXT(&rt->rt_root, rs)) { + uint64_t size = rs->rs_end - rs->rs_start; + int idx = highbit64(size) - 1; + + hist[idx]++; + ASSERT3U(hist[idx], !=, 0); + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + if (hist[i] != rt->rt_histogram[i]) { + zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu", + i, hist, hist[i], rt->rt_histogram[i]); + } + VERIFY3U(hist[i], ==, rt->rt_histogram[i]); + } +} + +static void +range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) +{ + uint64_t size = rs->rs_end - rs->rs_start; + int idx = highbit64(size) - 1; + + ASSERT(size != 0); + ASSERT3U(idx, <, + sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); + + rt->rt_histogram[idx]++; + ASSERT3U(rt->rt_histogram[idx], !=, 0); +} + +static void +range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) +{ + uint64_t size = rs->rs_end - rs->rs_start; + int idx = highbit64(size) - 1; + + ASSERT(size != 0); + ASSERT3U(idx, <, + sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); + + ASSERT3U(rt->rt_histogram[idx], !=, 0); + rt->rt_histogram[idx]--; +} + +/* + * NOTE: caller is responsible for all locking. + */ +static int +range_tree_seg_compare(const void *x1, const void *x2) +{ + const range_seg_t *r1 = (const range_seg_t *)x1; + const range_seg_t *r2 = (const range_seg_t *)x2; + + ASSERT3U(r1->rs_start, <=, r1->rs_end); + ASSERT3U(r2->rs_start, <=, r2->rs_end); + + return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); +} + +range_tree_t * +range_tree_create_impl(range_tree_ops_t *ops, void *arg, + int (*avl_compare) (const void *, const void *), uint64_t gap) +{ + range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); + + avl_create(&rt->rt_root, range_tree_seg_compare, + sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); + + rt->rt_ops = ops; + rt->rt_arg = arg; + rt->rt_gap = gap; + rt->rt_avl_compare = avl_compare; + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) + rt->rt_ops->rtop_create(rt, rt->rt_arg); + + return (rt); +} + +range_tree_t * +range_tree_create(range_tree_ops_t *ops, void *arg) +{ + return (range_tree_create_impl(ops, arg, NULL, 0)); +} + +void +range_tree_destroy(range_tree_t *rt) +{ + VERIFY0(rt->rt_space); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) + rt->rt_ops->rtop_destroy(rt, rt->rt_arg); + + avl_destroy(&rt->rt_root); + kmem_free(rt, sizeof (*rt)); +} + +void +range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) +{ + ASSERT3U(rs->rs_fill + delta, !=, 0); + ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + rs->rs_fill += delta; + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); +} + +static void +range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) +{ + range_tree_t *rt = arg; + avl_index_t where; + range_seg_t rsearch, *rs_before, *rs_after, *rs; + uint64_t end = start + size, gap = rt->rt_gap; + uint64_t bridge_size = 0; + boolean_t merge_before, merge_after; + + ASSERT3U(size, !=, 0); + ASSERT3U(fill, <=, size); + + rsearch.rs_start = start; + rsearch.rs_end = end; + rs = avl_find(&rt->rt_root, &rsearch, &where); + + if (gap == 0 && rs != NULL && + rs->rs_start <= start && rs->rs_end >= end) { + zfs_panic_recover("zfs: allocating allocated segment" + "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n", + (longlong_t)start, (longlong_t)size, + (longlong_t)rs->rs_start, + (longlong_t)rs->rs_end - rs->rs_start); + return; + } + + /* + * If this is a gap-supporting range tree, it is possible that we + * are inserting into an existing segment. In this case simply + * bump the fill count and call the remove / add callbacks. If the + * new range will extend an existing segment, we remove the + * existing one, apply the new extent to it and re-insert it using + * the normal code paths. + */ + if (rs != NULL) { + ASSERT3U(gap, !=, 0); + if (rs->rs_start <= start && rs->rs_end >= end) { + range_tree_adjust_fill(rt, rs, fill); + return; + } + + avl_remove(&rt->rt_root, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + + range_tree_stat_decr(rt, rs); + rt->rt_space -= rs->rs_end - rs->rs_start; + + fill += rs->rs_fill; + start = MIN(start, rs->rs_start); + end = MAX(end, rs->rs_end); + size = end - start; + + range_tree_add_impl(rt, start, size, fill); + + kmem_cache_free(range_seg_cache, rs); + return; + } + + ASSERT3P(rs, ==, NULL); + + /* + * Determine whether or not we will have to merge with our neighbors. + * If gap != 0, we might need to merge with our neighbors even if we + * aren't directly touching. + */ + rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); + rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); + + merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap); + merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap); + + if (merge_before && gap != 0) + bridge_size += start - rs_before->rs_end; + if (merge_after && gap != 0) + bridge_size += rs_after->rs_start - end; + + if (merge_before && merge_after) { + avl_remove(&rt->rt_root, rs_before); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { + rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); + rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); + } + + range_tree_stat_decr(rt, rs_before); + range_tree_stat_decr(rt, rs_after); + + rs_after->rs_fill += rs_before->rs_fill + fill; + rs_after->rs_start = rs_before->rs_start; + kmem_cache_free(range_seg_cache, rs_before); + rs = rs_after; + } else if (merge_before) { + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); + + range_tree_stat_decr(rt, rs_before); + + rs_before->rs_fill += fill; + rs_before->rs_end = end; + rs = rs_before; + } else if (merge_after) { + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); + + range_tree_stat_decr(rt, rs_after); + + rs_after->rs_fill += fill; + rs_after->rs_start = start; + rs = rs_after; + } else { + rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); + + rs->rs_fill = fill; + rs->rs_start = start; + rs->rs_end = end; + avl_insert(&rt->rt_root, rs, where); + } + + if (gap != 0) + ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start); + else + ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + + range_tree_stat_incr(rt, rs); + rt->rt_space += size + bridge_size; +} + +void +range_tree_add(void *arg, uint64_t start, uint64_t size) +{ + range_tree_add_impl(arg, start, size, size); +} + +static void +range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, + boolean_t do_fill) +{ + avl_index_t where; + range_seg_t rsearch, *rs, *newseg; + uint64_t end = start + size; + boolean_t left_over, right_over; + + VERIFY3U(size, !=, 0); + VERIFY3U(size, <=, rt->rt_space); + + rsearch.rs_start = start; + rsearch.rs_end = end; + rs = avl_find(&rt->rt_root, &rsearch, &where); + + /* Make sure we completely overlap with someone */ + if (rs == NULL) { + zfs_panic_recover("zfs: freeing free segment " + "(offset=%llu size=%llu)", + (longlong_t)start, (longlong_t)size); + return; + } + + /* + * Range trees with gap support must only remove complete segments + * from the tree. This allows us to maintain accurate fill accounting + * and to ensure that bridged sections are not leaked. If we need to + * remove less than the full segment, we can only adjust the fill count. + */ + if (rt->rt_gap != 0) { + if (do_fill) { + if (rs->rs_fill == size) { + start = rs->rs_start; + end = rs->rs_end; + size = end - start; + } else { + range_tree_adjust_fill(rt, rs, -size); + return; + } + } else if (rs->rs_start != start || rs->rs_end != end) { + zfs_panic_recover("zfs: freeing partial segment of " + "gap tree (offset=%llu size=%llu) of " + "(offset=%llu size=%llu)", + (longlong_t)start, (longlong_t)size, + (longlong_t)rs->rs_start, + (longlong_t)rs->rs_end - rs->rs_start); + return; + } + } + + VERIFY3U(rs->rs_start, <=, start); + VERIFY3U(rs->rs_end, >=, end); + + left_over = (rs->rs_start != start); + right_over = (rs->rs_end != end); + + range_tree_stat_decr(rt, rs); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + + if (left_over && right_over) { + newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); + newseg->rs_start = end; + newseg->rs_end = rs->rs_end; + newseg->rs_fill = newseg->rs_end - newseg->rs_start; + range_tree_stat_incr(rt, newseg); + + rs->rs_end = start; + + avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); + } else if (left_over) { + rs->rs_end = start; + } else if (right_over) { + rs->rs_start = end; + } else { + avl_remove(&rt->rt_root, rs); + kmem_cache_free(range_seg_cache, rs); + rs = NULL; + } + + if (rs != NULL) { + /* + * The fill of the leftover segment will always be equal to + * the size, since we do not support removing partial segments + * of range trees with gaps. + */ + rs->rs_fill = rs->rs_end - rs->rs_start; + range_tree_stat_incr(rt, rs); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + } + + rt->rt_space -= size; +} + +void +range_tree_remove(void *arg, uint64_t start, uint64_t size) +{ + range_tree_remove_impl(arg, start, size, B_FALSE); +} + +void +range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size) +{ + range_tree_remove_impl(rt, start, size, B_TRUE); +} + +void +range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, + uint64_t newstart, uint64_t newsize) +{ + int64_t delta = newsize - (rs->rs_end - rs->rs_start); + + range_tree_stat_decr(rt, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + + rs->rs_start = newstart; + rs->rs_end = newstart + newsize; + + range_tree_stat_incr(rt, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + + rt->rt_space += delta; +} + +static range_seg_t * +range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) +{ + range_seg_t rsearch; + uint64_t end = start + size; + + VERIFY(size != 0); + + rsearch.rs_start = start; + rsearch.rs_end = end; + return (avl_find(&rt->rt_root, &rsearch, NULL)); +} + +range_seg_t * +range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) +{ + range_seg_t *rs = range_tree_find_impl(rt, start, size); + if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size) + return (rs); + return (NULL); +} + +void +range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) +{ + range_seg_t *rs; + + rs = range_tree_find(rt, off, size); + if (rs != NULL) + panic("freeing free block; rs=%p", (void *)rs); +} + +boolean_t +range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) +{ + return (range_tree_find(rt, start, size) != NULL); +} + +/* + * Ensure that this range is not in the tree, regardless of whether + * it is currently in the tree. + */ +void +range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) +{ + range_seg_t *rs; + + if (size == 0) + return; + + while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { + uint64_t free_start = MAX(rs->rs_start, start); + uint64_t free_end = MIN(rs->rs_end, start + size); + range_tree_remove(rt, free_start, free_end - free_start); + } +} + +void +range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) +{ + range_tree_t *rt; + + ASSERT0(range_tree_space(*rtdst)); + ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); + + rt = *rtsrc; + *rtsrc = *rtdst; + *rtdst = rt; +} + +void +range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) +{ + range_seg_t *rs; + void *cookie = NULL; + + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) + rt->rt_ops->rtop_vacate(rt, rt->rt_arg); + + while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { + if (func != NULL) + func(arg, rs->rs_start, rs->rs_end - rs->rs_start); + kmem_cache_free(range_seg_cache, rs); + } + + bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); + rt->rt_space = 0; +} + +void +range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) +{ + range_seg_t *rs; + + for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) + func(arg, rs->rs_start, rs->rs_end - rs->rs_start); +} + +range_seg_t * +range_tree_first(range_tree_t *rt) +{ + return (avl_first(&rt->rt_root)); +} + +uint64_t +range_tree_space(range_tree_t *rt) +{ + return (rt->rt_space); +} + +/* Generic range tree functions for maintaining segments in an AVL tree. */ +void +rt_avl_create(range_tree_t *rt, void *arg) +{ + avl_tree_t *tree = arg; + + avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t), + offsetof(range_seg_t, rs_pp_node)); +} + +void +rt_avl_destroy(range_tree_t *rt, void *arg) +{ + avl_tree_t *tree = arg; + + ASSERT0(avl_numnodes(tree)); + avl_destroy(tree); +} + +void +rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + avl_tree_t *tree = arg; + avl_add(tree, rs); +} + +void +rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + avl_tree_t *tree = arg; + avl_remove(tree, rs); +} + +void +rt_avl_vacate(range_tree_t *rt, void *arg) +{ + /* + * Normally one would walk the tree freeing nodes along the way. + * Since the nodes are shared with the range trees we can avoid + * walking all nodes and just reinitialize the avl tree. The nodes + * will be freed by the range tree, so we don't want to free them here. + */ + rt_avl_create(rt, arg); +} + +boolean_t +range_tree_is_empty(range_tree_t *rt) +{ + ASSERT(rt != NULL); + return (range_tree_space(rt) == 0); +} + +uint64_t +range_tree_min(range_tree_t *rt) +{ + range_seg_t *rs = avl_first(&rt->rt_root); + return (rs != NULL ? rs->rs_start : 0); +} + +uint64_t +range_tree_max(range_tree_t *rt) +{ + range_seg_t *rs = avl_last(&rt->rt_root); + return (rs != NULL ? rs->rs_end : 0); +} + +uint64_t +range_tree_span(range_tree_t *rt) +{ + return (range_tree_max(rt) - range_tree_min(rt)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c new file mode 100644 index 000000000000..11eeffcf5d5d --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c @@ -0,0 +1,321 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +#ifdef ZFS_DEBUG + +#ifdef _KERNEL +int reference_tracking_enable = FALSE; /* runs out of memory too easily */ +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN, + &reference_tracking_enable, 0, + "Track reference holders to refcount_t objects, used mostly by ZFS"); +#else +int reference_tracking_enable = TRUE; +#endif +int reference_history = 3; /* tunable */ + +static kmem_cache_t *reference_cache; +static kmem_cache_t *reference_history_cache; + +void +refcount_sysinit(void) +{ + reference_cache = kmem_cache_create("reference_cache", + sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + reference_history_cache = kmem_cache_create("reference_history_cache", + sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +refcount_fini(void) +{ + kmem_cache_destroy(reference_cache); + kmem_cache_destroy(reference_history_cache); +} + +void +refcount_create(refcount_t *rc) +{ + mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&rc->rc_list, sizeof (reference_t), + offsetof(reference_t, ref_link)); + list_create(&rc->rc_removed, sizeof (reference_t), + offsetof(reference_t, ref_link)); + rc->rc_count = 0; + rc->rc_removed_count = 0; + rc->rc_tracked = reference_tracking_enable; +} + +void +refcount_create_tracked(refcount_t *rc) +{ + refcount_create(rc); + rc->rc_tracked = B_TRUE; +} + +void +refcount_create_untracked(refcount_t *rc) +{ + refcount_create(rc); + rc->rc_tracked = B_FALSE; +} + +void +refcount_destroy_many(refcount_t *rc, uint64_t number) +{ + reference_t *ref; + + ASSERT(rc->rc_count == number); + while (ref = list_head(&rc->rc_list)) { + list_remove(&rc->rc_list, ref); + kmem_cache_free(reference_cache, ref); + } + list_destroy(&rc->rc_list); + + while (ref = list_head(&rc->rc_removed)) { + list_remove(&rc->rc_removed, ref); + kmem_cache_free(reference_history_cache, ref->ref_removed); + kmem_cache_free(reference_cache, ref); + } + list_destroy(&rc->rc_removed); + mutex_destroy(&rc->rc_mtx); +} + +void +refcount_destroy(refcount_t *rc) +{ + refcount_destroy_many(rc, 0); +} + +int +refcount_is_zero(refcount_t *rc) +{ + return (rc->rc_count == 0); +} + +int64_t +refcount_count(refcount_t *rc) +{ + return (rc->rc_count); +} + +int64_t +refcount_add_many(refcount_t *rc, uint64_t number, void *holder) +{ + reference_t *ref = NULL; + int64_t count; + + if (rc->rc_tracked) { + ref = kmem_cache_alloc(reference_cache, KM_SLEEP); + ref->ref_holder = holder; + ref->ref_number = number; + } + mutex_enter(&rc->rc_mtx); + ASSERT(rc->rc_count >= 0); + if (rc->rc_tracked) + list_insert_head(&rc->rc_list, ref); + rc->rc_count += number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + + return (count); +} + +int64_t +refcount_add(refcount_t *rc, void *holder) +{ + return (refcount_add_many(rc, 1, holder)); +} + +int64_t +refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) +{ + reference_t *ref; + int64_t count; + + mutex_enter(&rc->rc_mtx); + ASSERT(rc->rc_count >= number); + + if (!rc->rc_tracked) { + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder && ref->ref_number == number) { + list_remove(&rc->rc_list, ref); + if (reference_history > 0) { + ref->ref_removed = + kmem_cache_alloc(reference_history_cache, + KM_SLEEP); + list_insert_head(&rc->rc_removed, ref); + rc->rc_removed_count++; + if (rc->rc_removed_count > reference_history) { + ref = list_tail(&rc->rc_removed); + list_remove(&rc->rc_removed, ref); + kmem_cache_free(reference_history_cache, + ref->ref_removed); + kmem_cache_free(reference_cache, ref); + rc->rc_removed_count--; + } + } else { + kmem_cache_free(reference_cache, ref); + } + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); + } + } + panic("No such hold %p on refcount %llx", holder, + (u_longlong_t)(uintptr_t)rc); + return (-1); +} + +int64_t +refcount_remove(refcount_t *rc, void *holder) +{ + return (refcount_remove_many(rc, 1, holder)); +} + +void +refcount_transfer(refcount_t *dst, refcount_t *src) +{ + int64_t count, removed_count; + list_t list, removed; + + list_create(&list, sizeof (reference_t), + offsetof(reference_t, ref_link)); + list_create(&removed, sizeof (reference_t), + offsetof(reference_t, ref_link)); + + mutex_enter(&src->rc_mtx); + count = src->rc_count; + removed_count = src->rc_removed_count; + src->rc_count = 0; + src->rc_removed_count = 0; + list_move_tail(&list, &src->rc_list); + list_move_tail(&removed, &src->rc_removed); + mutex_exit(&src->rc_mtx); + + mutex_enter(&dst->rc_mtx); + dst->rc_count += count; + dst->rc_removed_count += removed_count; + list_move_tail(&dst->rc_list, &list); + list_move_tail(&dst->rc_removed, &removed); + mutex_exit(&dst->rc_mtx); + + list_destroy(&list); + list_destroy(&removed); +} + +void +refcount_transfer_ownership(refcount_t *rc, void *current_holder, + void *new_holder) +{ + reference_t *ref; + boolean_t found = B_FALSE; + + mutex_enter(&rc->rc_mtx); + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return; + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == current_holder) { + ref->ref_holder = new_holder; + found = B_TRUE; + break; + } + } + ASSERT(found); + mutex_exit(&rc->rc_mtx); +} + +/* + * If tracking is enabled, return true if a reference exists that matches + * the "holder" tag. If tracking is disabled, then return true if a reference + * might be held. + */ +boolean_t +refcount_held(refcount_t *rc, void *holder) +{ + reference_t *ref; + + mutex_enter(&rc->rc_mtx); + + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return (rc->rc_count > 0); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_TRUE); + } + } + mutex_exit(&rc->rc_mtx); + return (B_FALSE); +} + +/* + * If tracking is enabled, return true if a reference does not exist that + * matches the "holder" tag. If tracking is disabled, always return true + * since the reference might not be held. + */ +boolean_t +refcount_not_held(refcount_t *rc, void *holder) +{ + reference_t *ref; + + mutex_enter(&rc->rc_mtx); + + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return (B_TRUE); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_FALSE); + } + } + mutex_exit(&rc->rc_mtx); + return (B_TRUE); +} +#endif /* ZFS_DEBUG */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c new file mode 100644 index 000000000000..51394c01c431 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c @@ -0,0 +1,395 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/refcount.h> +#include <sys/rrwlock.h> + +/* + * This file contains the implementation of a re-entrant read + * reader/writer lock (aka "rrwlock"). + * + * This is a normal reader/writer lock with the additional feature + * of allowing threads who have already obtained a read lock to + * re-enter another read lock (re-entrant read) - even if there are + * waiting writers. + * + * Callers who have not obtained a read lock give waiting writers priority. + * + * The rrwlock_t lock does not allow re-entrant writers, nor does it + * allow a re-entrant mix of reads and writes (that is, it does not + * allow a caller who has already obtained a read lock to be able to + * then grab a write lock without first dropping all read locks, and + * vice versa). + * + * The rrwlock_t uses tsd (thread specific data) to keep a list of + * nodes (rrw_node_t), where each node keeps track of which specific + * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering + * should be rare, a thread that grabs multiple reads on the same rrwlock_t + * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the + * tsd list can represent a different rrwlock_t. This allows a thread + * to enter multiple and unique rrwlock_ts for read locks at the same time. + * + * Since using tsd exposes some overhead, the rrwlock_t only needs to + * keep tsd data when writers are waiting. If no writers are waiting, then + * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd + * is needed. Once a writer attempts to grab the lock, readers then + * keep tsd data and bump the linked readers count (rr_linked_rcount). + * + * If there are waiting writers and there are anonymous readers, then a + * reader doesn't know if it is a re-entrant lock. But since it may be one, + * we allow the read to proceed (otherwise it could deadlock). Since once + * waiting writers are active, readers no longer bump the anonymous count, + * the anonymous readers will eventually flush themselves out. At this point, + * readers will be able to tell if they are a re-entrant lock (have a + * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then + * we must let the proceed. If they are not, then the reader blocks for the + * waiting writers. Hence, we do not starve writers. + */ + +/* global key for TSD */ +uint_t rrw_tsd_key; + +typedef struct rrw_node { + struct rrw_node *rn_next; + rrwlock_t *rn_rrl; + void *rn_tag; +} rrw_node_t; + +static rrw_node_t * +rrn_find(rrwlock_t *rrl) +{ + rrw_node_t *rn; + + if (refcount_count(&rrl->rr_linked_rcount) == 0) + return (NULL); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl) + return (rn); + } + return (NULL); +} + +/* + * Add a node to the head of the singly linked list. + */ +static void +rrn_add(rrwlock_t *rrl, void *tag) +{ + rrw_node_t *rn; + + rn = kmem_alloc(sizeof (*rn), KM_SLEEP); + rn->rn_rrl = rrl; + rn->rn_next = tsd_get(rrw_tsd_key); + rn->rn_tag = tag; + VERIFY(tsd_set(rrw_tsd_key, rn) == 0); +} + +/* + * If a node is found for 'rrl', then remove the node from this + * thread's list and return TRUE; otherwise return FALSE. + */ +static boolean_t +rrn_find_and_remove(rrwlock_t *rrl, void *tag) +{ + rrw_node_t *rn; + rrw_node_t *prev = NULL; + + if (refcount_count(&rrl->rr_linked_rcount) == 0) + return (B_FALSE); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl && rn->rn_tag == tag) { + if (prev) + prev->rn_next = rn->rn_next; + else + VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); + kmem_free(rn, sizeof (*rn)); + return (B_TRUE); + } + prev = rn; + } + return (B_FALSE); +} + +void +rrw_init(rrwlock_t *rrl, boolean_t track_all) +{ + mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); + rrl->rr_writer = NULL; + refcount_create(&rrl->rr_anon_rcount); + refcount_create(&rrl->rr_linked_rcount); + rrl->rr_writer_wanted = B_FALSE; + rrl->rr_track_all = track_all; +} + +void +rrw_destroy(rrwlock_t *rrl) +{ + mutex_destroy(&rrl->rr_lock); + cv_destroy(&rrl->rr_cv); + ASSERT(rrl->rr_writer == NULL); + refcount_destroy(&rrl->rr_anon_rcount); + refcount_destroy(&rrl->rr_linked_rcount); +} + +static void +rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag) +{ + mutex_enter(&rrl->rr_lock); +#if !defined(DEBUG) && defined(_KERNEL) + if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && + !rrl->rr_track_all) { + rrl->rr_anon_rcount.rc_count++; + mutex_exit(&rrl->rr_lock); + return; + } + DTRACE_PROBE(zfs__rrwfastpath__rdmiss); +#endif + ASSERT(rrl->rr_writer != curthread); + ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); + + while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && + refcount_is_zero(&rrl->rr_anon_rcount) && !prio && + rrn_find(rrl) == NULL)) + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + + if (rrl->rr_writer_wanted || rrl->rr_track_all) { + /* may or may not be a re-entrant enter */ + rrn_add(rrl, tag); + (void) refcount_add(&rrl->rr_linked_rcount, tag); + } else { + (void) refcount_add(&rrl->rr_anon_rcount, tag); + } + ASSERT(rrl->rr_writer == NULL); + mutex_exit(&rrl->rr_lock); +} + +void +rrw_enter_read(rrwlock_t *rrl, void *tag) +{ + rrw_enter_read_impl(rrl, B_FALSE, tag); +} + +/* + * take a read lock even if there are pending write lock requests. if we want + * to take a lock reentrantly, but from different threads (that have a + * relationship to each other), the normal detection mechanism to overrule + * the pending writer does not work, so we have to give an explicit hint here. + */ +void +rrw_enter_read_prio(rrwlock_t *rrl, void *tag) +{ + rrw_enter_read_impl(rrl, B_TRUE, tag); +} + + +void +rrw_enter_write(rrwlock_t *rrl) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(rrl->rr_writer != curthread); + + while (refcount_count(&rrl->rr_anon_rcount) > 0 || + refcount_count(&rrl->rr_linked_rcount) > 0 || + rrl->rr_writer != NULL) { + rrl->rr_writer_wanted = B_TRUE; + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + } + rrl->rr_writer_wanted = B_FALSE; + rrl->rr_writer = curthread; + mutex_exit(&rrl->rr_lock); +} + +void +rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) +{ + if (rw == RW_READER) + rrw_enter_read(rrl, tag); + else + rrw_enter_write(rrl); +} + +void +rrw_exit(rrwlock_t *rrl, void *tag) +{ + mutex_enter(&rrl->rr_lock); +#if !defined(DEBUG) && defined(_KERNEL) + if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { + rrl->rr_anon_rcount.rc_count--; + if (rrl->rr_anon_rcount.rc_count == 0) + cv_broadcast(&rrl->rr_cv); + mutex_exit(&rrl->rr_lock); + return; + } + DTRACE_PROBE(zfs__rrwfastpath__exitmiss); +#endif + ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || + !refcount_is_zero(&rrl->rr_linked_rcount) || + rrl->rr_writer != NULL); + + if (rrl->rr_writer == NULL) { + int64_t count; + if (rrn_find_and_remove(rrl, tag)) { + count = refcount_remove(&rrl->rr_linked_rcount, tag); + } else { + ASSERT(!rrl->rr_track_all); + count = refcount_remove(&rrl->rr_anon_rcount, tag); + } + if (count == 0) + cv_broadcast(&rrl->rr_cv); + } else { + ASSERT(rrl->rr_writer == curthread); + ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && + refcount_is_zero(&rrl->rr_linked_rcount)); + rrl->rr_writer = NULL; + cv_broadcast(&rrl->rr_cv); + } + mutex_exit(&rrl->rr_lock); +} + +/* + * If the lock was created with track_all, rrw_held(RW_READER) will return + * B_TRUE iff the current thread has the lock for reader. Otherwise it may + * return B_TRUE if any thread has the lock for reader. + */ +boolean_t +rrw_held(rrwlock_t *rrl, krw_t rw) +{ + boolean_t held; + + mutex_enter(&rrl->rr_lock); + if (rw == RW_WRITER) { + held = (rrl->rr_writer == curthread); + } else { + held = (!refcount_is_zero(&rrl->rr_anon_rcount) || + rrn_find(rrl) != NULL); + } + mutex_exit(&rrl->rr_lock); + + return (held); +} + +void +rrw_tsd_destroy(void *arg) +{ + rrw_node_t *rn = arg; + if (rn != NULL) { + panic("thread %p terminating with rrw lock %p held", + (void *)curthread, (void *)rn->rn_rrl); + } +} + +/* + * A reader-mostly lock implementation, tuning above reader-writer locks + * for hightly parallel read acquisitions, while pessimizing writes. + * + * The idea is to split single busy lock into array of locks, so that + * each reader can lock only one of them for read, depending on result + * of simple hash function. That proportionally reduces lock congestion. + * Writer same time has to sequentially aquire write on all the locks. + * That makes write aquisition proportionally slower, but in places where + * it is used (filesystem unmount) performance is not critical. + * + * All the functions below are direct wrappers around functions above. + */ +void +rrm_init(rrmlock_t *rrl, boolean_t track_all) +{ + int i; + + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_init(&rrl->locks[i], track_all); +} + +void +rrm_destroy(rrmlock_t *rrl) +{ + int i; + + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_destroy(&rrl->locks[i]); +} + +void +rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag) +{ + if (rw == RW_READER) + rrm_enter_read(rrl, tag); + else + rrm_enter_write(rrl); +} + +/* + * This maps the current thread to a specific lock. Note that the lock + * must be released by the same thread that acquired it. We do this + * mapping by taking the thread pointer mod a prime number. We examine + * only the low 32 bits of the thread pointer, because 32-bit division + * is faster than 64-bit division, and the high 32 bits have little + * entropy anyway. + */ +#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS) + +void +rrm_enter_read(rrmlock_t *rrl, void *tag) +{ + rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag); +} + +void +rrm_enter_write(rrmlock_t *rrl) +{ + int i; + + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_enter_write(&rrl->locks[i]); +} + +void +rrm_exit(rrmlock_t *rrl, void *tag) +{ + int i; + + if (rrl->locks[0].rr_writer == curthread) { + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_exit(&rrl->locks[i], tag); + } else { + rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag); + } +} + +boolean_t +rrm_held(rrmlock_t *rrl, krw_t rw) +{ + if (rw == RW_WRITER) { + return (rrw_held(&rrl->locks[0], rw)); + } else { + return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw)); + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c new file mode 100644 index 000000000000..ee7852a0df0e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c @@ -0,0 +1,2009 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2011 iXsystems, Inc + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include <sys/zfs_context.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zap.h> +#include <sys/sa.h> +#include <sys/sunddi.h> +#include <sys/sa_impl.h> +#include <sys/dnode.h> +#include <sys/errno.h> +#include <sys/zfs_context.h> + +/* + * ZFS System attributes: + * + * A generic mechanism to allow for arbitrary attributes + * to be stored in a dnode. The data will be stored in the bonus buffer of + * the dnode and if necessary a special "spill" block will be used to handle + * overflow situations. The spill block will be sized to fit the data + * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the + * spill block is stored at the end of the current bonus buffer. Any + * attributes that would be in the way of the blkptr_t will be relocated + * into the spill block. + * + * Attribute registration: + * + * Stored persistently on a per dataset basis + * a mapping between attribute "string" names and their actual attribute + * numeric values, length, and byteswap function. The names are only used + * during registration. All attributes are known by their unique attribute + * id value. If an attribute can have a variable size then the value + * 0 will be used to indicate this. + * + * Attribute Layout: + * + * Attribute layouts are a way to compactly store multiple attributes, but + * without taking the overhead associated with managing each attribute + * individually. Since you will typically have the same set of attributes + * stored in the same order a single table will be used to represent that + * layout. The ZPL for example will usually have only about 10 different + * layouts (regular files, device files, symlinks, + * regular files + scanstamp, files/dir with extended attributes, and then + * you have the possibility of all of those minus ACL, because it would + * be kicked out into the spill block) + * + * Layouts are simply an array of the attributes and their + * ordering i.e. [0, 1, 4, 5, 2] + * + * Each distinct layout is given a unique layout number and that is whats + * stored in the header at the beginning of the SA data buffer. + * + * A layout only covers a single dbuf (bonus or spill). If a set of + * attributes is split up between the bonus buffer and a spill buffer then + * two different layouts will be used. This allows us to byteswap the + * spill without looking at the bonus buffer and keeps the on disk format of + * the bonus and spill buffer the same. + * + * Adding a single attribute will cause the entire set of attributes to + * be rewritten and could result in a new layout number being constructed + * as part of the rewrite if no such layout exists for the new set of + * attribues. The new attribute will be appended to the end of the already + * existing attributes. + * + * Both the attribute registration and attribute layout information are + * stored in normal ZAP attributes. Their should be a small number of + * known layouts and the set of attributes is assumed to typically be quite + * small. + * + * The registered attributes and layout "table" information is maintained + * in core and a special "sa_os_t" is attached to the objset_t. + * + * A special interface is provided to allow for quickly applying + * a large set of attributes at once. sa_replace_all_by_template() is + * used to set an array of attributes. This is used by the ZPL when + * creating a brand new file. The template that is passed into the function + * specifies the attribute, size for variable length attributes, location of + * data and special "data locator" function if the data isn't in a contiguous + * location. + * + * Byteswap implications: + * + * Since the SA attributes are not entirely self describing we can't do + * the normal byteswap processing. The special ZAP layout attribute and + * attribute registration attributes define the byteswap function and the + * size of the attributes, unless it is variable sized. + * The normal ZFS byteswapping infrastructure assumes you don't need + * to read any objects in order to do the necessary byteswapping. Whereas + * SA attributes can only be properly byteswapped if the dataset is opened + * and the layout/attribute ZAP attributes are available. Because of this + * the SA attributes will be byteswapped when they are first accessed by + * the SA code that will read the SA data. + */ + +typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t, + uint16_t length, int length_idx, boolean_t, void *userp); + +static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype); +static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab); +static sa_idx_tab_t *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, + sa_hdr_phys_t *hdr); +static void sa_idx_tab_rele(objset_t *os, void *arg); +static void sa_copy_data(sa_data_locator_t *func, void *start, void *target, + int buflen); +static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, + sa_data_op_t action, sa_data_locator_t *locator, void *datastart, + uint16_t buflen, dmu_tx_t *tx); + +arc_byteswap_func_t *sa_bswap_table[] = { + byteswap_uint64_array, + byteswap_uint32_array, + byteswap_uint16_array, + byteswap_uint8_array, + zfs_acl_byteswap, +}; + +#define SA_COPY_DATA(f, s, t, l) \ + { \ + if (f == NULL) { \ + if (l == 8) { \ + *(uint64_t *)t = *(uint64_t *)s; \ + } else if (l == 16) { \ + *(uint64_t *)t = *(uint64_t *)s; \ + *(uint64_t *)((uintptr_t)t + 8) = \ + *(uint64_t *)((uintptr_t)s + 8); \ + } else { \ + bcopy(s, t, l); \ + } \ + } else \ + sa_copy_data(f, s, t, l); \ + } + +/* + * This table is fixed and cannot be changed. Its purpose is to + * allow the SA code to work with both old/new ZPL file systems. + * It contains the list of legacy attributes. These attributes aren't + * stored in the "attribute" registry zap objects, since older ZPL file systems + * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will + * use this static table. + */ +sa_attr_reg_t sa_legacy_attrs[] = { + {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, + {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, + {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, + {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, + {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, + {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, + {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, + {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, + {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, + {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, + {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, + {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, + {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, + {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, + {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, + {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, +}; + +/* + * This is only used for objects of type DMU_OT_ZNODE + */ +sa_attr_type_t sa_legacy_zpl_layout[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + +/* + * Special dummy layout used for buffers with no attributes. + */ +sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; + +static int sa_legacy_attr_count = 16; +static kmem_cache_t *sa_cache = NULL; + +/*ARGSUSED*/ +static int +sa_cache_constructor(void *buf, void *unused, int kmflag) +{ + sa_handle_t *hdl = buf; + + mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +/*ARGSUSED*/ +static void +sa_cache_destructor(void *buf, void *unused) +{ + sa_handle_t *hdl = buf; + mutex_destroy(&hdl->sa_lock); +} + +void +sa_cache_init(void) +{ + sa_cache = kmem_cache_create("sa_cache", + sizeof (sa_handle_t), 0, sa_cache_constructor, + sa_cache_destructor, NULL, NULL, NULL, 0); +} + +void +sa_cache_fini(void) +{ + if (sa_cache) + kmem_cache_destroy(sa_cache); +} + +static int +layout_num_compare(const void *arg1, const void *arg2) +{ + const sa_lot_t *node1 = (const sa_lot_t *)arg1; + const sa_lot_t *node2 = (const sa_lot_t *)arg2; + + return (AVL_CMP(node1->lot_num, node2->lot_num)); +} + +static int +layout_hash_compare(const void *arg1, const void *arg2) +{ + const sa_lot_t *node1 = (const sa_lot_t *)arg1; + const sa_lot_t *node2 = (const sa_lot_t *)arg2; + + int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash); + if (likely(cmp)) + return (cmp); + + return (AVL_CMP(node1->lot_instance, node2->lot_instance)); +} + +boolean_t +sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count) +{ + int i; + + if (count != tbf->lot_attr_count) + return (1); + + for (i = 0; i != count; i++) { + if (attrs[i] != tbf->lot_attrs[i]) + return (1); + } + return (0); +} + +#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF]) + +static uint64_t +sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count) +{ + int i; + uint64_t crc = -1ULL; + + for (i = 0; i != attr_count; i++) + crc ^= SA_ATTR_HASH(attrs[i]); + + return (crc); +} + +static int +sa_get_spill(sa_handle_t *hdl) +{ + int rc; + if (hdl->sa_spill == NULL) { + if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL, + &hdl->sa_spill)) == 0) + VERIFY(0 == sa_build_index(hdl, SA_SPILL)); + } else { + rc = 0; + } + + return (rc); +} + +/* + * Main attribute lookup/update function + * returns 0 for success or non zero for failures + * + * Operates on bulk array, first failure will abort further processing + */ +int +sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, + sa_data_op_t data_op, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + int i; + int error = 0; + sa_buf_type_t buftypes; + + buftypes = 0; + + ASSERT(count > 0); + for (i = 0; i != count; i++) { + ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs); + + bulk[i].sa_addr = NULL; + /* First check the bonus buffer */ + + if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT( + hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) { + SA_ATTR_INFO(sa, hdl->sa_bonus_tab, + SA_GET_HDR(hdl, SA_BONUS), + bulk[i].sa_attr, bulk[i], SA_BONUS, hdl); + if (tx && !(buftypes & SA_BONUS)) { + dmu_buf_will_dirty(hdl->sa_bonus, tx); + buftypes |= SA_BONUS; + } + } + if (bulk[i].sa_addr == NULL && + ((error = sa_get_spill(hdl)) == 0)) { + if (TOC_ATTR_PRESENT( + hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) { + SA_ATTR_INFO(sa, hdl->sa_spill_tab, + SA_GET_HDR(hdl, SA_SPILL), + bulk[i].sa_attr, bulk[i], SA_SPILL, hdl); + if (tx && !(buftypes & SA_SPILL) && + bulk[i].sa_size == bulk[i].sa_length) { + dmu_buf_will_dirty(hdl->sa_spill, tx); + buftypes |= SA_SPILL; + } + } + } + if (error && error != ENOENT) { + return ((error == ECKSUM) ? EIO : error); + } + + switch (data_op) { + case SA_LOOKUP: + if (bulk[i].sa_addr == NULL) + return (SET_ERROR(ENOENT)); + if (bulk[i].sa_data) { + SA_COPY_DATA(bulk[i].sa_data_func, + bulk[i].sa_addr, bulk[i].sa_data, + bulk[i].sa_size); + } + continue; + + case SA_UPDATE: + /* existing rewrite of attr */ + if (bulk[i].sa_addr && + bulk[i].sa_size == bulk[i].sa_length) { + SA_COPY_DATA(bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_addr, + bulk[i].sa_length); + continue; + } else if (bulk[i].sa_addr) { /* attr size change */ + error = sa_modify_attrs(hdl, bulk[i].sa_attr, + SA_REPLACE, bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_length, tx); + } else { /* adding new attribute */ + error = sa_modify_attrs(hdl, bulk[i].sa_attr, + SA_ADD, bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_length, tx); + } + if (error) + return (error); + break; + } + } + return (error); +} + +static sa_lot_t * +sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, + uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx) +{ + sa_os_t *sa = os->os_sa; + sa_lot_t *tb, *findtb; + int i; + avl_index_t loc; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); + tb->lot_attr_count = attr_count; + tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, + KM_SLEEP); + bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); + tb->lot_num = lot_num; + tb->lot_hash = hash; + tb->lot_instance = 0; + + if (zapadd) { + char attr_name[8]; + + if (sa->sa_layout_attr_obj == 0) { + sa->sa_layout_attr_obj = zap_create_link(os, + DMU_OT_SA_ATTR_LAYOUTS, + sa->sa_master_obj, SA_LAYOUTS, tx); + } + + (void) snprintf(attr_name, sizeof (attr_name), + "%d", (int)lot_num); + VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj, + attr_name, 2, attr_count, attrs, tx)); + } + + list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t), + offsetof(sa_idx_tab_t, sa_next)); + + for (i = 0; i != attr_count; i++) { + if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0) + tb->lot_var_sizes++; + } + + avl_add(&sa->sa_layout_num_tree, tb); + + /* verify we don't have a hash collision */ + if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) { + for (; findtb && findtb->lot_hash == hash; + findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) { + if (findtb->lot_instance != tb->lot_instance) + break; + tb->lot_instance++; + } + } + avl_add(&sa->sa_layout_hash_tree, tb); + return (tb); +} + +static void +sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs, + int count, dmu_tx_t *tx, sa_lot_t **lot) +{ + sa_lot_t *tb, tbsearch; + avl_index_t loc; + sa_os_t *sa = os->os_sa; + boolean_t found = B_FALSE; + + mutex_enter(&sa->sa_lock); + tbsearch.lot_hash = hash; + tbsearch.lot_instance = 0; + tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc); + if (tb) { + for (; tb && tb->lot_hash == hash; + tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) { + if (sa_layout_equal(tb, attrs, count) == 0) { + found = B_TRUE; + break; + } + } + } + if (!found) { + tb = sa_add_layout_entry(os, attrs, count, + avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx); + } + mutex_exit(&sa->sa_lock); + *lot = tb; +} + +static int +sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) +{ + int error; + uint32_t blocksize; + + if (size == 0) { + blocksize = SPA_MINBLOCKSIZE; + } else if (size > SPA_OLD_MAXBLOCKSIZE) { + ASSERT(0); + return (SET_ERROR(EFBIG)); + } else { + blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t); + } + + error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx); + ASSERT(error == 0); + return (error); +} + +static void +sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) +{ + if (func == NULL) { + bcopy(datastart, target, buflen); + } else { + boolean_t start; + int bytes; + void *dataptr; + void *saptr = target; + uint32_t length; + + start = B_TRUE; + bytes = 0; + while (bytes < buflen) { + func(&dataptr, &length, buflen, start, datastart); + bcopy(dataptr, saptr, length); + saptr = (void *)((caddr_t)saptr + length); + bytes += length; + start = B_FALSE; + } + } +} + +/* + * Determine several different sizes + * first the sa header size + * the number of bytes to be stored + * if spill would occur the index in the attribute array is returned + * + * the boolean will_spill will be set when spilling is necessary. It + * is only set when the buftype is SA_BONUS + */ +static int +sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, + dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index, + int *total, boolean_t *will_spill) +{ + int var_size = 0; + int i; + int hdrsize; + int extra_hdrsize; + + if (buftype == SA_BONUS && sa->sa_force_spill) { + *total = 0; + *index = 0; + *will_spill = B_TRUE; + return (0); + } + + *index = -1; + *total = 0; + *will_spill = B_FALSE; + + extra_hdrsize = 0; + hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : + sizeof (sa_hdr_phys_t); + + ASSERT(IS_P2ALIGNED(full_space, 8)); + + for (i = 0; i != attr_count; i++) { + boolean_t is_var_sz; + + *total = P2ROUNDUP(*total, 8); + *total += attr_desc[i].sa_length; + if (*will_spill) + continue; + + is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0); + if (is_var_sz) { + var_size++; + } + + if (is_var_sz && var_size > 1) { + /* + * Don't worry that the spill block might overflow. + * It will be resized if needed in sa_build_layouts(). + */ + if (buftype == SA_SPILL || + P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + + *total < full_space) { + /* + * Account for header space used by array of + * optional sizes of variable-length attributes. + * Record the extra header size in case this + * increase needs to be reversed due to + * spill-over. + */ + hdrsize += sizeof (uint16_t); + if (*index != -1) + extra_hdrsize += sizeof (uint16_t); + } else { + ASSERT(buftype == SA_BONUS); + if (*index == -1) + *index = i; + *will_spill = B_TRUE; + continue; + } + } + + /* + * find index of where spill *could* occur. + * Then continue to count of remainder attribute + * space. The sum is used later for sizing bonus + * and spill buffer. + */ + if (buftype == SA_BONUS && *index == -1 && + (*total + P2ROUNDUP(hdrsize, 8)) > + (full_space - sizeof (blkptr_t))) { + *index = i; + } + + if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space && + buftype == SA_BONUS) + *will_spill = B_TRUE; + } + + if (*will_spill) + hdrsize -= extra_hdrsize; + + hdrsize = P2ROUNDUP(hdrsize, 8); + return (hdrsize); +} + +#define BUF_SPACE_NEEDED(total, header) (total + header) + +/* + * Find layout that corresponds to ordering of attributes + * If not found a new layout number is created and added to + * persistent layout tables. + */ +static int +sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, + dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + uint64_t hash; + sa_buf_type_t buftype; + sa_hdr_phys_t *sahdr; + void *data_start; + int buf_space; + sa_attr_type_t *attrs, *attrs_start; + int i, lot_count; + int dnodesize; + int hdrsize; + int spillhdrsize = 0; + int used; + dmu_object_type_t bonustype; + sa_lot_t *lot; + int len_idx; + int spill_used; + int bonuslen; + boolean_t spilling; + + dmu_buf_will_dirty(hdl->sa_bonus, tx); + bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus); + dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize); + bonuslen = DN_BONUS_SIZE(dnodesize); + + /* first determine bonus header size and sum of all attributes */ + hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, + SA_BONUS, bonuslen, &i, &used, &spilling); + + if (used > SPA_OLD_MAXBLOCKSIZE) + return (SET_ERROR(EFBIG)); + + VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ? + MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) : + used + hdrsize, tx)); + + ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) || + bonustype == DMU_OT_SA); + + /* setup and size spill buffer when needed */ + if (spilling) { + boolean_t dummy; + + if (hdl->sa_spill == NULL) { + VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL, + &hdl->sa_spill) == 0); + } + dmu_buf_will_dirty(hdl->sa_spill, tx); + + spillhdrsize = sa_find_sizes(sa, &attr_desc[i], + attr_count - i, hdl->sa_spill, SA_SPILL, + hdl->sa_spill->db_size, &i, &spill_used, &dummy); + + if (spill_used > SPA_OLD_MAXBLOCKSIZE) + return (SET_ERROR(EFBIG)); + + buf_space = hdl->sa_spill->db_size - spillhdrsize; + if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) > + hdl->sa_spill->db_size) + VERIFY(0 == sa_resize_spill(hdl, + BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx)); + } + + /* setup starting pointers to lay down data */ + data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); + sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; + buftype = SA_BONUS; + + if (spilling) + buf_space = (sa->sa_force_spill) ? + 0 : SA_BLKPTR_SPACE - hdrsize; + else + buf_space = hdl->sa_bonus->db_size - hdrsize; + + attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, + KM_SLEEP); + lot_count = 0; + + for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) { + uint16_t length; + + ASSERT(IS_P2ALIGNED(data_start, 8)); + ASSERT(IS_P2ALIGNED(buf_space, 8)); + attrs[i] = attr_desc[i].sa_attr; + length = SA_REGISTERED_LEN(sa, attrs[i]); + if (length == 0) + length = attr_desc[i].sa_length; + else + VERIFY(length == attr_desc[i].sa_length); + + if (buf_space < length) { /* switch to spill buffer */ + VERIFY(spilling); + VERIFY(bonustype == DMU_OT_SA); + if (buftype == SA_BONUS && !sa->sa_force_spill) { + sa_find_layout(hdl->sa_os, hash, attrs_start, + lot_count, tx, &lot); + SA_SET_HDR(sahdr, lot->lot_num, hdrsize); + } + + buftype = SA_SPILL; + hash = -1ULL; + len_idx = 0; + + sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; + sahdr->sa_magic = SA_MAGIC; + data_start = (void *)((uintptr_t)sahdr + + spillhdrsize); + attrs_start = &attrs[i]; + buf_space = hdl->sa_spill->db_size - spillhdrsize; + lot_count = 0; + } + hash ^= SA_ATTR_HASH(attrs[i]); + attr_desc[i].sa_addr = data_start; + attr_desc[i].sa_size = length; + SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data, + data_start, length); + if (sa->sa_attr_table[attrs[i]].sa_length == 0) { + sahdr->sa_lengths[len_idx++] = length; + } + VERIFY((uintptr_t)data_start % 8 == 0); + data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + + length), 8); + buf_space -= P2ROUNDUP(length, 8); + lot_count++; + } + + sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); + + /* + * Verify that old znodes always have layout number 0. + * Must be DMU_OT_SA for arbitrary layouts + */ + VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) || + (bonustype == DMU_OT_SA && lot->lot_num > 1)); + + if (bonustype == DMU_OT_SA) { + SA_SET_HDR(sahdr, lot->lot_num, + buftype == SA_BONUS ? hdrsize : spillhdrsize); + } + + kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count); + if (hdl->sa_bonus_tab) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); + hdl->sa_bonus_tab = NULL; + } + if (!sa->sa_force_spill) + VERIFY(0 == sa_build_index(hdl, SA_BONUS)); + if (hdl->sa_spill) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); + if (!spilling) { + /* + * remove spill block that is no longer needed. + */ + dmu_buf_rele(hdl->sa_spill, NULL); + hdl->sa_spill = NULL; + hdl->sa_spill_tab = NULL; + VERIFY(0 == dmu_rm_spill(hdl->sa_os, + sa_handle_object(hdl), tx)); + } else { + VERIFY(0 == sa_build_index(hdl, SA_SPILL)); + } + } + + return (0); +} + +static void +sa_free_attr_table(sa_os_t *sa) +{ + int i; + + if (sa->sa_attr_table == NULL) + return; + + for (i = 0; i != sa->sa_num_attrs; i++) { + if (sa->sa_attr_table[i].sa_name) + kmem_free(sa->sa_attr_table[i].sa_name, + strlen(sa->sa_attr_table[i].sa_name) + 1); + } + + kmem_free(sa->sa_attr_table, + sizeof (sa_attr_table_t) * sa->sa_num_attrs); + + sa->sa_attr_table = NULL; +} + +static int +sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) +{ + sa_os_t *sa = os->os_sa; + uint64_t sa_attr_count = 0; + uint64_t sa_reg_count = 0; + int error = 0; + uint64_t attr_value; + sa_attr_table_t *tb; + zap_cursor_t zc; + zap_attribute_t za; + int registered_count = 0; + int i; + dmu_objset_type_t ostype = dmu_objset_type(os); + + sa->sa_user_table = + kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP); + sa->sa_user_table_sz = count * sizeof (sa_attr_type_t); + + if (sa->sa_reg_attr_obj != 0) { + error = zap_count(os, sa->sa_reg_attr_obj, + &sa_attr_count); + + /* + * Make sure we retrieved a count and that it isn't zero + */ + if (error || (error == 0 && sa_attr_count == 0)) { + if (error == 0) + error = SET_ERROR(EINVAL); + goto bail; + } + sa_reg_count = sa_attr_count; + } + + if (ostype == DMU_OST_ZFS && sa_attr_count == 0) + sa_attr_count += sa_legacy_attr_count; + + /* Allocate attribute numbers for attributes that aren't registered */ + for (i = 0; i != count; i++) { + boolean_t found = B_FALSE; + int j; + + if (ostype == DMU_OST_ZFS) { + for (j = 0; j != sa_legacy_attr_count; j++) { + if (strcmp(reg_attrs[i].sa_name, + sa_legacy_attrs[j].sa_name) == 0) { + sa->sa_user_table[i] = + sa_legacy_attrs[j].sa_attr; + found = B_TRUE; + } + } + } + if (found) + continue; + + if (sa->sa_reg_attr_obj) + error = zap_lookup(os, sa->sa_reg_attr_obj, + reg_attrs[i].sa_name, 8, 1, &attr_value); + else + error = SET_ERROR(ENOENT); + switch (error) { + case ENOENT: + sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count; + sa_attr_count++; + break; + case 0: + sa->sa_user_table[i] = ATTR_NUM(attr_value); + break; + default: + goto bail; + } + } + + sa->sa_num_attrs = sa_attr_count; + tb = sa->sa_attr_table = + kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP); + + /* + * Attribute table is constructed from requested attribute list, + * previously foreign registered attributes, and also the legacy + * ZPL set of attributes. + */ + + if (sa->sa_reg_attr_obj) { + for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t value; + value = za.za_first_integer; + + registered_count++; + tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value); + tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value); + tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value); + tb[ATTR_NUM(value)].sa_registered = B_TRUE; + + if (tb[ATTR_NUM(value)].sa_name) { + continue; + } + tb[ATTR_NUM(value)].sa_name = + kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP); + (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name, + strlen(za.za_name) +1); + } + zap_cursor_fini(&zc); + /* + * Make sure we processed the correct number of registered + * attributes + */ + if (registered_count != sa_reg_count) { + ASSERT(error != 0); + goto bail; + } + + } + + if (ostype == DMU_OST_ZFS) { + for (i = 0; i != sa_legacy_attr_count; i++) { + if (tb[i].sa_name) + continue; + tb[i].sa_attr = sa_legacy_attrs[i].sa_attr; + tb[i].sa_length = sa_legacy_attrs[i].sa_length; + tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap; + tb[i].sa_registered = B_FALSE; + tb[i].sa_name = + kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1, + KM_SLEEP); + (void) strlcpy(tb[i].sa_name, + sa_legacy_attrs[i].sa_name, + strlen(sa_legacy_attrs[i].sa_name) + 1); + } + } + + for (i = 0; i != count; i++) { + sa_attr_type_t attr_id; + + attr_id = sa->sa_user_table[i]; + if (tb[attr_id].sa_name) + continue; + + tb[attr_id].sa_length = reg_attrs[i].sa_length; + tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap; + tb[attr_id].sa_attr = attr_id; + tb[attr_id].sa_name = + kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP); + (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name, + strlen(reg_attrs[i].sa_name) + 1); + } + + sa->sa_need_attr_registration = + (sa_attr_count != registered_count); + + return (0); +bail: + kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t)); + sa->sa_user_table = NULL; + sa_free_attr_table(sa); + return ((error != 0) ? error : EINVAL); +} + +int +sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, + sa_attr_type_t **user_table) +{ + zap_cursor_t zc; + zap_attribute_t za; + sa_os_t *sa; + dmu_objset_type_t ostype = dmu_objset_type(os); + sa_attr_type_t *tb; + int error; + + mutex_enter(&os->os_user_ptr_lock); + if (os->os_sa) { + mutex_enter(&os->os_sa->sa_lock); + mutex_exit(&os->os_user_ptr_lock); + tb = os->os_sa->sa_user_table; + mutex_exit(&os->os_sa->sa_lock); + *user_table = tb; + return (0); + } + + sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP); + mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL); + sa->sa_master_obj = sa_obj; + + os->os_sa = sa; + mutex_enter(&sa->sa_lock); + mutex_exit(&os->os_user_ptr_lock); + avl_create(&sa->sa_layout_num_tree, layout_num_compare, + sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node)); + avl_create(&sa->sa_layout_hash_tree, layout_hash_compare, + sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node)); + + if (sa_obj) { + error = zap_lookup(os, sa_obj, SA_LAYOUTS, + 8, 1, &sa->sa_layout_attr_obj); + if (error != 0 && error != ENOENT) + goto fail; + error = zap_lookup(os, sa_obj, SA_REGISTRY, + 8, 1, &sa->sa_reg_attr_obj); + if (error != 0 && error != ENOENT) + goto fail; + } + + if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0) + goto fail; + + if (sa->sa_layout_attr_obj != 0) { + uint64_t layout_count; + + error = zap_count(os, sa->sa_layout_attr_obj, + &layout_count); + + /* + * Layout number count should be > 0 + */ + if (error || (error == 0 && layout_count == 0)) { + if (error == 0) + error = SET_ERROR(EINVAL); + goto fail; + } + + for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + sa_attr_type_t *lot_attrs; + uint64_t lot_num; + + lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) * + za.za_num_integers, KM_SLEEP); + + if ((error = (zap_lookup(os, sa->sa_layout_attr_obj, + za.za_name, 2, za.za_num_integers, + lot_attrs))) != 0) { + kmem_free(lot_attrs, sizeof (sa_attr_type_t) * + za.za_num_integers); + break; + } + VERIFY(ddi_strtoull(za.za_name, NULL, 10, + (unsigned long long *)&lot_num) == 0); + + (void) sa_add_layout_entry(os, lot_attrs, + za.za_num_integers, lot_num, + sa_layout_info_hash(lot_attrs, + za.za_num_integers), B_FALSE, NULL); + kmem_free(lot_attrs, sizeof (sa_attr_type_t) * + za.za_num_integers); + } + zap_cursor_fini(&zc); + + /* + * Make sure layout count matches number of entries added + * to AVL tree + */ + if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) { + ASSERT(error != 0); + goto fail; + } + } + + /* Add special layout number for old ZNODES */ + if (ostype == DMU_OST_ZFS) { + (void) sa_add_layout_entry(os, sa_legacy_zpl_layout, + sa_legacy_attr_count, 0, + sa_layout_info_hash(sa_legacy_zpl_layout, + sa_legacy_attr_count), B_FALSE, NULL); + + (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1, + 0, B_FALSE, NULL); + } + *user_table = os->os_sa->sa_user_table; + mutex_exit(&sa->sa_lock); + return (0); +fail: + os->os_sa = NULL; + sa_free_attr_table(sa); + if (sa->sa_user_table) + kmem_free(sa->sa_user_table, sa->sa_user_table_sz); + mutex_exit(&sa->sa_lock); + avl_destroy(&sa->sa_layout_hash_tree); + avl_destroy(&sa->sa_layout_num_tree); + mutex_destroy(&sa->sa_lock); + kmem_free(sa, sizeof (sa_os_t)); + return ((error == ECKSUM) ? EIO : error); +} + +void +sa_tear_down(objset_t *os) +{ + sa_os_t *sa = os->os_sa; + sa_lot_t *layout; + void *cookie; + + kmem_free(sa->sa_user_table, sa->sa_user_table_sz); + + /* Free up attr table */ + + sa_free_attr_table(sa); + + cookie = NULL; + while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) { + sa_idx_tab_t *tab; + while (tab = list_head(&layout->lot_idx_tab)) { + ASSERT(refcount_count(&tab->sa_refcount)); + sa_idx_tab_rele(os, tab); + } + } + + cookie = NULL; + while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) { + kmem_free(layout->lot_attrs, + sizeof (sa_attr_type_t) * layout->lot_attr_count); + kmem_free(layout, sizeof (sa_lot_t)); + } + + avl_destroy(&sa->sa_layout_hash_tree); + avl_destroy(&sa->sa_layout_num_tree); + mutex_destroy(&sa->sa_lock); + + kmem_free(sa, sizeof (sa_os_t)); + os->os_sa = NULL; +} + +void +sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr, + uint16_t length, int length_idx, boolean_t var_length, void *userp) +{ + sa_idx_tab_t *idx_tab = userp; + + if (var_length) { + ASSERT(idx_tab->sa_variable_lengths); + idx_tab->sa_variable_lengths[length_idx] = length; + } + TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx, + (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr)); +} + +static void +sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, + sa_iterfunc_t func, sa_lot_t *tab, void *userp) +{ + void *data_start; + sa_lot_t *tb = tab; + sa_lot_t search; + avl_index_t loc; + sa_os_t *sa = os->os_sa; + int i; + uint16_t *length_start = NULL; + uint8_t length_idx = 0; + + if (tab == NULL) { + search.lot_num = SA_LAYOUT_NUM(hdr, type); + tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); + ASSERT(tb); + } + + if (IS_SA_BONUSTYPE(type)) { + data_start = (void *)P2ROUNDUP(((uintptr_t)hdr + + offsetof(sa_hdr_phys_t, sa_lengths) + + (sizeof (uint16_t) * tb->lot_var_sizes)), 8); + length_start = hdr->sa_lengths; + } else { + data_start = hdr; + } + + for (i = 0; i != tb->lot_attr_count; i++) { + int attr_length, reg_length; + uint8_t idx_len; + + reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length; + if (reg_length) { + attr_length = reg_length; + idx_len = 0; + } else { + attr_length = length_start[length_idx]; + idx_len = length_idx++; + } + + func(hdr, data_start, tb->lot_attrs[i], attr_length, + idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp); + + data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + + attr_length), 8); + } +} + +/*ARGSUSED*/ +void +sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, + uint16_t length, int length_idx, boolean_t variable_length, void *userp) +{ + sa_handle_t *hdl = userp; + sa_os_t *sa = hdl->sa_os->os_sa; + + sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length); +} + +void +sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype) +{ + sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype); + dmu_buf_impl_t *db; + sa_os_t *sa = hdl->sa_os->os_sa; + int num_lengths = 1; + int i; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + if (sa_hdr_phys->sa_magic == SA_MAGIC) + return; + + db = SA_GET_DB(hdl, buftype); + + if (buftype == SA_SPILL) { + arc_release(db->db_buf, NULL); + arc_buf_thaw(db->db_buf); + } + + sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic); + sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info); + + /* + * Determine number of variable lenghts in header + * The standard 8 byte header has one for free and a + * 16 byte header would have 4 + 1; + */ + if (SA_HDR_SIZE(sa_hdr_phys) > 8) + num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1; + for (i = 0; i != num_lengths; i++) + sa_hdr_phys->sa_lengths[i] = + BSWAP_16(sa_hdr_phys->sa_lengths[i]); + + sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA, + sa_byteswap_cb, NULL, hdl); + + if (buftype == SA_SPILL) + arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf); +} + +static int +sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) +{ + sa_hdr_phys_t *sa_hdr_phys; + dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype); + dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db); + sa_os_t *sa = hdl->sa_os->os_sa; + sa_idx_tab_t *idx_tab; + + sa_hdr_phys = SA_GET_HDR(hdl, buftype); + + mutex_enter(&sa->sa_lock); + + /* Do we need to byteswap? */ + + /* only check if not old znode */ + if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC && + sa_hdr_phys->sa_magic != 0) { + VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC); + sa_byteswap(hdl, buftype); + } + + idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys); + + if (buftype == SA_BONUS) + hdl->sa_bonus_tab = idx_tab; + else + hdl->sa_spill_tab = idx_tab; + + mutex_exit(&sa->sa_lock); + return (0); +} + +/*ARGSUSED*/ +static void +sa_evict_sync(void *dbu) +{ + panic("evicting sa dbuf\n"); +} + +static void +sa_idx_tab_rele(objset_t *os, void *arg) +{ + sa_os_t *sa = os->os_sa; + sa_idx_tab_t *idx_tab = arg; + + if (idx_tab == NULL) + return; + + mutex_enter(&sa->sa_lock); + if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) { + list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab); + if (idx_tab->sa_variable_lengths) + kmem_free(idx_tab->sa_variable_lengths, + sizeof (uint16_t) * + idx_tab->sa_layout->lot_var_sizes); + refcount_destroy(&idx_tab->sa_refcount); + kmem_free(idx_tab->sa_idx_tab, + sizeof (uint32_t) * sa->sa_num_attrs); + kmem_free(idx_tab, sizeof (sa_idx_tab_t)); + } + mutex_exit(&sa->sa_lock); +} + +static void +sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab) +{ + sa_os_t *sa = os->os_sa; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + (void) refcount_add(&idx_tab->sa_refcount, NULL); +} + +void +sa_handle_destroy(sa_handle_t *hdl) +{ + dmu_buf_t *db = hdl->sa_bonus; + + mutex_enter(&hdl->sa_lock); + (void) dmu_buf_remove_user(db, &hdl->sa_dbu); + + if (hdl->sa_bonus_tab) + sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); + + if (hdl->sa_spill_tab) + sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); + + dmu_buf_rele(hdl->sa_bonus, NULL); + + if (hdl->sa_spill) + dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); + mutex_exit(&hdl->sa_lock); + + kmem_cache_free(sa_cache, hdl); +} + +int +sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, + sa_handle_type_t hdl_type, sa_handle_t **handlepp) +{ + int error = 0; + dmu_object_info_t doi; + sa_handle_t *handle = NULL; + +#ifdef ZFS_DEBUG + dmu_object_info_from_db(db, &doi); + ASSERT(doi.doi_bonus_type == DMU_OT_SA || + doi.doi_bonus_type == DMU_OT_ZNODE); +#endif + /* find handle, if it exists */ + /* if one doesn't exist then create a new one, and initialize it */ + + if (hdl_type == SA_HDL_SHARED) + handle = dmu_buf_get_user(db); + + if (handle == NULL) { + sa_handle_t *winner = NULL; + + handle = kmem_cache_alloc(sa_cache, KM_SLEEP); + handle->sa_dbu.dbu_evict_func_sync = NULL; + handle->sa_dbu.dbu_evict_func_async = NULL; + handle->sa_userp = userp; + handle->sa_bonus = db; + handle->sa_os = os; + handle->sa_spill = NULL; + handle->sa_bonus_tab = NULL; + handle->sa_spill_tab = NULL; + + error = sa_build_index(handle, SA_BONUS); + + if (hdl_type == SA_HDL_SHARED) { + dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL, + NULL); + winner = dmu_buf_set_user_ie(db, &handle->sa_dbu); + } + + if (winner != NULL) { + kmem_cache_free(sa_cache, handle); + handle = winner; + } + } + *handlepp = handle; + + return (error); +} + +int +sa_handle_get(objset_t *objset, uint64_t objid, void *userp, + sa_handle_type_t hdl_type, sa_handle_t **handlepp) +{ + dmu_buf_t *db; + int error; + + if (error = dmu_bonus_hold(objset, objid, NULL, &db)) + return (error); + + return (sa_handle_get_from_db(objset, db, userp, hdl_type, + handlepp)); +} + +int +sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db) +{ + return (dmu_bonus_hold(objset, obj_num, tag, db)); +} + +void +sa_buf_rele(dmu_buf_t *db, void *tag) +{ + dmu_buf_rele(db, tag); +} + +int +sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count) +{ + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL)); +} + +int +sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_attr = attr; + bulk.sa_data = buf; + bulk.sa_length = buflen; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_lookup_impl(hdl, &bulk, 1); + mutex_exit(&hdl->sa_lock); + return (error); +} + +#ifdef _KERNEL +int +sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_data = NULL; + bulk.sa_attr = attr; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + + mutex_enter(&hdl->sa_lock); + if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { + error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, + uio->uio_resid), UIO_READ, uio); + } + mutex_exit(&hdl->sa_lock); + return (error); + +} +#endif + +static sa_idx_tab_t * +sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr) +{ + sa_idx_tab_t *idx_tab; + sa_os_t *sa = os->os_sa; + sa_lot_t *tb, search; + avl_index_t loc; + + /* + * Deterimine layout number. If SA node and header == 0 then + * force the index table to the dummy "1" empty layout. + * + * The layout number would only be zero for a newly created file + * that has not added any attributes yet, or with crypto enabled which + * doesn't write any attributes to the bonus buffer. + */ + + search.lot_num = SA_LAYOUT_NUM(hdr, bonustype); + + tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); + + /* Verify header size is consistent with layout information */ + ASSERT(tb); + ASSERT(IS_SA_BONUSTYPE(bonustype) && + SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) || + (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0)); + + /* + * See if any of the already existing TOC entries can be reused? + */ + + for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab; + idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) { + boolean_t valid_idx = B_TRUE; + int i; + + if (tb->lot_var_sizes != 0 && + idx_tab->sa_variable_lengths != NULL) { + for (i = 0; i != tb->lot_var_sizes; i++) { + if (hdr->sa_lengths[i] != + idx_tab->sa_variable_lengths[i]) { + valid_idx = B_FALSE; + break; + } + } + } + if (valid_idx) { + sa_idx_tab_hold(os, idx_tab); + return (idx_tab); + } + } + + /* No such luck, create a new entry */ + idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP); + idx_tab->sa_idx_tab = + kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP); + idx_tab->sa_layout = tb; + refcount_create(&idx_tab->sa_refcount); + if (tb->lot_var_sizes) + idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) * + tb->lot_var_sizes, KM_SLEEP); + + sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab, + tb, idx_tab); + sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */ + sa_idx_tab_hold(os, idx_tab); /* one for layout */ + list_insert_tail(&tb->lot_idx_tab, idx_tab); + return (idx_tab); +} + +void +sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len, + boolean_t start, void *userdata) +{ + ASSERT(start); + + *dataptr = userdata; + *len = total_len; +} + +static void +sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) +{ + uint64_t attr_value = 0; + sa_os_t *sa = hdl->sa_os->os_sa; + sa_attr_table_t *tb = sa->sa_attr_table; + int i; + + mutex_enter(&sa->sa_lock); + + if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) { + mutex_exit(&sa->sa_lock); + return; + } + + if (sa->sa_reg_attr_obj == 0) { + sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os, + DMU_OT_SA_ATTR_REGISTRATION, + sa->sa_master_obj, SA_REGISTRY, tx); + } + for (i = 0; i != sa->sa_num_attrs; i++) { + if (sa->sa_attr_table[i].sa_registered) + continue; + ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length, + tb[i].sa_byteswap); + VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj, + tb[i].sa_name, 8, 1, &attr_value, tx)); + tb[i].sa_registered = B_TRUE; + } + sa->sa_need_attr_registration = B_FALSE; + mutex_exit(&sa->sa_lock); +} + +/* + * Replace all attributes with attributes specified in template. + * If dnode had a spill buffer then those attributes will be + * also be replaced, possibly with just an empty spill block + * + * This interface is intended to only be used for bulk adding of + * attributes for a new file. It will also be used by the ZPL + * when converting and old formatted znode to native SA support. + */ +int +sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, + int attr_count, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + + if (sa->sa_need_attr_registration) + sa_attr_register_sync(hdl, tx); + return (sa_build_layouts(hdl, attr_desc, attr_count, tx)); +} + +int +sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, + int attr_count, dmu_tx_t *tx) +{ + int error; + + mutex_enter(&hdl->sa_lock); + error = sa_replace_all_by_template_locked(hdl, attr_desc, + attr_count, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +/* + * Add/remove a single attribute or replace a variable-sized attribute value + * with a value of a different size, and then rewrite the entire set + * of attributes. + * Same-length attribute value replacement (including fixed-length attributes) + * is handled more efficiently by the upper layers. + */ +static int +sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, + sa_data_op_t action, sa_data_locator_t *locator, void *datastart, + uint16_t buflen, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; + sa_bulk_attr_t *attr_desc; + void *old_data[2]; + int bonus_attr_count = 0; + int bonus_data_size = 0; + int spill_data_size = 0; + int spill_attr_count = 0; + int error; + uint16_t length, reg_length; + int i, j, k, length_idx; + sa_hdr_phys_t *hdr; + sa_idx_tab_t *idx_tab; + int attr_count; + int count; + + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + + /* First make of copy of the old data */ + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_bonuslen != 0) { + bonus_data_size = hdl->sa_bonus->db_size; + old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); + bcopy(hdl->sa_bonus->db_data, old_data[0], + hdl->sa_bonus->db_size); + bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; + } else { + old_data[0] = NULL; + } + DB_DNODE_EXIT(db); + + /* Bring spill buffer online if it isn't currently */ + + if ((error = sa_get_spill(hdl)) == 0) { + spill_data_size = hdl->sa_spill->db_size; + old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP); + bcopy(hdl->sa_spill->db_data, old_data[1], + hdl->sa_spill->db_size); + spill_attr_count = + hdl->sa_spill_tab->sa_layout->lot_attr_count; + } else if (error && error != ENOENT) { + if (old_data[0]) + kmem_free(old_data[0], bonus_data_size); + return (error); + } else { + old_data[1] = NULL; + } + + /* build descriptor of all attributes */ + + attr_count = bonus_attr_count + spill_attr_count; + if (action == SA_ADD) + attr_count++; + else if (action == SA_REMOVE) + attr_count--; + + attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP); + + /* + * loop through bonus and spill buffer if it exists, and + * build up new attr_descriptor to reset the attributes + */ + k = j = 0; + count = bonus_attr_count; + hdr = SA_GET_HDR(hdl, SA_BONUS); + idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS); + for (; k != 2; k++) { + /* + * Iterate over each attribute in layout. Fetch the + * size of variable-length attributes needing rewrite + * from sa_lengths[]. + */ + for (i = 0, length_idx = 0; i != count; i++) { + sa_attr_type_t attr; + + attr = idx_tab->sa_layout->lot_attrs[i]; + reg_length = SA_REGISTERED_LEN(sa, attr); + if (reg_length == 0) { + length = hdr->sa_lengths[length_idx]; + length_idx++; + } else { + length = reg_length; + } + if (attr == newattr) { + /* + * There is nothing to do for SA_REMOVE, + * so it is just skipped. + */ + if (action == SA_REMOVE) + continue; + + /* + * Duplicate attributes are not allowed, so the + * action can not be SA_ADD here. + */ + ASSERT3S(action, ==, SA_REPLACE); + + /* + * Only a variable-sized attribute can be + * replaced here, and its size must be changing. + */ + ASSERT3U(reg_length, ==, 0); + ASSERT3U(length, !=, buflen); + SA_ADD_BULK_ATTR(attr_desc, j, attr, + locator, datastart, buflen); + } else { + SA_ADD_BULK_ATTR(attr_desc, j, attr, + NULL, (void *) + (TOC_OFF(idx_tab->sa_idx_tab[attr]) + + (uintptr_t)old_data[k]), length); + } + } + if (k == 0 && hdl->sa_spill) { + hdr = SA_GET_HDR(hdl, SA_SPILL); + idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL); + count = spill_attr_count; + } else { + break; + } + } + if (action == SA_ADD) { + reg_length = SA_REGISTERED_LEN(sa, newattr); + IMPLY(reg_length != 0, reg_length == buflen); + SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, + datastart, buflen); + } + ASSERT3U(j, ==, attr_count); + + error = sa_build_layouts(hdl, attr_desc, attr_count, tx); + + if (old_data[0]) + kmem_free(old_data[0], bonus_data_size); + if (old_data[1]) + kmem_free(old_data[1], spill_data_size); + kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); + + return (error); +} + +static int +sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, + dmu_tx_t *tx) +{ + int error; + sa_os_t *sa = hdl->sa_os->os_sa; + dmu_object_type_t bonustype; + + bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS)); + + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + + /* sync out registration table if necessary */ + if (sa->sa_need_attr_registration) + sa_attr_register_sync(hdl, tx); + + error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx); + if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb) + sa->sa_update_cb(hdl, tx); + + return (error); +} + +/* + * update or add new attribute + */ +int +sa_update(sa_handle_t *hdl, sa_attr_type_t type, + void *buf, uint32_t buflen, dmu_tx_t *tx) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_attr = type; + bulk.sa_data_func = NULL; + bulk.sa_length = buflen; + bulk.sa_data = buf; + + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, &bulk, 1, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr, + uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_attr = attr; + bulk.sa_data = userdata; + bulk.sa_data_func = locator; + bulk.sa_length = buflen; + + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, &bulk, 1, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +/* + * Return size of an attribute + */ + +int +sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size) +{ + sa_bulk_attr_t bulk; + int error; + + bulk.sa_data = NULL; + bulk.sa_attr = attr; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) { + mutex_exit(&hdl->sa_lock); + return (error); + } + *size = bulk.sa_size; + + mutex_exit(&hdl->sa_lock); + return (0); +} + +int +sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) +{ + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + return (sa_lookup_impl(hdl, attrs, count)); +} + +int +sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) +{ + int error; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_bulk_lookup_locked(hdl, attrs, count); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx) +{ + int error; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, attrs, count, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx) +{ + int error; + + mutex_enter(&hdl->sa_lock); + error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL, + NULL, 0, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +void +sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi) +{ + dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi); +} + +void +sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) +{ + dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus, + blksize, nblocks); +} + +void +sa_set_userp(sa_handle_t *hdl, void *ptr) +{ + hdl->sa_userp = ptr; +} + +dmu_buf_t * +sa_get_db(sa_handle_t *hdl) +{ + return ((dmu_buf_t *)hdl->sa_bonus); +} + +void * +sa_get_userdata(sa_handle_t *hdl) +{ + return (hdl->sa_userp); +} + +void +sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func) +{ + ASSERT(MUTEX_HELD(&os->os_sa->sa_lock)); + os->os_sa->sa_update_cb = func; +} + +void +sa_register_update_callback(objset_t *os, sa_update_cb_t *func) +{ + + mutex_enter(&os->os_sa->sa_lock); + sa_register_update_callback_locked(os, func); + mutex_exit(&os->os_sa->sa_lock); +} + +uint64_t +sa_handle_object(sa_handle_t *hdl) +{ + return (hdl->sa_bonus->db_object); +} + +boolean_t +sa_enabled(objset_t *os) +{ + return (os->os_sa == NULL); +} + +int +sa_set_sa_object(objset_t *os, uint64_t sa_object) +{ + sa_os_t *sa = os->os_sa; + + if (sa->sa_master_obj) + return (1); + + sa->sa_master_obj = sa_object; + + return (0); +} + +int +sa_hdrsize(void *arg) +{ + sa_hdr_phys_t *hdr = arg; + + return (SA_HDR_SIZE(hdr)); +} + +void +sa_handle_lock(sa_handle_t *hdl) +{ + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); +} + +void +sa_handle_unlock(sa_handle_t *hdl) +{ + ASSERT(hdl); + mutex_exit(&hdl->sa_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c new file mode 100644 index 000000000000..34c909f0c71a --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ +#include <sys/zfs_context.h> +#include <sys/zio.h> +#ifdef _KERNEL +#include <crypto/sha2/sha256.h> +#include <crypto/sha2/sha512t.h> +#else +#include <sha256.h> +#include <sha512t.h> +#endif +#include <sys/abd.h> + +static int +sha256_incremental(void *buf, size_t size, void *arg) +{ + SHA256_CTX *ctx = arg; + SHA256_Update(ctx, buf, size); + return (0); +} + +static int +sha512_incremental(void *buf, size_t size, void *arg) +{ + SHA512_CTX *ctx = arg; + SHA512_256_Update(ctx, buf, size); + return (0); +} + +/*ARGSUSED*/ +void +abd_checksum_SHA256(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + SHA256_CTX ctx; + zio_cksum_t tmp; + + SHA256_Init(&ctx); + (void) abd_iterate_func(abd, 0, size, sha256_incremental, &ctx); + SHA256_Final((unsigned char *)&tmp, &ctx); + + /* + * A prior implementation of this function had a + * private SHA256 implementation always wrote things out in + * Big Endian and there wasn't a byteswap variant of it. + * To preserve on disk compatibility we need to force that + * behavior. + */ + zcp->zc_word[0] = BE_64(tmp.zc_word[0]); + zcp->zc_word[1] = BE_64(tmp.zc_word[1]); + zcp->zc_word[2] = BE_64(tmp.zc_word[2]); + zcp->zc_word[3] = BE_64(tmp.zc_word[3]); +} + +/*ARGSUSED*/ +void +abd_checksum_SHA512_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + SHA512_CTX ctx; + + SHA512_256_Init(&ctx); + (void) abd_iterate_func(abd, 0, size, sha512_incremental, &ctx); + SHA512_256_Final((unsigned char *)zcp, &ctx); +} + +/*ARGSUSED*/ +void +abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + abd_checksum_SHA512_native(abd, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); + zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); + zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); + zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c new file mode 100644 index 000000000000..c30f590a5fdb --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ +#include <sys/zfs_context.h> +#include <sys/zio.h> +#ifdef _KERNEL +#include <crypto/skein/skein.h> +#else +#include <skein.h> +#endif +#include <sys/abd.h> + +static int +skein_incremental(void *buf, size_t size, void *arg) +{ + Skein_512_Ctxt_t *ctx = arg; + (void) Skein_512_Update(ctx, buf, size); + return (0); +} + +/* + * Computes a native 256-bit skein MAC checksum. Please note that this + * function requires the presence of a ctx_template that should be allocated + * using abd_checksum_skein_tmpl_init. + */ +/*ARGSUSED*/ +void +abd_checksum_skein_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + Skein_512_Ctxt_t ctx; + + ASSERT(ctx_template != NULL); + bcopy(ctx_template, &ctx, sizeof (ctx)); + (void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx); + (void) Skein_512_Final(&ctx, (uint8_t *)zcp); + bzero(&ctx, sizeof (ctx)); +} + +/* + * Byteswapped version of abd_checksum_skein_native. This just invokes + * the native checksum function and byteswaps the resulting checksum (since + * skein is internally endian-insensitive). + */ +void +abd_checksum_skein_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + abd_checksum_skein_native(abd, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); + zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); + zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); + zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]); +} + +/* + * Allocates a skein MAC template suitable for using in skein MAC checksum + * computations and returns a pointer to it. + */ +void * +abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) +{ + Skein_512_Ctxt_t *ctx; + + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + (void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0, + salt->zcs_bytes, sizeof (salt->zcs_bytes)); + return (ctx); +} + +/* + * Frees a skein context template previously allocated using + * abd_checksum_skein_tmpl_init. + */ +void +abd_checksum_skein_tmpl_free(void *ctx_template) +{ + Skein_512_Ctxt_t *ctx = ctx_template; + + bzero(ctx, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c new file mode 100644 index 000000000000..31d7ed442820 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -0,0 +1,8524 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Toomas Soome <tsoome@me.com> + * Copyright 2018 Joyent, Inc. + * Copyright (c) 2017 Datto Inc. + * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. + */ + +/* + * SPA: Storage Pool Allocator + * + * This file contains all the routines used when modifying on-disk SPA state. + * This includes opening, importing, destroying, exporting a pool, and syncing a + * pool. + */ + +#include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/ddt.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_removal.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/vdev_indirect_births.h> +#include <sys/vdev_initialize.h> +#include <sys/metaslab.h> +#include <sys/metaslab_impl.h> +#include <sys/uberblock_impl.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/bpobj.h> +#include <sys/dmu_traverse.h> +#include <sys/dmu_objset.h> +#include <sys/unique.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/fs/zfs.h> +#include <sys/arc.h> +#include <sys/callb.h> +#include <sys/spa_boot.h> +#include <sys/zfs_ioctl.h> +#include <sys/dsl_scan.h> +#include <sys/dmu_send.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_userhold.h> +#include <sys/zfeature.h> +#include <sys/zvol.h> +#include <sys/trim_map.h> +#include <sys/abd.h> + +#ifdef _KERNEL +#include <sys/callb.h> +#include <sys/cpupart.h> +#include <sys/zone.h> +#endif /* _KERNEL */ + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +/* Check hostid on import? */ +static int check_hostid = 1; + +/* + * The interval, in seconds, at which failed configuration cache file writes + * should be retried. + */ +int zfs_ccw_retry_interval = 300; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, + "Check hostid on import?"); +TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); +SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, + &zfs_ccw_retry_interval, 0, + "Configuration cache file write, retry after failure, interval (seconds)"); + +typedef enum zti_modes { + ZTI_MODE_FIXED, /* value is # of threads (min 1) */ + ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ + ZTI_MODE_NULL, /* don't create a taskq */ + ZTI_NMODES +} zti_modes_t; + +#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } +#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } +#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } + +#define ZTI_N(n) ZTI_P(n, 1) +#define ZTI_ONE ZTI_N(1) + +typedef struct zio_taskq_info { + zti_modes_t zti_mode; + uint_t zti_value; + uint_t zti_count; +} zio_taskq_info_t; + +static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { + "issue", "issue_high", "intr", "intr_high" +}; + +/* + * This table defines the taskq settings for each ZFS I/O type. When + * initializing a pool, we use this table to create an appropriately sized + * taskq. Some operations are low volume and therefore have a small, static + * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE + * macros. Other operations process a large amount of data; the ZTI_BATCH + * macro causes us to create a taskq oriented for throughput. Some operations + * are so high frequency and short-lived that the taskq itself can become a a + * point of lock contention. The ZTI_P(#, #) macro indicates that we need an + * additional degree of parallelism specified by the number of threads per- + * taskq and the number of taskqs; when dispatching an event in this case, the + * particular taskq is chosen at random. + * + * The different taskq priorities are to handle the different contexts (issue + * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that + * need to be handled with minimum delay. + */ +const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { + /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ + { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ + { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ + { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ +}; + +static void spa_sync_version(void *arg, dmu_tx_t *tx); +static void spa_sync_props(void *arg, dmu_tx_t *tx); +static boolean_t spa_has_active_shared_spare(spa_t *spa); +static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); +static void spa_vdev_resilver_done(spa_t *spa); + +uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ +#ifdef PSRSET_BIND +id_t zio_taskq_psrset_bind = PS_NONE; +#endif +#ifdef SYSDC +boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ +uint_t zio_taskq_basedc = 80; /* base duty cycle */ +#endif + +boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ +extern int zfs_sync_pass_deferred_free; + +/* + * Report any spa_load_verify errors found, but do not fail spa_load. + * This is used by zdb to analyze non-idle pools. + */ +boolean_t spa_load_verify_dryrun = B_FALSE; + +/* + * This (illegal) pool name is used when temporarily importing a spa_t in order + * to get the vdev stats associated with the imported devices. + */ +#define TRYIMPORT_NAME "$import" + +/* + * For debugging purposes: print out vdev tree during pool import. + */ +int spa_load_print_vdev_tree = B_FALSE; + +/* + * A non-zero value for zfs_max_missing_tvds means that we allow importing + * pools with missing top-level vdevs. This is strictly intended for advanced + * pool recovery cases since missing data is almost inevitable. Pools with + * missing devices can only be imported read-only for safety reasons, and their + * fail-mode will be automatically set to "continue". + * + * With 1 missing vdev we should be able to import the pool and mount all + * datasets. User data that was not modified after the missing device has been + * added should be recoverable. This means that snapshots created prior to the + * addition of that device should be completely intact. + * + * With 2 missing vdevs, some datasets may fail to mount since there are + * dataset statistics that are stored as regular metadata. Some data might be + * recoverable if those vdevs were added recently. + * + * With 3 or more missing vdevs, the pool is severely damaged and MOS entries + * may be missing entirely. Chances of data recovery are very low. Note that + * there are also risks of performing an inadvertent rewind as we might be + * missing all the vdevs with the latest uberblocks. + */ +uint64_t zfs_max_missing_tvds = 0; + +/* + * The parameters below are similar to zfs_max_missing_tvds but are only + * intended for a preliminary open of the pool with an untrusted config which + * might be incomplete or out-dated. + * + * We are more tolerant for pools opened from a cachefile since we could have + * an out-dated cachefile where a device removal was not registered. + * We could have set the limit arbitrarily high but in the case where devices + * are really missing we would want to return the proper error codes; we chose + * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available + * and we get a chance to retrieve the trusted config. + */ +uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; + +/* + * In the case where config was assembled by scanning device paths (/dev/dsks + * by default) we are less tolerant since all the existing devices should have + * been detected and we want spa_load to return the right error codes. + */ +uint64_t zfs_max_missing_tvds_scan = 0; + + +SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_print_vdev_tree, CTLFLAG_RWTUN, + &spa_load_print_vdev_tree, 0, + "print out vdev tree during pool import"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds, CTLFLAG_RWTUN, + &zfs_max_missing_tvds, 0, + "allow importing pools with missing top-level vdevs"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, + &zfs_max_missing_tvds_cachefile, 0, + "allow importing pools with missing top-level vdevs in cache file"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, + &zfs_max_missing_tvds_scan, 0, + "allow importing pools with missing top-level vdevs during scan"); + +/* + * Debugging aid that pauses spa_sync() towards the end. + */ +boolean_t zfs_pause_spa_sync = B_FALSE; + +/* + * ========================================================================== + * SPA properties routines + * ========================================================================== + */ + +/* + * Add a (source=src, propname=propval) list to an nvlist. + */ +static void +spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, + uint64_t intval, zprop_source_t src) +{ + const char *propname = zpool_prop_to_name(prop); + nvlist_t *propval; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); + + if (strval != NULL) + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); + else + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); + + VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); + nvlist_free(propval); +} + +/* + * Get property values from the spa configuration. + */ +static void +spa_prop_get_config(spa_t *spa, nvlist_t **nvp) +{ + vdev_t *rvd = spa->spa_root_vdev; + dsl_pool_t *pool = spa->spa_dsl_pool; + uint64_t size, alloc, cap, version; + zprop_source_t src = ZPROP_SRC_NONE; + spa_config_dirent_t *dp; + metaslab_class_t *mc = spa_normal_class(spa); + + ASSERT(MUTEX_HELD(&spa->spa_props_lock)); + + if (rvd != NULL) { + alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + size = metaslab_class_get_space(spa_normal_class(spa)); + spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, + size - alloc, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, + spa->spa_checkpoint_info.sci_dspace, src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, + metaslab_class_fragmentation(mc), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, + metaslab_class_expandable_space(mc), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, + (spa_mode(spa) == FREAD), src); + + cap = (size == 0) ? 0 : (alloc * 100 / size); + spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, + ddt_get_pool_dedup_ratio(spa), src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, + rvd->vdev_state, src); + + version = spa_version(spa); + if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); + } + + if (pool != NULL) { + /* + * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, + * when opening pools before this version freedir will be NULL. + */ + if (pool->dp_free_dir != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, + dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, + src); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, + NULL, 0, src); + } + + if (pool->dp_leak_dir != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, + dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, + src); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, + NULL, 0, src); + } + } + + spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); + + if (spa->spa_comment != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, + 0, ZPROP_SRC_LOCAL); + } + + if (spa->spa_root != NULL) + spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, + 0, ZPROP_SRC_LOCAL); + + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, + MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, + SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); + } + + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + DNODE_MAX_SIZE, ZPROP_SRC_NONE); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + DNODE_MIN_SIZE, ZPROP_SRC_NONE); + } + + if ((dp = list_head(&spa->spa_config_list)) != NULL) { + if (dp->scd_path == NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + "none", 0, ZPROP_SRC_LOCAL); + } else if (strcmp(dp->scd_path, spa_config_path) != 0) { + spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + dp->scd_path, 0, ZPROP_SRC_LOCAL); + } + } +} + +/* + * Get zpool property values. + */ +int +spa_prop_get(spa_t *spa, nvlist_t **nvp) +{ + objset_t *mos = spa->spa_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + int err; + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + mutex_enter(&spa->spa_props_lock); + + /* + * Get properties from the spa config. + */ + spa_prop_get_config(spa, nvp); + + /* If no pool property object, no more prop to get. */ + if (mos == NULL || spa->spa_pool_props_object == 0) { + mutex_exit(&spa->spa_props_lock); + return (0); + } + + /* + * Get properties from the MOS pool property object. + */ + for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); + (err = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t intval = 0; + char *strval = NULL; + zprop_source_t src = ZPROP_SRC_DEFAULT; + zpool_prop_t prop; + + if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) + continue; + + switch (za.za_integer_length) { + case 8: + /* integer property */ + if (za.za_first_integer != + zpool_prop_default_numeric(prop)) + src = ZPROP_SRC_LOCAL; + + if (prop == ZPOOL_PROP_BOOTFS) { + dsl_pool_t *dp; + dsl_dataset_t *ds = NULL; + + dp = spa_get_dsl(spa); + dsl_pool_config_enter(dp, FTAG); + err = dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &ds); + if (err != 0) { + dsl_pool_config_exit(dp, FTAG); + break; + } + + strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, + KM_SLEEP); + dsl_dataset_name(ds, strval); + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + } else { + strval = NULL; + intval = za.za_first_integer; + } + + spa_prop_add_list(*nvp, prop, strval, intval, src); + + if (strval != NULL) + kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); + + break; + + case 1: + /* string property */ + strval = kmem_alloc(za.za_num_integers, KM_SLEEP); + err = zap_lookup(mos, spa->spa_pool_props_object, + za.za_name, 1, za.za_num_integers, strval); + if (err) { + kmem_free(strval, za.za_num_integers); + break; + } + spa_prop_add_list(*nvp, prop, strval, 0, src); + kmem_free(strval, za.za_num_integers); + break; + + default: + break; + } + } + zap_cursor_fini(&zc); + mutex_exit(&spa->spa_props_lock); +out: + if (err && err != ENOENT) { + nvlist_free(*nvp); + *nvp = NULL; + return (err); + } + + return (0); +} + +/* + * Validate the given pool properties nvlist and modify the list + * for the property values to be set. + */ +static int +spa_prop_validate(spa_t *spa, nvlist_t *props) +{ + nvpair_t *elem; + int error = 0, reset_bootfs = 0; + uint64_t objnum = 0; + boolean_t has_feature = B_FALSE; + + elem = NULL; + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + uint64_t intval; + char *strval, *slash, *check, *fname; + const char *propname = nvpair_name(elem); + zpool_prop_t prop = zpool_name_to_prop(propname); + + switch (prop) { + case ZPOOL_PROP_INVAL: + if (!zpool_prop_feature(propname)) { + error = SET_ERROR(EINVAL); + break; + } + + /* + * Sanitize the input. + */ + if (nvpair_type(elem) != DATA_TYPE_UINT64) { + error = SET_ERROR(EINVAL); + break; + } + + if (nvpair_value_uint64(elem, &intval) != 0) { + error = SET_ERROR(EINVAL); + break; + } + + if (intval != 0) { + error = SET_ERROR(EINVAL); + break; + } + + fname = strchr(propname, '@') + 1; + if (zfeature_lookup_name(fname, NULL) != 0) { + error = SET_ERROR(EINVAL); + break; + } + + has_feature = B_TRUE; + break; + + case ZPOOL_PROP_VERSION: + error = nvpair_value_uint64(elem, &intval); + if (!error && + (intval < spa_version(spa) || + intval > SPA_VERSION_BEFORE_FEATURES || + has_feature)) + error = SET_ERROR(EINVAL); + break; + + case ZPOOL_PROP_DELEGATION: + case ZPOOL_PROP_AUTOREPLACE: + case ZPOOL_PROP_LISTSNAPS: + case ZPOOL_PROP_AUTOEXPAND: + error = nvpair_value_uint64(elem, &intval); + if (!error && intval > 1) + error = SET_ERROR(EINVAL); + break; + + case ZPOOL_PROP_BOOTFS: + /* + * If the pool version is less than SPA_VERSION_BOOTFS, + * or the pool is still being created (version == 0), + * the bootfs property cannot be set. + */ + if (spa_version(spa) < SPA_VERSION_BOOTFS) { + error = SET_ERROR(ENOTSUP); + break; + } + + /* + * Make sure the vdev config is bootable + */ + if (!vdev_is_bootable(spa->spa_root_vdev)) { + error = SET_ERROR(ENOTSUP); + break; + } + + reset_bootfs = 1; + + error = nvpair_value_string(elem, &strval); + + if (!error) { + objset_t *os; + uint64_t propval; + + if (strval == NULL || strval[0] == '\0') { + objnum = zpool_prop_default_numeric( + ZPOOL_PROP_BOOTFS); + break; + } + + error = dmu_objset_hold(strval, FTAG, &os); + if (error != 0) + break; + + /* + * Must be ZPL, and its property settings + * must be supported. + */ + + if (dmu_objset_type(os) != DMU_OST_ZFS) { + error = SET_ERROR(ENOTSUP); + } else if ((error = + dsl_prop_get_int_ds(dmu_objset_ds(os), + zfs_prop_to_name(ZFS_PROP_COMPRESSION), + &propval)) == 0 && + !BOOTFS_COMPRESS_VALID(propval)) { + error = SET_ERROR(ENOTSUP); + } else { + objnum = dmu_objset_id(os); + } + dmu_objset_rele(os, FTAG); + } + break; + + case ZPOOL_PROP_FAILUREMODE: + error = nvpair_value_uint64(elem, &intval); + if (!error && (intval < ZIO_FAILURE_MODE_WAIT || + intval > ZIO_FAILURE_MODE_PANIC)) + error = SET_ERROR(EINVAL); + + /* + * This is a special case which only occurs when + * the pool has completely failed. This allows + * the user to change the in-core failmode property + * without syncing it out to disk (I/Os might + * currently be blocked). We do this by returning + * EIO to the caller (spa_prop_set) to trick it + * into thinking we encountered a property validation + * error. + */ + if (!error && spa_suspended(spa)) { + spa->spa_failmode = intval; + error = SET_ERROR(EIO); + } + break; + + case ZPOOL_PROP_CACHEFILE: + if ((error = nvpair_value_string(elem, &strval)) != 0) + break; + + if (strval[0] == '\0') + break; + + if (strcmp(strval, "none") == 0) + break; + + if (strval[0] != '/') { + error = SET_ERROR(EINVAL); + break; + } + + slash = strrchr(strval, '/'); + ASSERT(slash != NULL); + + if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || + strcmp(slash, "/..") == 0) + error = SET_ERROR(EINVAL); + break; + + case ZPOOL_PROP_COMMENT: + if ((error = nvpair_value_string(elem, &strval)) != 0) + break; + for (check = strval; *check != '\0'; check++) { + /* + * The kernel doesn't have an easy isprint() + * check. For this kernel check, we merely + * check ASCII apart from DEL. Fix this if + * there is an easy-to-use kernel isprint(). + */ + if (*check >= 0x7f) { + error = SET_ERROR(EINVAL); + break; + } + } + if (strlen(strval) > ZPROP_MAX_COMMENT) + error = E2BIG; + break; + + case ZPOOL_PROP_DEDUPDITTO: + if (spa_version(spa) < SPA_VERSION_DEDUP) + error = SET_ERROR(ENOTSUP); + else + error = nvpair_value_uint64(elem, &intval); + if (error == 0 && + intval != 0 && intval < ZIO_DEDUPDITTO_MIN) + error = SET_ERROR(EINVAL); + break; + } + + if (error) + break; + } + + if (!error && reset_bootfs) { + error = nvlist_remove(props, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); + + if (!error) { + error = nvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); + } + } + + return (error); +} + +void +spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) +{ + char *cachefile; + spa_config_dirent_t *dp; + + if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), + &cachefile) != 0) + return; + + dp = kmem_alloc(sizeof (spa_config_dirent_t), + KM_SLEEP); + + if (cachefile[0] == '\0') + dp->scd_path = spa_strdup(spa_config_path); + else if (strcmp(cachefile, "none") == 0) + dp->scd_path = NULL; + else + dp->scd_path = spa_strdup(cachefile); + + list_insert_head(&spa->spa_config_list, dp); + if (need_sync) + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); +} + +int +spa_prop_set(spa_t *spa, nvlist_t *nvp) +{ + int error; + nvpair_t *elem = NULL; + boolean_t need_sync = B_FALSE; + + if ((error = spa_prop_validate(spa, nvp)) != 0) + return (error); + + while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { + zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); + + if (prop == ZPOOL_PROP_CACHEFILE || + prop == ZPOOL_PROP_ALTROOT || + prop == ZPOOL_PROP_READONLY) + continue; + + if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { + uint64_t ver; + + if (prop == ZPOOL_PROP_VERSION) { + VERIFY(nvpair_value_uint64(elem, &ver) == 0); + } else { + ASSERT(zpool_prop_feature(nvpair_name(elem))); + ver = SPA_VERSION_FEATURES; + need_sync = B_TRUE; + } + + /* Save time if the version is already set. */ + if (ver == spa_version(spa)) + continue; + + /* + * In addition to the pool directory object, we might + * create the pool properties object, the features for + * read object, the features for write object, or the + * feature descriptions object. + */ + error = dsl_sync_task(spa->spa_name, NULL, + spa_sync_version, &ver, + 6, ZFS_SPACE_CHECK_RESERVED); + if (error) + return (error); + continue; + } + + need_sync = B_TRUE; + break; + } + + if (need_sync) { + return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, + nvp, 6, ZFS_SPACE_CHECK_RESERVED)); + } + + return (0); +} + +/* + * If the bootfs property value is dsobj, clear it. + */ +void +spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) +{ + if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { + VERIFY(zap_remove(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); + spa->spa_bootfs = 0; + } +} + +/*ARGSUSED*/ +static int +spa_change_guid_check(void *arg, dmu_tx_t *tx) +{ + uint64_t *newguid = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *rvd = spa->spa_root_vdev; + uint64_t vdev_state; + + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + int error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (SET_ERROR(error)); + } + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_state = rvd->vdev_state; + spa_config_exit(spa, SCL_STATE, FTAG); + + if (vdev_state != VDEV_STATE_HEALTHY) + return (SET_ERROR(ENXIO)); + + ASSERT3U(spa_guid(spa), !=, *newguid); + + return (0); +} + +static void +spa_change_guid_sync(void *arg, dmu_tx_t *tx) +{ + uint64_t *newguid = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + uint64_t oldguid; + vdev_t *rvd = spa->spa_root_vdev; + + oldguid = spa_guid(spa); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + rvd->vdev_guid = *newguid; + rvd->vdev_guid_sum += (*newguid - oldguid); + vdev_config_dirty(rvd); + spa_config_exit(spa, SCL_STATE, FTAG); + + spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", + oldguid, *newguid); +} + +/* + * Change the GUID for the pool. This is done so that we can later + * re-import a pool built from a clone of our own vdevs. We will modify + * the root vdev's guid, our own pool guid, and then mark all of our + * vdevs dirty. Note that we must make sure that all our vdevs are + * online when we do this, or else any vdevs that weren't present + * would be orphaned from our pool. We are also going to issue a + * sysevent to update any watchers. + */ +int +spa_change_guid(spa_t *spa) +{ + int error; + uint64_t guid; + + mutex_enter(&spa->spa_vdev_top_lock); + mutex_enter(&spa_namespace_lock); + guid = spa_generate_guid(NULL); + + error = dsl_sync_task(spa->spa_name, spa_change_guid_check, + spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); + + if (error == 0) { + spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); + } + + mutex_exit(&spa_namespace_lock); + mutex_exit(&spa->spa_vdev_top_lock); + + return (error); +} + +/* + * ========================================================================== + * SPA state manipulation (open/create/destroy/import/export) + * ========================================================================== + */ + +static int +spa_error_entry_compare(const void *a, const void *b) +{ + const spa_error_entry_t *sa = (const spa_error_entry_t *)a; + const spa_error_entry_t *sb = (const spa_error_entry_t *)b; + int ret; + + ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, + sizeof (zbookmark_phys_t)); + + return (AVL_ISIGN(ret)); +} + +/* + * Utility function which retrieves copies of the current logs and + * re-initializes them in the process. + */ +void +spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) +{ + ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); + + bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); + bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); + + avl_create(&spa->spa_errlist_scrub, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); + avl_create(&spa->spa_errlist_last, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); +} + +static void +spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) +{ + const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; + enum zti_modes mode = ztip->zti_mode; + uint_t value = ztip->zti_value; + uint_t count = ztip->zti_count; + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + char name[32]; + uint_t flags = 0; + boolean_t batch = B_FALSE; + + if (mode == ZTI_MODE_NULL) { + tqs->stqs_count = 0; + tqs->stqs_taskq = NULL; + return; + } + + ASSERT3U(count, >, 0); + + tqs->stqs_count = count; + tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); + + switch (mode) { + case ZTI_MODE_FIXED: + ASSERT3U(value, >=, 1); + value = MAX(value, 1); + break; + + case ZTI_MODE_BATCH: + batch = B_TRUE; + flags |= TASKQ_THREADS_CPU_PCT; + value = zio_taskq_batch_pct; + break; + + default: + panic("unrecognized mode for %s_%s taskq (%u:%u) in " + "spa_activate()", + zio_type_name[t], zio_taskq_types[q], mode, value); + break; + } + + for (uint_t i = 0; i < count; i++) { + taskq_t *tq; + + if (count > 1) { + (void) snprintf(name, sizeof (name), "%s_%s_%u", + zio_type_name[t], zio_taskq_types[q], i); + } else { + (void) snprintf(name, sizeof (name), "%s_%s", + zio_type_name[t], zio_taskq_types[q]); + } + +#ifdef SYSDC + if (zio_taskq_sysdc && spa->spa_proc != &p0) { + if (batch) + flags |= TASKQ_DC_BATCH; + + tq = taskq_create_sysdc(name, value, 50, INT_MAX, + spa->spa_proc, zio_taskq_basedc, flags); + } else { +#endif + pri_t pri = maxclsyspri; + /* + * The write issue taskq can be extremely CPU + * intensive. Run it at slightly lower priority + * than the other taskqs. + * FreeBSD notes: + * - numerically higher priorities are lower priorities; + * - if priorities divided by four (RQ_PPQ) are equal + * then a difference between them is insignificant. + */ + if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) +#ifdef illumos + pri--; +#else + pri += 4; +#endif + + tq = taskq_create_proc(name, value, pri, 50, + INT_MAX, spa->spa_proc, flags); +#ifdef SYSDC + } +#endif + + tqs->stqs_taskq[i] = tq; + } +} + +static void +spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) +{ + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + + if (tqs->stqs_taskq == NULL) { + ASSERT0(tqs->stqs_count); + return; + } + + for (uint_t i = 0; i < tqs->stqs_count; i++) { + ASSERT3P(tqs->stqs_taskq[i], !=, NULL); + taskq_destroy(tqs->stqs_taskq[i]); + } + + kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); + tqs->stqs_taskq = NULL; +} + +/* + * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. + * Note that a type may have multiple discrete taskqs to avoid lock contention + * on the taskq itself. In that case we choose which taskq at random by using + * the low bits of gethrtime(). + */ +void +spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) +{ + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + taskq_t *tq; + + ASSERT3P(tqs->stqs_taskq, !=, NULL); + ASSERT3U(tqs->stqs_count, !=, 0); + + if (tqs->stqs_count == 1) { + tq = tqs->stqs_taskq[0]; + } else { +#ifdef _KERNEL + tq = tqs->stqs_taskq[(u_int)(sbinuptime() + curcpu) % + tqs->stqs_count]; +#else + tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; +#endif + } + + taskq_dispatch_ent(tq, func, arg, flags, ent); +} + +static void +spa_create_zio_taskqs(spa_t *spa) +{ + for (int t = 0; t < ZIO_TYPES; t++) { + for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { + spa_taskqs_init(spa, t, q); + } + } +} + +#ifdef _KERNEL +#ifdef SPA_PROCESS +static void +spa_thread(void *arg) +{ + callb_cpr_t cprinfo; + + spa_t *spa = arg; + user_t *pu = PTOU(curproc); + + CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, + spa->spa_name); + + ASSERT(curproc != &p0); + (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), + "zpool-%s", spa->spa_name); + (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); + +#ifdef PSRSET_BIND + /* bind this thread to the requested psrset */ + if (zio_taskq_psrset_bind != PS_NONE) { + pool_lock(); + mutex_enter(&cpu_lock); + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, + 0, NULL, NULL) == 0) { + curthread->t_bind_pset = zio_taskq_psrset_bind; + } else { + cmn_err(CE_WARN, + "Couldn't bind process for zfs pool \"%s\" to " + "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); + } + + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + mutex_exit(&cpu_lock); + pool_unlock(); + } +#endif + +#ifdef SYSDC + if (zio_taskq_sysdc) { + sysdc_thread_enter(curthread, 100, 0); + } +#endif + + spa->spa_proc = curproc; + spa->spa_did = curthread->t_did; + + spa_create_zio_taskqs(spa); + + mutex_enter(&spa->spa_proc_lock); + ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); + + spa->spa_proc_state = SPA_PROC_ACTIVE; + cv_broadcast(&spa->spa_proc_cv); + + CALLB_CPR_SAFE_BEGIN(&cprinfo); + while (spa->spa_proc_state == SPA_PROC_ACTIVE) + cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); + CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); + + ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); + spa->spa_proc_state = SPA_PROC_GONE; + spa->spa_proc = &p0; + cv_broadcast(&spa->spa_proc_cv); + CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ + + mutex_enter(&curproc->p_lock); + lwp_exit(); +} +#endif /* SPA_PROCESS */ +#endif + +/* + * Activate an uninitialized pool. + */ +static void +spa_activate(spa_t *spa, int mode) +{ + ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + + spa->spa_state = POOL_STATE_ACTIVE; + spa->spa_mode = mode; + + spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); + + /* Try to create a covering process */ + mutex_enter(&spa->spa_proc_lock); + ASSERT(spa->spa_proc_state == SPA_PROC_NONE); + ASSERT(spa->spa_proc == &p0); + spa->spa_did = 0; + +#ifdef SPA_PROCESS + /* Only create a process if we're going to be around a while. */ + if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { + if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, + NULL, 0) == 0) { + spa->spa_proc_state = SPA_PROC_CREATED; + while (spa->spa_proc_state == SPA_PROC_CREATED) { + cv_wait(&spa->spa_proc_cv, + &spa->spa_proc_lock); + } + ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); + ASSERT(spa->spa_proc != &p0); + ASSERT(spa->spa_did != 0); + } else { +#ifdef _KERNEL + cmn_err(CE_WARN, + "Couldn't create process for zfs pool \"%s\"\n", + spa->spa_name); +#endif + } + } +#endif /* SPA_PROCESS */ + mutex_exit(&spa->spa_proc_lock); + + /* If we didn't create a process, we need to create our taskqs. */ + ASSERT(spa->spa_proc == &p0); + if (spa->spa_proc == &p0) { + spa_create_zio_taskqs(spa); + } + + /* + * Start TRIM thread. + */ + trim_thread_create(spa); + + for (size_t i = 0; i < TXG_SIZE; i++) { + spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + } + + list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_config_dirty_node)); + list_create(&spa->spa_evicting_os_list, sizeof (objset_t), + offsetof(objset_t, os_evicting_node)); + list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_state_dirty_node)); + + txg_list_create(&spa->spa_vdev_txg_list, spa, + offsetof(struct vdev, vdev_txg_node)); + + avl_create(&spa->spa_errlist_scrub, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); + avl_create(&spa->spa_errlist_last, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); +} + +/* + * Opposite of spa_activate(). + */ +static void +spa_deactivate(spa_t *spa) +{ + ASSERT(spa->spa_sync_on == B_FALSE); + ASSERT(spa->spa_dsl_pool == NULL); + ASSERT(spa->spa_root_vdev == NULL); + ASSERT(spa->spa_async_zio_root == NULL); + ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); + + /* + * Stop TRIM thread in case spa_unload() wasn't called directly + * before spa_deactivate(). + */ + trim_thread_destroy(spa); + + spa_evicting_os_wait(spa); + + txg_list_destroy(&spa->spa_vdev_txg_list); + + list_destroy(&spa->spa_config_dirty_list); + list_destroy(&spa->spa_evicting_os_list); + list_destroy(&spa->spa_state_dirty_list); + + for (int t = 0; t < ZIO_TYPES; t++) { + for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { + spa_taskqs_fini(spa, t, q); + } + } + + for (size_t i = 0; i < TXG_SIZE; i++) { + ASSERT3P(spa->spa_txg_zio[i], !=, NULL); + VERIFY0(zio_wait(spa->spa_txg_zio[i])); + spa->spa_txg_zio[i] = NULL; + } + + metaslab_class_destroy(spa->spa_normal_class); + spa->spa_normal_class = NULL; + + metaslab_class_destroy(spa->spa_log_class); + spa->spa_log_class = NULL; + + /* + * If this was part of an import or the open otherwise failed, we may + * still have errors left in the queues. Empty them just in case. + */ + spa_errlog_drain(spa); + + avl_destroy(&spa->spa_errlist_scrub); + avl_destroy(&spa->spa_errlist_last); + + spa->spa_state = POOL_STATE_UNINITIALIZED; + + mutex_enter(&spa->spa_proc_lock); + if (spa->spa_proc_state != SPA_PROC_NONE) { + ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); + spa->spa_proc_state = SPA_PROC_DEACTIVATE; + cv_broadcast(&spa->spa_proc_cv); + while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { + ASSERT(spa->spa_proc != &p0); + cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); + } + ASSERT(spa->spa_proc_state == SPA_PROC_GONE); + spa->spa_proc_state = SPA_PROC_NONE; + } + ASSERT(spa->spa_proc == &p0); + mutex_exit(&spa->spa_proc_lock); + +#ifdef SPA_PROCESS + /* + * We want to make sure spa_thread() has actually exited the ZFS + * module, so that the module can't be unloaded out from underneath + * it. + */ + if (spa->spa_did != 0) { + thread_join(spa->spa_did); + spa->spa_did = 0; + } +#endif /* SPA_PROCESS */ +} + +/* + * Verify a pool configuration, and construct the vdev tree appropriately. This + * will create all the necessary vdevs in the appropriate layout, with each vdev + * in the CLOSED state. This will prep the pool before open/creation/import. + * All vdev validation is done by the vdev_alloc() routine. + */ +static int +spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, + uint_t id, int atype) +{ + nvlist_t **child; + uint_t children; + int error; + + if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) + return (error); + + if ((*vdp)->vdev_ops->vdev_op_leaf) + return (0); + + error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children); + + if (error == ENOENT) + return (0); + + if (error) { + vdev_free(*vdp); + *vdp = NULL; + return (SET_ERROR(EINVAL)); + } + + for (int c = 0; c < children; c++) { + vdev_t *vd; + if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, + atype)) != 0) { + vdev_free(*vdp); + *vdp = NULL; + return (error); + } + } + + ASSERT(*vdp != NULL); + + return (0); +} + +/* + * Opposite of spa_load(). + */ +static void +spa_unload(spa_t *spa) +{ + int i; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_load_note(spa, "UNLOADING"); + + /* + * Stop TRIM thread. + */ + trim_thread_destroy(spa); + + /* + * Stop async tasks. + */ + spa_async_suspend(spa); + + if (spa->spa_root_vdev) { + vdev_initialize_stop_all(spa->spa_root_vdev, + VDEV_INITIALIZE_ACTIVE); + } + + /* + * Stop syncing. + */ + if (spa->spa_sync_on) { + txg_sync_stop(spa->spa_dsl_pool); + spa->spa_sync_on = B_FALSE; + } + + /* + * Even though vdev_free() also calls vdev_metaslab_fini, we need + * to call it earlier, before we wait for async i/o to complete. + * This ensures that there is no async metaslab prefetching, by + * calling taskq_wait(mg_taskq). + */ + if (spa->spa_root_vdev != NULL) { + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); + for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) + vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); + spa_config_exit(spa, SCL_ALL, spa); + } + + /* + * Wait for any outstanding async I/O to complete. + */ + if (spa->spa_async_zio_root != NULL) { + for (int i = 0; i < max_ncpus; i++) + (void) zio_wait(spa->spa_async_zio_root[i]); + kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); + spa->spa_async_zio_root = NULL; + } + + if (spa->spa_vdev_removal != NULL) { + spa_vdev_removal_destroy(spa->spa_vdev_removal); + spa->spa_vdev_removal = NULL; + } + + if (spa->spa_condense_zthr != NULL) { + ASSERT(!zthr_isrunning(spa->spa_condense_zthr)); + zthr_destroy(spa->spa_condense_zthr); + spa->spa_condense_zthr = NULL; + } + + if (spa->spa_checkpoint_discard_zthr != NULL) { + ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr)); + zthr_destroy(spa->spa_checkpoint_discard_zthr); + spa->spa_checkpoint_discard_zthr = NULL; + } + + spa_condense_fini(spa); + + bpobj_close(&spa->spa_deferred_bpobj); + + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); + + /* + * Close all vdevs. + */ + if (spa->spa_root_vdev) + vdev_free(spa->spa_root_vdev); + ASSERT(spa->spa_root_vdev == NULL); + + /* + * Close the dsl pool. + */ + if (spa->spa_dsl_pool) { + dsl_pool_close(spa->spa_dsl_pool); + spa->spa_dsl_pool = NULL; + spa->spa_meta_objset = NULL; + } + + ddt_unload(spa); + + /* + * Drop and purge level 2 cache + */ + spa_l2cache_drop(spa); + + for (i = 0; i < spa->spa_spares.sav_count; i++) + vdev_free(spa->spa_spares.sav_vdevs[i]); + if (spa->spa_spares.sav_vdevs) { + kmem_free(spa->spa_spares.sav_vdevs, + spa->spa_spares.sav_count * sizeof (void *)); + spa->spa_spares.sav_vdevs = NULL; + } + if (spa->spa_spares.sav_config) { + nvlist_free(spa->spa_spares.sav_config); + spa->spa_spares.sav_config = NULL; + } + spa->spa_spares.sav_count = 0; + + for (i = 0; i < spa->spa_l2cache.sav_count; i++) { + vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); + vdev_free(spa->spa_l2cache.sav_vdevs[i]); + } + if (spa->spa_l2cache.sav_vdevs) { + kmem_free(spa->spa_l2cache.sav_vdevs, + spa->spa_l2cache.sav_count * sizeof (void *)); + spa->spa_l2cache.sav_vdevs = NULL; + } + if (spa->spa_l2cache.sav_config) { + nvlist_free(spa->spa_l2cache.sav_config); + spa->spa_l2cache.sav_config = NULL; + } + spa->spa_l2cache.sav_count = 0; + + spa->spa_async_suspended = 0; + + spa->spa_indirect_vdevs_loaded = B_FALSE; + + if (spa->spa_comment != NULL) { + spa_strfree(spa->spa_comment); + spa->spa_comment = NULL; + } + + spa_config_exit(spa, SCL_ALL, spa); +} + +/* + * Load (or re-load) the current list of vdevs describing the active spares for + * this pool. When this is called, we have some form of basic information in + * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and + * then re-generate a more complete list including status information. + */ +void +spa_load_spares(spa_t *spa) +{ + nvlist_t **spares; + uint_t nspares; + int i; + vdev_t *vd, *tvd; + +#ifndef _KERNEL + /* + * zdb opens both the current state of the pool and the + * checkpointed state (if present), with a different spa_t. + * + * As spare vdevs are shared among open pools, we skip loading + * them when we load the checkpointed state of the pool. + */ + if (!spa_writeable(spa)) + return; +#endif + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + /* + * First, close and free any existing spare vdevs. + */ + for (i = 0; i < spa->spa_spares.sav_count; i++) { + vd = spa->spa_spares.sav_vdevs[i]; + + /* Undo the call to spa_activate() below */ + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, + B_FALSE)) != NULL && tvd->vdev_isspare) + spa_spare_remove(tvd); + vdev_close(vd); + vdev_free(vd); + } + + if (spa->spa_spares.sav_vdevs) + kmem_free(spa->spa_spares.sav_vdevs, + spa->spa_spares.sav_count * sizeof (void *)); + + if (spa->spa_spares.sav_config == NULL) + nspares = 0; + else + VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + + spa->spa_spares.sav_count = (int)nspares; + spa->spa_spares.sav_vdevs = NULL; + + if (nspares == 0) + return; + + /* + * Construct the array of vdevs, opening them to get status in the + * process. For each spare, there is potentially two different vdev_t + * structures associated with it: one in the list of spares (used only + * for basic validation purposes) and one in the active vdev + * configuration (if it's spared in). During this phase we open and + * validate each vdev on the spare list. If the vdev also exists in the + * active configuration, then we also mark this vdev as an active spare. + */ + spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), + KM_SLEEP); + for (i = 0; i < spa->spa_spares.sav_count; i++) { + VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, + VDEV_ALLOC_SPARE) == 0); + ASSERT(vd != NULL); + + spa->spa_spares.sav_vdevs[i] = vd; + + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, + B_FALSE)) != NULL) { + if (!tvd->vdev_isspare) + spa_spare_add(tvd); + + /* + * We only mark the spare active if we were successfully + * able to load the vdev. Otherwise, importing a pool + * with a bad active spare would result in strange + * behavior, because multiple pool would think the spare + * is actively in use. + * + * There is a vulnerability here to an equally bizarre + * circumstance, where a dead active spare is later + * brought back to life (onlined or otherwise). Given + * the rarity of this scenario, and the extra complexity + * it adds, we ignore the possibility. + */ + if (!vdev_is_dead(tvd)) + spa_spare_activate(tvd); + } + + vd->vdev_top = vd; + vd->vdev_aux = &spa->spa_spares; + + if (vdev_open(vd) != 0) + continue; + + if (vdev_validate_aux(vd) == 0) + spa_spare_add(vd); + } + + /* + * Recompute the stashed list of spares, with status information + * this time. + */ + VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, + DATA_TYPE_NVLIST_ARRAY) == 0); + + spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), + KM_SLEEP); + for (i = 0; i < spa->spa_spares.sav_count; i++) + spares[i] = vdev_config_generate(spa, + spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); + for (i = 0; i < spa->spa_spares.sav_count; i++) + nvlist_free(spares[i]); + kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); +} + +/* + * Load (or re-load) the current list of vdevs describing the active l2cache for + * this pool. When this is called, we have some form of basic information in + * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and + * then re-generate a more complete list including status information. + * Devices which are already active have their details maintained, and are + * not re-opened. + */ +void +spa_load_l2cache(spa_t *spa) +{ + nvlist_t **l2cache; + uint_t nl2cache; + int i, j, oldnvdevs; + uint64_t guid; + vdev_t *vd, **oldvdevs, **newvdevs; + spa_aux_vdev_t *sav = &spa->spa_l2cache; + +#ifndef _KERNEL + /* + * zdb opens both the current state of the pool and the + * checkpointed state (if present), with a different spa_t. + * + * As L2 caches are part of the ARC which is shared among open + * pools, we skip loading them when we load the checkpointed + * state of the pool. + */ + if (!spa_writeable(spa)) + return; +#endif + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if (sav->sav_config != NULL) { + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); + } else { + nl2cache = 0; + newvdevs = NULL; + } + + oldvdevs = sav->sav_vdevs; + oldnvdevs = sav->sav_count; + sav->sav_vdevs = NULL; + sav->sav_count = 0; + + /* + * Process new nvlist of vdevs. + */ + for (i = 0; i < nl2cache; i++) { + VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, + &guid) == 0); + + newvdevs[i] = NULL; + for (j = 0; j < oldnvdevs; j++) { + vd = oldvdevs[j]; + if (vd != NULL && guid == vd->vdev_guid) { + /* + * Retain previous vdev for add/remove ops. + */ + newvdevs[i] = vd; + oldvdevs[j] = NULL; + break; + } + } + + if (newvdevs[i] == NULL) { + /* + * Create new vdev + */ + VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, + VDEV_ALLOC_L2CACHE) == 0); + ASSERT(vd != NULL); + newvdevs[i] = vd; + + /* + * Commit this vdev as an l2cache device, + * even if it fails to open. + */ + spa_l2cache_add(vd); + + vd->vdev_top = vd; + vd->vdev_aux = sav; + + spa_l2cache_activate(vd); + + if (vdev_open(vd) != 0) + continue; + + (void) vdev_validate_aux(vd); + + if (!vdev_is_dead(vd)) + l2arc_add_vdev(spa, vd); + } + } + + /* + * Purge vdevs that were dropped + */ + for (i = 0; i < oldnvdevs; i++) { + uint64_t pool; + + vd = oldvdevs[i]; + if (vd != NULL) { + ASSERT(vd->vdev_isl2cache); + + if (spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && l2arc_vdev_present(vd)) + l2arc_remove_vdev(vd); + vdev_clear_stats(vd); + vdev_free(vd); + } + } + + if (oldvdevs) + kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); + + if (sav->sav_config == NULL) + goto out; + + sav->sav_vdevs = newvdevs; + sav->sav_count = (int)nl2cache; + + /* + * Recompute the stashed list of l2cache devices, with status + * information this time. + */ + VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, + DATA_TYPE_NVLIST_ARRAY) == 0); + + l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); + for (i = 0; i < sav->sav_count; i++) + l2cache[i] = vdev_config_generate(spa, + sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); + VERIFY(nvlist_add_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); +out: + for (i = 0; i < sav->sav_count; i++) + nvlist_free(l2cache[i]); + if (sav->sav_count) + kmem_free(l2cache, sav->sav_count * sizeof (void *)); +} + +static int +load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) +{ + dmu_buf_t *db; + char *packed = NULL; + size_t nvsize = 0; + int error; + *value = NULL; + + error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); + if (error != 0) + return (error); + + nvsize = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + packed = kmem_alloc(nvsize, KM_SLEEP); + error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, + DMU_READ_PREFETCH); + if (error == 0) + error = nvlist_unpack(packed, nvsize, value, 0); + kmem_free(packed, nvsize); + + return (error); +} + +/* + * Concrete top-level vdevs that are not missing and are not logs. At every + * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. + */ +static uint64_t +spa_healthy_core_tvds(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t tvds = 0; + + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + if (vd->vdev_islog) + continue; + if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) + tvds++; + } + + return (tvds); +} + +/* + * Checks to see if the given vdev could not be opened, in which case we post a + * sysevent to notify the autoreplace code that the device has been removed. + */ +static void +spa_check_removed(vdev_t *vd) +{ + for (uint64_t c = 0; c < vd->vdev_children; c++) + spa_check_removed(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && + vdev_is_concrete(vd)) { + zfs_post_autoreplace(vd->vdev_spa, vd); + spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); + } +} + +static int +spa_check_for_missing_logs(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + /* + * If we're doing a normal import, then build up any additional + * diagnostic information about missing log devices. + * We'll pass this up to the user for further processing. + */ + if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { + nvlist_t **child, *nv; + uint64_t idx = 0; + + child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), + KM_SLEEP); + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + /* + * We consider a device as missing only if it failed + * to open (i.e. offline or faulted is not considered + * as missing). + */ + if (tvd->vdev_islog && + tvd->vdev_state == VDEV_STATE_CANT_OPEN) { + child[idx++] = vdev_config_generate(spa, tvd, + B_FALSE, VDEV_CONFIG_MISSING); + } + } + + if (idx > 0) { + fnvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, idx); + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_MISSING_DEVICES, nv); + + for (uint64_t i = 0; i < idx; i++) + nvlist_free(child[i]); + } + nvlist_free(nv); + kmem_free(child, rvd->vdev_children * sizeof (char **)); + + if (idx > 0) { + spa_load_failed(spa, "some log devices are missing"); + vdev_dbgmsg_print_tree(rvd, 2); + return (SET_ERROR(ENXIO)); + } + } else { + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + if (tvd->vdev_islog && + tvd->vdev_state == VDEV_STATE_CANT_OPEN) { + spa_set_log_state(spa, SPA_LOG_CLEAR); + spa_load_note(spa, "some log devices are " + "missing, ZIL is dropped."); + vdev_dbgmsg_print_tree(rvd, 2); + break; + } + } + } + + return (0); +} + +/* + * Check for missing log devices + */ +static boolean_t +spa_check_logs(spa_t *spa) +{ + boolean_t rv = B_FALSE; + dsl_pool_t *dp = spa_get_dsl(spa); + + switch (spa->spa_log_state) { + case SPA_LOG_MISSING: + /* need to recheck in case slog has been restored */ + case SPA_LOG_UNKNOWN: + rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); + if (rv) + spa_set_log_state(spa, SPA_LOG_MISSING); + break; + } + return (rv); +} + +static boolean_t +spa_passivate_log(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + boolean_t slog_found = B_FALSE; + + ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + if (!spa_has_slogs(spa)) + return (B_FALSE); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (tvd->vdev_islog) { + metaslab_group_passivate(mg); + slog_found = B_TRUE; + } + } + + return (slog_found); +} + +static void +spa_activate_log(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (tvd->vdev_islog) + metaslab_group_activate(mg); + } +} + +int +spa_reset_logs(spa_t *spa) +{ + int error; + + error = dmu_objset_find(spa_name(spa), zil_reset, + NULL, DS_FIND_CHILDREN); + if (error == 0) { + /* + * We successfully offlined the log device, sync out the + * current txg so that the "stubby" block can be removed + * by zil_sync(). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + } + return (error); +} + +static void +spa_aux_check_removed(spa_aux_vdev_t *sav) +{ + int i; + + for (i = 0; i < sav->sav_count; i++) + spa_check_removed(sav->sav_vdevs[i]); +} + +void +spa_claim_notify(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + + if (zio->io_error) + return; + + mutex_enter(&spa->spa_props_lock); /* any mutex will do */ + if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) + spa->spa_claim_max_txg = zio->io_bp->blk_birth; + mutex_exit(&spa->spa_props_lock); +} + +typedef struct spa_load_error { + uint64_t sle_meta_count; + uint64_t sle_data_count; +} spa_load_error_t; + +static void +spa_load_verify_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + spa_load_error_t *sle = zio->io_private; + dmu_object_type_t type = BP_GET_TYPE(bp); + int error = zio->io_error; + spa_t *spa = zio->io_spa; + + abd_free(zio->io_abd); + if (error) { + if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && + type != DMU_OT_INTENT_LOG) + atomic_inc_64(&sle->sle_meta_count); + else + atomic_inc_64(&sle->sle_data_count); + } + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_load_verify_ios--; + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); +} + +/* + * Maximum number of concurrent scrub i/os to create while verifying + * a pool while importing it. + */ +int spa_load_verify_maxinflight = 10000; +boolean_t spa_load_verify_metadata = B_TRUE; +boolean_t spa_load_verify_data = B_TRUE; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, + &spa_load_verify_maxinflight, 0, + "Maximum number of concurrent scrub I/Os to create while verifying a " + "pool while importing it"); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, + &spa_load_verify_metadata, 0, + "Check metadata on import?"); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, + &spa_load_verify_data, 0, + "Check user data on import?"); + +/*ARGSUSED*/ +static int +spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + return (0); + /* + * Note: normally this routine will not be called if + * spa_load_verify_metadata is not set. However, it may be useful + * to manually set the flag after the traversal has begun. + */ + if (!spa_load_verify_metadata) + return (0); + if (!BP_IS_METADATA(bp) && !spa_load_verify_data) + return (0); + + zio_t *rio = arg; + size_t size = BP_GET_PSIZE(bp); + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_load_verify_ios++; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, + spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); + return (0); +} + +/* ARGSUSED */ +int +verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + + return (0); +} + +static int +spa_load_verify(spa_t *spa) +{ + zio_t *rio; + spa_load_error_t sle = { 0 }; + zpool_load_policy_t policy; + boolean_t verify_ok = B_FALSE; + int error = 0; + + zpool_get_load_policy(spa->spa_config, &policy); + + if (policy.zlp_rewind & ZPOOL_NEVER_REWIND) + return (0); + + dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); + error = dmu_objset_find_dp(spa->spa_dsl_pool, + spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, + DS_FIND_CHILDREN); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); + if (error != 0) + return (error); + + rio = zio_root(spa, NULL, &sle, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + + if (spa_load_verify_metadata) { + if (spa->spa_extreme_rewind) { + spa_load_note(spa, "performing a complete scan of the " + "pool since extreme rewind is on. This may take " + "a very long time.\n (spa_load_verify_data=%u, " + "spa_load_verify_metadata=%u)", + spa_load_verify_data, spa_load_verify_metadata); + } + error = traverse_pool(spa, spa->spa_verify_min_txg, + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, + spa_load_verify_cb, rio); + } + + (void) zio_wait(rio); + + spa->spa_load_meta_errors = sle.sle_meta_count; + spa->spa_load_data_errors = sle.sle_data_count; + + if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { + spa_load_note(spa, "spa_load_verify found %llu metadata errors " + "and %llu data errors", (u_longlong_t)sle.sle_meta_count, + (u_longlong_t)sle.sle_data_count); + } + + if (spa_load_verify_dryrun || + (!error && sle.sle_meta_count <= policy.zlp_maxmeta && + sle.sle_data_count <= policy.zlp_maxdata)) { + int64_t loss = 0; + + verify_ok = B_TRUE; + spa->spa_load_txg = spa->spa_uberblock.ub_txg; + spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; + + loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); + VERIFY(nvlist_add_int64(spa->spa_load_info, + ZPOOL_CONFIG_REWIND_TIME, loss) == 0); + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); + } else { + spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; + } + + if (spa_load_verify_dryrun) + return (0); + + if (error) { + if (error != ENXIO && error != EIO) + error = SET_ERROR(EIO); + return (error); + } + + return (verify_ok ? 0 : EIO); +} + +/* + * Find a value in the pool props object. + */ +static void +spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) +{ + (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, + zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); +} + +/* + * Find a value in the pool directory object. + */ +static int +spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) +{ + int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + name, sizeof (uint64_t), 1, val); + + if (error != 0 && (error != ENOENT || log_enoent)) { + spa_load_failed(spa, "couldn't get '%s' value in MOS directory " + "[error=%d]", name, error); + } + + return (error); +} + +static int +spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) +{ + vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); + return (SET_ERROR(err)); +} + +static void +spa_spawn_aux_threads(spa_t *spa) +{ + ASSERT(spa_writeable(spa)); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_start_indirect_condensing_thread(spa); + + ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); + spa->spa_checkpoint_discard_zthr = + zthr_create(spa_checkpoint_discard_thread_check, + spa_checkpoint_discard_thread, spa); +} + +/* + * Fix up config after a partly-completed split. This is done with the + * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off + * pool have that entry in their config, but only the splitting one contains + * a list of all the guids of the vdevs that are being split off. + * + * This function determines what to do with that list: either rejoin + * all the disks to the pool, or complete the splitting process. To attempt + * the rejoin, each disk that is offlined is marked online again, and + * we do a reopen() call. If the vdev label for every disk that was + * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) + * then we call vdev_split() on each disk, and complete the split. + * + * Otherwise we leave the config alone, with all the vdevs in place in + * the original pool. + */ +static void +spa_try_repair(spa_t *spa, nvlist_t *config) +{ + uint_t extracted; + uint64_t *glist; + uint_t i, gcount; + nvlist_t *nvl; + vdev_t **vd; + boolean_t attempt_reopen; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) + return; + + /* check that the config is complete */ + if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, + &glist, &gcount) != 0) + return; + + vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); + + /* attempt to online all the vdevs & validate */ + attempt_reopen = B_TRUE; + for (i = 0; i < gcount; i++) { + if (glist[i] == 0) /* vdev is hole */ + continue; + + vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); + if (vd[i] == NULL) { + /* + * Don't bother attempting to reopen the disks; + * just do the split. + */ + attempt_reopen = B_FALSE; + } else { + /* attempt to re-online it */ + vd[i]->vdev_offline = B_FALSE; + } + } + + if (attempt_reopen) { + vdev_reopen(spa->spa_root_vdev); + + /* check each device to see what state it's in */ + for (extracted = 0, i = 0; i < gcount; i++) { + if (vd[i] != NULL && + vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) + break; + ++extracted; + } + } + + /* + * If every disk has been moved to the new pool, or if we never + * even attempted to look at them, then we split them off for + * good. + */ + if (!attempt_reopen || gcount == extracted) { + for (i = 0; i < gcount; i++) + if (vd[i] != NULL) + vdev_split(vd[i]); + vdev_reopen(spa->spa_root_vdev); + } + + kmem_free(vd, gcount * sizeof (vdev_t *)); +} + +static int +spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) +{ + char *ereport = FM_EREPORT_ZFS_POOL; + int error; + + spa->spa_load_state = state; + + gethrestime(&spa->spa_loaded_ts); + error = spa_load_impl(spa, type, &ereport); + + /* + * Don't count references from objsets that are already closed + * and are making their way through the eviction process. + */ + spa_evicting_os_wait(spa); + spa->spa_minref = refcount_count(&spa->spa_refcount); + if (error) { + if (error != EEXIST) { + spa->spa_loaded_ts.tv_sec = 0; + spa->spa_loaded_ts.tv_nsec = 0; + } + if (error != EBADF) { + zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + } + } + spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; + spa->spa_ena = 0; + + return (error); +} + +/* + * Count the number of per-vdev ZAPs associated with all of the vdevs in the + * vdev tree rooted in the given vd, and ensure that each ZAP is present in the + * spa's per-vdev ZAP list. + */ +static uint64_t +vdev_count_verify_zaps(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + uint64_t total = 0; + if (vd->vdev_top_zap != 0) { + total++; + ASSERT0(zap_lookup_int(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, vd->vdev_top_zap)); + } + if (vd->vdev_leaf_zap != 0) { + total++; + ASSERT0(zap_lookup_int(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + total += vdev_count_verify_zaps(vd->vdev_child[i]); + } + + return (total); +} + +static int +spa_verify_host(spa_t *spa, nvlist_t *mos_config) +{ + uint64_t hostid; + char *hostname; + uint64_t myhostid = 0; + + if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, + ZPOOL_CONFIG_HOSTID, &hostid) == 0) { + hostname = fnvlist_lookup_string(mos_config, + ZPOOL_CONFIG_HOSTNAME); + + myhostid = zone_get_hostid(NULL); + + if (hostid != 0 && myhostid != 0 && hostid != myhostid) { + cmn_err(CE_WARN, "pool '%s' could not be " + "loaded as it was last accessed by " + "another system (host: %s hostid: 0x%llx). " + "See: http://illumos.org/msg/ZFS-8000-EY", + spa_name(spa), hostname, (u_longlong_t)hostid); + spa_load_failed(spa, "hostid verification failed: pool " + "last accessed by host: %s (hostid: 0x%llx)", + hostname, (u_longlong_t)hostid); + return (SET_ERROR(EBADF)); + } + } + + return (0); +} + +static int +spa_ld_parse_config(spa_t *spa, spa_import_type_t type) +{ + int error = 0; + nvlist_t *nvtree, *nvl, *config = spa->spa_config; + int parse; + vdev_t *rvd; + uint64_t pool_guid; + char *comment; + + /* + * Versioning wasn't explicitly added to the label until later, so if + * it's not present treat it as the initial version. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { + spa_load_failed(spa, "invalid config provided: '%s' missing", + ZPOOL_CONFIG_POOL_GUID); + return (SET_ERROR(EINVAL)); + } + + /* + * If we are doing an import, ensure that the pool is not already + * imported by checking if its pool guid already exists in the + * spa namespace. + * + * The only case that we allow an already imported pool to be + * imported again, is when the pool is checkpointed and we want to + * look at its checkpointed state from userland tools like zdb. + */ +#ifdef _KERNEL + if ((spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0)) { +#else + if ((spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0) && + !spa_importing_readonly_checkpoint(spa)) { +#endif + spa_load_failed(spa, "a pool with guid %llu is already open", + (u_longlong_t)pool_guid); + return (SET_ERROR(EEXIST)); + } + + spa->spa_config_guid = pool_guid; + + nvlist_free(spa->spa_load_info); + spa->spa_load_info = fnvlist_alloc(); + + ASSERT(spa->spa_comment == NULL); + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) + spa->spa_comment = spa_strdup(comment); + + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &spa->spa_config_txg); + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) + spa->spa_config_splitting = fnvlist_dup(nvl); + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { + spa_load_failed(spa, "invalid config provided: '%s' missing", + ZPOOL_CONFIG_VDEV_TREE); + return (SET_ERROR(EINVAL)); + } + + /* + * Create "The Godfather" zio to hold all async IOs + */ + spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), + KM_SLEEP); + for (int i = 0; i < max_ncpus; i++) { + spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } + + /* + * Parse the configuration into a vdev tree. We explicitly set the + * value that will be returned by spa_version() since parsing the + * configuration requires knowing the version number. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + parse = (type == SPA_IMPORT_EXISTING ? + VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); + error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_load_failed(spa, "unable to parse config [error=%d]", + error); + return (error); + } + + ASSERT(spa->spa_root_vdev == rvd); + ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); + + if (type != SPA_IMPORT_ASSEMBLE) { + ASSERT(spa_guid(spa) == pool_guid); + } + + return (0); +} + +/* + * Recursively open all vdevs in the vdev tree. This function is called twice: + * first with the untrusted config, then with the trusted config. + */ +static int +spa_ld_open_vdevs(spa_t *spa) +{ + int error = 0; + + /* + * spa_missing_tvds_allowed defines how many top-level vdevs can be + * missing/unopenable for the root vdev to be still considered openable. + */ + if (spa->spa_trust_config) { + spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; + } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { + spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; + } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { + spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; + } else { + spa->spa_missing_tvds_allowed = 0; + } + + spa->spa_missing_tvds_allowed = + MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_open(spa->spa_root_vdev); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (spa->spa_missing_tvds != 0) { + spa_load_note(spa, "vdev tree has %lld missing top-level " + "vdevs.", (u_longlong_t)spa->spa_missing_tvds); + if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) { + /* + * Although theoretically we could allow users to open + * incomplete pools in RW mode, we'd need to add a lot + * of extra logic (e.g. adjust pool space to account + * for missing vdevs). + * This limitation also prevents users from accidentally + * opening the pool in RW mode during data recovery and + * damaging it further. + */ + spa_load_note(spa, "pools with missing top-level " + "vdevs can only be opened in read-only mode."); + error = SET_ERROR(ENXIO); + } else { + spa_load_note(spa, "current settings allow for maximum " + "%lld missing top-level vdevs at this stage.", + (u_longlong_t)spa->spa_missing_tvds_allowed); + } + } + if (error != 0) { + spa_load_failed(spa, "unable to open vdev tree [error=%d]", + error); + } + if (spa->spa_missing_tvds != 0 || error != 0) + vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); + + return (error); +} + +/* + * We need to validate the vdev labels against the configuration that + * we have in hand. This function is called twice: first with an untrusted + * config, then with a trusted config. The validation is more strict when the + * config is trusted. + */ +static int +spa_ld_validate_vdevs(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_validate(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_load_failed(spa, "vdev_validate failed [error=%d]", error); + return (error); + } + + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { + spa_load_failed(spa, "cannot open vdev tree after invalidating " + "some vdevs"); + vdev_dbgmsg_print_tree(rvd, 2); + return (SET_ERROR(ENXIO)); + } + + return (0); +} + +static void +spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) +{ + spa->spa_state = POOL_STATE_ACTIVE; + spa->spa_ubsync = spa->spa_uberblock; + spa->spa_verify_min_txg = spa->spa_extreme_rewind ? + TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; + spa->spa_first_txg = spa->spa_last_ubsync_txg ? + spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; + spa->spa_claim_max_txg = spa->spa_first_txg; + spa->spa_prev_software_version = ub->ub_software_version; +} + +static int +spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) +{ + vdev_t *rvd = spa->spa_root_vdev; + nvlist_t *label; + uberblock_t *ub = &spa->spa_uberblock; + + /* + * If we are opening the checkpointed state of the pool by + * rewinding to it, at this point we will have written the + * checkpointed uberblock to the vdev labels, so searching + * the labels will find the right uberblock. However, if + * we are opening the checkpointed state read-only, we have + * not modified the labels. Therefore, we must ignore the + * labels and continue using the spa_uberblock that was set + * by spa_ld_checkpoint_rewind. + * + * Note that it would be fine to ignore the labels when + * rewinding (opening writeable) as well. However, if we + * crash just after writing the labels, we will end up + * searching the labels. Doing so in the common case means + * that this code path gets exercised normally, rather than + * just in the edge case. + */ + if (ub->ub_checkpoint_txg != 0 && + spa_importing_readonly_checkpoint(spa)) { + spa_ld_select_uberblock_done(spa, ub); + return (0); + } + + /* + * Find the best uberblock. + */ + vdev_uberblock_load(rvd, ub, &label); + + /* + * If we weren't able to find a single valid uberblock, return failure. + */ + if (ub->ub_txg == 0) { + nvlist_free(label); + spa_load_failed(spa, "no valid uberblock found"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); + } + + spa_load_note(spa, "using uberblock with txg=%llu", + (u_longlong_t)ub->ub_txg); + + /* + * If the pool has an unsupported version we can't open it. + */ + if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { + nvlist_free(label); + spa_load_failed(spa, "version %llu is not supported", + (u_longlong_t)ub->ub_version); + return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); + } + + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *features; + + /* + * If we weren't able to find what's necessary for reading the + * MOS in the label, return failure. + */ + if (label == NULL) { + spa_load_failed(spa, "label config unavailable"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + ENXIO)); + } + + if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, + &features) != 0) { + nvlist_free(label); + spa_load_failed(spa, "invalid label: '%s' missing", + ZPOOL_CONFIG_FEATURES_FOR_READ); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + ENXIO)); + } + + /* + * Update our in-core representation with the definitive values + * from the label. + */ + nvlist_free(spa->spa_label_features); + VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); + } + + nvlist_free(label); + + /* + * Look through entries in the label nvlist's features_for_read. If + * there is a feature listed there which we don't understand then we + * cannot open a pool. + */ + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *unsup_feat; + + VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == + 0); + + for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, + NULL); nvp != NULL; + nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { + if (!zfeature_is_supported(nvpair_name(nvp))) { + VERIFY(nvlist_add_string(unsup_feat, + nvpair_name(nvp), "") == 0); + } + } + + if (!nvlist_empty(unsup_feat)) { + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + nvlist_free(unsup_feat); + spa_load_failed(spa, "some features are unsupported"); + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + + nvlist_free(unsup_feat); + } + + if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_try_repair(spa, spa->spa_config); + spa_config_exit(spa, SCL_ALL, FTAG); + nvlist_free(spa->spa_config_splitting); + spa->spa_config_splitting = NULL; + } + + /* + * Initialize internal SPA structures. + */ + spa_ld_select_uberblock_done(spa, ub); + + return (0); +} + +static int +spa_ld_open_rootbp(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); + if (error != 0) { + spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " + "[error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; + + return (0); +} + +static int +spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, + boolean_t reloading) +{ + vdev_t *mrvd, *rvd = spa->spa_root_vdev; + nvlist_t *nv, *mos_config, *policy; + int error = 0, copy_error; + uint64_t healthy_tvds, healthy_tvds_mos; + uint64_t mos_config_txg; + + if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) + != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * If we're assembling a pool from a split, the config provided is + * already trusted so there is nothing to do. + */ + if (type == SPA_IMPORT_ASSEMBLE) + return (0); + + healthy_tvds = spa_healthy_core_tvds(spa); + + if (load_nvlist(spa, spa->spa_config_object, &mos_config) + != 0) { + spa_load_failed(spa, "unable to retrieve MOS config"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + /* + * If we are doing an open, pool owner wasn't verified yet, thus do + * the verification here. + */ + if (spa->spa_load_state == SPA_LOAD_OPEN) { + error = spa_verify_host(spa, mos_config); + if (error != 0) { + nvlist_free(mos_config); + return (error); + } + } + + nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * Build a new vdev tree from the trusted config + */ + VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + + /* + * Vdev paths in the MOS may be obsolete. If the untrusted config was + * obtained by scanning /dev/dsk, then it will have the right vdev + * paths. We update the trusted MOS config with this information. + * We first try to copy the paths with vdev_copy_path_strict, which + * succeeds only when both configs have exactly the same vdev tree. + * If that fails, we fall back to a more flexible method that has a + * best effort policy. + */ + copy_error = vdev_copy_path_strict(rvd, mrvd); + if (copy_error != 0 || spa_load_print_vdev_tree) { + spa_load_note(spa, "provided vdev tree:"); + vdev_dbgmsg_print_tree(rvd, 2); + spa_load_note(spa, "MOS vdev tree:"); + vdev_dbgmsg_print_tree(mrvd, 2); + } + if (copy_error != 0) { + spa_load_note(spa, "vdev_copy_path_strict failed, falling " + "back to vdev_copy_path_relaxed"); + vdev_copy_path_relaxed(rvd, mrvd); + } + + vdev_close(rvd); + vdev_free(rvd); + spa->spa_root_vdev = mrvd; + rvd = mrvd; + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * We will use spa_config if we decide to reload the spa or if spa_load + * fails and we rewind. We must thus regenerate the config using the + * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to + * pass settings on how to load the pool and is not stored in the MOS. + * We copy it over to our new, trusted config. + */ + mos_config_txg = fnvlist_lookup_uint64(mos_config, + ZPOOL_CONFIG_POOL_TXG); + nvlist_free(mos_config); + mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); + if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, + &policy) == 0) + fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); + spa_config_set(spa, mos_config); + spa->spa_config_source = SPA_CONFIG_SRC_MOS; + + /* + * Now that we got the config from the MOS, we should be more strict + * in checking blkptrs and can make assumptions about the consistency + * of the vdev tree. spa_trust_config must be set to true before opening + * vdevs in order for them to be writeable. + */ + spa->spa_trust_config = B_TRUE; + + /* + * Open and validate the new vdev tree + */ + error = spa_ld_open_vdevs(spa); + if (error != 0) + return (error); + + error = spa_ld_validate_vdevs(spa); + if (error != 0) + return (error); + + if (copy_error != 0 || spa_load_print_vdev_tree) { + spa_load_note(spa, "final vdev tree:"); + vdev_dbgmsg_print_tree(rvd, 2); + } + + if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && + !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { + /* + * Sanity check to make sure that we are indeed loading the + * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds + * in the config provided and they happened to be the only ones + * to have the latest uberblock, we could involuntarily perform + * an extreme rewind. + */ + healthy_tvds_mos = spa_healthy_core_tvds(spa); + if (healthy_tvds_mos - healthy_tvds >= + SPA_SYNC_MIN_VDEVS) { + spa_load_note(spa, "config provided misses too many " + "top-level vdevs compared to MOS (%lld vs %lld). ", + (u_longlong_t)healthy_tvds, + (u_longlong_t)healthy_tvds_mos); + spa_load_note(spa, "vdev tree:"); + vdev_dbgmsg_print_tree(rvd, 2); + if (reloading) { + spa_load_failed(spa, "config was already " + "provided from MOS. Aborting."); + return (spa_vdev_err(rvd, + VDEV_AUX_CORRUPT_DATA, EIO)); + } + spa_load_note(spa, "spa must be reloaded using MOS " + "config"); + return (SET_ERROR(EAGAIN)); + } + } + + error = spa_check_for_missing_logs(spa); + if (error != 0) + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); + + if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { + spa_load_failed(spa, "uberblock guid sum doesn't match MOS " + "guid sum (%llu != %llu)", + (u_longlong_t)spa->spa_uberblock.ub_guid_sum, + (u_longlong_t)rvd->vdev_guid_sum); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, + ENXIO)); + } + + return (0); +} + +static int +spa_ld_open_indirect_vdev_metadata(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * Everything that we read before spa_remove_init() must be stored + * on concreted vdevs. Therefore we do this as early as possible. + */ + error = spa_remove_init(spa); + if (error != 0) { + spa_load_failed(spa, "spa_remove_init failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + /* + * Retrieve information needed to condense indirect vdev mappings. + */ + error = spa_condense_init(spa); + if (error != 0) { + spa_load_failed(spa, "spa_condense_init failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } + + return (0); +} + +static int +spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + if (spa_version(spa) >= SPA_VERSION_FEATURES) { + boolean_t missing_feat_read = B_FALSE; + nvlist_t *unsup_feat, *enabled_feat; + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, + &spa->spa_feat_for_read_obj, B_TRUE) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, + &spa->spa_feat_for_write_obj, B_TRUE) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, + &spa->spa_feat_desc_obj, B_TRUE) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + enabled_feat = fnvlist_alloc(); + unsup_feat = fnvlist_alloc(); + + if (!spa_features_check(spa, B_FALSE, + unsup_feat, enabled_feat)) + missing_feat_read = B_TRUE; + + if (spa_writeable(spa) || + spa->spa_load_state == SPA_LOAD_TRYIMPORT) { + if (!spa_features_check(spa, B_TRUE, + unsup_feat, enabled_feat)) { + *missing_feat_writep = B_TRUE; + } + } + + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); + + if (!nvlist_empty(unsup_feat)) { + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); + } + + fnvlist_free(enabled_feat); + fnvlist_free(unsup_feat); + + if (!missing_feat_read) { + fnvlist_add_boolean(spa->spa_load_info, + ZPOOL_CONFIG_CAN_RDONLY); + } + + /* + * If the state is SPA_LOAD_TRYIMPORT, our objective is + * twofold: to determine whether the pool is available for + * import in read-write mode and (if it is not) whether the + * pool is available for import in read-only mode. If the pool + * is available for import in read-write mode, it is displayed + * as available in userland; if it is not available for import + * in read-only mode, it is displayed as unavailable in + * userland. If the pool is available for import in read-only + * mode but not read-write mode, it is displayed as unavailable + * in userland with a special note that the pool is actually + * available for open in read-only mode. + * + * As a result, if the state is SPA_LOAD_TRYIMPORT and we are + * missing a feature for write, we must first determine whether + * the pool can be opened read-only before returning to + * userland in order to know whether to display the + * abovementioned note. + */ + if (missing_feat_read || (*missing_feat_writep && + spa_writeable(spa))) { + spa_load_failed(spa, "pool uses unsupported features"); + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + + /* + * Load refcounts for ZFS features from disk into an in-memory + * cache during SPA initialization. + */ + for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { + uint64_t refcount; + + error = feature_get_refcount_from_disk(spa, + &spa_feature_table[i], &refcount); + if (error == 0) { + spa->spa_feat_refcount_cache[i] = refcount; + } else if (error == ENOTSUP) { + spa->spa_feat_refcount_cache[i] = + SPA_FEATURE_DISABLED; + } else { + spa_load_failed(spa, "error getting refcount " + "for feature %s [error=%d]", + spa_feature_table[i].fi_guid, error); + return (spa_vdev_err(rvd, + VDEV_AUX_CORRUPT_DATA, EIO)); + } + } + } + + if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { + if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, + &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + return (0); +} + +static int +spa_ld_load_special_directories(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + spa->spa_is_initializing = B_TRUE; + error = dsl_pool_open(spa->spa_dsl_pool); + spa->spa_is_initializing = B_FALSE; + if (error != 0) { + spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + return (0); +} + +static int +spa_ld_get_props(spa_t *spa) +{ + int error = 0; + uint64_t obj; + vdev_t *rvd = spa->spa_root_vdev; + + /* Grab the secret checksum salt from the MOS. */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CHECKSUM_SALT, 1, + sizeof (spa->spa_cksum_salt.zcs_bytes), + spa->spa_cksum_salt.zcs_bytes); + if (error == ENOENT) { + /* Generate a new salt for subsequent use */ + (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, + sizeof (spa->spa_cksum_salt.zcs_bytes)); + } else if (error != 0) { + spa_load_failed(spa, "unable to retrieve checksum salt from " + "MOS [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); + if (error != 0) { + spa_load_failed(spa, "error opening deferred-frees bpobj " + "[error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + /* + * Load the bit that tells us to use the new accounting function + * (raid-z deflation). If we have an older pool, this will not + * be present. + */ + error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, + &spa->spa_creation_version, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * Load the persistent error log. If we have an older pool, this will + * not be present. + */ + error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, + B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, + &spa->spa_errlog_scrub, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * Load the history object. If we have an older pool, this + * will not be present. + */ + error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * Load the per-vdev ZAP map. If we have an older pool, this will not + * be present; in this case, defer its creation to a later time to + * avoid dirtying the MOS this early / out of sync context. See + * spa_sync_config_object. + */ + + /* The sentinel is only available in the MOS config. */ + nvlist_t *mos_config; + if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { + spa_load_failed(spa, "unable to retrieve MOS config"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, + &spa->spa_all_vdev_zaps, B_FALSE); + + if (error == ENOENT) { + VERIFY(!nvlist_exists(mos_config, + ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); + spa->spa_avz_action = AVZ_ACTION_INITIALIZE; + ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); + } else if (error != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { + /* + * An older version of ZFS overwrote the sentinel value, so + * we have orphaned per-vdev ZAPs in the MOS. Defer their + * destruction to later; see spa_sync_config_object. + */ + spa->spa_avz_action = AVZ_ACTION_DESTROY; + /* + * We're assuming that no vdevs have had their ZAPs created + * before this. Better be sure of it. + */ + ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); + } + nvlist_free(mos_config); + + spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); + + error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, + B_FALSE); + if (error && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + if (error == 0) { + uint64_t autoreplace; + + spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); + spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); + spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); + spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); + spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); + spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, + &spa->spa_dedup_ditto); + + spa->spa_autoreplace = (autoreplace != 0); + } + + /* + * If we are importing a pool with missing top-level vdevs, + * we enforce that the pool doesn't panic or get suspended on + * error since the likelihood of missing data is extremely high. + */ + if (spa->spa_missing_tvds > 0 && + spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && + spa->spa_load_state != SPA_LOAD_TRYIMPORT) { + spa_load_note(spa, "forcing failmode to 'continue' " + "as some top level vdevs are missing"); + spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; + } + + return (0); +} + +static int +spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * If we're assembling the pool from the split-off vdevs of + * an existing pool, we don't want to attach the spares & cache + * devices. + */ + + /* + * Load any hot spares for this pool. + */ + error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, + B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { + ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); + if (load_nvlist(spa, spa->spa_spares.sav_object, + &spa->spa_spares.sav_config) != 0) { + spa_load_failed(spa, "error loading spares nvlist"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_spares(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + } else if (error == 0) { + spa->spa_spares.sav_sync = B_TRUE; + } + + /* + * Load any level 2 ARC devices for this pool. + */ + error = spa_dir_prop(spa, DMU_POOL_L2CACHE, + &spa->spa_l2cache.sav_object, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { + ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); + if (load_nvlist(spa, spa->spa_l2cache.sav_object, + &spa->spa_l2cache.sav_config) != 0) { + spa_load_failed(spa, "error loading l2cache nvlist"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + } else if (error == 0) { + spa->spa_l2cache.sav_sync = B_TRUE; + } + + return (0); +} + +static int +spa_ld_load_vdev_metadata(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * If the 'autoreplace' property is set, then post a resource notifying + * the ZFS DE that it should not issue any faults for unopenable + * devices. We also iterate over the vdevs, and post a sysevent for any + * unopenable vdevs so that the normal autoreplace handler can take + * over. + */ + if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { + spa_check_removed(spa->spa_root_vdev); + /* + * For the import case, this is done in spa_import(), because + * at this point we're using the spare definitions from + * the MOS config, not necessarily from the userland config. + */ + if (spa->spa_load_state != SPA_LOAD_IMPORT) { + spa_aux_check_removed(&spa->spa_spares); + spa_aux_check_removed(&spa->spa_l2cache); + } + } + + /* + * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. + */ + error = vdev_load(rvd); + if (error != 0) { + spa_load_failed(spa, "vdev_load failed [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } + + /* + * Propagate the leaf DTLs we just loaded all the way up the vdev tree. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_dtl_reassess(rvd, 0, 0, B_FALSE); + spa_config_exit(spa, SCL_ALL, FTAG); + + return (0); +} + +static int +spa_ld_load_dedup_tables(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + error = ddt_load(spa); + if (error != 0) { + spa_load_failed(spa, "ddt_load failed [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + return (0); +} + +static int +spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) +{ + vdev_t *rvd = spa->spa_root_vdev; + + if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { + boolean_t missing = spa_check_logs(spa); + if (missing) { + if (spa->spa_missing_tvds != 0) { + spa_load_note(spa, "spa_check_logs failed " + "so dropping the logs"); + } else { + *ereport = FM_EREPORT_ZFS_LOG_REPLAY; + spa_load_failed(spa, "spa_check_logs failed"); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, + ENXIO)); + } + } + } + + return (0); +} + +static int +spa_ld_verify_pool_data(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * We've successfully opened the pool, verify that we're ready + * to start pushing transactions. + */ + if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { + error = spa_load_verify(spa); + if (error != 0) { + spa_load_failed(spa, "spa_load_verify failed " + "[error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + error)); + } + } + + return (0); +} + +static void +spa_ld_claim_log_blocks(spa_t *spa) +{ + dmu_tx_t *tx; + dsl_pool_t *dp = spa_get_dsl(spa); + + /* + * Claim log blocks that haven't been committed yet. + * This must all happen in a single txg. + * Note: spa_claim_max_txg is updated by spa_claim_notify(), + * invoked from zil_claim_log_block()'s i/o done callback. + * Price of rollback is that we abandon the log. + */ + spa->spa_claiming = B_TRUE; + + tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); + (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + zil_claim, tx, DS_FIND_CHILDREN); + dmu_tx_commit(tx); + + spa->spa_claiming = B_FALSE; + + spa_set_log_state(spa, SPA_LOG_GOOD); +} + +static void +spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, + boolean_t update_config_cache) +{ + vdev_t *rvd = spa->spa_root_vdev; + int need_update = B_FALSE; + + /* + * If the config cache is stale, or we have uninitialized + * metaslabs (see spa_vdev_add()), then update the config. + * + * If this is a verbatim import, trust the current + * in-core spa_config and update the disk labels. + */ + if (update_config_cache || config_cache_txg != spa->spa_config_txg || + spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_RECOVER || + (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) + need_update = B_TRUE; + + for (int c = 0; c < rvd->vdev_children; c++) + if (rvd->vdev_child[c]->vdev_ms_array == 0) + need_update = B_TRUE; + + /* + * Update the config cache asychronously in case we're the + * root pool, in which case the config cache isn't writable yet. + */ + if (need_update) + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); +} + +static void +spa_ld_prepare_for_reload(spa_t *spa) +{ + int mode = spa->spa_mode; + int async_suspended = spa->spa_async_suspended; + + spa_unload(spa); + spa_deactivate(spa); + spa_activate(spa, mode); + + /* + * We save the value of spa_async_suspended as it gets reset to 0 by + * spa_unload(). We want to restore it back to the original value before + * returning as we might be calling spa_async_resume() later. + */ + spa->spa_async_suspended = async_suspended; +} + +static int +spa_ld_read_checkpoint_txg(spa_t *spa) +{ + uberblock_t checkpoint; + int error = 0; + + ASSERT0(spa->spa_checkpoint_txg); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), + sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); + + if (error == ENOENT) + return (0); + + if (error != 0) + return (error); + + ASSERT3U(checkpoint.ub_txg, !=, 0); + ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); + ASSERT3U(checkpoint.ub_timestamp, !=, 0); + spa->spa_checkpoint_txg = checkpoint.ub_txg; + spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; + + return (0); +} + +static int +spa_ld_mos_init(spa_t *spa, spa_import_type_t type) +{ + int error = 0; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); + + /* + * Never trust the config that is provided unless we are assembling + * a pool following a split. + * This means don't trust blkptrs and the vdev tree in general. This + * also effectively puts the spa in read-only mode since + * spa_writeable() checks for spa_trust_config to be true. + * We will later load a trusted config from the MOS. + */ + if (type != SPA_IMPORT_ASSEMBLE) + spa->spa_trust_config = B_FALSE; + + /* + * Parse the config provided to create a vdev tree. + */ + error = spa_ld_parse_config(spa, type); + if (error != 0) + return (error); + + /* + * Now that we have the vdev tree, try to open each vdev. This involves + * opening the underlying physical device, retrieving its geometry and + * probing the vdev with a dummy I/O. The state of each vdev will be set + * based on the success of those operations. After this we'll be ready + * to read from the vdevs. + */ + error = spa_ld_open_vdevs(spa); + if (error != 0) + return (error); + + /* + * Read the label of each vdev and make sure that the GUIDs stored + * there match the GUIDs in the config provided. + * If we're assembling a new pool that's been split off from an + * existing pool, the labels haven't yet been updated so we skip + * validation for now. + */ + if (type != SPA_IMPORT_ASSEMBLE) { + error = spa_ld_validate_vdevs(spa); + if (error != 0) + return (error); + } + + /* + * Read all vdev labels to find the best uberblock (i.e. latest, + * unless spa_load_max_txg is set) and store it in spa_uberblock. We + * get the list of features required to read blkptrs in the MOS from + * the vdev label with the best uberblock and verify that our version + * of zfs supports them all. + */ + error = spa_ld_select_uberblock(spa, type); + if (error != 0) + return (error); + + /* + * Pass that uberblock to the dsl_pool layer which will open the root + * blkptr. This blkptr points to the latest version of the MOS and will + * allow us to read its contents. + */ + error = spa_ld_open_rootbp(spa); + if (error != 0) + return (error); + + return (0); +} + +static int +spa_ld_checkpoint_rewind(spa_t *spa) +{ + uberblock_t checkpoint; + int error = 0; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), + sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); + + if (error != 0) { + spa_load_failed(spa, "unable to retrieve checkpointed " + "uberblock from the MOS config [error=%d]", error); + + if (error == ENOENT) + error = ZFS_ERR_NO_CHECKPOINT; + + return (error); + } + + ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); + ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); + + /* + * We need to update the txg and timestamp of the checkpointed + * uberblock to be higher than the latest one. This ensures that + * the checkpointed uberblock is selected if we were to close and + * reopen the pool right after we've written it in the vdev labels. + * (also see block comment in vdev_uberblock_compare) + */ + checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; + checkpoint.ub_timestamp = gethrestime_sec(); + + /* + * Set current uberblock to be the checkpointed uberblock. + */ + spa->spa_uberblock = checkpoint; + + /* + * If we are doing a normal rewind, then the pool is open for + * writing and we sync the "updated" checkpointed uberblock to + * disk. Once this is done, we've basically rewound the whole + * pool and there is no way back. + * + * There are cases when we don't want to attempt and sync the + * checkpointed uberblock to disk because we are opening a + * pool as read-only. Specifically, verifying the checkpointed + * state with zdb, and importing the checkpointed state to get + * a "preview" of its content. + */ + if (spa_writeable(spa)) { + vdev_t *rvd = spa->spa_root_vdev; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; + int svdcount = 0; + int children = rvd->vdev_children; + int c0 = spa_get_random(children); + + for (int c = 0; c < children; c++) { + vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; + + /* Stop when revisiting the first vdev */ + if (c > 0 && svd[0] == vd) + break; + + if (vd->vdev_ms_array == 0 || vd->vdev_islog || + !vdev_is_concrete(vd)) + continue; + + svd[svdcount++] = vd; + if (svdcount == SPA_SYNC_MIN_VDEVS) + break; + } + error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); + if (error == 0) + spa->spa_last_synced_guid = rvd->vdev_guid; + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_load_failed(spa, "failed to write checkpointed " + "uberblock to the vdev labels [error=%d]", error); + return (error); + } + } + + return (0); +} + +static int +spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, + boolean_t *update_config_cache) +{ + int error; + + /* + * Parse the config for pool, open and validate vdevs, + * select an uberblock, and use that uberblock to open + * the MOS. + */ + error = spa_ld_mos_init(spa, type); + if (error != 0) + return (error); + + /* + * Retrieve the trusted config stored in the MOS and use it to create + * a new, exact version of the vdev tree, then reopen all vdevs. + */ + error = spa_ld_trusted_config(spa, type, B_FALSE); + if (error == EAGAIN) { + if (update_config_cache != NULL) + *update_config_cache = B_TRUE; + + /* + * Redo the loading process with the trusted config if it is + * too different from the untrusted config. + */ + spa_ld_prepare_for_reload(spa); + spa_load_note(spa, "RELOADING"); + error = spa_ld_mos_init(spa, type); + if (error != 0) + return (error); + + error = spa_ld_trusted_config(spa, type, B_TRUE); + if (error != 0) + return (error); + + } else if (error != 0) { + return (error); + } + + return (0); +} + +/* + * Load an existing storage pool, using the config provided. This config + * describes which vdevs are part of the pool and is later validated against + * partial configs present in each vdev's label and an entire copy of the + * config stored in the MOS. + */ +static int +spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) +{ + int error = 0; + boolean_t missing_feat_write = B_FALSE; + boolean_t checkpoint_rewind = + (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + boolean_t update_config_cache = B_FALSE; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); + + spa_load_note(spa, "LOADING"); + + error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); + if (error != 0) + return (error); + + /* + * If we are rewinding to the checkpoint then we need to repeat + * everything we've done so far in this function but this time + * selecting the checkpointed uberblock and using that to open + * the MOS. + */ + if (checkpoint_rewind) { + /* + * If we are rewinding to the checkpoint update config cache + * anyway. + */ + update_config_cache = B_TRUE; + + /* + * Extract the checkpointed uberblock from the current MOS + * and use this as the pool's uberblock from now on. If the + * pool is imported as writeable we also write the checkpoint + * uberblock to the labels, making the rewind permanent. + */ + error = spa_ld_checkpoint_rewind(spa); + if (error != 0) + return (error); + + /* + * Redo the loading process process again with the + * checkpointed uberblock. + */ + spa_ld_prepare_for_reload(spa); + spa_load_note(spa, "LOADING checkpointed uberblock"); + error = spa_ld_mos_with_trusted_config(spa, type, NULL); + if (error != 0) + return (error); + } + + /* + * Retrieve the checkpoint txg if the pool has a checkpoint. + */ + error = spa_ld_read_checkpoint_txg(spa); + if (error != 0) + return (error); + + /* + * Retrieve the mapping of indirect vdevs. Those vdevs were removed + * from the pool and their contents were re-mapped to other vdevs. Note + * that everything that we read before this step must have been + * rewritten on concrete vdevs after the last device removal was + * initiated. Otherwise we could be reading from indirect vdevs before + * we have loaded their mappings. + */ + error = spa_ld_open_indirect_vdev_metadata(spa); + if (error != 0) + return (error); + + /* + * Retrieve the full list of active features from the MOS and check if + * they are all supported. + */ + error = spa_ld_check_features(spa, &missing_feat_write); + if (error != 0) + return (error); + + /* + * Load several special directories from the MOS needed by the dsl_pool + * layer. + */ + error = spa_ld_load_special_directories(spa); + if (error != 0) + return (error); + + /* + * Retrieve pool properties from the MOS. + */ + error = spa_ld_get_props(spa); + if (error != 0) + return (error); + + /* + * Retrieve the list of auxiliary devices - cache devices and spares - + * and open them. + */ + error = spa_ld_open_aux_vdevs(spa, type); + if (error != 0) + return (error); + + /* + * Load the metadata for all vdevs. Also check if unopenable devices + * should be autoreplaced. + */ + error = spa_ld_load_vdev_metadata(spa); + if (error != 0) + return (error); + + error = spa_ld_load_dedup_tables(spa); + if (error != 0) + return (error); + + /* + * Verify the logs now to make sure we don't have any unexpected errors + * when we claim log blocks later. + */ + error = spa_ld_verify_logs(spa, type, ereport); + if (error != 0) + return (error); + + if (missing_feat_write) { + ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); + + /* + * At this point, we know that we can open the pool in + * read-only mode but not read-write mode. We now have enough + * information and can return to userland. + */ + return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + + /* + * Traverse the last txgs to make sure the pool was left off in a safe + * state. When performing an extreme rewind, we verify the whole pool, + * which can take a very long time. + */ + error = spa_ld_verify_pool_data(spa); + if (error != 0) + return (error); + + /* + * Calculate the deflated space for the pool. This must be done before + * we write anything to the pool because we'd need to update the space + * accounting using the deflated sizes. + */ + spa_update_dspace(spa); + + /* + * We have now retrieved all the information we needed to open the + * pool. If we are importing the pool in read-write mode, a few + * additional steps must be performed to finish the import. + */ + if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || + spa->spa_load_max_txg == UINT64_MAX)) { + uint64_t config_cache_txg = spa->spa_config_txg; + + ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); + + /* + * In case of a checkpoint rewind, log the original txg + * of the checkpointed uberblock. + */ + if (checkpoint_rewind) { + spa_history_log_internal(spa, "checkpoint rewind", + NULL, "rewound state to txg=%llu", + (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); + } + + /* + * Traverse the ZIL and claim all blocks. + */ + spa_ld_claim_log_blocks(spa); + + /* + * Kick-off the syncing thread. + */ + spa->spa_sync_on = B_TRUE; + txg_sync_start(spa->spa_dsl_pool); + + /* + * Wait for all claims to sync. We sync up to the highest + * claimed log block birth time so that claimed log blocks + * don't appear to be from the future. spa_claim_max_txg + * will have been set for us by ZIL traversal operations + * performed above. + */ + txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); + + /* + * Check if we need to request an update of the config. On the + * next sync, we would update the config stored in vdev labels + * and the cachefile (by default /etc/zfs/zpool.cache). + */ + spa_ld_check_for_config_update(spa, config_cache_txg, + update_config_cache); + + /* + * Check all DTLs to see if anything needs resilvering. + */ + if (!dsl_scan_resilvering(spa->spa_dsl_pool) && + vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + spa_async_request(spa, SPA_ASYNC_RESILVER); + + /* + * Log the fact that we booted up (so that we can detect if + * we rebooted in the middle of an operation). + */ + spa_history_log_version(spa, "open"); + + /* + * Delete any inconsistent datasets. + */ + (void) dmu_objset_find(spa_name(spa), + dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); + + /* + * Clean up any stale temporary dataset userrefs. + */ + dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); + + spa_restart_removal(spa); + + spa_spawn_aux_threads(spa); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_initialize_restart(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + spa_load_note(spa, "LOADED"); + + return (0); +} + +static int +spa_load_retry(spa_t *spa, spa_load_state_t state) +{ + int mode = spa->spa_mode; + + spa_unload(spa); + spa_deactivate(spa); + + spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; + + spa_activate(spa, mode); + spa_async_suspend(spa); + + spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", + (u_longlong_t)spa->spa_load_max_txg); + + return (spa_load(spa, state, SPA_IMPORT_EXISTING)); +} + +/* + * If spa_load() fails this function will try loading prior txg's. If + * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool + * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this + * function will not rewind the pool and will return the same error as + * spa_load(). + */ +static int +spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, + int rewind_flags) +{ + nvlist_t *loadinfo = NULL; + nvlist_t *config = NULL; + int load_error, rewind_error; + uint64_t safe_rewind_txg; + uint64_t min_txg; + + if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { + spa->spa_load_max_txg = spa->spa_load_txg; + spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + spa->spa_load_max_txg = max_request; + if (max_request != UINT64_MAX) + spa->spa_extreme_rewind = B_TRUE; + } + + load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); + if (load_error == 0) + return (0); + if (load_error == ZFS_ERR_NO_CHECKPOINT) { + /* + * When attempting checkpoint-rewind on a pool with no + * checkpoint, we should not attempt to load uberblocks + * from previous txgs when spa_load fails. + */ + ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + return (load_error); + } + + if (spa->spa_root_vdev != NULL) + config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + + spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; + spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; + + if (rewind_flags & ZPOOL_NEVER_REWIND) { + nvlist_free(config); + return (load_error); + } + + if (state == SPA_LOAD_RECOVER) { + /* Price of rolling back is discarding txgs, including log */ + spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + /* + * If we aren't rolling back save the load info from our first + * import attempt so that we can restore it after attempting + * to rewind. + */ + loadinfo = spa->spa_load_info; + spa->spa_load_info = fnvlist_alloc(); + } + + spa->spa_load_max_txg = spa->spa_last_ubsync_txg; + safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; + min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? + TXG_INITIAL : safe_rewind_txg; + + /* + * Continue as long as we're finding errors, we're still within + * the acceptable rewind range, and we're still finding uberblocks + */ + while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && + spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { + if (spa->spa_load_max_txg < safe_rewind_txg) + spa->spa_extreme_rewind = B_TRUE; + rewind_error = spa_load_retry(spa, state); + } + + spa->spa_extreme_rewind = B_FALSE; + spa->spa_load_max_txg = UINT64_MAX; + + if (config && (rewind_error || state != SPA_LOAD_RECOVER)) + spa_config_set(spa, config); + else + nvlist_free(config); + + if (state == SPA_LOAD_RECOVER) { + ASSERT3P(loadinfo, ==, NULL); + return (rewind_error); + } else { + /* Store the rewind info as part of the initial load info */ + fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, + spa->spa_load_info); + + /* Restore the initial load info */ + fnvlist_free(spa->spa_load_info); + spa->spa_load_info = loadinfo; + + return (load_error); + } +} + +/* + * Pool Open/Import + * + * The import case is identical to an open except that the configuration is sent + * down from userland, instead of grabbed from the configuration cache. For the + * case of an open, the pool configuration will exist in the + * POOL_STATE_UNINITIALIZED state. + * + * The stats information (gen/count/ustats) is used to gather vdev statistics at + * the same time open the pool, without having to keep around the spa_t in some + * ambiguous state. + */ +static int +spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, + nvlist_t **config) +{ + spa_t *spa; + spa_load_state_t state = SPA_LOAD_OPEN; + int error; + int locked = B_FALSE; + int firstopen = B_FALSE; + + *spapp = NULL; + + /* + * As disgusting as this is, we need to support recursive calls to this + * function because dsl_dir_open() is called during spa_load(), and ends + * up calling spa_open() again. The real fix is to figure out how to + * avoid dsl_dir_open() calling this in the first place. + */ + if (mutex_owner(&spa_namespace_lock) != curthread) { + mutex_enter(&spa_namespace_lock); + locked = B_TRUE; + } + + if ((spa = spa_lookup(pool)) == NULL) { + if (locked) + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ENOENT)); + } + + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + zpool_load_policy_t policy; + + firstopen = B_TRUE; + + zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, + &policy); + if (policy.zlp_rewind & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; + + spa_activate(spa, spa_mode_global); + + if (state != SPA_LOAD_RECOVER) + spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; + + zfs_dbgmsg("spa_open_common: opening %s", pool); + error = spa_load_best(spa, state, policy.zlp_txg, + policy.zlp_rewind); + + if (error == EBADF) { + /* + * If vdev_validate() returns failure (indicated by + * EBADF), it indicates that one of the vdevs indicates + * that the pool has been exported or destroyed. If + * this is the case, the config cache is out of sync and + * we should remove the pool from the namespace. + */ + spa_unload(spa); + spa_deactivate(spa); + spa_write_cachefile(spa, B_TRUE, B_TRUE); + spa_remove(spa); + if (locked) + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ENOENT)); + } + + if (error) { + /* + * We can't open the pool, but we still have useful + * information: the state of each vdev after the + * attempted vdev_open(). Return this to the user. + */ + if (config != NULL && spa->spa_config) { + VERIFY(nvlist_dup(spa->spa_config, config, + KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist(*config, + ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } + spa_unload(spa); + spa_deactivate(spa); + spa->spa_last_open_failed = error; + if (locked) + mutex_exit(&spa_namespace_lock); + *spapp = NULL; + return (error); + } + } + + spa_open_ref(spa, tag); + + if (config != NULL) + *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + + /* + * If we've recovered the pool, pass back any information we + * gathered while doing the load. + */ + if (state == SPA_LOAD_RECOVER) { + VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } + + if (locked) { + spa->spa_last_open_failed = 0; + spa->spa_last_ubsync_txg = 0; + spa->spa_load_txg = 0; + mutex_exit(&spa_namespace_lock); +#ifdef __FreeBSD__ +#ifdef _KERNEL + if (firstopen) + zvol_create_minors(spa->spa_name); +#endif +#endif + } + + *spapp = spa; + + return (0); +} + +int +spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, + nvlist_t **config) +{ + return (spa_open_common(name, spapp, tag, policy, config)); +} + +int +spa_open(const char *name, spa_t **spapp, void *tag) +{ + return (spa_open_common(name, spapp, tag, NULL, NULL)); +} + +/* + * Lookup the given spa_t, incrementing the inject count in the process, + * preventing it from being exported or destroyed. + */ +spa_t * +spa_inject_addref(char *name) +{ + spa_t *spa; + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(name)) == NULL) { + mutex_exit(&spa_namespace_lock); + return (NULL); + } + spa->spa_inject_ref++; + mutex_exit(&spa_namespace_lock); + + return (spa); +} + +void +spa_inject_delref(spa_t *spa) +{ + mutex_enter(&spa_namespace_lock); + spa->spa_inject_ref--; + mutex_exit(&spa_namespace_lock); +} + +/* + * Add spares device information to the nvlist. + */ +static void +spa_add_spares(spa_t *spa, nvlist_t *config) +{ + nvlist_t **spares; + uint_t i, nspares; + nvlist_t *nvroot; + uint64_t guid; + vdev_stat_t *vs; + uint_t vsc; + uint64_t pool; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + if (spa->spa_spares.sav_count == 0) + return; + + VERIFY(nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + if (nspares != 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + + /* + * Go through and find any spares which have since been + * repurposed as an active spare. If this is the case, update + * their status appropriately. + */ + for (i = 0; i < nspares; i++) { + VERIFY(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &guid) == 0); + if (spa_spare_exists(guid, &pool, NULL) && + pool != 0ULL) { + VERIFY(nvlist_lookup_uint64_array( + spares[i], ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) == 0); + vs->vs_state = VDEV_STATE_CANT_OPEN; + vs->vs_aux = VDEV_AUX_SPARED; + } + } + } +} + +/* + * Add l2cache device information to the nvlist, including vdev stats. + */ +static void +spa_add_l2cache(spa_t *spa, nvlist_t *config) +{ + nvlist_t **l2cache; + uint_t i, j, nl2cache; + nvlist_t *nvroot; + uint64_t guid; + vdev_t *vd; + vdev_stat_t *vs; + uint_t vsc; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + if (spa->spa_l2cache.sav_count == 0) + return; + + VERIFY(nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + if (nl2cache != 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + + /* + * Update level 2 cache device stats. + */ + + for (i = 0; i < nl2cache; i++) { + VERIFY(nvlist_lookup_uint64(l2cache[i], + ZPOOL_CONFIG_GUID, &guid) == 0); + + vd = NULL; + for (j = 0; j < spa->spa_l2cache.sav_count; j++) { + if (guid == + spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { + vd = spa->spa_l2cache.sav_vdevs[j]; + break; + } + } + ASSERT(vd != NULL); + + VERIFY(nvlist_lookup_uint64_array(l2cache[i], + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) + == 0); + vdev_get_stats(vd, vs); + } + } +} + +static void +spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) +{ + zap_cursor_t zc; + zap_attribute_t za; + + /* We may be unable to read features if pool is suspended. */ + if (spa_suspended(spa)) + return; + + if (spa->spa_feat_for_read_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_read_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY0(nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } + + if (spa->spa_feat_for_write_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_write_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY0(nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } +} + +static void +spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) +{ + int i; + + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t feature = spa_feature_table[i]; + uint64_t refcount; + + if (feature_get_refcount(spa, &feature, &refcount) != 0) + continue; + + VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); + } +} + +/* + * Store a list of pool features and their reference counts in the + * config. + * + * The first time this is called on a spa, allocate a new nvlist, fetch + * the pool features and reference counts from disk, then save the list + * in the spa. In subsequent calls on the same spa use the saved nvlist + * and refresh its values from the cached reference counts. This + * ensures we don't block here on I/O on a suspended pool so 'zpool + * clear' can resume the pool. + */ +static void +spa_add_feature_stats(spa_t *spa, nvlist_t *config) +{ + nvlist_t *features; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + mutex_enter(&spa->spa_feat_stats_lock); + features = spa->spa_feat_stats; + + if (features != NULL) { + spa_feature_stats_from_cache(spa, features); + } else { + VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); + spa->spa_feat_stats = features; + spa_feature_stats_from_disk(spa, features); + } + + VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, + features)); + + mutex_exit(&spa->spa_feat_stats_lock); +} + +int +spa_get_stats(const char *name, nvlist_t **config, + char *altroot, size_t buflen) +{ + int error; + spa_t *spa; + + *config = NULL; + error = spa_open_common(name, &spa, FTAG, NULL, config); + + if (spa != NULL) { + /* + * This still leaves a window of inconsistency where the spares + * or l2cache devices could change and the config would be + * self-inconsistent. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + if (*config != NULL) { + uint64_t loadtimes[2]; + + loadtimes[0] = spa->spa_loaded_ts.tv_sec; + loadtimes[1] = spa->spa_loaded_ts.tv_nsec; + VERIFY(nvlist_add_uint64_array(*config, + ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); + + VERIFY(nvlist_add_uint64(*config, + ZPOOL_CONFIG_ERRCOUNT, + spa_get_errlog_size(spa)) == 0); + + if (spa_suspended(spa)) + VERIFY(nvlist_add_uint64(*config, + ZPOOL_CONFIG_SUSPENDED, + spa->spa_failmode) == 0); + + spa_add_spares(spa, *config); + spa_add_l2cache(spa, *config); + spa_add_feature_stats(spa, *config); + } + } + + /* + * We want to get the alternate root even for faulted pools, so we cheat + * and call spa_lookup() directly. + */ + if (altroot) { + if (spa == NULL) { + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(name); + if (spa) + spa_altroot(spa, altroot, buflen); + else + altroot[0] = '\0'; + spa = NULL; + mutex_exit(&spa_namespace_lock); + } else { + spa_altroot(spa, altroot, buflen); + } + } + + if (spa != NULL) { + spa_config_exit(spa, SCL_CONFIG, FTAG); + spa_close(spa, FTAG); + } + + return (error); +} + +/* + * Validate that the auxiliary device array is well formed. We must have an + * array of nvlists, each which describes a valid leaf vdev. If this is an + * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be + * specified, as long as they are well-formed. + */ +static int +spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, + spa_aux_vdev_t *sav, const char *config, uint64_t version, + vdev_labeltype_t label) +{ + nvlist_t **dev; + uint_t i, ndev; + vdev_t *vd; + int error; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + /* + * It's acceptable to have no devs specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) + return (0); + + if (ndev == 0) + return (SET_ERROR(EINVAL)); + + /* + * Make sure the pool is formatted with a version that supports this + * device type. + */ + if (spa_version(spa) < version) + return (SET_ERROR(ENOTSUP)); + + /* + * Set the pending device list so we correctly handle device in-use + * checking. + */ + sav->sav_pending = dev; + sav->sav_npending = ndev; + + for (i = 0; i < ndev; i++) { + if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, + mode)) != 0) + goto out; + + if (!vd->vdev_ops->vdev_op_leaf) { + vdev_free(vd); + error = SET_ERROR(EINVAL); + goto out; + } + + /* + * The L2ARC currently only supports disk devices in + * kernel context. For user-level testing, we allow it. + */ +#ifdef _KERNEL + if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && + strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { + error = SET_ERROR(ENOTBLK); + vdev_free(vd); + goto out; + } +#endif + vd->vdev_top = vd; + + if ((error = vdev_open(vd)) == 0 && + (error = vdev_label_init(vd, crtxg, label)) == 0) { + VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + } + + vdev_free(vd); + + if (error && + (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) + goto out; + else + error = 0; + } + +out: + sav->sav_pending = NULL; + sav->sav_npending = 0; + return (error); +} + +static int +spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) +{ + int error; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, + &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, + VDEV_LABEL_SPARE)) != 0) { + return (error); + } + + return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, + &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, + VDEV_LABEL_L2CACHE)); +} + +static void +spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, + const char *config) +{ + int i; + + if (sav->sav_config != NULL) { + nvlist_t **olddevs; + uint_t oldndevs; + nvlist_t **newdevs; + + /* + * Generate new dev list by concatentating with the + * current dev list. + */ + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, + &olddevs, &oldndevs) == 0); + + newdevs = kmem_alloc(sizeof (void *) * + (ndevs + oldndevs), KM_SLEEP); + for (i = 0; i < oldndevs; i++) + VERIFY(nvlist_dup(olddevs[i], &newdevs[i], + KM_SLEEP) == 0); + for (i = 0; i < ndevs; i++) + VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], + KM_SLEEP) == 0); + + VERIFY(nvlist_remove(sav->sav_config, config, + DATA_TYPE_NVLIST_ARRAY) == 0); + + VERIFY(nvlist_add_nvlist_array(sav->sav_config, + config, newdevs, ndevs + oldndevs) == 0); + for (i = 0; i < oldndevs + ndevs; i++) + nvlist_free(newdevs[i]); + kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); + } else { + /* + * Generate a new dev list. + */ + VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, + devs, ndevs) == 0); + } +} + +/* + * Stop and drop level 2 ARC devices + */ +void +spa_l2cache_drop(spa_t *spa) +{ + vdev_t *vd; + int i; + spa_aux_vdev_t *sav = &spa->spa_l2cache; + + for (i = 0; i < sav->sav_count; i++) { + uint64_t pool; + + vd = sav->sav_vdevs[i]; + ASSERT(vd != NULL); + + if (spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && l2arc_vdev_present(vd)) + l2arc_remove_vdev(vd); + } +} + +/* + * Pool Creation + */ +int +spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, + nvlist_t *zplprops) +{ + spa_t *spa; + char *altroot = NULL; + vdev_t *rvd; + dsl_pool_t *dp; + dmu_tx_t *tx; + int error = 0; + uint64_t txg = TXG_INITIAL; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + uint64_t version, obj; + boolean_t has_features; + char *poolname; + nvlist_t *nvl; + + if (nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) + poolname = (char *)pool; + + /* + * If this pool already exists, return failure. + */ + mutex_enter(&spa_namespace_lock); + if (spa_lookup(poolname) != NULL) { + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EEXIST)); + } + + /* + * Allocate a new spa_t structure. + */ + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + spa = spa_add(poolname, nvl, altroot); + fnvlist_free(nvl); + spa_activate(spa, spa_mode_global); + + if (props && (error = spa_prop_validate(spa, props))) { + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + /* + * Temporary pool names should never be written to disk. + */ + if (poolname != pool) + spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; + + has_features = B_FALSE; + for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); + elem != NULL; elem = nvlist_next_nvpair(props, elem)) { + if (zpool_prop_feature(nvpair_name(elem))) + has_features = B_TRUE; + } + + if (has_features || nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { + version = SPA_VERSION; + } + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); + + spa->spa_first_txg = txg; + spa->spa_uberblock.ub_txg = txg - 1; + spa->spa_uberblock.ub_version = version; + spa->spa_ubsync = spa->spa_uberblock; + spa->spa_load_state = SPA_LOAD_CREATE; + spa->spa_removing_phys.sr_state = DSS_NONE; + spa->spa_removing_phys.sr_removing_vdev = -1; + spa->spa_removing_phys.sr_prev_indirect_vdev = -1; + spa->spa_indirect_vdevs_loaded = B_TRUE; + + /* + * Create "The Godfather" zio to hold all async IOs + */ + spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), + KM_SLEEP); + for (int i = 0; i < max_ncpus; i++) { + spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } + + /* + * Create the root vdev. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); + + ASSERT(error != 0 || rvd != NULL); + ASSERT(error != 0 || spa->spa_root_vdev == rvd); + + if (error == 0 && !zfs_allocatable_devs(nvroot)) + error = SET_ERROR(EINVAL); + + if (error == 0 && + (error = vdev_create(rvd, txg, B_FALSE)) == 0 && + (error = spa_validate_aux(spa, nvroot, txg, + VDEV_ALLOC_ADD)) == 0) { + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_ashift_optimize(rvd->vdev_child[c]); + vdev_metaslab_set_size(rvd->vdev_child[c]); + vdev_expand(rvd->vdev_child[c], txg); + } + } + + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + /* + * Get the list of spares, if specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_spares(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_spares.sav_sync = B_TRUE; + } + + /* + * Get the list of level 2 cache devices, if specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_l2cache.sav_sync = B_TRUE; + } + + spa->spa_is_initializing = B_TRUE; + spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); + spa->spa_meta_objset = dp->dp_meta_objset; + spa->spa_is_initializing = B_FALSE; + + /* + * Create DDTs (dedup tables). + */ + ddt_create(spa); + + spa_update_dspace(spa); + + tx = dmu_tx_create_assigned(dp, txg); + + /* + * Create the pool config object. + */ + spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, + DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); + + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, + sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { + cmn_err(CE_PANIC, "failed to add pool config"); + } + + if (spa_version(spa) >= SPA_VERSION_FEATURES) + spa_feature_create_zap_objects(spa, tx); + + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, + sizeof (uint64_t), 1, &version, tx) != 0) { + cmn_err(CE_PANIC, "failed to add pool version"); + } + + /* Newly created pools with the right version are always deflated. */ + if (version >= SPA_VERSION_RAIDZ_DEFLATE) { + spa->spa_deflate = TRUE; + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { + cmn_err(CE_PANIC, "failed to add deflate"); + } + } + + /* + * Create the deferred-free bpobj. Turn off compression + * because sync-to-convergence takes longer if the blocksize + * keeps changing. + */ + obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); + dmu_object_set_compress(spa->spa_meta_objset, obj, + ZIO_COMPRESS_OFF, tx); + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, + sizeof (uint64_t), 1, &obj, tx) != 0) { + cmn_err(CE_PANIC, "failed to add bpobj"); + } + VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, + spa->spa_meta_objset, obj)); + + /* + * Create the pool's history object. + */ + if (version >= SPA_VERSION_ZPOOL_HISTORY) + spa_history_create_obj(spa, tx); + + /* + * Generate some random noise for salted checksums to operate on. + */ + (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, + sizeof (spa->spa_cksum_salt.zcs_bytes)); + + /* + * Set pool properties. + */ + spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); + spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); + spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); + spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); + + if (props != NULL) { + spa_configfile_set(spa, props, B_FALSE); + spa_sync_props(props, tx); + } + + dmu_tx_commit(tx); + + spa->spa_sync_on = B_TRUE; + txg_sync_start(spa->spa_dsl_pool); + + /* + * We explicitly wait for the first transaction to complete so that our + * bean counters are appropriately updated. + */ + txg_wait_synced(spa->spa_dsl_pool, txg); + + spa_spawn_aux_threads(spa); + + spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); + + spa_history_log_version(spa, "create"); + + /* + * Don't count references from objsets that are already closed + * and are making their way through the eviction process. + */ + spa_evicting_os_wait(spa); + spa->spa_minref = refcount_count(&spa->spa_refcount); + spa->spa_load_state = SPA_LOAD_NONE; + + mutex_exit(&spa_namespace_lock); + + return (0); +} + +#ifdef _KERNEL +#ifdef illumos +/* + * Get the root pool information from the root disk, then import the root pool + * during the system boot up time. + */ +extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); + +static nvlist_t * +spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) +{ + nvlist_t *config; + nvlist_t *nvtop, *nvroot; + uint64_t pgid; + + if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) + return (NULL); + + /* + * Add this top-level vdev to the child array. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pgid) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); + + /* + * Put this pool's top-level vdevs into a root vdev. + */ + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &nvtop, 1) == 0); + + /* + * Replace the existing vdev_tree with the new root vdev in + * this pool's configuration (remove the old, add the new). + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + nvlist_free(nvroot); + return (config); +} + +/* + * Walk the vdev tree and see if we can find a device with "better" + * configuration. A configuration is "better" if the label on that + * device has a more recent txg. + */ +static void +spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) +{ + for (int c = 0; c < vd->vdev_children; c++) + spa_alt_rootvdev(vd->vdev_child[c], avd, txg); + + if (vd->vdev_ops->vdev_op_leaf) { + nvlist_t *label; + uint64_t label_txg; + + if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, + &label) != 0) + return; + + VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, + &label_txg) == 0); + + /* + * Do we have a better boot device? + */ + if (label_txg > *txg) { + *txg = label_txg; + *avd = vd; + } + nvlist_free(label); + } +} + +/* + * Import a root pool. + * + * For x86. devpath_list will consist of devid and/or physpath name of + * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). + * The GRUB "findroot" command will return the vdev we should boot. + * + * For Sparc, devpath_list consists the physpath name of the booting device + * no matter the rootpool is a single device pool or a mirrored pool. + * e.g. + * "/pci@1f,0/ide@d/disk@0,0:a" + */ +int +spa_import_rootpool(char *devpath, char *devid) +{ + spa_t *spa; + vdev_t *rvd, *bvd, *avd = NULL; + nvlist_t *config, *nvtop; + uint64_t guid, txg; + char *pname; + int error; + + /* + * Read the label from the boot device and generate a configuration. + */ + config = spa_generate_rootconf(devpath, devid, &guid); +#if defined(_OBP) && defined(_KERNEL) + if (config == NULL) { + if (strstr(devpath, "/iscsi/ssd") != NULL) { + /* iscsi boot */ + get_iscsi_bootpath_phy(devpath); + config = spa_generate_rootconf(devpath, devid, &guid); + } + } +#endif + if (config == NULL) { + cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", + devpath); + return (SET_ERROR(EIO)); + } + + VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pname) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(pname)) != NULL) { + /* + * Remove the existing root pool from the namespace so that we + * can replace it with the correct config we just read in. + */ + spa_remove(spa); + } + + spa = spa_add(pname, config, NULL); + spa->spa_is_root = B_TRUE; + spa->spa_import_flags = ZFS_IMPORT_VERBATIM; + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + + /* + * Build up a vdev tree based on the boot device's label config. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, + VDEV_ALLOC_ROOTPOOL); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", + pname); + return (error); + } + + /* + * Get the boot vdev. + */ + if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { + cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", + (u_longlong_t)guid); + error = SET_ERROR(ENOENT); + goto out; + } + + /* + * Determine if there is a better boot device. + */ + avd = bvd; + spa_alt_rootvdev(rvd, &avd, &txg); + if (avd != bvd) { + cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " + "try booting from '%s'", avd->vdev_path); + error = SET_ERROR(EINVAL); + goto out; + } + + /* + * If the boot device is part of a spare vdev then ensure that + * we're booting off the active spare. + */ + if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && + !bvd->vdev_isspare) { + cmn_err(CE_NOTE, "The boot device is currently spared. Please " + "try booting from '%s'", + bvd->vdev_parent-> + vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); + error = SET_ERROR(EINVAL); + goto out; + } + + error = 0; +out: + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_free(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + mutex_exit(&spa_namespace_lock); + + nvlist_free(config); + return (error); +} + +#else /* !illumos */ + +extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, + uint64_t *count); + +static nvlist_t * +spa_generate_rootconf(const char *name) +{ + nvlist_t **configs, **tops; + nvlist_t *config; + nvlist_t *best_cfg, *nvtop, *nvroot; + uint64_t *holes; + uint64_t best_txg; + uint64_t nchildren; + uint64_t pgid; + uint64_t count; + uint64_t i; + uint_t nholes; + + if (vdev_geom_read_pool_label(name, &configs, &count) != 0) + return (NULL); + + ASSERT3U(count, !=, 0); + best_txg = 0; + for (i = 0; i < count; i++) { + uint64_t txg; + + VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, + &txg) == 0); + if (txg > best_txg) { + best_txg = txg; + best_cfg = configs[i]; + } + } + + nchildren = 1; + nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); + holes = NULL; + nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, + &holes, &nholes); + + tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); + for (i = 0; i < nchildren; i++) { + if (i >= count) + break; + if (configs[i] == NULL) + continue; + VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + nvlist_dup(nvtop, &tops[i], KM_SLEEP); + } + for (i = 0; holes != NULL && i < nholes; i++) { + if (i >= nchildren) + continue; + if (tops[holes[i]] != NULL) + continue; + nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); + VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) == 0); + VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, + holes[i]) == 0); + VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, + 0) == 0); + } + for (i = 0; i < nchildren; i++) { + if (tops[i] != NULL) + continue; + nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); + VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MISSING) == 0); + VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, + i) == 0); + VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, + 0) == 0); + } + + /* + * Create pool config based on the best vdev config. + */ + nvlist_dup(best_cfg, &config, KM_SLEEP); + + /* + * Put this pool's top-level vdevs into a root vdev. + */ + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pgid) == 0); + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + tops, nchildren) == 0); + + /* + * Replace the existing vdev_tree with the new root vdev in + * this pool's configuration (remove the old, add the new). + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + + /* + * Drop vdev config elements that should not be present at pool level. + */ + nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); + nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); + + for (i = 0; i < count; i++) + nvlist_free(configs[i]); + kmem_free(configs, count * sizeof(void *)); + for (i = 0; i < nchildren; i++) + nvlist_free(tops[i]); + kmem_free(tops, nchildren * sizeof(void *)); + nvlist_free(nvroot); + return (config); +} + +int +spa_import_rootpool(const char *name) +{ + spa_t *spa; + vdev_t *rvd, *bvd, *avd = NULL; + nvlist_t *config, *nvtop; + uint64_t txg; + char *pname; + int error; + + /* + * Read the label from the boot device and generate a configuration. + */ + config = spa_generate_rootconf(name); + + mutex_enter(&spa_namespace_lock); + if (config != NULL) { + VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pname) == 0 && strcmp(name, pname) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) + == 0); + + if ((spa = spa_lookup(pname)) != NULL) { + /* + * The pool could already be imported, + * e.g., after reboot -r. + */ + if (spa->spa_state == POOL_STATE_ACTIVE) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + return (0); + } + + /* + * Remove the existing root pool from the namespace so + * that we can replace it with the correct config + * we just read in. + */ + spa_remove(spa); + } + spa = spa_add(pname, config, NULL); + + /* + * Set spa_ubsync.ub_version as it can be used in vdev_alloc() + * via spa_version(). + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + } else if ((spa = spa_lookup(name)) == NULL) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", + name); + return (EIO); + } else { + VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); + } + spa->spa_is_root = B_TRUE; + spa->spa_import_flags = ZFS_IMPORT_VERBATIM; + + /* + * Build up a vdev tree based on the boot device's label config. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, + VDEV_ALLOC_ROOTPOOL); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", + pname); + return (error); + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_free(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + mutex_exit(&spa_namespace_lock); + + nvlist_free(config); + return (0); +} + +#endif /* illumos */ +#endif /* _KERNEL */ + +/* + * Import a non-root pool into the system. + */ +int +spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) +{ + spa_t *spa; + char *altroot = NULL; + spa_load_state_t state = SPA_LOAD_IMPORT; + zpool_load_policy_t policy; + uint64_t mode = spa_mode_global; + uint64_t readonly = B_FALSE; + int error; + nvlist_t *nvroot; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + + /* + * If a pool with this name exists, return failure. + */ + mutex_enter(&spa_namespace_lock); + if (spa_lookup(pool) != NULL) { + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EEXIST)); + } + + /* + * Create and initialize the spa structure. + */ + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + (void) nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); + if (readonly) + mode = FREAD; + spa = spa_add(pool, config, altroot); + spa->spa_import_flags = flags; + + /* + * Verbatim import - Take a pool and insert it into the namespace + * as if it had been loaded at boot. + */ + if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + + spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); + zfs_dbgmsg("spa_import: verbatim import of %s", pool); + mutex_exit(&spa_namespace_lock); + return (0); + } + + spa_activate(spa, mode); + + /* + * Don't start async tasks until we know everything is healthy. + */ + spa_async_suspend(spa); + + zpool_get_load_policy(config, &policy); + if (policy.zlp_rewind & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; + + spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; + + if (state != SPA_LOAD_RECOVER) { + spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + zfs_dbgmsg("spa_import: importing %s", pool); + } else { + zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " + "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); + } + error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); + + /* + * Propagate anything learned while loading the pool and pass it + * back to caller (i.e. rewind info, missing devices, etc). + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + /* + * Toss any existing sparelist, as it doesn't have any validity + * anymore, and conflicts with spa_has_spare(). + */ + if (spa->spa_spares.sav_config) { + nvlist_free(spa->spa_spares.sav_config); + spa->spa_spares.sav_config = NULL; + spa_load_spares(spa); + } + if (spa->spa_l2cache.sav_config) { + nvlist_free(spa->spa_l2cache.sav_config); + spa->spa_l2cache.sav_config = NULL; + spa_load_l2cache(spa); + } + + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (error == 0) + error = spa_validate_aux(spa, nvroot, -1ULL, + VDEV_ALLOC_SPARE); + if (error == 0) + error = spa_validate_aux(spa, nvroot, -1ULL, + VDEV_ALLOC_L2CACHE); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + + if (error != 0 || (props && spa_writeable(spa) && + (error = spa_prop_set(spa, props)))) { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + spa_async_resume(spa); + + /* + * Override any spares and level 2 cache devices as specified by + * the user, as these may have correct device names/devids, etc. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + if (spa->spa_spares.sav_config) + VERIFY(nvlist_remove(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_spares(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_spares.sav_sync = B_TRUE; + } + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + if (spa->spa_l2cache.sav_config) + VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_l2cache.sav_sync = B_TRUE; + } + + /* + * Check for any removed devices. + */ + if (spa->spa_autoreplace) { + spa_aux_check_removed(&spa->spa_spares); + spa_aux_check_removed(&spa->spa_l2cache); + } + + if (spa_writeable(spa)) { + /* + * Update the config cache to include the newly-imported pool. + */ + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + } + + /* + * It's possible that the pool was expanded while it was exported. + * We kick off an async task to handle this for us. + */ + spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); + + spa_history_log_version(spa, "import"); + + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); + + mutex_exit(&spa_namespace_lock); + +#ifdef __FreeBSD__ +#ifdef _KERNEL + zvol_create_minors(pool); +#endif +#endif + return (0); +} + +nvlist_t * +spa_tryimport(nvlist_t *tryconfig) +{ + nvlist_t *config = NULL; + char *poolname, *cachefile; + spa_t *spa; + uint64_t state; + int error; + zpool_load_policy_t policy; + + if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) + return (NULL); + + if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) + return (NULL); + + /* + * Create and initialize the spa structure. + */ + mutex_enter(&spa_namespace_lock); + spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); + spa_activate(spa, FREAD); + + /* + * Rewind pool if a max txg was provided. + */ + zpool_get_load_policy(spa->spa_config, &policy); + if (policy.zlp_txg != UINT64_MAX) { + spa->spa_load_max_txg = policy.zlp_txg; + spa->spa_extreme_rewind = B_TRUE; + zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", + poolname, (longlong_t)policy.zlp_txg); + } else { + zfs_dbgmsg("spa_tryimport: importing %s", poolname); + } + + if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) + == 0) { + zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); + spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; + } else { + spa->spa_config_source = SPA_CONFIG_SRC_SCAN; + } + + error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); + + /* + * If 'tryconfig' was at least parsable, return the current config. + */ + if (spa->spa_root_vdev != NULL) { + config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, + poolname) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + state) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, + spa->spa_uberblock.ub_timestamp) == 0); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + + /* + * If the bootfs property exists on this pool then we + * copy it out so that external consumers can tell which + * pools are bootable. + */ + if ((!error || error == EEXIST) && spa->spa_bootfs) { + char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + /* + * We have to play games with the name since the + * pool was opened as TRYIMPORT_NAME. + */ + if (dsl_dsobj_to_dsname(spa_name(spa), + spa->spa_bootfs, tmpname) == 0) { + char *cp; + char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + cp = strchr(tmpname, '/'); + if (cp == NULL) { + (void) strlcpy(dsname, tmpname, + MAXPATHLEN); + } else { + (void) snprintf(dsname, MAXPATHLEN, + "%s/%s", poolname, ++cp); + } + VERIFY(nvlist_add_string(config, + ZPOOL_CONFIG_BOOTFS, dsname) == 0); + kmem_free(dsname, MAXPATHLEN); + } + kmem_free(tmpname, MAXPATHLEN); + } + + /* + * Add the list of hot spares and level 2 cache devices. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_add_spares(spa, config); + spa_add_l2cache(spa, config); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + + return (config); +} + +/* + * Pool export/destroy + * + * The act of destroying or exporting a pool is very simple. We make sure there + * is no more pending I/O and any references to the pool are gone. Then, we + * update the pool state and sync all the labels to disk, removing the + * configuration from the cache afterwards. If the 'hardforce' flag is set, then + * we don't sync the labels or remove the configuration cache. + */ +static int +spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, + boolean_t force, boolean_t hardforce) +{ + spa_t *spa; + + if (oldconfig) + *oldconfig = NULL; + + if (!(spa_mode_global & FWRITE)) + return (SET_ERROR(EROFS)); + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(pool)) == NULL) { + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ENOENT)); + } + + /* + * Put a hold on the pool, drop the namespace lock, stop async tasks, + * reacquire the namespace lock, and see if we can export. + */ + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + spa_async_suspend(spa); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + + /* + * The pool will be in core if it's openable, + * in which case we can modify its state. + */ + if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { + + /* + * Objsets may be open only because they're dirty, so we + * have to force it to sync before checking spa_refcnt. + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + spa_evicting_os_wait(spa); + + /* + * A pool cannot be exported or destroyed if there are active + * references. If we are resetting a pool, allow references by + * fault injection handlers. + */ + if (!spa_refcount_zero(spa) || + (spa->spa_inject_ref != 0 && + new_state != POOL_STATE_UNINITIALIZED)) { + spa_async_resume(spa); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EBUSY)); + } + + /* + * A pool cannot be exported if it has an active shared spare. + * This is to prevent other pools stealing the active spare + * from an exported pool. At user's own will, such pool can + * be forcedly exported. + */ + if (!force && new_state == POOL_STATE_EXPORTED && + spa_has_active_shared_spare(spa)) { + spa_async_resume(spa); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EXDEV)); + } + + /* + * We're about to export or destroy this pool. Make sure + * we stop all initializtion activity here before we + * set the spa_final_txg. This will ensure that all + * dirty data resulting from the initialization is + * committed to disk before we unload the pool. + */ + if (spa->spa_root_vdev != NULL) { + vdev_initialize_stop_all(spa->spa_root_vdev, + VDEV_INITIALIZE_ACTIVE); + } + + /* + * We want this to be reflected on every label, + * so mark them all dirty. spa_unload() will do the + * final sync that pushes these changes out. + */ + if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa->spa_state = new_state; + spa->spa_final_txg = spa_last_synced_txg(spa) + + TXG_DEFER_SIZE + 1; + vdev_config_dirty(spa->spa_root_vdev); + spa_config_exit(spa, SCL_ALL, FTAG); + } + } + + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); + + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { + spa_unload(spa); + spa_deactivate(spa); + } + + if (oldconfig && spa->spa_config) + VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); + + if (new_state != POOL_STATE_UNINITIALIZED) { + if (!hardforce) + spa_write_cachefile(spa, B_TRUE, B_TRUE); + spa_remove(spa); + } + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/* + * Destroy a storage pool. + */ +int +spa_destroy(char *pool) +{ + return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, + B_FALSE, B_FALSE)); +} + +/* + * Export a storage pool. + */ +int +spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, + boolean_t hardforce) +{ + return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, + force, hardforce)); +} + +/* + * Similar to spa_export(), this unloads the spa_t without actually removing it + * from the namespace in any way. + */ +int +spa_reset(char *pool) +{ + return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, + B_FALSE, B_FALSE)); +} + +/* + * ========================================================================== + * Device manipulation + * ========================================================================== + */ + +/* + * Add a device to a storage pool. + */ +int +spa_vdev_add(spa_t *spa, nvlist_t *nvroot) +{ + uint64_t txg, id; + int error; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd, *tvd; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + + ASSERT(spa_writeable(spa)); + + txg = spa_vdev_enter(spa); + + if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, + VDEV_ALLOC_ADD)) != 0) + return (spa_vdev_exit(spa, NULL, txg, error)); + + spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, + &nspares) != 0) + nspares = 0; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, + &nl2cache) != 0) + nl2cache = 0; + + if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + + if (vd->vdev_children != 0 && + (error = vdev_create(vd, txg, B_FALSE)) != 0) + return (spa_vdev_exit(spa, vd, txg, error)); + + /* + * We must validate the spares and l2cache devices after checking the + * children. Otherwise, vdev_inuse() will blindly overwrite the spare. + */ + if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) + return (spa_vdev_exit(spa, vd, txg, error)); + + /* + * If we are in the middle of a device removal, we can only add + * devices which match the existing devices in the pool. + * If we are in the middle of a removal, or have some indirect + * vdevs, we can not add raidz toplevels. + */ + if (spa->spa_vdev_removal != NULL || + spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { + for (int c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + if (spa->spa_vdev_removal != NULL && + tvd->vdev_ashift != spa->spa_max_ashift) { + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } + /* Fail if top level vdev is raidz */ + if (tvd->vdev_ops == &vdev_raidz_ops) { + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } + /* + * Need the top level mirror to be + * a mirror of leaf vdevs only + */ + if (tvd->vdev_ops == &vdev_mirror_ops) { + for (uint64_t cid = 0; + cid < tvd->vdev_children; cid++) { + vdev_t *cvd = tvd->vdev_child[cid]; + if (!cvd->vdev_ops->vdev_op_leaf) { + return (spa_vdev_exit(spa, vd, + txg, EINVAL)); + } + } + } + } + } + + for (int c = 0; c < vd->vdev_children; c++) { + + /* + * Set the vdev id to the first hole, if one exists. + */ + for (id = 0; id < rvd->vdev_children; id++) { + if (rvd->vdev_child[id]->vdev_ishole) { + vdev_free(rvd->vdev_child[id]); + break; + } + } + tvd = vd->vdev_child[c]; + vdev_remove_child(vd, tvd); + tvd->vdev_id = id; + vdev_add_child(rvd, tvd); + vdev_config_dirty(tvd); + } + + if (nspares != 0) { + spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, + ZPOOL_CONFIG_SPARES); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } + + if (nl2cache != 0) { + spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, + ZPOOL_CONFIG_L2CACHE); + spa_load_l2cache(spa); + spa->spa_l2cache.sav_sync = B_TRUE; + } + + /* + * We have to be careful when adding new vdevs to an existing pool. + * If other threads start allocating from these vdevs before we + * sync the config cache, and we lose power, then upon reboot we may + * fail to open the pool because there are DVAs that the config cache + * can't translate. Therefore, we first add the vdevs without + * initializing metaslabs; sync the config cache (via spa_vdev_exit()); + * and then let spa_config_update() initialize the new metaslabs. + * + * spa_load() checks for added-but-not-initialized vdevs, so that + * if we lose power at any point in this sequence, the remaining + * steps will be completed the next time we load the pool. + */ + (void) spa_vdev_exit(spa, vd, txg, 0); + + mutex_enter(&spa_namespace_lock); + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/* + * Attach a device to a mirror. The arguments are the path to any device + * in the mirror, and the nvroot for the new device. If the path specifies + * a device that is not mirrored, we automatically insert the mirror vdev. + * + * If 'replacing' is specified, the new device is intended to replace the + * existing device; in this case the two devices are made into their own + * mirror using the 'replacing' vdev, which is functionally identical to + * the mirror vdev (it actually reuses all the same ops) but has a few + * extra rules: you can't attach to it after it's been created, and upon + * completion of resilvering, the first disk (the one being replaced) + * is automatically detached. + */ +int +spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) +{ + uint64_t txg, dtl_max_txg; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; + vdev_ops_t *pvops; + char *oldvdpath, *newvdpath; + int newvd_isspare; + int error; + + ASSERT(spa_writeable(spa)); + + txg = spa_vdev_enter(spa); + + oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + if (spa->spa_vdev_removal != NULL) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + if (oldvd == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!oldvd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + pvd = oldvd->vdev_parent; + + if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, + VDEV_ALLOC_ATTACH)) != 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + if (newrootvd->vdev_children != 1) + return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); + + newvd = newrootvd->vdev_child[0]; + + if (!newvd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); + + if ((error = vdev_create(newrootvd, txg, replacing)) != 0) + return (spa_vdev_exit(spa, newrootvd, txg, error)); + + /* + * Spares can't replace logs + */ + if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + if (!replacing) { + /* + * For attach, the only allowable parent is a mirror or the root + * vdev. + */ + if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_root_ops) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + pvops = &vdev_mirror_ops; + } else { + /* + * Active hot spares can only be replaced by inactive hot + * spares. + */ + if (pvd->vdev_ops == &vdev_spare_ops && + oldvd->vdev_isspare && + !spa_has_spare(spa, newvd->vdev_guid)) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + /* + * If the source is a hot spare, and the parent isn't already a + * spare, then we want to create a new hot spare. Otherwise, we + * want to create a replacing vdev. The user is not allowed to + * attach to a spared vdev child unless the 'isspare' state is + * the same (spare replaces spare, non-spare replaces + * non-spare). + */ + if (pvd->vdev_ops == &vdev_replacing_ops && + spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } else if (pvd->vdev_ops == &vdev_spare_ops && + newvd->vdev_isspare != oldvd->vdev_isspare) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + + if (newvd->vdev_isspare) + pvops = &vdev_spare_ops; + else + pvops = &vdev_replacing_ops; + } + + /* + * Make sure the new device is big enough. + */ + if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) + return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); + + /* + * The new device cannot have a higher alignment requirement + * than the top-level vdev. + */ + if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) + return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); + + /* + * If this is an in-place replacement, update oldvd's path and devid + * to make it distinguishable from newvd, and unopenable from now on. + */ + if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { + spa_strfree(oldvd->vdev_path); + oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, + KM_SLEEP); + (void) sprintf(oldvd->vdev_path, "%s/%s", + newvd->vdev_path, "old"); + if (oldvd->vdev_devid != NULL) { + spa_strfree(oldvd->vdev_devid); + oldvd->vdev_devid = NULL; + } + } + + /* mark the device being resilvered */ + newvd->vdev_resilver_txg = txg; + + /* + * If the parent is not a mirror, or if we're replacing, insert the new + * mirror/replacing/spare vdev above oldvd. + */ + if (pvd->vdev_ops != pvops) + pvd = vdev_add_parent(oldvd, pvops); + + ASSERT(pvd->vdev_top->vdev_parent == rvd); + ASSERT(pvd->vdev_ops == pvops); + ASSERT(oldvd->vdev_parent == pvd); + + /* + * Extract the new device from its root and add it to pvd. + */ + vdev_remove_child(newrootvd, newvd); + newvd->vdev_id = pvd->vdev_children; + newvd->vdev_crtxg = oldvd->vdev_crtxg; + vdev_add_child(pvd, newvd); + + tvd = newvd->vdev_top; + ASSERT(pvd->vdev_top == tvd); + ASSERT(tvd->vdev_parent == rvd); + + vdev_config_dirty(tvd); + + /* + * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account + * for any dmu_sync-ed blocks. It will propagate upward when + * spa_vdev_exit() calls vdev_dtl_reassess(). + */ + dtl_max_txg = txg + TXG_CONCURRENT_STATES; + + vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, + dtl_max_txg - TXG_INITIAL); + + if (newvd->vdev_isspare) { + spa_spare_activate(newvd); + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); + } + + oldvdpath = spa_strdup(oldvd->vdev_path); + newvdpath = spa_strdup(newvd->vdev_path); + newvd_isspare = newvd->vdev_isspare; + + /* + * Mark newvd's DTL dirty in this txg. + */ + vdev_dirty(tvd, VDD_DTL, newvd, txg); + + /* + * Schedule the resilver to restart in the future. We do this to + * ensure that dmu_sync-ed blocks have been stitched into the + * respective datasets. + */ + dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + + if (spa->spa_bootfs) + spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); + + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); + + /* + * Commit the config + */ + (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); + + spa_history_log_internal(spa, "vdev attach", NULL, + "%s vdev=%s %s vdev=%s", + replacing && newvd_isspare ? "spare in" : + replacing ? "replace" : "attach", newvdpath, + replacing ? "for" : "to", oldvdpath); + + spa_strfree(oldvdpath); + spa_strfree(newvdpath); + + return (0); +} + +/* + * Detach a device from a mirror or replacing vdev. + * + * If 'replace_done' is specified, only detach if the parent + * is a replacing vdev. + */ +int +spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) +{ + uint64_t txg; + int error; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd, *pvd, *cvd, *tvd; + boolean_t unspare = B_FALSE; + uint64_t unspare_guid = 0; + char *vdpath; + + ASSERT(spa_writeable(spa)); + + txg = spa_vdev_enter(spa); + + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + + /* + * Besides being called directly from the userland through the + * ioctl interface, spa_vdev_detach() can be potentially called + * at the end of spa_vdev_resilver_done(). + * + * In the regular case, when we have a checkpoint this shouldn't + * happen as we never empty the DTLs of a vdev during the scrub + * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() + * should never get here when we have a checkpoint. + * + * That said, even in a case when we checkpoint the pool exactly + * as spa_vdev_resilver_done() calls this function everything + * should be fine as the resilver will return right away. + */ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + if (vd == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + pvd = vd->vdev_parent; + + /* + * If the parent/child relationship is not as expected, don't do it. + * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing + * vdev that's replacing B with C. The user's intent in replacing + * is to go from M(A,B) to M(A,C). If the user decides to cancel + * the replace by detaching C, the expected behavior is to end up + * M(A,B). But suppose that right after deciding to detach C, + * the replacement of B completes. We would have M(A,C), and then + * ask to detach C, which would leave us with just A -- not what + * the user wanted. To prevent this, we make sure that the + * parent/child relationship hasn't changed -- in this example, + * that C's parent is still the replacing vdev R. + */ + if (pvd->vdev_guid != pguid && pguid != 0) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + /* + * Only 'replacing' or 'spare' vdevs can be replaced. + */ + if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && + pvd->vdev_ops != &vdev_spare_ops) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + ASSERT(pvd->vdev_ops != &vdev_spare_ops || + spa_version(spa) >= SPA_VERSION_SPARES); + + /* + * Only mirror, replacing, and spare vdevs support detach. + */ + if (pvd->vdev_ops != &vdev_replacing_ops && + pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_spare_ops) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + /* + * If this device has the only valid copy of some data, + * we cannot safely detach it. + */ + if (vdev_dtl_required(vd)) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + ASSERT(pvd->vdev_children >= 2); + + /* + * If we are detaching the second disk from a replacing vdev, then + * check to see if we changed the original vdev's path to have "/old" + * at the end in spa_vdev_attach(). If so, undo that change now. + */ + if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && + vd->vdev_path != NULL) { + size_t len = strlen(vd->vdev_path); + + for (int c = 0; c < pvd->vdev_children; c++) { + cvd = pvd->vdev_child[c]; + + if (cvd == vd || cvd->vdev_path == NULL) + continue; + + if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && + strcmp(cvd->vdev_path + len, "/old") == 0) { + spa_strfree(cvd->vdev_path); + cvd->vdev_path = spa_strdup(vd->vdev_path); + break; + } + } + } + + /* + * If we are detaching the original disk from a spare, then it implies + * that the spare should become a real disk, and be removed from the + * active spare list for the pool. + */ + if (pvd->vdev_ops == &vdev_spare_ops && + vd->vdev_id == 0 && + pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) + unspare = B_TRUE; + + /* + * Erase the disk labels so the disk can be used for other things. + * This must be done after all other error cases are handled, + * but before we disembowel vd (so we can still do I/O to it). + * But if we can't do it, don't treat the error as fatal -- + * it may be that the unwritability of the disk is the reason + * it's being detached! + */ + error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + + /* + * Remove vd from its parent and compact the parent's children. + */ + vdev_remove_child(pvd, vd); + vdev_compact_children(pvd); + + /* + * Remember one of the remaining children so we can get tvd below. + */ + cvd = pvd->vdev_child[pvd->vdev_children - 1]; + + /* + * If we need to remove the remaining child from the list of hot spares, + * do it now, marking the vdev as no longer a spare in the process. + * We must do this before vdev_remove_parent(), because that can + * change the GUID if it creates a new toplevel GUID. For a similar + * reason, we must remove the spare now, in the same txg as the detach; + * otherwise someone could attach a new sibling, change the GUID, and + * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. + */ + if (unspare) { + ASSERT(cvd->vdev_isspare); + spa_spare_remove(cvd); + unspare_guid = cvd->vdev_guid; + (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + cvd->vdev_unspare = B_TRUE; + } + + /* + * If the parent mirror/replacing vdev only has one child, + * the parent is no longer needed. Remove it from the tree. + */ + if (pvd->vdev_children == 1) { + if (pvd->vdev_ops == &vdev_spare_ops) + cvd->vdev_unspare = B_FALSE; + vdev_remove_parent(cvd); + } + + + /* + * We don't set tvd until now because the parent we just removed + * may have been the previous top-level vdev. + */ + tvd = cvd->vdev_top; + ASSERT(tvd->vdev_parent == rvd); + + /* + * Reevaluate the parent vdev state. + */ + vdev_propagate_state(cvd); + + /* + * If the 'autoexpand' property is set on the pool then automatically + * try to expand the size of the pool. For example if the device we + * just detached was smaller than the others, it may be possible to + * add metaslabs (i.e. grow the pool). We need to reopen the vdev + * first so that we can obtain the updated sizes of the leaf vdevs. + */ + if (spa->spa_autoexpand) { + vdev_reopen(tvd); + vdev_expand(tvd, txg); + } + + vdev_config_dirty(tvd); + + /* + * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that + * vd->vdev_detached is set and free vd's DTL object in syncing context. + * But first make sure we're not on any *other* txg's DTL list, to + * prevent vd from being accessed after it's freed. + */ + vdpath = spa_strdup(vd->vdev_path); + for (int t = 0; t < TXG_SIZE; t++) + (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); + vd->vdev_detached = B_TRUE; + vdev_dirty(tvd, VDD_DTL, vd, txg); + + spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); + + /* hang on to the spa before we release the lock */ + spa_open_ref(spa, FTAG); + + error = spa_vdev_exit(spa, vd, txg, 0); + + spa_history_log_internal(spa, "detach", NULL, + "vdev=%s", vdpath); + spa_strfree(vdpath); + + /* + * If this was the removal of the original device in a hot spare vdev, + * then we want to go through and remove the device from the hot spare + * list of every other pool. + */ + if (unspare) { + spa_t *altspa = NULL; + + mutex_enter(&spa_namespace_lock); + while ((altspa = spa_next(altspa)) != NULL) { + if (altspa->spa_state != POOL_STATE_ACTIVE || + altspa == spa) + continue; + + spa_open_ref(altspa, FTAG); + mutex_exit(&spa_namespace_lock); + (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); + mutex_enter(&spa_namespace_lock); + spa_close(altspa, FTAG); + } + mutex_exit(&spa_namespace_lock); + + /* search the rest of the vdevs for spares to remove */ + spa_vdev_resilver_done(spa); + } + + /* all done with the spa; OK to release */ + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + + return (error); +} + +int +spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type) +{ + /* + * We hold the namespace lock through the whole function + * to prevent any changes to the pool while we're starting or + * stopping initialization. The config and state locks are held so that + * we can properly assess the vdev state before we commit to + * the initializing operation. + */ + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + + /* Look up vdev and ensure it's a leaf. */ + vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_detached) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ENODEV)); + } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EINVAL)); + } else if (!vdev_writeable(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EROFS)); + } + mutex_enter(&vd->vdev_initialize_lock); + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + + /* + * When we activate an initialize action we check to see + * if the vdev_initialize_thread is NULL. We do this instead + * of using the vdev_initialize_state since there might be + * a previous initialization process which has completed but + * the thread is not exited. + */ + if (cmd_type == POOL_INITIALIZE_DO && + (vd->vdev_initialize_thread != NULL || + vd->vdev_top->vdev_removing)) { + mutex_exit(&vd->vdev_initialize_lock); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EBUSY)); + } else if (cmd_type == POOL_INITIALIZE_CANCEL && + (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && + vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { + mutex_exit(&vd->vdev_initialize_lock); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ESRCH)); + } else if (cmd_type == POOL_INITIALIZE_SUSPEND && + vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { + mutex_exit(&vd->vdev_initialize_lock); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ESRCH)); + } + + switch (cmd_type) { + case POOL_INITIALIZE_DO: + vdev_initialize(vd); + break; + case POOL_INITIALIZE_CANCEL: + vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); + break; + case POOL_INITIALIZE_SUSPEND: + vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED); + break; + default: + panic("invalid cmd_type %llu", (unsigned long long)cmd_type); + } + mutex_exit(&vd->vdev_initialize_lock); + + /* Sync out the initializing state */ + txg_wait_synced(spa->spa_dsl_pool, 0); + mutex_exit(&spa_namespace_lock); + + return (0); +} + + +/* + * Split a set of devices from their mirrors, and create a new pool from them. + */ +int +spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, + nvlist_t *props, boolean_t exp) +{ + int error = 0; + uint64_t txg, *glist; + spa_t *newspa; + uint_t c, children, lastlog; + nvlist_t **child, *nvl, *tmp; + dmu_tx_t *tx; + char *altroot = NULL; + vdev_t *rvd, **vml = NULL; /* vdev modify list */ + boolean_t activate_slog; + + ASSERT(spa_writeable(spa)); + + txg = spa_vdev_enter(spa); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + /* clear the log and flush everything up to now */ + activate_slog = spa_passivate_log(spa); + (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); + error = spa_reset_logs(spa); + txg = spa_vdev_config_enter(spa); + + if (activate_slog) + spa_activate_log(spa); + + if (error != 0) + return (spa_vdev_exit(spa, NULL, txg, error)); + + /* check new spa name before going any further */ + if (spa_lookup(newname) != NULL) + return (spa_vdev_exit(spa, NULL, txg, EEXIST)); + + /* + * scan through all the children to ensure they're all mirrors + */ + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || + nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + /* first, check to ensure we've got the right child count */ + rvd = spa->spa_root_vdev; + lastlog = 0; + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + /* don't count the holes & logs as children */ + if (vd->vdev_islog || !vdev_is_concrete(vd)) { + if (lastlog == 0) + lastlog = c; + continue; + } + + lastlog = 0; + } + if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + /* next, ensure no spare or cache devices are part of the split */ + if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || + nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); + glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); + + /* then, loop over each vdev and validate it */ + for (c = 0; c < children; c++) { + uint64_t is_hole = 0; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &is_hole); + + if (is_hole != 0) { + if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || + spa->spa_root_vdev->vdev_child[c]->vdev_islog) { + continue; + } else { + error = SET_ERROR(EINVAL); + break; + } + } + + /* which disk is going to be split? */ + if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, + &glist[c]) != 0) { + error = SET_ERROR(EINVAL); + break; + } + + /* look it up in the spa */ + vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); + if (vml[c] == NULL) { + error = SET_ERROR(ENODEV); + break; + } + + /* make sure there's nothing stopping the split */ + if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || + vml[c]->vdev_islog || + !vdev_is_concrete(vml[c]) || + vml[c]->vdev_isspare || + vml[c]->vdev_isl2cache || + !vdev_writeable(vml[c]) || + vml[c]->vdev_children != 0 || + vml[c]->vdev_state != VDEV_STATE_HEALTHY || + c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { + error = SET_ERROR(EINVAL); + break; + } + + if (vdev_dtl_required(vml[c])) { + error = SET_ERROR(EBUSY); + break; + } + + /* we need certain info from the top level */ + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, + vml[c]->vdev_top->vdev_ms_array) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, + vml[c]->vdev_top->vdev_ms_shift) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, + vml[c]->vdev_top->vdev_asize) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, + vml[c]->vdev_top->vdev_ashift) == 0); + + /* transfer per-vdev ZAPs */ + ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); + VERIFY0(nvlist_add_uint64(child[c], + ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); + + ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); + VERIFY0(nvlist_add_uint64(child[c], + ZPOOL_CONFIG_VDEV_TOP_ZAP, + vml[c]->vdev_parent->vdev_top_zap)); + } + + if (error != 0) { + kmem_free(vml, children * sizeof (vdev_t *)); + kmem_free(glist, children * sizeof (uint64_t)); + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + /* stop writers from using the disks */ + for (c = 0; c < children; c++) { + if (vml[c] != NULL) + vml[c]->vdev_offline = B_TRUE; + } + vdev_reopen(spa->spa_root_vdev); + + /* + * Temporarily record the splitting vdevs in the spa config. This + * will disappear once the config is regenerated. + */ + VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, + glist, children) == 0); + kmem_free(glist, children * sizeof (uint64_t)); + + mutex_enter(&spa->spa_props_lock); + VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, + nvl) == 0); + mutex_exit(&spa->spa_props_lock); + spa->spa_config_splitting = nvl; + vdev_config_dirty(spa->spa_root_vdev); + + /* configure and create the new pool */ + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, + spa->spa_config_txg) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, + spa_generate_guid(NULL)) == 0); + VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + + /* add the new pool to the namespace */ + newspa = spa_add(newname, config, altroot); + newspa->spa_avz_action = AVZ_ACTION_REBUILD; + newspa->spa_config_txg = spa->spa_config_txg; + spa_set_log_state(newspa, SPA_LOG_CLEAR); + + /* release the spa config lock, retaining the namespace lock */ + spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 1); + + spa_activate(newspa, spa_mode_global); + spa_async_suspend(newspa); + + for (c = 0; c < children; c++) { + if (vml[c] != NULL) { + /* + * Temporarily stop the initializing activity. We set + * the state to ACTIVE so that we know to resume + * the initializing once the split has completed. + */ + mutex_enter(&vml[c]->vdev_initialize_lock); + vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE); + mutex_exit(&vml[c]->vdev_initialize_lock); + } + } + +#ifndef illumos + /* mark that we are creating new spa by splitting */ + newspa->spa_splitting_newspa = B_TRUE; +#endif + newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; + + /* create the new pool from the disks of the original pool */ + error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); +#ifndef illumos + newspa->spa_splitting_newspa = B_FALSE; +#endif + if (error) + goto out; + + /* if that worked, generate a real config for the new pool */ + if (newspa->spa_root_vdev != NULL) { + VERIFY(nvlist_alloc(&newspa->spa_config_splitting, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, + ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); + spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, + B_TRUE)); + } + + /* set the props */ + if (props != NULL) { + spa_configfile_set(newspa, props, B_FALSE); + error = spa_prop_set(newspa, props); + if (error) + goto out; + } + + /* flush everything */ + txg = spa_vdev_config_enter(newspa); + vdev_config_dirty(newspa->spa_root_vdev); + (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 2); + + spa_async_resume(newspa); + + /* finally, update the original pool's config */ + txg = spa_vdev_config_enter(spa); + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) + dmu_tx_abort(tx); + for (c = 0; c < children; c++) { + if (vml[c] != NULL) { + vdev_split(vml[c]); + if (error == 0) + spa_history_log_internal(spa, "detach", tx, + "vdev=%s", vml[c]->vdev_path); + + vdev_free(vml[c]); + } + } + spa->spa_avz_action = AVZ_ACTION_REBUILD; + vdev_config_dirty(spa->spa_root_vdev); + spa->spa_config_splitting = NULL; + nvlist_free(nvl); + if (error == 0) + dmu_tx_commit(tx); + (void) spa_vdev_exit(spa, NULL, txg, 0); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 3); + + /* split is complete; log a history record */ + spa_history_log_internal(newspa, "split", NULL, + "from pool %s", spa_name(spa)); + + kmem_free(vml, children * sizeof (vdev_t *)); + + /* if we're not going to mount the filesystems in userland, export */ + if (exp) + error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, + B_FALSE, B_FALSE); + + return (error); + +out: + spa_unload(newspa); + spa_deactivate(newspa); + spa_remove(newspa); + + txg = spa_vdev_config_enter(spa); + + /* re-online all offlined disks */ + for (c = 0; c < children; c++) { + if (vml[c] != NULL) + vml[c]->vdev_offline = B_FALSE; + } + + /* restart initializing disks as necessary */ + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + + vdev_reopen(spa->spa_root_vdev); + + nvlist_free(spa->spa_config_splitting); + spa->spa_config_splitting = NULL; + (void) spa_vdev_exit(spa, NULL, txg, error); + + kmem_free(vml, children * sizeof (vdev_t *)); + return (error); +} + +/* + * Find any device that's done replacing, or a vdev marked 'unspare' that's + * currently spared, so we can detach it. + */ +static vdev_t * +spa_vdev_resilver_done_hunt(vdev_t *vd) +{ + vdev_t *newvd, *oldvd; + + for (int c = 0; c < vd->vdev_children; c++) { + oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); + if (oldvd != NULL) + return (oldvd); + } + + /* + * Check for a completed replacement. We always consider the first + * vdev in the list to be the oldest vdev, and the last one to be + * the newest (see spa_vdev_attach() for how that works). In + * the case where the newest vdev is faulted, we will not automatically + * remove it after a resilver completes. This is OK as it will require + * user intervention to determine which disk the admin wishes to keep. + */ + if (vd->vdev_ops == &vdev_replacing_ops) { + ASSERT(vd->vdev_children > 1); + + newvd = vd->vdev_child[vd->vdev_children - 1]; + oldvd = vd->vdev_child[0]; + + if (vdev_dtl_empty(newvd, DTL_MISSING) && + vdev_dtl_empty(newvd, DTL_OUTAGE) && + !vdev_dtl_required(oldvd)) + return (oldvd); + } + + /* + * Check for a completed resilver with the 'unspare' flag set. + * Also potentially update faulted state. + */ + if (vd->vdev_ops == &vdev_spare_ops) { + vdev_t *first = vd->vdev_child[0]; + vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; + + if (last->vdev_unspare) { + oldvd = first; + newvd = last; + } else if (first->vdev_unspare) { + oldvd = last; + newvd = first; + } else { + oldvd = NULL; + } + + if (oldvd != NULL && + vdev_dtl_empty(newvd, DTL_MISSING) && + vdev_dtl_empty(newvd, DTL_OUTAGE) && + !vdev_dtl_required(oldvd)) + return (oldvd); + + vdev_propagate_state(vd); + + /* + * If there are more than two spares attached to a disk, + * and those spares are not required, then we want to + * attempt to free them up now so that they can be used + * by other pools. Once we're back down to a single + * disk+spare, we stop removing them. + */ + if (vd->vdev_children > 2) { + newvd = vd->vdev_child[1]; + + if (newvd->vdev_isspare && last->vdev_isspare && + vdev_dtl_empty(last, DTL_MISSING) && + vdev_dtl_empty(last, DTL_OUTAGE) && + !vdev_dtl_required(newvd)) + return (newvd); + } + } + + return (NULL); +} + +static void +spa_vdev_resilver_done(spa_t *spa) +{ + vdev_t *vd, *pvd, *ppvd; + uint64_t guid, sguid, pguid, ppguid; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { + pvd = vd->vdev_parent; + ppvd = pvd->vdev_parent; + guid = vd->vdev_guid; + pguid = pvd->vdev_guid; + ppguid = ppvd->vdev_guid; + sguid = 0; + /* + * If we have just finished replacing a hot spared device, then + * we need to detach the parent's first child (the original hot + * spare) as well. + */ + if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && + ppvd->vdev_children == 2) { + ASSERT(pvd->vdev_ops == &vdev_replacing_ops); + sguid = ppvd->vdev_child[1]->vdev_guid; + } + ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); + + spa_config_exit(spa, SCL_ALL, FTAG); + if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) + return; + if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) + return; + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + } + + spa_config_exit(spa, SCL_ALL, FTAG); +} + +/* + * Update the stored path or FRU for this vdev. + */ +int +spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, + boolean_t ispath) +{ + vdev_t *vd; + boolean_t sync = B_FALSE; + + ASSERT(spa_writeable(spa)); + + spa_vdev_state_enter(spa, SCL_ALL); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENOENT)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + + if (ispath) { + if (strcmp(value, vd->vdev_path) != 0) { + spa_strfree(vd->vdev_path); + vd->vdev_path = spa_strdup(value); + sync = B_TRUE; + } + } else { + if (vd->vdev_fru == NULL) { + vd->vdev_fru = spa_strdup(value); + sync = B_TRUE; + } else if (strcmp(value, vd->vdev_fru) != 0) { + spa_strfree(vd->vdev_fru); + vd->vdev_fru = spa_strdup(value); + sync = B_TRUE; + } + } + + return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); +} + +int +spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) +{ + return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); +} + +int +spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) +{ + return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); +} + +/* + * ========================================================================== + * SPA Scanning + * ========================================================================== + */ +int +spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) +{ + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + + if (dsl_scan_resilvering(spa->spa_dsl_pool)) + return (SET_ERROR(EBUSY)); + + return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); +} + +int +spa_scan_stop(spa_t *spa) +{ + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + if (dsl_scan_resilvering(spa->spa_dsl_pool)) + return (SET_ERROR(EBUSY)); + return (dsl_scan_cancel(spa->spa_dsl_pool)); +} + +int +spa_scan(spa_t *spa, pool_scan_func_t func) +{ + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + + if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) + return (SET_ERROR(ENOTSUP)); + + /* + * If a resilver was requested, but there is no DTL on a + * writeable leaf device, we have nothing to do. + */ + if (func == POOL_SCAN_RESILVER && + !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); + return (0); + } + + return (dsl_scan(spa->spa_dsl_pool, func)); +} + +/* + * ========================================================================== + * SPA async task processing + * ========================================================================== + */ + +static void +spa_async_remove(spa_t *spa, vdev_t *vd) +{ + if (vd->vdev_remove_wanted) { + vd->vdev_remove_wanted = B_FALSE; + vd->vdev_delayed_close = B_FALSE; + vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); + + /* + * We want to clear the stats, but we don't want to do a full + * vdev_clear() as that will cause us to throw away + * degraded/faulted state as well as attempt to reopen the + * device, all of which is a waste. + */ + vd->vdev_stat.vs_read_errors = 0; + vd->vdev_stat.vs_write_errors = 0; + vd->vdev_stat.vs_checksum_errors = 0; + + vdev_state_dirty(vd->vdev_top); + /* Tell userspace that the vdev is gone. */ + zfs_post_remove(spa, vd); + } + + for (int c = 0; c < vd->vdev_children; c++) + spa_async_remove(spa, vd->vdev_child[c]); +} + +static void +spa_async_probe(spa_t *spa, vdev_t *vd) +{ + if (vd->vdev_probe_wanted) { + vd->vdev_probe_wanted = B_FALSE; + vdev_reopen(vd); /* vdev_open() does the actual probe */ + } + + for (int c = 0; c < vd->vdev_children; c++) + spa_async_probe(spa, vd->vdev_child[c]); +} + +static void +spa_async_autoexpand(spa_t *spa, vdev_t *vd) +{ + sysevent_id_t eid; + nvlist_t *attr; + char *physpath; + + if (!spa->spa_autoexpand) + return; + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + spa_async_autoexpand(spa, cvd); + } + + if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) + return; + + physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); + + VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); + + (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, + ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); + + nvlist_free(attr); + kmem_free(physpath, MAXPATHLEN); +} + +static void +spa_async_thread(void *arg) +{ + spa_t *spa = (spa_t *)arg; + int tasks; + + ASSERT(spa->spa_sync_on); + + mutex_enter(&spa->spa_async_lock); + tasks = spa->spa_async_tasks; + spa->spa_async_tasks &= SPA_ASYNC_REMOVE; + mutex_exit(&spa->spa_async_lock); + + /* + * See if the config needs to be updated. + */ + if (tasks & SPA_ASYNC_CONFIG_UPDATE) { + uint64_t old_space, new_space; + + mutex_enter(&spa_namespace_lock); + old_space = metaslab_class_get_space(spa_normal_class(spa)); + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + new_space = metaslab_class_get_space(spa_normal_class(spa)); + mutex_exit(&spa_namespace_lock); + + /* + * If the pool grew as a result of the config update, + * then log an internal history event. + */ + if (new_space != old_space) { + spa_history_log_internal(spa, "vdev online", NULL, + "pool '%s' size: %llu(+%llu)", + spa_name(spa), new_space, new_space - old_space); + } + } + + if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_async_autoexpand(spa, spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + /* + * See if any devices need to be probed. + */ + if (tasks & SPA_ASYNC_PROBE) { + spa_vdev_state_enter(spa, SCL_NONE); + spa_async_probe(spa, spa->spa_root_vdev); + (void) spa_vdev_state_exit(spa, NULL, 0); + } + + /* + * If any devices are done replacing, detach them. + */ + if (tasks & SPA_ASYNC_RESILVER_DONE) + spa_vdev_resilver_done(spa); + + /* + * Kick off a resilver. + */ + if (tasks & SPA_ASYNC_RESILVER) + dsl_resilver_restart(spa->spa_dsl_pool, 0); + + if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_initialize_restart(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_exit(&spa_namespace_lock); + } + + /* + * Let the world know that we're done. + */ + mutex_enter(&spa->spa_async_lock); + spa->spa_async_thread = NULL; + cv_broadcast(&spa->spa_async_cv); + mutex_exit(&spa->spa_async_lock); + thread_exit(); +} + +static void +spa_async_thread_vd(void *arg) +{ + spa_t *spa = arg; + int tasks; + + mutex_enter(&spa->spa_async_lock); + tasks = spa->spa_async_tasks; +retry: + spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; + mutex_exit(&spa->spa_async_lock); + + /* + * See if any devices need to be marked REMOVED. + */ + if (tasks & SPA_ASYNC_REMOVE) { + spa_vdev_state_enter(spa, SCL_NONE); + spa_async_remove(spa, spa->spa_root_vdev); + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) + spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); + for (int i = 0; i < spa->spa_spares.sav_count; i++) + spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); + (void) spa_vdev_state_exit(spa, NULL, 0); + } + + /* + * Let the world know that we're done. + */ + mutex_enter(&spa->spa_async_lock); + tasks = spa->spa_async_tasks; + if ((tasks & SPA_ASYNC_REMOVE) != 0) + goto retry; + spa->spa_async_thread_vd = NULL; + cv_broadcast(&spa->spa_async_cv); + mutex_exit(&spa->spa_async_lock); + thread_exit(); +} + +void +spa_async_suspend(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + spa->spa_async_suspended++; + while (spa->spa_async_thread != NULL || + spa->spa_async_thread_vd != NULL) + cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); + mutex_exit(&spa->spa_async_lock); + + spa_vdev_remove_suspend(spa); + + zthr_t *condense_thread = spa->spa_condense_zthr; + if (condense_thread != NULL && zthr_isrunning(condense_thread)) + VERIFY0(zthr_cancel(condense_thread)); + + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; + if (discard_thread != NULL && zthr_isrunning(discard_thread)) + VERIFY0(zthr_cancel(discard_thread)); +} + +void +spa_async_resume(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + ASSERT(spa->spa_async_suspended != 0); + spa->spa_async_suspended--; + mutex_exit(&spa->spa_async_lock); + spa_restart_removal(spa); + + zthr_t *condense_thread = spa->spa_condense_zthr; + if (condense_thread != NULL && !zthr_isrunning(condense_thread)) + zthr_resume(condense_thread); + + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; + if (discard_thread != NULL && !zthr_isrunning(discard_thread)) + zthr_resume(discard_thread); +} + +static boolean_t +spa_async_tasks_pending(spa_t *spa) +{ + uint_t non_config_tasks; + uint_t config_task; + boolean_t config_task_suspended; + + non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | + SPA_ASYNC_REMOVE); + config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; + if (spa->spa_ccw_fail_time == 0) { + config_task_suspended = B_FALSE; + } else { + config_task_suspended = + (gethrtime() - spa->spa_ccw_fail_time) < + (zfs_ccw_retry_interval * NANOSEC); + } + + return (non_config_tasks || (config_task && !config_task_suspended)); +} + +static void +spa_async_dispatch(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + if (spa_async_tasks_pending(spa) && + !spa->spa_async_suspended && + spa->spa_async_thread == NULL && + rootdir != NULL) + spa->spa_async_thread = thread_create(NULL, 0, + spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); + mutex_exit(&spa->spa_async_lock); +} + +static void +spa_async_dispatch_vd(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && + !spa->spa_async_suspended && + spa->spa_async_thread_vd == NULL && + rootdir != NULL) + spa->spa_async_thread_vd = thread_create(NULL, 0, + spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); + mutex_exit(&spa->spa_async_lock); +} + +void +spa_async_request(spa_t *spa, int task) +{ + zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); + mutex_enter(&spa->spa_async_lock); + spa->spa_async_tasks |= task; + mutex_exit(&spa->spa_async_lock); + spa_async_dispatch_vd(spa); +} + +/* + * ========================================================================== + * SPA syncing routines + * ========================================================================== + */ + +static int +bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + bpobj_t *bpo = arg; + bpobj_enqueue(bpo, bp, tx); + return (0); +} + +static int +spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + zio_t *zio = arg; + + zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, + BP_GET_PSIZE(bp), zio->io_flags)); + return (0); +} + +/* + * Note: this simple function is not inlined to make it easier to dtrace the + * amount of time spent syncing frees. + */ +static void +spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) +{ + zio_t *zio = zio_root(spa, NULL, NULL, 0); + bplist_iterate(bpl, spa_free_sync_cb, zio, tx); + VERIFY(zio_wait(zio) == 0); +} + +/* + * Note: this simple function is not inlined to make it easier to dtrace the + * amount of time spent syncing deferred frees. + */ +static void +spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) +{ + zio_t *zio = zio_root(spa, NULL, NULL, 0); + VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, + spa_free_sync_cb, zio, tx), ==, 0); + VERIFY0(zio_wait(zio)); +} + + +static void +spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) +{ + char *packed = NULL; + size_t bufsize; + size_t nvsize = 0; + dmu_buf_t *db; + + VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); + + /* + * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration + * information. This avoids the dmu_buf_will_dirty() path and + * saves us a pre-read to get data we don't actually care about. + */ + bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); + packed = kmem_alloc(bufsize, KM_SLEEP); + + VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, + KM_SLEEP) == 0); + bzero(packed + nvsize, bufsize - nvsize); + + dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); + + kmem_free(packed, bufsize); + + VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = nvsize; + dmu_buf_rele(db, FTAG); +} + +static void +spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, + const char *config, const char *entry) +{ + nvlist_t *nvroot; + nvlist_t **list; + int i; + + if (!sav->sav_sync) + return; + + /* + * Update the MOS nvlist describing the list of available devices. + * spa_validate_aux() will have already made sure this nvlist is + * valid and the vdevs are labeled appropriately. + */ + if (sav->sav_object == 0) { + sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, + sizeof (uint64_t), tx); + VERIFY(zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, + &sav->sav_object, tx) == 0); + } + + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if (sav->sav_count == 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); + } else { + list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); + for (i = 0; i < sav->sav_count; i++) + list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], + B_FALSE, VDEV_CONFIG_L2CACHE); + VERIFY(nvlist_add_nvlist_array(nvroot, config, list, + sav->sav_count) == 0); + for (i = 0; i < sav->sav_count; i++) + nvlist_free(list[i]); + kmem_free(list, sav->sav_count * sizeof (void *)); + } + + spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); + nvlist_free(nvroot); + + sav->sav_sync = B_FALSE; +} + +/* + * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. + * The all-vdev ZAP must be empty. + */ +static void +spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + if (vd->vdev_top_zap != 0) { + VERIFY0(zap_add_int(spa->spa_meta_objset, avz, + vd->vdev_top_zap, tx)); + } + if (vd->vdev_leaf_zap != 0) { + VERIFY0(zap_add_int(spa->spa_meta_objset, avz, + vd->vdev_leaf_zap, tx)); + } + for (uint64_t i = 0; i < vd->vdev_children; i++) { + spa_avz_build(vd->vdev_child[i], avz, tx); + } +} + +static void +spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) +{ + nvlist_t *config; + + /* + * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, + * its config may not be dirty but we still need to build per-vdev ZAPs. + * Similarly, if the pool is being assembled (e.g. after a split), we + * need to rebuild the AVZ although the config may not be dirty. + */ + if (list_is_empty(&spa->spa_config_dirty_list) && + spa->spa_avz_action == AVZ_ACTION_NONE) + return; + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || + spa->spa_avz_action == AVZ_ACTION_INITIALIZE || + spa->spa_all_vdev_zaps != 0); + + if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { + /* Make and build the new AVZ */ + uint64_t new_avz = zap_create(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + spa_avz_build(spa->spa_root_vdev, new_avz, tx); + + /* Diff old AVZ with new one */ + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_all_vdev_zaps); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t vdzap = za.za_first_integer; + if (zap_lookup_int(spa->spa_meta_objset, new_avz, + vdzap) == ENOENT) { + /* + * ZAP is listed in old AVZ but not in new one; + * destroy it + */ + VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, + tx)); + } + } + + zap_cursor_fini(&zc); + + /* Destroy the old AVZ */ + VERIFY0(zap_destroy(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, tx)); + + /* Replace the old AVZ in the dir obj with the new one */ + VERIFY0(zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, + sizeof (new_avz), 1, &new_avz, tx)); + + spa->spa_all_vdev_zaps = new_avz; + } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { + zap_cursor_t zc; + zap_attribute_t za; + + /* Walk through the AVZ and destroy all listed ZAPs */ + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_all_vdev_zaps); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t zap = za.za_first_integer; + VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); + } + + zap_cursor_fini(&zc); + + /* Destroy and unlink the AVZ itself */ + VERIFY0(zap_destroy(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, tx)); + VERIFY0(zap_remove(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); + spa->spa_all_vdev_zaps = 0; + } + + if (spa->spa_all_vdev_zaps == 0) { + spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_VDEV_ZAP_MAP, tx); + } + spa->spa_avz_action = AVZ_ACTION_NONE; + + /* Create ZAPs for vdevs that don't have them. */ + vdev_construct_zaps(spa->spa_root_vdev, tx); + + config = spa_config_generate(spa, spa->spa_root_vdev, + dmu_tx_get_txg(tx), B_FALSE); + + /* + * If we're upgrading the spa version then make sure that + * the config object gets updated with the correct version. + */ + if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) + fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa->spa_uberblock.ub_version); + + spa_config_exit(spa, SCL_STATE, FTAG); + + nvlist_free(spa->spa_config_syncing); + spa->spa_config_syncing = config; + + spa_sync_nvlist(spa, spa->spa_config_object, config, tx); +} + +static void +spa_sync_version(void *arg, dmu_tx_t *tx) +{ + uint64_t *versionp = arg; + uint64_t version = *versionp; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + /* + * Setting the version is special cased when first creating the pool. + */ + ASSERT(tx->tx_txg != TXG_INITIAL); + + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); + ASSERT(version >= spa_version(spa)); + + spa->spa_uberblock.ub_version = version; + vdev_config_dirty(spa->spa_root_vdev); + spa_history_log_internal(spa, "set", tx, "version=%lld", version); +} + +/* + * Set zpool properties. + */ +static void +spa_sync_props(void *arg, dmu_tx_t *tx) +{ + nvlist_t *nvp = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = spa->spa_meta_objset; + nvpair_t *elem = NULL; + + mutex_enter(&spa->spa_props_lock); + + while ((elem = nvlist_next_nvpair(nvp, elem))) { + uint64_t intval; + char *strval, *fname; + zpool_prop_t prop; + const char *propname; + zprop_type_t proptype; + spa_feature_t fid; + + switch (prop = zpool_name_to_prop(nvpair_name(elem))) { + case ZPOOL_PROP_INVAL: + /* + * We checked this earlier in spa_prop_validate(). + */ + ASSERT(zpool_prop_feature(nvpair_name(elem))); + + fname = strchr(nvpair_name(elem), '@') + 1; + VERIFY0(zfeature_lookup_name(fname, &fid)); + + spa_feature_enable(spa, fid, tx); + spa_history_log_internal(spa, "set", tx, + "%s=enabled", nvpair_name(elem)); + break; + + case ZPOOL_PROP_VERSION: + intval = fnvpair_value_uint64(elem); + /* + * The version is synced seperatly before other + * properties and should be correct by now. + */ + ASSERT3U(spa_version(spa), >=, intval); + break; + + case ZPOOL_PROP_ALTROOT: + /* + * 'altroot' is a non-persistent property. It should + * have been set temporarily at creation or import time. + */ + ASSERT(spa->spa_root != NULL); + break; + + case ZPOOL_PROP_READONLY: + case ZPOOL_PROP_CACHEFILE: + /* + * 'readonly' and 'cachefile' are also non-persisitent + * properties. + */ + break; + case ZPOOL_PROP_COMMENT: + strval = fnvpair_value_string(elem); + if (spa->spa_comment != NULL) + spa_strfree(spa->spa_comment); + spa->spa_comment = spa_strdup(strval); + /* + * We need to dirty the configuration on all the vdevs + * so that their labels get updated. It's unnecessary + * to do this for pool creation since the vdev's + * configuratoin has already been dirtied. + */ + if (tx->tx_txg != TXG_INITIAL) + vdev_config_dirty(spa->spa_root_vdev); + spa_history_log_internal(spa, "set", tx, + "%s=%s", nvpair_name(elem), strval); + break; + default: + /* + * Set pool property values in the poolprops mos object. + */ + if (spa->spa_pool_props_object == 0) { + spa->spa_pool_props_object = + zap_create_link(mos, DMU_OT_POOL_PROPS, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, + tx); + } + + /* normalize the property name */ + propname = zpool_prop_to_name(prop); + proptype = zpool_prop_get_type(prop); + + if (nvpair_type(elem) == DATA_TYPE_STRING) { + ASSERT(proptype == PROP_TYPE_STRING); + strval = fnvpair_value_string(elem); + VERIFY0(zap_update(mos, + spa->spa_pool_props_object, propname, + 1, strlen(strval) + 1, strval, tx)); + spa_history_log_internal(spa, "set", tx, + "%s=%s", nvpair_name(elem), strval); + } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { + intval = fnvpair_value_uint64(elem); + + if (proptype == PROP_TYPE_INDEX) { + const char *unused; + VERIFY0(zpool_prop_index_to_string( + prop, intval, &unused)); + } + VERIFY0(zap_update(mos, + spa->spa_pool_props_object, propname, + 8, 1, &intval, tx)); + spa_history_log_internal(spa, "set", tx, + "%s=%lld", nvpair_name(elem), intval); + } else { + ASSERT(0); /* not allowed */ + } + + switch (prop) { + case ZPOOL_PROP_DELEGATION: + spa->spa_delegation = intval; + break; + case ZPOOL_PROP_BOOTFS: + spa->spa_bootfs = intval; + break; + case ZPOOL_PROP_FAILUREMODE: + spa->spa_failmode = intval; + break; + case ZPOOL_PROP_AUTOEXPAND: + spa->spa_autoexpand = intval; + if (tx->tx_txg != TXG_INITIAL) + spa_async_request(spa, + SPA_ASYNC_AUTOEXPAND); + break; + case ZPOOL_PROP_DEDUPDITTO: + spa->spa_dedup_ditto = intval; + break; + default: + break; + } + } + + } + + mutex_exit(&spa->spa_props_lock); +} + +/* + * Perform one-time upgrade on-disk changes. spa_version() does not + * reflect the new version this txg, so there must be no changes this + * txg to anything that the upgrade code depends on after it executes. + * Therefore this must be called after dsl_pool_sync() does the sync + * tasks. + */ +static void +spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + + ASSERT(spa->spa_sync_pass == 1); + + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + + if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && + spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { + dsl_pool_create_origin(dp, tx); + + /* Keeping the origin open increases spa_minref */ + spa->spa_minref += 3; + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && + spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { + dsl_pool_upgrade_clones(dp, tx); + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && + spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { + dsl_pool_upgrade_dir_clones(dp, tx); + + /* Keeping the freedir open increases spa_minref */ + spa->spa_minref += 3; + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && + spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { + spa_feature_create_zap_objects(spa, tx); + } + + /* + * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable + * when possibility to use lz4 compression for metadata was added + * Old pools that have this feature enabled must be upgraded to have + * this feature active + */ + if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { + boolean_t lz4_en = spa_feature_is_enabled(spa, + SPA_FEATURE_LZ4_COMPRESS); + boolean_t lz4_ac = spa_feature_is_active(spa, + SPA_FEATURE_LZ4_COMPRESS); + + if (lz4_en && !lz4_ac) + spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); + } + + /* + * If we haven't written the salt, do so now. Note that the + * feature may not be activated yet, but that's fine since + * the presence of this ZAP entry is backwards compatible. + */ + if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CHECKSUM_SALT) == ENOENT) { + VERIFY0(zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, + sizeof (spa->spa_cksum_salt.zcs_bytes), + spa->spa_cksum_salt.zcs_bytes, tx)); + } + + rrw_exit(&dp->dp_config_rwlock, FTAG); +} + +static void +vdev_indirect_state_sync_verify(vdev_t *vd) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + vdev_indirect_births_t *vib = vd->vdev_indirect_births; + + if (vd->vdev_ops == &vdev_indirect_ops) { + ASSERT(vim != NULL); + ASSERT(vib != NULL); + } + + if (vdev_obsolete_sm_object(vd) != 0) { + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); + ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); + + ASSERT3U(vdev_obsolete_sm_object(vd), ==, + space_map_object(vd->vdev_obsolete_sm)); + ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, + space_map_allocated(vd->vdev_obsolete_sm)); + } + ASSERT(vd->vdev_obsolete_segments != NULL); + + /* + * Since frees / remaps to an indirect vdev can only + * happen in syncing context, the obsolete segments + * tree must be empty when we start syncing. + */ + ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); +} + +/* + * Sync the specified transaction group. New blocks may be dirtied as + * part of the process, so we iterate until it converges. + */ +void +spa_sync(spa_t *spa, uint64_t txg) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + objset_t *mos = spa->spa_meta_objset; + bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd; + dmu_tx_t *tx; + int error; + uint32_t max_queue_depth = zfs_vdev_async_write_max_active * + zfs_vdev_queue_depth_pct / 100; + + VERIFY(spa_writeable(spa)); + + /* + * Wait for i/os issued in open context that need to complete + * before this txg syncs. + */ + (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); + spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + + /* + * Lock out configuration changes. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + spa->spa_syncing_txg = txg; + spa->spa_sync_pass = 0; + + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } + + /* + * If there are any pending vdev state changes, convert them + * into config changes that go out with this transaction group. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + while (list_head(&spa->spa_state_dirty_list) != NULL) { + /* + * We need the write lock here because, for aux vdevs, + * calling vdev_config_dirty() modifies sav_config. + * This is ugly and will become unnecessary when we + * eliminate the aux vdev wart by integrating all vdevs + * into the root vdev tree. + */ + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); + while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { + vdev_state_clean(vd); + vdev_config_dirty(vd); + } + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + } + spa_config_exit(spa, SCL_STATE, FTAG); + + tx = dmu_tx_create_assigned(dp, txg); + + spa->spa_sync_starttime = gethrtime(); +#ifdef illumos + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, + spa->spa_sync_starttime + spa->spa_deadman_synctime)); +#else /* !illumos */ +#ifdef _KERNEL + callout_schedule(&spa->spa_deadman_cycid, + hz * spa->spa_deadman_synctime / NANOSEC); +#endif +#endif /* illumos */ + + /* + * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, + * set spa_deflate if we have no raid-z vdevs. + */ + if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && + spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { + int i; + + for (i = 0; i < rvd->vdev_children; i++) { + vd = rvd->vdev_child[i]; + if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) + break; + } + if (i == rvd->vdev_children) { + spa->spa_deflate = TRUE; + VERIFY(0 == zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx)); + } + } + + /* + * Set the top-level vdev's max queue depth. Evaluate each + * top-level's async write queue depth in case it changed. + * The max queue depth will not change in the middle of syncing + * out this txg. + */ + uint64_t slots_per_allocator = 0; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (mg == NULL || mg->mg_class != spa_normal_class(spa) || + !metaslab_group_initialized(mg)) + continue; + + /* + * It is safe to do a lock-free check here because only async + * allocations look at mg_max_alloc_queue_depth, and async + * allocations all happen from spa_sync(). + */ + for (int i = 0; i < spa->spa_alloc_count; i++) + ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i]))); + mg->mg_max_alloc_queue_depth = max_queue_depth; + + for (int i = 0; i < spa->spa_alloc_count; i++) { + mg->mg_cur_max_alloc_queue_depth[i] = + zfs_vdev_def_queue_depth; + } + slots_per_allocator += zfs_vdev_def_queue_depth; + } + metaslab_class_t *mc = spa_normal_class(spa); + for (int i = 0; i < spa->spa_alloc_count; i++) { + ASSERT0(refcount_count(&mc->mc_alloc_slots[i])); + mc->mc_alloc_max_slots[i] = slots_per_allocator; + } + mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + vdev_indirect_state_sync_verify(vd); + + if (vdev_indirect_should_condense(vd)) { + spa_condense_indirect_start_sync(vd, tx); + break; + } + } + + /* + * Iterate to convergence. + */ + do { + int pass = ++spa->spa_sync_pass; + + spa_sync_config_object(spa, tx); + spa_sync_aux_dev(spa, &spa->spa_spares, tx, + ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); + spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, + ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); + spa_errlog_sync(spa, txg); + dsl_pool_sync(dp, txg); + + if (pass < zfs_sync_pass_deferred_free) { + spa_sync_frees(spa, free_bpl, tx); + } else { + /* + * We can not defer frees in pass 1, because + * we sync the deferred frees later in pass 1. + */ + ASSERT3U(pass, >, 1); + bplist_iterate(free_bpl, bpobj_enqueue_cb, + &spa->spa_deferred_bpobj, tx); + } + + ddt_sync(spa, txg); + dsl_scan_sync(dp, tx); + + if (spa->spa_vdev_removal != NULL) + svr_sync(spa, tx); + + while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) + != NULL) + vdev_sync(vd, txg); + + if (pass == 1) { + spa_sync_upgrades(spa, tx); + ASSERT3U(txg, >=, + spa->spa_uberblock.ub_rootbp.blk_birth); + /* + * Note: We need to check if the MOS is dirty + * because we could have marked the MOS dirty + * without updating the uberblock (e.g. if we + * have sync tasks but no dirty user data). We + * need to check the uberblock's rootbp because + * it is updated if we have synced out dirty + * data (though in this case the MOS will most + * likely also be dirty due to second order + * effects, we don't want to rely on that here). + */ + if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && + !dmu_objset_is_dirty(mos, txg)) { + /* + * Nothing changed on the first pass, + * therefore this TXG is a no-op. Avoid + * syncing deferred frees, so that we + * can keep this TXG as a no-op. + */ + ASSERT(txg_list_empty(&dp->dp_dirty_datasets, + txg)); + ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); + ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); + ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, + txg)); + break; + } + spa_sync_deferred_frees(spa, tx); + } + + } while (dmu_objset_is_dirty(mos, txg)); + + if (!list_is_empty(&spa->spa_config_dirty_list)) { + /* + * Make sure that the number of ZAPs for all the vdevs matches + * the number of ZAPs in the per-vdev ZAP list. This only gets + * called if the config is dirty; otherwise there may be + * outstanding AVZ operations that weren't completed in + * spa_sync_config_object. + */ + uint64_t all_vdev_zap_entry_count; + ASSERT0(zap_count(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); + ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, + all_vdev_zap_entry_count); + } + + if (spa->spa_vdev_removal != NULL) { + ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); + } + + /* + * Rewrite the vdev configuration (which includes the uberblock) + * to commit the transaction group. + * + * If there are no dirty vdevs, we sync the uberblock to a few + * random top-level vdevs that are known to be visible in the + * config cache (see spa_vdev_add() for a complete description). + * If there *are* dirty vdevs, sync the uberblock to all vdevs. + */ + for (;;) { + /* + * We hold SCL_STATE to prevent vdev open/close/etc. + * while we're attempting to write the vdev labels. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + if (list_is_empty(&spa->spa_config_dirty_list)) { + vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; + int svdcount = 0; + int children = rvd->vdev_children; + int c0 = spa_get_random(children); + + for (int c = 0; c < children; c++) { + vd = rvd->vdev_child[(c0 + c) % children]; + + /* Stop when revisiting the first vdev */ + if (c > 0 && svd[0] == vd) + break; + + if (vd->vdev_ms_array == 0 || vd->vdev_islog || + !vdev_is_concrete(vd)) + continue; + + svd[svdcount++] = vd; + if (svdcount == SPA_SYNC_MIN_VDEVS) + break; + } + error = vdev_config_sync(svd, svdcount, txg); + } else { + error = vdev_config_sync(rvd->vdev_child, + rvd->vdev_children, txg); + } + + if (error == 0) + spa->spa_last_synced_guid = rvd->vdev_guid; + + spa_config_exit(spa, SCL_STATE, FTAG); + + if (error == 0) + break; + zio_suspend(spa, NULL); + zio_resume_wait(spa); + } + dmu_tx_commit(tx); + +#ifdef illumos + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); +#else /* !illumos */ +#ifdef _KERNEL + callout_drain(&spa->spa_deadman_cycid); +#endif +#endif /* illumos */ + + /* + * Clear the dirty config list. + */ + while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) + vdev_config_clean(vd); + + /* + * Now that the new config has synced transactionally, + * let it become visible to the config cache. + */ + if (spa->spa_config_syncing != NULL) { + spa_config_set(spa, spa->spa_config_syncing); + spa->spa_config_txg = txg; + spa->spa_config_syncing = NULL; + } + + dsl_pool_sync_done(dp, txg); + + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } + + /* + * Update usable space statistics. + */ + while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) + != NULL) + vdev_sync_done(vd, txg); + + spa_update_dspace(spa); + + /* + * It had better be the case that we didn't dirty anything + * since vdev_config_sync(). + */ + ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); + ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); + ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); + + while (zfs_pause_spa_sync) + delay(1); + + spa->spa_sync_pass = 0; + + /* + * Update the last synced uberblock here. We want to do this at + * the end of spa_sync() so that consumers of spa_last_synced_txg() + * will be guaranteed that all the processing associated with + * that txg has been completed. + */ + spa->spa_ubsync = spa->spa_uberblock; + spa_config_exit(spa, SCL_CONFIG, FTAG); + + spa_handle_ignored_writes(spa); + + /* + * If any async tasks have been requested, kick them off. + */ + spa_async_dispatch(spa); + spa_async_dispatch_vd(spa); +} + +/* + * Sync all pools. We don't want to hold the namespace lock across these + * operations, so we take a reference on the spa_t and drop the lock during the + * sync. + */ +void +spa_sync_allpools(void) +{ + spa_t *spa = NULL; + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (spa_state(spa) != POOL_STATE_ACTIVE || + !spa_writeable(spa) || spa_suspended(spa)) + continue; + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + txg_wait_synced(spa_get_dsl(spa), 0); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + } + mutex_exit(&spa_namespace_lock); +} + +/* + * ========================================================================== + * Miscellaneous routines + * ========================================================================== + */ + +/* + * Remove all pools in the system. + */ +void +spa_evict_all(void) +{ + spa_t *spa; + + /* + * Remove all cached state. All pools should be closed now, + * so every spa in the AVL tree should be unreferenced. + */ + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(NULL)) != NULL) { + /* + * Stop async tasks. The async thread may need to detach + * a device that's been replaced, which requires grabbing + * spa_namespace_lock, so we must drop it here. + */ + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + spa_async_suspend(spa); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { + spa_unload(spa); + spa_deactivate(spa); + } + spa_remove(spa); + } + mutex_exit(&spa_namespace_lock); +} + +vdev_t * +spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) +{ + vdev_t *vd; + int i; + + if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) + return (vd); + + if (aux) { + for (i = 0; i < spa->spa_l2cache.sav_count; i++) { + vd = spa->spa_l2cache.sav_vdevs[i]; + if (vd->vdev_guid == guid) + return (vd); + } + + for (i = 0; i < spa->spa_spares.sav_count; i++) { + vd = spa->spa_spares.sav_vdevs[i]; + if (vd->vdev_guid == guid) + return (vd); + } + } + + return (NULL); +} + +void +spa_upgrade(spa_t *spa, uint64_t version) +{ + ASSERT(spa_writeable(spa)); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * This should only be called for a non-faulted pool, and since a + * future version would result in an unopenable pool, this shouldn't be + * possible. + */ + ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); + ASSERT3U(version, >=, spa->spa_uberblock.ub_version); + + spa->spa_uberblock.ub_version = version; + vdev_config_dirty(spa->spa_root_vdev); + + spa_config_exit(spa, SCL_ALL, FTAG); + + txg_wait_synced(spa_get_dsl(spa), 0); +} + +boolean_t +spa_has_spare(spa_t *spa, uint64_t guid) +{ + int i; + uint64_t spareguid; + spa_aux_vdev_t *sav = &spa->spa_spares; + + for (i = 0; i < sav->sav_count; i++) + if (sav->sav_vdevs[i]->vdev_guid == guid) + return (B_TRUE); + + for (i = 0; i < sav->sav_npending; i++) { + if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, + &spareguid) == 0 && spareguid == guid) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Check if a pool has an active shared spare device. + * Note: reference count of an active spare is 2, as a spare and as a replace + */ +static boolean_t +spa_has_active_shared_spare(spa_t *spa) +{ + int i, refcnt; + uint64_t pool; + spa_aux_vdev_t *sav = &spa->spa_spares; + + for (i = 0; i < sav->sav_count; i++) { + if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, + &refcnt) && pool != 0ULL && pool == spa_guid(spa) && + refcnt > 2) + return (B_TRUE); + } + + return (B_FALSE); +} + +sysevent_t * +spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) +{ + sysevent_t *ev = NULL; +#ifdef _KERNEL + sysevent_attr_list_t *attr = NULL; + sysevent_value_t value; + + ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", + SE_SLEEP); + ASSERT(ev != NULL); + + value.value_type = SE_DATA_TYPE_STRING; + value.value.sv_string = spa_name(spa); + if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) + goto done; + + value.value_type = SE_DATA_TYPE_UINT64; + value.value.sv_uint64 = spa_guid(spa); + if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) + goto done; + + if (vd) { + value.value_type = SE_DATA_TYPE_UINT64; + value.value.sv_uint64 = vd->vdev_guid; + if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, + SE_SLEEP) != 0) + goto done; + + if (vd->vdev_path) { + value.value_type = SE_DATA_TYPE_STRING; + value.value.sv_string = vd->vdev_path; + if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, + &value, SE_SLEEP) != 0) + goto done; + } + } + + if (hist_nvl != NULL) { + fnvlist_merge((nvlist_t *)attr, hist_nvl); + } + + if (sysevent_attach_attributes(ev, attr) != 0) + goto done; + attr = NULL; + +done: + if (attr) + sysevent_free_attr(attr); + +#endif + return (ev); +} + +void +spa_event_post(sysevent_t *ev) +{ +#ifdef _KERNEL + sysevent_id_t eid; + + (void) log_sysevent(ev, SE_SLEEP, &eid); + sysevent_free(ev); +#endif +} + +void +spa_event_discard(sysevent_t *ev) +{ +#ifdef _KERNEL + sysevent_free(ev); +#endif +} + +/* + * Post a sysevent corresponding to the given event. The 'name' must be one of + * the event definitions in sys/sysevent/eventdefs.h. The payload will be + * filled in from the spa and (optionally) the vdev and history nvl. This + * doesn't do anything in the userland libzpool, as we don't want consumers to + * misinterpret ztest or zdb as real changes. + */ +void +spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) +{ + spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c new file mode 100644 index 000000000000..db0d2caa6107 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c @@ -0,0 +1,625 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +/* + * Storage Pool Checkpoint + * + * A storage pool checkpoint can be thought of as a pool-wide snapshot or + * a stable version of extreme rewind that guarantees no blocks from the + * checkpointed state will have been overwritten. It remembers the entire + * state of the storage pool (e.g. snapshots, dataset names, etc..) from the + * point that it was taken and the user can rewind back to that point even if + * they applied destructive operations on their datasets or even enabled new + * zpool on-disk features. If a pool has a checkpoint that is no longer + * needed, the user can discard it. + * + * == On disk data structures used == + * + * - The pool has a new feature flag and a new entry in the MOS. The feature + * flag is set to active when we create the checkpoint and remains active + * until the checkpoint is fully discarded. The entry in the MOS config + * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that + * references the state of the pool when we take the checkpoint. The entry + * remains populated until we start discarding the checkpoint or we rewind + * back to it. + * + * - Each vdev contains a vdev-wide space map while the pool has a checkpoint, + * which persists until the checkpoint is fully discarded. The space map + * contains entries that have been freed in the current state of the pool + * but we want to keep around in case we decide to rewind to the checkpoint. + * [see vdev_checkpoint_sm] + * + * - Each metaslab's ms_sm space map behaves the same as without the + * checkpoint, with the only exception being the scenario when we free + * blocks that belong to the checkpoint. In this case, these blocks remain + * ALLOCATED in the metaslab's space map and they are added as FREE in the + * vdev's checkpoint space map. + * + * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that + * the uberblock was checkpointed. For normal uberblocks this field is 0. + * + * == Overview of operations == + * + * - To create a checkpoint, we first wait for the current TXG to be synced, + * so we can use the most recently synced uberblock (spa_ubsync) as the + * checkpointed uberblock. Then we use an early synctask to place that + * uberblock in MOS config, increment the feature flag for the checkpoint + * (marking it active), and setting spa_checkpoint_txg (see its use below) + * to the TXG of the checkpointed uberblock. We use an early synctask for + * the aforementioned operations to ensure that no blocks were dirtied + * between the current TXG and the TXG of the checkpointed uberblock + * (e.g the previous txg). + * + * - When a checkpoint exists, we need to ensure that the blocks that + * belong to the checkpoint are freed but never reused. This means that + * these blocks should never end up in the ms_allocatable or the ms_freeing + * trees of a metaslab. Therefore, whenever there is a checkpoint the new + * ms_checkpointing tree is used in addition to the aforementioned ones. + * + * Whenever a block is freed and we find out that it is referenced by the + * checkpoint (we find out by comparing its birth to spa_checkpoint_txg), + * we place it in the ms_checkpointing tree instead of the ms_freeingtree. + * This way, we divide the blocks that are being freed into checkpointed + * and not-checkpointed blocks. + * + * In order to persist these frees, we write the extents from the + * ms_freeingtree to the ms_sm as usual, and the extents from the + * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these + * checkpointed extents will remain allocated in the metaslab's ms_sm space + * map, and therefore won't be reused [see metaslab_sync()]. In addition, + * when we discard the checkpoint, we can find the entries that have + * actually been freed in vdev_checkpoint_sm. + * [see spa_checkpoint_discard_thread_sync()] + * + * - To discard the checkpoint we use an early synctask to delete the + * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0, + * and wakeup the discarding zthr thread (an open-context async thread). + * We use an early synctask to ensure that the operation happens before any + * new data end up in the checkpoint's data structures. + * + * Once the synctask is done and the discarding zthr is awake, we discard + * the checkpointed data over multiple TXGs by having the zthr prefetching + * entries from vdev_checkpoint_sm and then starting a synctask that places + * them as free blocks in to their respective ms_allocatable and ms_sm + * structures. + * [see spa_checkpoint_discard_thread()] + * + * When there are no entries left in the vdev_checkpoint_sm of all + * top-level vdevs, a final synctask runs that decrements the feature flag. + * + * - To rewind to the checkpoint, we first use the current uberblock and + * open the MOS so we can access the checkpointed uberblock from the MOS + * config. After we retrieve the checkpointed uberblock, we use it as the + * current uberblock for the pool by writing it to disk with an updated + * TXG, opening its version of the MOS, and moving on as usual from there. + * [see spa_ld_checkpoint_rewind()] + * + * An important note on rewinding to the checkpoint has to do with how we + * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL + * blocks that have not been claimed by the time we took the checkpoint + * as they should no longer be valid. + * [see comment in zil_claim()] + * + * == Miscellaneous information == + * + * - In the hypothetical event that we take a checkpoint, remove a vdev, + * and attempt to rewind, the rewind would fail as the checkpointed + * uberblock would reference data in the removed device. For this reason + * and others of similar nature, we disallow the following operations that + * can change the config: + * vdev removal and attach/detach, mirror splitting, and pool reguid. + * + * - As most of the checkpoint logic is implemented in the SPA and doesn't + * distinguish datasets when it comes to space accounting, having a + * checkpoint can potentially break the boundaries set by dataset + * reservations. + */ + +#include <sys/dmu_tx.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> +#include <sys/metaslab_impl.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/spa_checkpoint.h> +#include <sys/vdev_impl.h> +#include <sys/zap.h> +#include <sys/zfeature.h> + +/* + * The following parameter limits the amount of memory to be used for the + * prefetching of the checkpoint space map done on each vdev while + * discarding the checkpoint. + * + * The reason it exists is because top-level vdevs with long checkpoint + * space maps can potentially take up a lot of memory depending on the + * amount of checkpointed data that has been freed within them while + * the pool had a checkpoint. + */ +uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024; + +int +spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); + + bzero(pcs, sizeof (pool_checkpoint_stat_t)); + + int error = zap_contains(spa_meta_objset(spa), + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT); + ASSERT(error == 0 || error == ENOENT); + + if (error == ENOENT) + pcs->pcs_state = CS_CHECKPOINT_DISCARDING; + else + pcs->pcs_state = CS_CHECKPOINT_EXISTS; + + pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace; + pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp; + + return (0); +} + +static void +spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + + spa->spa_checkpoint_info.sci_timestamp = 0; + + spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); + + spa_history_log_internal(spa, "spa discard checkpoint", tx, + "finished discarding checkpointed state from the pool"); +} + +typedef struct spa_checkpoint_discard_sync_callback_arg { + vdev_t *sdc_vd; + uint64_t sdc_txg; + uint64_t sdc_entry_limit; +} spa_checkpoint_discard_sync_callback_arg_t; + +static int +spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg) +{ + spa_checkpoint_discard_sync_callback_arg_t *sdc = arg; + vdev_t *vd = sdc->sdc_vd; + metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; + uint64_t end = sme->sme_offset + sme->sme_run; + + if (sdc->sdc_entry_limit == 0) + return (EINTR); + + /* + * Since the space map is not condensed, we know that + * none of its entries is crossing the boundaries of + * its respective metaslab. + * + * That said, there is no fundamental requirement that + * the checkpoint's space map entries should not cross + * metaslab boundaries. So if needed we could add code + * that handles metaslab-crossing segments in the future. + */ + VERIFY3U(sme->sme_type, ==, SM_FREE); + VERIFY3U(sme->sme_offset, >=, ms->ms_start); + VERIFY3U(end, <=, ms->ms_start + ms->ms_size); + + /* + * At this point we should not be processing any + * other frees concurrently, so the lock is technically + * unnecessary. We use the lock anyway though to + * potentially save ourselves from future headaches. + */ + mutex_enter(&ms->ms_lock); + if (range_tree_is_empty(ms->ms_freeing)) + vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg); + range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run); + mutex_exit(&ms->ms_lock); + + ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, + sme->sme_run); + ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run); + + vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run; + vd->vdev_stat.vs_checkpoint_space -= sme->sme_run; + sdc->sdc_entry_limit--; + + return (0); +} + +static void +spa_checkpoint_accounting_verify(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t ckpoint_sm_space_sum = 0; + uint64_t vs_ckpoint_space_sum = 0; + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + if (vd->vdev_checkpoint_sm != NULL) { + ckpoint_sm_space_sum += + -vd->vdev_checkpoint_sm->sm_alloc; + vs_ckpoint_space_sum += + vd->vdev_stat.vs_checkpoint_space; + ASSERT3U(ckpoint_sm_space_sum, ==, + vs_ckpoint_space_sum); + } else { + ASSERT0(vd->vdev_stat.vs_checkpoint_space); + } + } + ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum); +} + +static void +spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) +{ + vdev_t *vd = arg; + int error; + + /* + * The space map callback is applied only to non-debug entries. + * Because the number of debug entries is less or equal to the + * number of non-debug entries, we want to ensure that we only + * read what we prefetched from open-context. + * + * Thus, we set the maximum entries that the space map callback + * will be applied to be half the entries that could fit in the + * imposed memory limit. + * + * Note that since this is a conservative estimate we also + * assume the worst case scenario in our computation where each + * entry is two-word. + */ + uint64_t max_entry_limit = + (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1; + + /* + * Iterate from the end of the space map towards the beginning, + * placing its entries on ms_freeing and removing them from the + * space map. The iteration stops if one of the following + * conditions is true: + * + * 1] We reached the beginning of the space map. At this point + * the space map should be completely empty and + * space_map_incremental_destroy should have returned 0. + * The next step would be to free and close the space map + * and remove its entry from its vdev's top zap. This allows + * spa_checkpoint_discard_thread() to move on to the next vdev. + * + * 2] We reached the memory limit (amount of memory used to hold + * space map entries in memory) and space_map_incremental_destroy + * returned EINTR. This means that there are entries remaining + * in the space map that will be cleared in a future invocation + * of this function by spa_checkpoint_discard_thread(). + */ + spa_checkpoint_discard_sync_callback_arg_t sdc; + sdc.sdc_vd = vd; + sdc.sdc_txg = tx->tx_txg; + sdc.sdc_entry_limit = max_entry_limit; + + uint64_t words_before = + space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); + + error = space_map_incremental_destroy(vd->vdev_checkpoint_sm, + spa_checkpoint_discard_sync_callback, &sdc, tx); + + uint64_t words_after = + space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); + +#ifdef DEBUG + spa_checkpoint_accounting_verify(vd->vdev_spa); +#endif + + zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, " + "deleted %llu words - %llu words are left", + tx->tx_txg, vd->vdev_id, (words_before - words_after), + words_after); + + if (error != EINTR) { + if (error != 0) { + zfs_panic_recover("zfs: error %d was returned " + "while incrementally destroying the checkpoint " + "space map of vdev %llu\n", + error, vd->vdev_id); + } + ASSERT0(words_after); + ASSERT0(vd->vdev_checkpoint_sm->sm_alloc); + ASSERT0(space_map_length(vd->vdev_checkpoint_sm)); + + space_map_free(vd->vdev_checkpoint_sm, tx); + space_map_close(vd->vdev_checkpoint_sm); + vd->vdev_checkpoint_sm = NULL; + + VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx)); + } +} + +static boolean_t +spa_checkpoint_discard_is_done(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(!spa_has_checkpoint(spa)); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)); + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL) + return (B_FALSE); + ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space); + } + + return (B_TRUE); +} + +/* ARGSUSED */ +boolean_t +spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + + if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (B_FALSE); + + if (spa_has_checkpoint(spa)) + return (B_FALSE); + + return (B_TRUE); +} + +int +spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + vdev_t *rvd = spa->spa_root_vdev; + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + while (vd->vdev_checkpoint_sm != NULL) { + space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm; + int numbufs; + dmu_buf_t **dbp; + + if (zthr_iscancelled(zthr)) + return (0); + + ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); + + uint64_t size = MIN(space_map_length(checkpoint_sm), + zfs_spa_discard_memory_limit); + uint64_t offset = + space_map_length(checkpoint_sm) - size; + + /* + * Ensure that the part of the space map that will + * be destroyed by the synctask, is prefetched in + * memory before the synctask runs. + */ + int error = dmu_buf_hold_array_by_bonus( + checkpoint_sm->sm_dbuf, offset, size, + B_TRUE, FTAG, &numbufs, &dbp); + if (error != 0) { + zfs_panic_recover("zfs: error %d was returned " + "while prefetching checkpoint space map " + "entries of vdev %llu\n", + error, vd->vdev_id); + } + + VERIFY0(dsl_sync_task(spa->spa_name, NULL, + spa_checkpoint_discard_thread_sync, vd, + 0, ZFS_SPACE_CHECK_NONE)); + + dmu_buf_rele_array(dbp, numbufs, FTAG); + } + } + + VERIFY(spa_checkpoint_discard_is_done(spa)); + VERIFY0(spa->spa_checkpoint_info.sci_dspace); + VERIFY0(dsl_sync_task(spa->spa_name, NULL, + spa_checkpoint_discard_complete_sync, spa, + 0, ZFS_SPACE_CHECK_NONE)); + + return (0); +} + + +/* ARGSUSED */ +static int +spa_checkpoint_check(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (SET_ERROR(ENOTSUP)); + + if (!spa_top_vdevs_spacemap_addressable(spa)) + return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG)); + + if (spa->spa_vdev_removal != NULL) + return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); + + if (spa->spa_checkpoint_txg != 0) + return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); + + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); + + return (0); +} + +/* ARGSUSED */ +static void +spa_checkpoint_sync(void *arg, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + uberblock_t checkpoint = spa->spa_ubsync; + + /* + * At this point, there should not be a checkpoint in the MOS. + */ + ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT); + + ASSERT0(spa->spa_checkpoint_info.sci_timestamp); + ASSERT0(spa->spa_checkpoint_info.sci_dspace); + + /* + * Since the checkpointed uberblock is the one that just got synced + * (we use spa_ubsync), its txg must be equal to the txg number of + * the txg we are syncing, minus 1. + */ + ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1); + + /* + * Once the checkpoint is in place, we need to ensure that none of + * its blocks will be marked for reuse after it has been freed. + * When there is a checkpoint and a block is freed, we compare its + * birth txg to the txg of the checkpointed uberblock to see if the + * block is part of the checkpoint or not. Therefore, we have to set + * spa_checkpoint_txg before any frees happen in this txg (which is + * why this is done as an early_synctask as explained in the comment + * in spa_checkpoint()). + */ + spa->spa_checkpoint_txg = checkpoint.ub_txg; + spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; + + checkpoint.ub_checkpoint_txg = checkpoint.ub_txg; + VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, + sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), + &checkpoint, tx)); + + /* + * Increment the feature refcount and thus activate the feature. + * Note that the feature will be deactivated when we've + * completely discarded all checkpointed state (both vdev + * space maps and uberblock). + */ + spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); + + spa_history_log_internal(spa, "spa checkpoint", tx, + "checkpointed uberblock txg=%llu", checkpoint.ub_txg); +} + +/* + * Create a checkpoint for the pool. + */ +int +spa_checkpoint(const char *pool) +{ + int error; + spa_t *spa; + + error = spa_open(pool, &spa, FTAG); + if (error != 0) + return (error); + + mutex_enter(&spa->spa_vdev_top_lock); + + /* + * Wait for current syncing txg to finish so the latest synced + * uberblock (spa_ubsync) has all the changes that we expect + * to see if we were to revert later to the checkpoint. In other + * words we want the checkpointed uberblock to include/reference + * all the changes that were pending at the time that we issued + * the checkpoint command. + */ + txg_wait_synced(spa_get_dsl(spa), 0); + + /* + * As the checkpointed uberblock references blocks from the previous + * txg (spa_ubsync) we want to ensure that are not freeing any of + * these blocks in the same txg that the following synctask will + * run. Thus, we run it as an early synctask, so the dirty changes + * that are synced to disk afterwards during zios and other synctasks + * do not reuse checkpointed blocks. + */ + error = dsl_early_sync_task(pool, spa_checkpoint_check, + spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL); + + mutex_exit(&spa->spa_vdev_top_lock); + + spa_close(spa, FTAG); + return (error); +} + +/* ARGSUSED */ +static int +spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); + + if (spa->spa_checkpoint_txg == 0) + return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); + + VERIFY0(zap_contains(spa_meta_objset(spa), + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT)); + + return (0); +} + +/* ARGSUSED */ +static void +spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, tx)); + + spa->spa_checkpoint_txg = 0; + + zthr_wakeup(spa->spa_checkpoint_discard_zthr); + + spa_history_log_internal(spa, "spa discard checkpoint", tx, + "started discarding checkpointed state from the pool"); +} + +/* + * Discard the checkpoint from a pool. + */ +int +spa_checkpoint_discard(const char *pool) +{ + /* + * Similarly to spa_checkpoint(), we want our synctask to run + * before any pending dirty data are written to disk so they + * won't end up in the checkpoint's data structures (e.g. + * ms_checkpointing and vdev_checkpoint_sm) and re-create any + * space maps that the discarding open-context thread has + * deleted. + * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread] + */ + return (dsl_early_sync_task(pool, spa_checkpoint_discard_check, + spa_checkpoint_discard_sync, NULL, 0, + ZFS_SPACE_CHECK_DISCARD_CHECKPOINT)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c new file mode 100644 index 000000000000..28f1a699ec4c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c @@ -0,0 +1,595 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/fm/fs/zfs.h> +#include <sys/spa_impl.h> +#include <sys/nvpair.h> +#include <sys/uio.h> +#include <sys/fs/zfs.h> +#include <sys/vdev_impl.h> +#include <sys/zfs_ioctl.h> +#include <sys/utsname.h> +#include <sys/sunddi.h> +#include <sys/zfeature.h> +#ifdef _KERNEL +#include <sys/kobj.h> +#include <sys/zone.h> +#endif + +/* + * Pool configuration repository. + * + * Pool configuration is stored as a packed nvlist on the filesystem. By + * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot + * (when the ZFS module is loaded). Pools can also have the 'cachefile' + * property set that allows them to be stored in an alternate location until + * the control of external software. + * + * For each cache file, we have a single nvlist which holds all the + * configuration information. When the module loads, we read this information + * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is + * maintained independently in spa.c. Whenever the namespace is modified, or + * the configuration of a pool is changed, we call spa_write_cachefile(), which + * walks through all the active pools and writes the configuration to disk. + */ + +static uint64_t spa_config_generation = 1; + +/* + * This can be overridden in userland to preserve an alternate namespace for + * userland pools when doing testing. + */ +const char *spa_config_path = ZPOOL_CACHE; + +/* + * Called when the module is first loaded, this routine loads the configuration + * file into the SPA namespace. It does not actually open or load the pools; it + * only populates the namespace. + */ +void +spa_config_load(void) +{ + void *buf = NULL; + nvlist_t *nvlist, *child; + nvpair_t *nvpair; + char *pathname; + struct _buf *file; + uint64_t fsize; + + /* + * Open the configuration file. + */ + pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); + + file = kobj_open_file(pathname); + + kmem_free(pathname, MAXPATHLEN); + + if (file == (struct _buf *)-1) + return; + + if (kobj_get_filesize(file, &fsize) != 0) + goto out; + + buf = kmem_alloc(fsize, KM_SLEEP); + + /* + * Read the nvlist from the file. + */ + if (kobj_read_file(file, buf, fsize, 0) < 0) + goto out; + + /* + * Unpack the nvlist. + */ + if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) + goto out; + + /* + * Iterate over all elements in the nvlist, creating a new spa_t for + * each one with the specified configuration. + */ + mutex_enter(&spa_namespace_lock); + nvpair = NULL; + while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { + if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) + continue; + + child = fnvpair_value_nvlist(nvpair); + + if (spa_lookup(nvpair_name(nvpair)) != NULL) + continue; + (void) spa_add(nvpair_name(nvpair), child, NULL); + } + mutex_exit(&spa_namespace_lock); + + nvlist_free(nvlist); + +out: + if (buf != NULL) + kmem_free(buf, fsize); + + kobj_close_file(file); +} + +static void +spa_config_clean(nvlist_t *nvl) +{ + nvlist_t **child; + nvlist_t *nvroot = NULL; + uint_t c, children; + + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + for (c = 0; c < children; c++) + spa_config_clean(child[c]); + } + + if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0) + spa_config_clean(nvroot); + + nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, DATA_TYPE_UINT64_ARRAY); + nvlist_remove(nvl, ZPOOL_CONFIG_SCAN_STATS, DATA_TYPE_UINT64_ARRAY); +} + +static int +spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) +{ + size_t buflen; + char *buf; + vnode_t *vp; + int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; + char *temp; + int err; + + /* + * If the nvlist is empty (NULL), then remove the old cachefile. + */ + if (nvl == NULL) { + err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE); + return (err); + } + + /* + * Pack the configuration into a buffer. + */ + buf = fnvlist_pack(nvl, &buflen); + temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + /* + * Write the configuration to disk. We need to do the traditional + * 'write to temporary file, sync, move over original' to make sure we + * always have a consistent view of the data. + */ + (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path); + + err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0); + if (err == 0) { + err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, + 0, RLIM64_INFINITY, kcred, NULL); + if (err == 0) + err = VOP_FSYNC(vp, FSYNC, kcred, NULL); + if (err == 0) + err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE); + (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL); + } + + (void) vn_remove(temp, UIO_SYSSPACE, RMFILE); + + fnvlist_pack_free(buf, buflen); + kmem_free(temp, MAXPATHLEN); + return (err); +} + +/* + * Synchronize pool configuration to disk. This must be called with the + * namespace lock held. Synchronizing the pool cache is typically done after + * the configuration has been synced to the MOS. This exposes a window where + * the MOS config will have been updated but the cache file has not. If + * the system were to crash at that instant then the cached config may not + * contain the correct information to open the pool and an explicit import + * would be required. + */ +void +spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) +{ + spa_config_dirent_t *dp, *tdp; + nvlist_t *nvl; + boolean_t ccw_failure; + int error; + char *pool_name; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + if (rootdir == NULL || !(spa_mode_global & FWRITE)) + return; + + /* + * Iterate over all cachefiles for the pool, past or present. When the + * cachefile is changed, the new one is pushed onto this list, allowing + * us to update previous cachefiles that no longer contain this pool. + */ + ccw_failure = B_FALSE; + for (dp = list_head(&target->spa_config_list); dp != NULL; + dp = list_next(&target->spa_config_list, dp)) { + spa_t *spa = NULL; + if (dp->scd_path == NULL) + continue; + + /* + * Iterate over all pools, adding any matching pools to 'nvl'. + */ + nvl = NULL; + while ((spa = spa_next(spa)) != NULL) { + nvlist_t *nvroot = NULL; + /* + * Skip over our own pool if we're about to remove + * ourselves from the spa namespace or any pool that + * is readonly. Since we cannot guarantee that a + * readonly pool would successfully import upon reboot, + * we don't allow them to be written to the cache file. + */ + if ((spa == target && removing) || + (spa_state(spa) == POOL_STATE_ACTIVE && + !spa_writeable(spa))) + continue; + + mutex_enter(&spa->spa_props_lock); + tdp = list_head(&spa->spa_config_list); + if (spa->spa_config == NULL || + tdp->scd_path == NULL || + strcmp(tdp->scd_path, dp->scd_path) != 0) { + mutex_exit(&spa->spa_props_lock); + continue; + } + + if (nvl == NULL) + nvl = fnvlist_alloc(); + + if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { + pool_name = fnvlist_lookup_string( + spa->spa_config, ZPOOL_CONFIG_POOL_NAME); + } else { + pool_name = spa_name(spa); + } + + fnvlist_add_nvlist(nvl, pool_name, + spa->spa_config); + mutex_exit(&spa->spa_props_lock); + + if (nvlist_lookup_nvlist(nvl, pool_name, &nvroot) == 0) + spa_config_clean(nvroot); + } + + error = spa_config_write(dp, nvl); + if (error != 0) + ccw_failure = B_TRUE; + nvlist_free(nvl); + } + + if (ccw_failure) { + /* + * Keep trying so that configuration data is + * written if/when any temporary filesystem + * resource issues are resolved. + */ + if (target->spa_ccw_fail_time == 0) { + zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, + target, NULL, NULL, 0, 0); + } + target->spa_ccw_fail_time = gethrtime(); + spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); + } else { + /* + * Do not rate limit future attempts to update + * the config cache. + */ + target->spa_ccw_fail_time = 0; + } + + /* + * Remove any config entries older than the current one. + */ + dp = list_head(&target->spa_config_list); + while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { + list_remove(&target->spa_config_list, tdp); + if (tdp->scd_path != NULL) + spa_strfree(tdp->scd_path); + kmem_free(tdp, sizeof (spa_config_dirent_t)); + } + + spa_config_generation++; + + if (postsysevent) + spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC); +} + +/* + * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, + * and we don't want to allow the local zone to see all the pools anyway. + * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration + * information for all pool visible within the zone. + */ +nvlist_t * +spa_all_configs(uint64_t *generation) +{ + nvlist_t *pools; + spa_t *spa = NULL; + + if (*generation == spa_config_generation) + return (NULL); + + pools = fnvlist_alloc(); + + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (INGLOBALZONE(curthread) || + zone_dataset_visible(spa_name(spa), NULL)) { + mutex_enter(&spa->spa_props_lock); + fnvlist_add_nvlist(pools, spa_name(spa), + spa->spa_config); + mutex_exit(&spa->spa_props_lock); + } + } + *generation = spa_config_generation; + mutex_exit(&spa_namespace_lock); + + return (pools); +} + +void +spa_config_set(spa_t *spa, nvlist_t *config) +{ + mutex_enter(&spa->spa_props_lock); + if (spa->spa_config != NULL && spa->spa_config != config) + nvlist_free(spa->spa_config); + spa->spa_config = config; + mutex_exit(&spa->spa_props_lock); +} + +/* + * Generate the pool's configuration based on the current in-core state. + * + * We infer whether to generate a complete config or just one top-level config + * based on whether vd is the root vdev. + */ +nvlist_t * +spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) +{ + nvlist_t *config, *nvroot; + vdev_t *rvd = spa->spa_root_vdev; + unsigned long hostid = 0; + boolean_t locked = B_FALSE; + uint64_t split_guid; + char *pool_name; + + if (vd == NULL) { + vd = rvd; + locked = B_TRUE; + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + } + + ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == + (SCL_CONFIG | SCL_STATE)); + + /* + * If txg is -1, report the current value of spa->spa_config_txg. + */ + if (txg == -1ULL) + txg = spa->spa_config_txg; + + /* + * Originally, users had to handle spa namespace collisions by either + * exporting the already imported pool or by specifying a new name for + * the pool with a conflicting name. In the case of root pools from + * virtual guests, neither approach to collision resolution is + * reasonable. This is addressed by extending the new name syntax with + * an option to specify that the new name is temporary. When specified, + * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us + * to use the previous name, which we do below. + */ + if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { + pool_name = fnvlist_lookup_string(spa->spa_config, + ZPOOL_CONFIG_POOL_NAME); + } else { + pool_name = spa_name(spa); + } + + config = fnvlist_alloc(); + + fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); + fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name); + fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)); + fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); + fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); + if (spa->spa_comment != NULL) { + fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, + spa->spa_comment); + } + + hostid = zone_get_hostid(NULL); + + if (hostid != 0) { + fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); + } + fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename); + + int config_gen_flags = 0; + if (vd != rvd) { + fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, + vd->vdev_top->vdev_guid); + fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID, + vd->vdev_guid); + if (vd->vdev_isspare) { + fnvlist_add_uint64(config, + ZPOOL_CONFIG_IS_SPARE, 1ULL); + } + if (vd->vdev_islog) { + fnvlist_add_uint64(config, + ZPOOL_CONFIG_IS_LOG, 1ULL); + } + vd = vd->vdev_top; /* label contains top config */ + } else { + /* + * Only add the (potentially large) split information + * in the mos config, and not in the vdev labels + */ + if (spa->spa_config_splitting != NULL) + fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, + spa->spa_config_splitting); + fnvlist_add_boolean(config, + ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS); + + config_gen_flags |= VDEV_CONFIG_MOS; + } + + /* + * Add the top-level config. We even add this on pools which + * don't support holes in the namespace. + */ + vdev_top_config_generate(spa, config); + + /* + * If we're splitting, record the original pool's guid. + */ + if (spa->spa_config_splitting != NULL && + nvlist_lookup_uint64(spa->spa_config_splitting, + ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { + fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, + split_guid); + } + + nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags); + fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot); + nvlist_free(nvroot); + + /* + * Store what's necessary for reading the MOS in the label. + */ + fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + spa->spa_label_features); + + if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { + ddt_histogram_t *ddh; + ddt_stat_t *dds; + ddt_object_t *ddo; + + ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddt_get_dedup_histogram(spa, ddh); + fnvlist_add_uint64_array(config, + ZPOOL_CONFIG_DDT_HISTOGRAM, + (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)); + kmem_free(ddh, sizeof (ddt_histogram_t)); + + ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); + ddt_get_dedup_object_stats(spa, ddo); + fnvlist_add_uint64_array(config, + ZPOOL_CONFIG_DDT_OBJ_STATS, + (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)); + kmem_free(ddo, sizeof (ddt_object_t)); + + dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); + ddt_get_dedup_stats(spa, dds); + fnvlist_add_uint64_array(config, + ZPOOL_CONFIG_DDT_STATS, + (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)); + kmem_free(dds, sizeof (ddt_stat_t)); + } + + if (locked) + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + + return (config); +} + +/* + * Update all disk labels, generate a fresh config based on the current + * in-core state, and sync the global config cache (do not sync the config + * cache if this is a booting rootpool). + */ +void +spa_config_update(spa_t *spa, int what) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t txg; + int c; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + txg = spa_last_synced_txg(spa) + 1; + if (what == SPA_CONFIG_UPDATE_POOL) { + vdev_config_dirty(rvd); + } else { + /* + * If we have top-level vdevs that were added but have + * not yet been prepared for allocation, do that now. + * (It's safe now because the config cache is up to date, + * so it will be able to translate the new DVAs.) + * See comments in spa_vdev_add() for full details. + */ + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + /* + * Explicitly skip vdevs that are indirect or + * log vdevs that are being removed. The reason + * is that both of those can have vdev_ms_array + * set to 0 and we wouldn't want to change their + * metaslab size nor call vdev_expand() on them. + */ + if (!vdev_is_concrete(tvd) || + (tvd->vdev_islog && tvd->vdev_removing)) + continue; + + if (tvd->vdev_ms_array == 0) { + vdev_ashift_optimize(tvd); + vdev_metaslab_set_size(tvd); + } + vdev_expand(tvd, txg); + } + } + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Wait for the mosconfig to be regenerated and synced. + */ + txg_wait_synced(spa->spa_dsl_pool, txg); + + /* + * Update the global config cache to reflect the new mosconfig. + */ + spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); + + if (what == SPA_CONFIG_UPDATE_POOL) + spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c new file mode 100644 index 000000000000..8ce780537abb --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c @@ -0,0 +1,406 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +/* + * Routines to manage the on-disk persistent error log. + * + * Each pool stores a log of all logical data errors seen during normal + * operation. This is actually the union of two distinct logs: the last log, + * and the current log. All errors seen are logged to the current log. When a + * scrub completes, the current log becomes the last log, the last log is thrown + * out, and the current log is reinitialized. This way, if an error is somehow + * corrected, a new scrub will show that that it no longer exists, and will be + * deleted from the log when the scrub completes. + * + * The log is stored using a ZAP object whose key is a string form of the + * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an + * optional 'objset:object' human-readable string describing the data. When an + * error is first logged, this string will be empty, indicating that no name is + * known. This prevents us from having to issue a potentially large amount of + * I/O to discover the object name during an error path. Instead, we do the + * calculation when the data is requested, storing the result so future queries + * will be faster. + * + * This log is then shipped into an nvlist where the key is the dataset name and + * the value is the object name. Userland is then responsible for uniquifying + * this list and displaying it to the user. + */ + +#include <sys/dmu_tx.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zap.h> +#include <sys/zio.h> + + +/* + * Convert a bookmark to a string. + */ +static void +bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) +{ + (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", + (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); +} + +/* + * Convert a string to a bookmark + */ +#ifdef _KERNEL +static void +name_to_bookmark(char *buf, zbookmark_phys_t *zb) +{ + zb->zb_objset = zfs_strtonum(buf, &buf); + ASSERT(*buf == ':'); + zb->zb_object = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_level = (int)zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_blkid = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} +#endif + +/* + * Log an uncorrectable error to the persistent error log. We add it to the + * spa's list of pending errors. The changes are actually synced out to disk + * during spa_errlog_sync(). + */ +void +spa_log_error(spa_t *spa, zio_t *zio) +{ + zbookmark_phys_t *zb = &zio->io_logical->io_bookmark; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_tree_t *tree; + avl_index_t where; + + /* + * If we are trying to import a pool, ignore any errors, as we won't be + * writing to the pool any time soon. + */ + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) + return; + + mutex_enter(&spa->spa_errlist_lock); + + /* + * If we have had a request to rotate the log, log it to the next list + * instead of the current one. + */ + if (spa->spa_scrub_active || spa->spa_scrub_finished) + tree = &spa->spa_errlist_scrub; + else + tree = &spa->spa_errlist_last; + + search.se_bookmark = *zb; + if (avl_find(tree, &search, &where) != NULL) { + mutex_exit(&spa->spa_errlist_lock); + return; + } + + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *zb; + avl_insert(tree, new, where); + + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Return the number of errors currently in the error log. This is actually the + * sum of both the last log and the current log, since we don't know the union + * of these logs until we reach userland. + */ +uint64_t +spa_get_errlog_size(spa_t *spa) +{ + uint64_t total = 0, count; + + mutex_enter(&spa->spa_errlog_lock); + if (spa->spa_errlog_scrub != 0 && + zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, + &count) == 0) + total += count; + + if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && + zap_count(spa->spa_meta_objset, spa->spa_errlog_last, + &count) == 0) + total += count; + mutex_exit(&spa->spa_errlog_lock); + + mutex_enter(&spa->spa_errlist_lock); + total += avl_numnodes(&spa->spa_errlist_last); + total += avl_numnodes(&spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); + + return (total); +} + +#ifdef _KERNEL +static int +process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) +{ + zap_cursor_t zc; + zap_attribute_t za; + zbookmark_phys_t zb; + + if (obj == 0) + return (0); + + for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + + if (*count == 0) { + zap_cursor_fini(&zc); + return (SET_ERROR(ENOMEM)); + } + + name_to_bookmark(za.za_name, &zb); + + if (copyout(&zb, (char *)addr + + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + zap_cursor_fini(&zc); + return (SET_ERROR(EFAULT)); + } + + *count -= 1; + } + + zap_cursor_fini(&zc); + + return (0); +} + +static int +process_error_list(avl_tree_t *list, void *addr, size_t *count) +{ + spa_error_entry_t *se; + + for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + + if (*count == 0) + return (SET_ERROR(ENOMEM)); + + if (copyout(&se->se_bookmark, (char *)addr + + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) + return (SET_ERROR(EFAULT)); + + *count -= 1; + } + + return (0); +} +#endif + +/* + * Copy all known errors to userland as an array of bookmarks. This is + * actually a union of the on-disk last log and current log, as well as any + * pending error requests. + * + * Because the act of reading the on-disk log could cause errors to be + * generated, we have two separate locks: one for the error log and one for the + * in-core error lists. We only need the error list lock to log and error, so + * we grab the error log lock while we read the on-disk logs, and only pick up + * the error list lock when we are finished. + */ +int +spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) +{ + int ret = 0; + +#ifdef _KERNEL + mutex_enter(&spa->spa_errlog_lock); + + ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); + + if (!ret && !spa->spa_scrub_finished) + ret = process_error_log(spa, spa->spa_errlog_last, uaddr, + count); + + mutex_enter(&spa->spa_errlist_lock); + if (!ret) + ret = process_error_list(&spa->spa_errlist_scrub, uaddr, + count); + if (!ret) + ret = process_error_list(&spa->spa_errlist_last, uaddr, + count); + mutex_exit(&spa->spa_errlist_lock); + + mutex_exit(&spa->spa_errlog_lock); +#endif + + return (ret); +} + +/* + * Called when a scrub completes. This simply set a bit which tells which AVL + * tree to add new errors. spa_errlog_sync() is responsible for actually + * syncing the changes to the underlying objects. + */ +void +spa_errlog_rotate(spa_t *spa) +{ + mutex_enter(&spa->spa_errlist_lock); + spa->spa_scrub_finished = B_TRUE; + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Discard any pending errors from the spa_t. Called when unloading a faulted + * pool, as the errors encountered during the open cannot be synced to disk. + */ +void +spa_errlog_drain(spa_t *spa) +{ + spa_error_entry_t *se; + void *cookie; + + mutex_enter(&spa->spa_errlist_lock); + + cookie = NULL; + while ((se = avl_destroy_nodes(&spa->spa_errlist_last, + &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + cookie = NULL; + while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, + &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Process a list of errors into the current on-disk log. + */ +static void +sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) +{ + spa_error_entry_t *se; + char buf[64]; + void *cookie; + + if (avl_numnodes(t) != 0) { + /* create log if necessary */ + if (*obj == 0) + *obj = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, + 0, tx); + + /* add errors to the current log */ + for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { + char *name = se->se_name ? se->se_name : ""; + + bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); + + (void) zap_update(spa->spa_meta_objset, + *obj, buf, 1, strlen(name) + 1, name, tx); + } + + /* purge the error list */ + cookie = NULL; + while ((se = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + } +} + +/* + * Sync the error log out to disk. This is a little tricky because the act of + * writing the error log requires the spa_errlist_lock. So, we need to lock the + * error lists, take a copy of the lists, and then reinitialize them. Then, we + * drop the error list lock and take the error log lock, at which point we + * do the errlog processing. Then, if we encounter an I/O error during this + * process, we can successfully add the error to the list. Note that this will + * result in the perpetual recycling of errors, but it is an unlikely situation + * and not a performance critical operation. + */ +void +spa_errlog_sync(spa_t *spa, uint64_t txg) +{ + dmu_tx_t *tx; + avl_tree_t scrub, last; + int scrub_finished; + + mutex_enter(&spa->spa_errlist_lock); + + /* + * Bail out early under normal circumstances. + */ + if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && + avl_numnodes(&spa->spa_errlist_last) == 0 && + !spa->spa_scrub_finished) { + mutex_exit(&spa->spa_errlist_lock); + return; + } + + spa_get_errlists(spa, &last, &scrub); + scrub_finished = spa->spa_scrub_finished; + spa->spa_scrub_finished = B_FALSE; + + mutex_exit(&spa->spa_errlist_lock); + mutex_enter(&spa->spa_errlog_lock); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + /* + * Sync out the current list of errors. + */ + sync_error_list(spa, &last, &spa->spa_errlog_last, tx); + + /* + * Rotate the log if necessary. + */ + if (scrub_finished) { + if (spa->spa_errlog_last != 0) + VERIFY(dmu_object_free(spa->spa_meta_objset, + spa->spa_errlog_last, tx) == 0); + spa->spa_errlog_last = spa->spa_errlog_scrub; + spa->spa_errlog_scrub = 0; + + sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); + } + + /* + * Sync out any pending scrub errors. + */ + sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); + + /* + * Update the MOS to reflect the new values. + */ + (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, + &spa->spa_errlog_last, tx); + (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, + &spa->spa_errlog_scrub, tx); + + dmu_tx_commit(tx); + + mutex_exit(&spa->spa_errlog_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c new file mode 100644 index 000000000000..4b080fc48cdf --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c @@ -0,0 +1,628 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zap.h> +#include <sys/dsl_synctask.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/utsname.h> +#include <sys/sunddi.h> +#include <sys/cred.h> +#include "zfs_comutil.h" +#ifdef _KERNEL +#include <sys/cmn_err.h> +#include <sys/zone.h> +#endif + +/* + * Routines to manage the on-disk history log. + * + * The history log is stored as a dmu object containing + * <packed record length, record nvlist> tuples. + * + * Where "record nvlist" is a nvlist containing uint64_ts and strings, and + * "packed record length" is the packed length of the "record nvlist" stored + * as a little endian uint64_t. + * + * The log is implemented as a ring buffer, though the original creation + * of the pool ('zpool create') is never overwritten. + * + * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer + * of 'spa_history' stores the offsets for logging/retrieving history as + * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of + * where the 'zpool create' record is stored. This allows us to never + * overwrite the original creation of the pool. 'sh_phys_max_off' is the + * physical ending offset in bytes of the log. This tells you the length of + * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record + * is added, 'sh_eof' is incremented by the the size of the record. + * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes). + * This is where the consumer should start reading from after reading in + * the 'zpool create' portion of the log. + * + * 'sh_records_lost' keeps track of how many records have been overwritten + * and permanently lost. + */ + +/* convert a logical offset to physical */ +static uint64_t +spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) +{ + uint64_t phys_len; + + phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len; + return ((log_off - shpp->sh_pool_create_len) % phys_len + + shpp->sh_pool_create_len); +} + +void +spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) +{ + dmu_buf_t *dbp; + spa_history_phys_t *shpp; + objset_t *mos = spa->spa_meta_objset; + + ASSERT(spa->spa_history == 0); + spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, + SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, + sizeof (spa_history_phys_t), tx); + + VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_HISTORY, sizeof (uint64_t), 1, + &spa->spa_history, tx) == 0); + + VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); + ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); + + shpp = dbp->db_data; + dmu_buf_will_dirty(dbp, tx); + + /* + * Figure out maximum size of history log. We set it at + * 0.1% of pool size, with a max of 1G and min of 128KB. + */ + shpp->sh_phys_max_off = + metaslab_class_get_dspace(spa_normal_class(spa)) / 1000; + shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30); + shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); + + dmu_buf_rele(dbp, FTAG); +} + +/* + * Change 'sh_bof' to the beginning of the next record. + */ +static int +spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) +{ + objset_t *mos = spa->spa_meta_objset; + uint64_t firstread, reclen, phys_bof; + char buf[sizeof (reclen)]; + int err; + + phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp); + firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); + + if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, + buf, DMU_READ_PREFETCH)) != 0) + return (err); + if (firstread != sizeof (reclen)) { + if ((err = dmu_read(mos, spa->spa_history, + shpp->sh_pool_create_len, sizeof (reclen) - firstread, + buf + firstread, DMU_READ_PREFETCH)) != 0) + return (err); + } + + reclen = LE_64(*((uint64_t *)buf)); + shpp->sh_bof += reclen + sizeof (reclen); + shpp->sh_records_lost++; + return (0); +} + +static int +spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, + dmu_tx_t *tx) +{ + uint64_t firstwrite, phys_eof; + objset_t *mos = spa->spa_meta_objset; + int err; + + ASSERT(MUTEX_HELD(&spa->spa_history_lock)); + + /* see if we need to reset logical BOF */ + while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - + (shpp->sh_eof - shpp->sh_bof) <= len) { + if ((err = spa_history_advance_bof(spa, shpp)) != 0) { + return (err); + } + } + + phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); + firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); + shpp->sh_eof += len; + dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); + + len -= firstwrite; + if (len > 0) { + /* write out the rest at the beginning of physical file */ + dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, + len, (char *)buf + firstwrite, tx); + } + + return (0); +} + +static char * +spa_history_zone(void) +{ +#ifdef _KERNEL + /* XXX: pr_hostname can be changed by default from within a jail! */ + if (jailed(curthread->td_ucred)) + return (curthread->td_ucred->cr_prison->pr_hostname); +#endif + return (NULL); +} + +/* + * Post a history sysevent. + * + * The nvlist_t* passed into this function will be transformed into a new + * nvlist where: + * + * 1. Nested nvlists will be flattened to a single level + * 2. Keys will have their names normalized (to remove any problematic + * characters, such as whitespace) + * + * The nvlist_t passed into this function will duplicated and should be freed + * by caller. + * + */ +static void +spa_history_log_notify(spa_t *spa, nvlist_t *nvl) +{ + nvlist_t *hist_nvl = fnvlist_alloc(); + uint64_t uint64; + char *string; + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_ZONE, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_ZONE, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_HOST, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_HOST, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_DSNAME, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_STR, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_IOCTL, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_DSID, uint64); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TXG, uint64); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TIME, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TIME, uint64); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_WHO, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_WHO, uint64); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_INT_EVENT, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_INT_EVENT, uint64); + + spa_event_notify(spa, NULL, hist_nvl, ESC_ZFS_HISTORY_EVENT); + + nvlist_free(hist_nvl); +} + +/* + * Write out a history event. + */ +/*ARGSUSED*/ +static void +spa_history_log_sync(void *arg, dmu_tx_t *tx) +{ + nvlist_t *nvl = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = spa->spa_meta_objset; + dmu_buf_t *dbp; + spa_history_phys_t *shpp; + size_t reclen; + uint64_t le_len; + char *record_packed = NULL; + int ret; + + /* + * If we have an older pool that doesn't have a command + * history object, create it now. + */ + mutex_enter(&spa->spa_history_lock); + if (!spa->spa_history) + spa_history_create_obj(spa, tx); + mutex_exit(&spa->spa_history_lock); + + /* + * Get the offset of where we need to write via the bonus buffer. + * Update the offset when the write completes. + */ + VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); + shpp = dbp->db_data; + + dmu_buf_will_dirty(dbp, tx); + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(dbp, &doi); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); + } +#endif + + fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); +#ifdef _KERNEL + fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename); +#endif + if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) { + zfs_dbgmsg("command: %s", + fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD)); + } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) { + if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) { + zfs_dbgmsg("txg %lld %s %s (id %llu) %s", + fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), + fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), + fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME), + fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID), + fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); + } else { + zfs_dbgmsg("txg %lld %s %s", + fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), + fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), + fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); + } + /* + * The history sysevent is posted only for internal history + * messages to show what has happened, not how it happened. For + * example, the following command: + * + * # zfs destroy -r tank/foo + * + * will result in one sysevent posted per dataset that is + * destroyed as a result of the command - which could be more + * than one event in total. By contrast, if the sysevent was + * posted as a result of the ZPOOL_HIST_CMD key being present + * it would result in only one sysevent being posted with the + * full command line arguments, requiring the consumer to know + * how to parse and understand zfs(1M) command invocations. + */ + spa_history_log_notify(spa, nvl); + } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { + zfs_dbgmsg("ioctl %s", + fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL)); + } + + record_packed = fnvlist_pack(nvl, &reclen); + + mutex_enter(&spa->spa_history_lock); + + /* write out the packed length as little endian */ + le_len = LE_64((uint64_t)reclen); + ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); + if (!ret) + ret = spa_history_write(spa, record_packed, reclen, shpp, tx); + + /* The first command is the create, which we keep forever */ + if (ret == 0 && shpp->sh_pool_create_len == 0 && + nvlist_exists(nvl, ZPOOL_HIST_CMD)) { + shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof; + } + + mutex_exit(&spa->spa_history_lock); + fnvlist_pack_free(record_packed, reclen); + dmu_buf_rele(dbp, FTAG); + fnvlist_free(nvl); +} + +/* + * Write out a history event. + */ +int +spa_history_log(spa_t *spa, const char *msg) +{ + int err; + nvlist_t *nvl = fnvlist_alloc(); + + fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg); + err = spa_history_log_nvl(spa, nvl); + fnvlist_free(nvl); + return (err); +} + +int +spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) +{ + int err = 0; + dmu_tx_t *tx; + nvlist_t *nvarg; + + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) + return (EINVAL); + + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa)) + return (SET_ERROR(EINVAL)); + + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + nvarg = fnvlist_dup(nvl); + if (spa_history_zone() != NULL) { + fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE, + spa_history_zone()); + } + fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); + + /* Kick this off asynchronously; errors are ignored. */ + dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, + nvarg, 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + /* spa_history_log_sync will free nvl */ + return (err); + +} + +/* + * Read out the command history. + */ +int +spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) +{ + objset_t *mos = spa->spa_meta_objset; + dmu_buf_t *dbp; + uint64_t read_len, phys_read_off, phys_eof; + uint64_t leftover = 0; + spa_history_phys_t *shpp; + int err; + + /* + * If the command history doesn't exist (older pool), + * that's ok, just return ENOENT. + */ + if (!spa->spa_history) + return (SET_ERROR(ENOENT)); + + /* + * The history is logged asynchronously, so when they request + * the first chunk of history, make sure everything has been + * synced to disk so that we get it. + */ + if (*offp == 0 && spa_writeable(spa)) + txg_wait_synced(spa_get_dsl(spa), 0); + + if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) + return (err); + shpp = dbp->db_data; + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(dbp, &doi); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); + } +#endif + + mutex_enter(&spa->spa_history_lock); + phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); + + if (*offp < shpp->sh_pool_create_len) { + /* read in just the zpool create history */ + phys_read_off = *offp; + read_len = MIN(*len, shpp->sh_pool_create_len - + phys_read_off); + } else { + /* + * Need to reset passed in offset to BOF if the passed in + * offset has since been overwritten. + */ + *offp = MAX(*offp, shpp->sh_bof); + phys_read_off = spa_history_log_to_phys(*offp, shpp); + + /* + * Read up to the minimum of what the user passed down or + * the EOF (physical or logical). If we hit physical EOF, + * use 'leftover' to read from the physical BOF. + */ + if (phys_read_off <= phys_eof) { + read_len = MIN(*len, phys_eof - phys_read_off); + } else { + read_len = MIN(*len, + shpp->sh_phys_max_off - phys_read_off); + if (phys_read_off + *len > shpp->sh_phys_max_off) { + leftover = MIN(*len - read_len, + phys_eof - shpp->sh_pool_create_len); + } + } + } + + /* offset for consumer to use next */ + *offp += read_len + leftover; + + /* tell the consumer how much you actually read */ + *len = read_len + leftover; + + if (read_len == 0) { + mutex_exit(&spa->spa_history_lock); + dmu_buf_rele(dbp, FTAG); + return (0); + } + + err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, + DMU_READ_PREFETCH); + if (leftover && err == 0) { + err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, + leftover, buf + read_len, DMU_READ_PREFETCH); + } + mutex_exit(&spa->spa_history_lock); + + dmu_buf_rele(dbp, FTAG); + return (err); +} + +/* + * The nvlist will be consumed by this call. + */ +static void +log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, + dmu_tx_t *tx, const char *fmt, va_list adx) +{ + char *msg; + va_list adx2; + + /* + * If this is part of creating a pool, not everything is + * initialized yet, so don't bother logging the internal events. + * Likewise if the pool is not writeable. + */ + if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) { + fnvlist_free(nvl); + return; + } + + va_copy(adx2, adx); + + msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP); + (void) vsprintf(msg, fmt, adx2); + fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); + strfree(msg); + + va_end(adx2); + + fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation); + fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); + + if (dmu_tx_is_syncing(tx)) { + spa_history_log_sync(nvl, tx); + } else { + dsl_sync_task_nowait(spa_get_dsl(spa), + spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx); + } + /* spa_history_log_sync() will free nvl */ +} + +void +spa_history_log_internal(spa_t *spa, const char *operation, + dmu_tx_t *tx, const char *fmt, ...) +{ + dmu_tx_t *htx = tx; + va_list adx; + + /* create a tx if we didn't get one */ + if (tx == NULL) { + htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + if (dmu_tx_assign(htx, TXG_WAIT) != 0) { + dmu_tx_abort(htx); + return; + } + } + + va_start(adx, fmt); + log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx); + va_end(adx); + + /* if we didn't get a tx from the caller, commit the one we made */ + if (tx == NULL) + dmu_tx_commit(htx); +} + +void +spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation, + dmu_tx_t *tx, const char *fmt, ...) +{ + va_list adx; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *nvl = fnvlist_alloc(); + + ASSERT(tx != NULL); + + dsl_dataset_name(ds, namebuf); + fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); + fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object); + + va_start(adx, fmt); + log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx); + va_end(adx); +} + +void +spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, + dmu_tx_t *tx, const char *fmt, ...) +{ + va_list adx; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *nvl = fnvlist_alloc(); + + ASSERT(tx != NULL); + + dsl_dir_name(dd, namebuf); + fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); + fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, + dsl_dir_phys(dd)->dd_head_dataset_obj); + + va_start(adx, fmt); + log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx); + va_end(adx); +} + +void +spa_history_log_version(spa_t *spa, const char *operation) +{ + spa_history_log_internal(spa, operation, NULL, + "pool version %llu; software version %llu/%llu; uts %s %s %s %s", + (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION, + utsname.nodename, utsname.release, utsname.version, + utsname.machine); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c new file mode 100644 index 000000000000..a5816ed180f2 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -0,0 +1,2427 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2017 Datto Inc. + */ + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/spa_boot.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_file.h> +#include <sys/vdev_initialize.h> +#include <sys/metaslab.h> +#include <sys/uberblock_impl.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/unique.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_scan.h> +#include <sys/fs/zfs.h> +#include <sys/metaslab_impl.h> +#include <sys/arc.h> +#include <sys/ddt.h> +#include "zfs_prop.h" +#include <sys/zfeature.h> + +#if defined(__FreeBSD__) && defined(_KERNEL) +#include <sys/types.h> +#include <sys/sysctl.h> +#endif + +/* + * SPA locking + * + * There are four basic locks for managing spa_t structures: + * + * spa_namespace_lock (global mutex) + * + * This lock must be acquired to do any of the following: + * + * - Lookup a spa_t by name + * - Add or remove a spa_t from the namespace + * - Increase spa_refcount from non-zero + * - Check if spa_refcount is zero + * - Rename a spa_t + * - add/remove/attach/detach devices + * - Held for the duration of create/destroy/import/export + * + * It does not need to handle recursion. A create or destroy may + * reference objects (files or zvols) in other pools, but by + * definition they must have an existing reference, and will never need + * to lookup a spa_t by name. + * + * spa_refcount (per-spa refcount_t protected by mutex) + * + * This reference count keep track of any active users of the spa_t. The + * spa_t cannot be destroyed or freed while this is non-zero. Internally, + * the refcount is never really 'zero' - opening a pool implicitly keeps + * some references in the DMU. Internally we check against spa_minref, but + * present the image of a zero/non-zero value to consumers. + * + * spa_config_lock[] (per-spa array of rwlocks) + * + * This protects the spa_t from config changes, and must be held in + * the following circumstances: + * + * - RW_READER to perform I/O to the spa + * - RW_WRITER to change the vdev config + * + * The locking order is fairly straightforward: + * + * spa_namespace_lock -> spa_refcount + * + * The namespace lock must be acquired to increase the refcount from 0 + * or to check if it is zero. + * + * spa_refcount -> spa_config_lock[] + * + * There must be at least one valid reference on the spa_t to acquire + * the config lock. + * + * spa_namespace_lock -> spa_config_lock[] + * + * The namespace lock must always be taken before the config lock. + * + * + * The spa_namespace_lock can be acquired directly and is globally visible. + * + * The namespace is manipulated using the following functions, all of which + * require the spa_namespace_lock to be held. + * + * spa_lookup() Lookup a spa_t by name. + * + * spa_add() Create a new spa_t in the namespace. + * + * spa_remove() Remove a spa_t from the namespace. This also + * frees up any memory associated with the spa_t. + * + * spa_next() Returns the next spa_t in the system, or the + * first if NULL is passed. + * + * spa_evict_all() Shutdown and remove all spa_t structures in + * the system. + * + * spa_guid_exists() Determine whether a pool/device guid exists. + * + * The spa_refcount is manipulated using the following functions: + * + * spa_open_ref() Adds a reference to the given spa_t. Must be + * called with spa_namespace_lock held if the + * refcount is currently zero. + * + * spa_close() Remove a reference from the spa_t. This will + * not free the spa_t or remove it from the + * namespace. No locking is required. + * + * spa_refcount_zero() Returns true if the refcount is currently + * zero. Must be called with spa_namespace_lock + * held. + * + * The spa_config_lock[] is an array of rwlocks, ordered as follows: + * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. + * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). + * + * To read the configuration, it suffices to hold one of these locks as reader. + * To modify the configuration, you must hold all locks as writer. To modify + * vdev state without altering the vdev tree's topology (e.g. online/offline), + * you must hold SCL_STATE and SCL_ZIO as writer. + * + * We use these distinct config locks to avoid recursive lock entry. + * For example, spa_sync() (which holds SCL_CONFIG as reader) induces + * block allocations (SCL_ALLOC), which may require reading space maps + * from disk (dmu_read() -> zio_read() -> SCL_ZIO). + * + * The spa config locks cannot be normal rwlocks because we need the + * ability to hand off ownership. For example, SCL_ZIO is acquired + * by the issuing thread and later released by an interrupt thread. + * They do, however, obey the usual write-wanted semantics to prevent + * writer (i.e. system administrator) starvation. + * + * The lock acquisition rules are as follows: + * + * SCL_CONFIG + * Protects changes to the vdev tree topology, such as vdev + * add/remove/attach/detach. Protects the dirty config list + * (spa_config_dirty_list) and the set of spares and l2arc devices. + * + * SCL_STATE + * Protects changes to pool state and vdev state, such as vdev + * online/offline/fault/degrade/clear. Protects the dirty state list + * (spa_state_dirty_list) and global pool state (spa_state). + * + * SCL_ALLOC + * Protects changes to metaslab groups and classes. + * Held as reader by metaslab_alloc() and metaslab_claim(). + * + * SCL_ZIO + * Held by bp-level zios (those which have no io_vd upon entry) + * to prevent changes to the vdev tree. The bp-level zio implicitly + * protects all of its vdev child zios, which do not hold SCL_ZIO. + * + * SCL_FREE + * Protects changes to metaslab groups and classes. + * Held as reader by metaslab_free(). SCL_FREE is distinct from + * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free + * blocks in zio_done() while another i/o that holds either + * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. + * + * SCL_VDEV + * Held as reader to prevent changes to the vdev tree during trivial + * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the + * other locks, and lower than all of them, to ensure that it's safe + * to acquire regardless of caller context. + * + * In addition, the following rules apply: + * + * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. + * The lock ordering is SCL_CONFIG > spa_props_lock. + * + * (b) I/O operations on leaf vdevs. For any zio operation that takes + * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), + * or zio_write_phys() -- the caller must ensure that the config cannot + * cannot change in the interim, and that the vdev cannot be reopened. + * SCL_STATE as reader suffices for both. + * + * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). + * + * spa_vdev_enter() Acquire the namespace lock and the config lock + * for writing. + * + * spa_vdev_exit() Release the config lock, wait for all I/O + * to complete, sync the updated configs to the + * cache, and release the namespace lock. + * + * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). + * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual + * locking is, always, based on spa_namespace_lock and spa_config_lock[]. + * + * spa_rename() is also implemented within this file since it requires + * manipulation of the namespace. + */ + +static avl_tree_t spa_namespace_avl; +kmutex_t spa_namespace_lock; +static kcondvar_t spa_namespace_cv; +static int spa_active_count; +int spa_max_replication_override = SPA_DVAS_PER_BP; + +static kmutex_t spa_spare_lock; +static avl_tree_t spa_spare_avl; +static kmutex_t spa_l2cache_lock; +static avl_tree_t spa_l2cache_avl; + +kmem_cache_t *spa_buffer_pool; +int spa_mode_global; + +#ifdef ZFS_DEBUG +/* + * Everything except dprintf, spa, and indirect_remap is on by default + * in debug builds. + */ +int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_INDIRECT_REMAP); +#else +int zfs_flags = 0; +#endif + +/* + * zfs_recover can be set to nonzero to attempt to recover from + * otherwise-fatal errors, typically caused by on-disk corruption. When + * set, calls to zfs_panic_recover() will turn into warning messages. + * This should only be used as a last resort, as it typically results + * in leaked space, or worse. + */ +boolean_t zfs_recover = B_FALSE; + +/* + * If destroy encounters an EIO while reading metadata (e.g. indirect + * blocks), space referenced by the missing metadata can not be freed. + * Normally this causes the background destroy to become "stalled", as + * it is unable to make forward progress. While in this stalled state, + * all remaining space to free from the error-encountering filesystem is + * "temporarily leaked". Set this flag to cause it to ignore the EIO, + * permanently leak the space from indirect blocks that can not be read, + * and continue to free everything else that it can. + * + * The default, "stalling" behavior is useful if the storage partially + * fails (i.e. some but not all i/os fail), and then later recovers. In + * this case, we will be able to continue pool operations while it is + * partially failed, and when it recovers, we can continue to free the + * space, with no leaks. However, note that this case is actually + * fairly rare. + * + * Typically pools either (a) fail completely (but perhaps temporarily, + * e.g. a top-level vdev going offline), or (b) have localized, + * permanent errors (e.g. disk returns the wrong data due to bit flip or + * firmware bug). In case (a), this setting does not matter because the + * pool will be suspended and the sync thread will not be able to make + * forward progress regardless. In case (b), because the error is + * permanent, the best we can do is leak the minimum amount of space, + * which is what setting this flag will do. Therefore, it is reasonable + * for this flag to normally be set, but we chose the more conservative + * approach of not setting it, so that there is no possibility of + * leaking space in the "partial temporary" failure case. + */ +boolean_t zfs_free_leak_on_eio = B_FALSE; + +/* + * Expiration time in milliseconds. This value has two meanings. First it is + * used to determine when the spa_deadman() logic should fire. By default the + * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. + * Secondly, the value determines if an I/O is considered "hung". Any I/O that + * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting + * in a system panic. + */ +uint64_t zfs_deadman_synctime_ms = 1000000ULL; + +/* + * Check time in milliseconds. This defines the frequency at which we check + * for hung I/O. + */ +uint64_t zfs_deadman_checktime_ms = 5000ULL; + +/* + * Default value of -1 for zfs_deadman_enabled is resolved in + * zfs_deadman_init() + */ +int zfs_deadman_enabled = -1; + +/* + * The worst case is single-sector max-parity RAID-Z blocks, in which + * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) + * times the size; so just assume that. Add to this the fact that + * we can have up to 3 DVAs per bp, and one more factor of 2 because + * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, + * the worst case is: + * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 + */ +int spa_asize_inflation = 24; + +#if defined(__FreeBSD__) && defined(_KERNEL) +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0, + "Try to recover from otherwise-fatal errors."); + +static int +sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) +{ + int err, val; + + val = zfs_flags; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + /* + * ZFS_DEBUG_MODIFY must be enabled prior to boot so all + * arc buffers in the system have the necessary additional + * checksum data. However, it is safe to disable at any + * time. + */ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + val &= ~ZFS_DEBUG_MODIFY; + zfs_flags = val; + + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), + sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RWTUN, + &zfs_deadman_synctime_ms, 0, + "Stalled ZFS I/O expiration time in milliseconds"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RWTUN, + &zfs_deadman_checktime_ms, 0, + "Period of checks for stalled ZFS I/O in milliseconds"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RWTUN, + &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN, + &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes"); +#endif + +#ifndef illumos +#ifdef _KERNEL +static void +zfs_deadman_init() +{ + /* + * If we are not i386 or amd64 or in a virtual machine, + * disable ZFS deadman thread by default + */ + if (zfs_deadman_enabled == -1) { +#if defined(__amd64__) || defined(__i386__) + zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0; +#else + zfs_deadman_enabled = 0; +#endif + } +} +#endif /* _KERNEL */ +#endif /* !illumos */ + +/* + * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in + * the pool to be consumed. This ensures that we don't run the pool + * completely out of space, due to unaccounted changes (e.g. to the MOS). + * It also limits the worst-case time to allocate space. If we have + * less than this amount of free space, most ZPL operations (e.g. write, + * create) will return ENOSPC. + * + * Certain operations (e.g. file removal, most administrative actions) can + * use half the slop space. They will only return ENOSPC if less than half + * the slop space is free. Typically, once the pool has less than the slop + * space free, the user will use these operations to free up space in the pool. + * These are the operations that call dsl_pool_adjustedsize() with the netfree + * argument set to TRUE. + * + * Operations that are almost guaranteed to free up space in the absence of + * a pool checkpoint can use up to three quarters of the slop space + * (e.g zfs destroy). + * + * A very restricted set of operations are always permitted, regardless of + * the amount of free space. These are the operations that call + * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net + * increase in the amount of space used, it is possible to run the pool + * completely out of space, causing it to be permanently read-only. + * + * Note that on very small pools, the slop space will be larger than + * 3.2%, in an effort to have it be at least spa_min_slop (128MB), + * but we never allow it to be more than half the pool size. + * + * See also the comments in zfs_space_check_t. + */ +int spa_slop_shift = 5; +SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN, + &spa_slop_shift, 0, + "Shift value of reserved space (1/(2^spa_slop_shift))."); +uint64_t spa_min_slop = 128 * 1024 * 1024; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN, + &spa_min_slop, 0, + "Minimal value of reserved space"); + +int spa_allocators = 4; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_allocators, CTLFLAG_RWTUN, + &spa_allocators, 0, + "Number of allocators per metaslab group"); + +/*PRINTFLIKE2*/ +void +spa_load_failed(spa_t *spa, const char *fmt, ...) +{ + va_list adx; + char buf[256]; + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name, + spa->spa_trust_config ? "trusted" : "untrusted", buf); +} + +/*PRINTFLIKE2*/ +void +spa_load_note(spa_t *spa, const char *fmt, ...) +{ + va_list adx; + char buf[256]; + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, + spa->spa_trust_config ? "trusted" : "untrusted", buf); +} + +/* + * ========================================================================== + * SPA config locking + * ========================================================================== + */ +static void +spa_config_lock_init(spa_t *spa) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); + refcount_create_untracked(&scl->scl_count); + scl->scl_writer = NULL; + scl->scl_write_wanted = 0; + } +} + +static void +spa_config_lock_destroy(spa_t *spa) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + mutex_destroy(&scl->scl_lock); + cv_destroy(&scl->scl_cv); + refcount_destroy(&scl->scl_count); + ASSERT(scl->scl_writer == NULL); + ASSERT(scl->scl_write_wanted == 0); + } +} + +int +spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + mutex_enter(&scl->scl_lock); + if (rw == RW_READER) { + if (scl->scl_writer || scl->scl_write_wanted) { + mutex_exit(&scl->scl_lock); + spa_config_exit(spa, locks & ((1 << i) - 1), + tag); + return (0); + } + } else { + ASSERT(scl->scl_writer != curthread); + if (!refcount_is_zero(&scl->scl_count)) { + mutex_exit(&scl->scl_lock); + spa_config_exit(spa, locks & ((1 << i) - 1), + tag); + return (0); + } + scl->scl_writer = curthread; + } + (void) refcount_add(&scl->scl_count, tag); + mutex_exit(&scl->scl_lock); + } + return (1); +} + +void +spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) +{ + int wlocks_held = 0; + + ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); + + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (scl->scl_writer == curthread) + wlocks_held |= (1 << i); + if (!(locks & (1 << i))) + continue; + mutex_enter(&scl->scl_lock); + if (rw == RW_READER) { + while (scl->scl_writer || scl->scl_write_wanted) { + cv_wait(&scl->scl_cv, &scl->scl_lock); + } + } else { + ASSERT(scl->scl_writer != curthread); + while (!refcount_is_zero(&scl->scl_count)) { + scl->scl_write_wanted++; + cv_wait(&scl->scl_cv, &scl->scl_lock); + scl->scl_write_wanted--; + } + scl->scl_writer = curthread; + } + (void) refcount_add(&scl->scl_count, tag); + mutex_exit(&scl->scl_lock); + } + ASSERT3U(wlocks_held, <=, locks); +} + +void +spa_config_exit(spa_t *spa, int locks, void *tag) +{ + for (int i = SCL_LOCKS - 1; i >= 0; i--) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + mutex_enter(&scl->scl_lock); + ASSERT(!refcount_is_zero(&scl->scl_count)); + if (refcount_remove(&scl->scl_count, tag) == 0) { + ASSERT(scl->scl_writer == NULL || + scl->scl_writer == curthread); + scl->scl_writer = NULL; /* OK in either case */ + cv_broadcast(&scl->scl_cv); + } + mutex_exit(&scl->scl_lock); + } +} + +int +spa_config_held(spa_t *spa, int locks, krw_t rw) +{ + int locks_held = 0; + + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || + (rw == RW_WRITER && scl->scl_writer == curthread)) + locks_held |= 1 << i; + } + + return (locks_held); +} + +/* + * ========================================================================== + * SPA namespace functions + * ========================================================================== + */ + +/* + * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. + * Returns NULL if no matching spa_t is found. + */ +spa_t * +spa_lookup(const char *name) +{ + static spa_t search; /* spa_t is large; don't allocate on stack */ + spa_t *spa; + avl_index_t where; + char *cp; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); + + /* + * If it's a full dataset name, figure out the pool name and + * just use that. + */ + cp = strpbrk(search.spa_name, "/@#"); + if (cp != NULL) + *cp = '\0'; + + spa = avl_find(&spa_namespace_avl, &search, &where); + + return (spa); +} + +/* + * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. + * If the zfs_deadman_enabled flag is set then it inspects all vdev queues + * looking for potentially hung I/Os. + */ +static void +spa_deadman(void *arg, int pending) +{ + spa_t *spa = arg; + + /* + * Disable the deadman timer if the pool is suspended. + */ + if (spa_suspended(spa)) { +#ifdef illumos + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); +#else + /* Nothing. just don't schedule any future callouts. */ +#endif + return; + } + + zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", + (gethrtime() - spa->spa_sync_starttime) / NANOSEC, + ++spa->spa_deadman_calls); + if (zfs_deadman_enabled) + vdev_deadman(spa->spa_root_vdev); +#ifdef __FreeBSD__ +#ifdef _KERNEL + callout_schedule(&spa->spa_deadman_cycid, + hz * zfs_deadman_checktime_ms / MILLISEC); +#endif +#endif +} + +#if defined(__FreeBSD__) && defined(_KERNEL) +static void +spa_deadman_timeout(void *arg) +{ + spa_t *spa = arg; + + taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task); +} +#endif + +/* + * Create an uninitialized spa_t with the given name. Requires + * spa_namespace_lock. The caller must ensure that the spa_t doesn't already + * exist by calling spa_lookup() first. + */ +spa_t * +spa_add(const char *name, nvlist_t *config, const char *altroot) +{ + spa_t *spa; + spa_config_dirent_t *dp; +#ifdef illumos + cyc_handler_t hdlr; + cyc_time_t when; +#endif + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); + + mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); + + cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); + + for (int t = 0; t < TXG_SIZE; t++) + bplist_create(&spa->spa_free_bplist[t]); + + (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); + spa->spa_state = POOL_STATE_UNINITIALIZED; + spa->spa_freeze_txg = UINT64_MAX; + spa->spa_final_txg = UINT64_MAX; + spa->spa_load_max_txg = UINT64_MAX; + spa->spa_proc = &p0; + spa->spa_proc_state = SPA_PROC_NONE; + spa->spa_trust_config = B_TRUE; + +#ifdef illumos + hdlr.cyh_func = spa_deadman; + hdlr.cyh_arg = spa; + hdlr.cyh_level = CY_LOW_LEVEL; +#endif + + spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); + +#ifdef illumos + /* + * This determines how often we need to check for hung I/Os after + * the cyclic has already fired. Since checking for hung I/Os is + * an expensive operation we don't want to check too frequently. + * Instead wait for 5 seconds before checking again. + */ + when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms); + when.cyt_when = CY_INFINITY; + mutex_enter(&cpu_lock); + spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); + mutex_exit(&cpu_lock); +#else /* !illumos */ +#ifdef _KERNEL + /* + * callout(9) does not provide a way to initialize a callout with + * a function and an argument, so we use callout_reset() to schedule + * the callout in the very distant future. Even if that event ever + * fires, it should be okayas we won't have any active zio-s. + * But normally spa_sync() will reschedule the callout with a proper + * timeout. + * callout(9) does not allow the callback function to sleep but + * vdev_deadman() needs to acquire vq_lock and illumos mutexes are + * emulated using sx(9). For this reason spa_deadman_timeout() + * will schedule spa_deadman() as task on a taskqueue that allows + * sleeping. + */ + TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa); + callout_init(&spa->spa_deadman_cycid, 1); + callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0, + spa_deadman_timeout, spa, 0); +#endif +#endif + refcount_create(&spa->spa_refcount); + spa_config_lock_init(spa); + + avl_add(&spa_namespace_avl, spa); + + /* + * Set the alternate root, if there is one. + */ + if (altroot) { + spa->spa_root = spa_strdup(altroot); + spa_active_count++; + } + + spa->spa_alloc_count = spa_allocators; + spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count * + sizeof (kmutex_t), KM_SLEEP); + spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count * + sizeof (avl_tree_t), KM_SLEEP); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL); + avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare, + sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + } + + /* + * Every pool starts with the default cachefile + */ + list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), + offsetof(spa_config_dirent_t, scd_link)); + + dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); + dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); + list_insert_head(&spa->spa_config_list, dp); + + VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + + if (config != NULL) { + nvlist_t *features; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + &features) == 0) { + VERIFY(nvlist_dup(features, &spa->spa_label_features, + 0) == 0); + } + + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); + } + + if (spa->spa_label_features == NULL) { + VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + } + + spa->spa_min_ashift = INT_MAX; + spa->spa_max_ashift = 0; + + /* + * As a pool is being created, treat all features as disabled by + * setting SPA_FEATURE_DISABLED for all entries in the feature + * refcount cache. + */ + for (int i = 0; i < SPA_FEATURES; i++) { + spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; + } + + return (spa); +} + +/* + * Removes a spa_t from the namespace, freeing up any memory used. Requires + * spa_namespace_lock. This is called only after the spa_t has been closed and + * deactivated. + */ +void +spa_remove(spa_t *spa) +{ + spa_config_dirent_t *dp; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0); + + nvlist_free(spa->spa_config_splitting); + + avl_remove(&spa_namespace_avl, spa); + cv_broadcast(&spa_namespace_cv); + + if (spa->spa_root) { + spa_strfree(spa->spa_root); + spa_active_count--; + } + + while ((dp = list_head(&spa->spa_config_list)) != NULL) { + list_remove(&spa->spa_config_list, dp); + if (dp->scd_path != NULL) + spa_strfree(dp->scd_path); + kmem_free(dp, sizeof (spa_config_dirent_t)); + } + + for (int i = 0; i < spa->spa_alloc_count; i++) { + avl_destroy(&spa->spa_alloc_trees[i]); + mutex_destroy(&spa->spa_alloc_locks[i]); + } + kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count * + sizeof (kmutex_t)); + kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count * + sizeof (avl_tree_t)); + + list_destroy(&spa->spa_config_list); + + nvlist_free(spa->spa_label_features); + nvlist_free(spa->spa_load_info); + nvlist_free(spa->spa_feat_stats); + spa_config_set(spa, NULL); + +#ifdef illumos + mutex_enter(&cpu_lock); + if (spa->spa_deadman_cycid != CYCLIC_NONE) + cyclic_remove(spa->spa_deadman_cycid); + mutex_exit(&cpu_lock); + spa->spa_deadman_cycid = CYCLIC_NONE; +#else /* !illumos */ +#ifdef _KERNEL + callout_drain(&spa->spa_deadman_cycid); + taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task); +#endif +#endif + + refcount_destroy(&spa->spa_refcount); + + spa_config_lock_destroy(spa); + + for (int t = 0; t < TXG_SIZE; t++) + bplist_destroy(&spa->spa_free_bplist[t]); + + zio_checksum_templates_free(spa); + + cv_destroy(&spa->spa_async_cv); + cv_destroy(&spa->spa_evicting_os_cv); + cv_destroy(&spa->spa_proc_cv); + cv_destroy(&spa->spa_scrub_io_cv); + cv_destroy(&spa->spa_suspend_cv); + + mutex_destroy(&spa->spa_async_lock); + mutex_destroy(&spa->spa_errlist_lock); + mutex_destroy(&spa->spa_errlog_lock); + mutex_destroy(&spa->spa_evicting_os_lock); + mutex_destroy(&spa->spa_history_lock); + mutex_destroy(&spa->spa_proc_lock); + mutex_destroy(&spa->spa_props_lock); + mutex_destroy(&spa->spa_cksum_tmpls_lock); + mutex_destroy(&spa->spa_scrub_lock); + mutex_destroy(&spa->spa_suspend_lock); + mutex_destroy(&spa->spa_vdev_top_lock); + mutex_destroy(&spa->spa_feat_stats_lock); + + kmem_free(spa, sizeof (spa_t)); +} + +/* + * Given a pool, return the next pool in the namespace, or NULL if there is + * none. If 'prev' is NULL, return the first pool. + */ +spa_t * +spa_next(spa_t *prev) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + if (prev) + return (AVL_NEXT(&spa_namespace_avl, prev)); + else + return (avl_first(&spa_namespace_avl)); +} + +/* + * ========================================================================== + * SPA refcount functions + * ========================================================================== + */ + +/* + * Add a reference to the given spa_t. Must have at least one reference, or + * have the namespace lock held. + */ +void +spa_open_ref(spa_t *spa, void *tag) +{ + ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || + MUTEX_HELD(&spa_namespace_lock)); + (void) refcount_add(&spa->spa_refcount, tag); +} + +/* + * Remove a reference to the given spa_t. Must have at least one reference, or + * have the namespace lock held. + */ +void +spa_close(spa_t *spa, void *tag) +{ + ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || + MUTEX_HELD(&spa_namespace_lock)); + (void) refcount_remove(&spa->spa_refcount, tag); +} + +/* + * Remove a reference to the given spa_t held by a dsl dir that is + * being asynchronously released. Async releases occur from a taskq + * performing eviction of dsl datasets and dirs. The namespace lock + * isn't held and the hold by the object being evicted may contribute to + * spa_minref (e.g. dataset or directory released during pool export), + * so the asserts in spa_close() do not apply. + */ +void +spa_async_close(spa_t *spa, void *tag) +{ + (void) refcount_remove(&spa->spa_refcount, tag); +} + +/* + * Check to see if the spa refcount is zero. Must be called with + * spa_namespace_lock held. We really compare against spa_minref, which is the + * number of references acquired when opening a pool + */ +boolean_t +spa_refcount_zero(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + return (refcount_count(&spa->spa_refcount) == spa->spa_minref); +} + +/* + * ========================================================================== + * SPA spare and l2cache tracking + * ========================================================================== + */ + +/* + * Hot spares and cache devices are tracked using the same code below, + * for 'auxiliary' devices. + */ + +typedef struct spa_aux { + uint64_t aux_guid; + uint64_t aux_pool; + avl_node_t aux_avl; + int aux_count; +} spa_aux_t; + +static inline int +spa_aux_compare(const void *a, const void *b) +{ + const spa_aux_t *sa = (const spa_aux_t *)a; + const spa_aux_t *sb = (const spa_aux_t *)b; + + return (AVL_CMP(sa->aux_guid, sb->aux_guid)); +} + +void +spa_aux_add(vdev_t *vd, avl_tree_t *avl) +{ + avl_index_t where; + spa_aux_t search; + spa_aux_t *aux; + + search.aux_guid = vd->vdev_guid; + if ((aux = avl_find(avl, &search, &where)) != NULL) { + aux->aux_count++; + } else { + aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); + aux->aux_guid = vd->vdev_guid; + aux->aux_count = 1; + avl_insert(avl, aux, where); + } +} + +void +spa_aux_remove(vdev_t *vd, avl_tree_t *avl) +{ + spa_aux_t search; + spa_aux_t *aux; + avl_index_t where; + + search.aux_guid = vd->vdev_guid; + aux = avl_find(avl, &search, &where); + + ASSERT(aux != NULL); + + if (--aux->aux_count == 0) { + avl_remove(avl, aux); + kmem_free(aux, sizeof (spa_aux_t)); + } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { + aux->aux_pool = 0ULL; + } +} + +boolean_t +spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) +{ + spa_aux_t search, *found; + + search.aux_guid = guid; + found = avl_find(avl, &search, NULL); + + if (pool) { + if (found) + *pool = found->aux_pool; + else + *pool = 0ULL; + } + + if (refcnt) { + if (found) + *refcnt = found->aux_count; + else + *refcnt = 0; + } + + return (found != NULL); +} + +void +spa_aux_activate(vdev_t *vd, avl_tree_t *avl) +{ + spa_aux_t search, *found; + avl_index_t where; + + search.aux_guid = vd->vdev_guid; + found = avl_find(avl, &search, &where); + ASSERT(found != NULL); + ASSERT(found->aux_pool == 0ULL); + + found->aux_pool = spa_guid(vd->vdev_spa); +} + +/* + * Spares are tracked globally due to the following constraints: + * + * - A spare may be part of multiple pools. + * - A spare may be added to a pool even if it's actively in use within + * another pool. + * - A spare in use in any pool can only be the source of a replacement if + * the target is a spare in the same pool. + * + * We keep track of all spares on the system through the use of a reference + * counted AVL tree. When a vdev is added as a spare, or used as a replacement + * spare, then we bump the reference count in the AVL tree. In addition, we set + * the 'vdev_isspare' member to indicate that the device is a spare (active or + * inactive). When a spare is made active (used to replace a device in the + * pool), we also keep track of which pool its been made a part of. + * + * The 'spa_spare_lock' protects the AVL tree. These functions are normally + * called under the spa_namespace lock as part of vdev reconfiguration. The + * separate spare lock exists for the status query path, which does not need to + * be completely consistent with respect to other vdev configuration changes. + */ + +static int +spa_spare_compare(const void *a, const void *b) +{ + return (spa_aux_compare(a, b)); +} + +void +spa_spare_add(vdev_t *vd) +{ + mutex_enter(&spa_spare_lock); + ASSERT(!vd->vdev_isspare); + spa_aux_add(vd, &spa_spare_avl); + vd->vdev_isspare = B_TRUE; + mutex_exit(&spa_spare_lock); +} + +void +spa_spare_remove(vdev_t *vd) +{ + mutex_enter(&spa_spare_lock); + ASSERT(vd->vdev_isspare); + spa_aux_remove(vd, &spa_spare_avl); + vd->vdev_isspare = B_FALSE; + mutex_exit(&spa_spare_lock); +} + +boolean_t +spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) +{ + boolean_t found; + + mutex_enter(&spa_spare_lock); + found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); + mutex_exit(&spa_spare_lock); + + return (found); +} + +void +spa_spare_activate(vdev_t *vd) +{ + mutex_enter(&spa_spare_lock); + ASSERT(vd->vdev_isspare); + spa_aux_activate(vd, &spa_spare_avl); + mutex_exit(&spa_spare_lock); +} + +/* + * Level 2 ARC devices are tracked globally for the same reasons as spares. + * Cache devices currently only support one pool per cache device, and so + * for these devices the aux reference count is currently unused beyond 1. + */ + +static int +spa_l2cache_compare(const void *a, const void *b) +{ + return (spa_aux_compare(a, b)); +} + +void +spa_l2cache_add(vdev_t *vd) +{ + mutex_enter(&spa_l2cache_lock); + ASSERT(!vd->vdev_isl2cache); + spa_aux_add(vd, &spa_l2cache_avl); + vd->vdev_isl2cache = B_TRUE; + mutex_exit(&spa_l2cache_lock); +} + +void +spa_l2cache_remove(vdev_t *vd) +{ + mutex_enter(&spa_l2cache_lock); + ASSERT(vd->vdev_isl2cache); + spa_aux_remove(vd, &spa_l2cache_avl); + vd->vdev_isl2cache = B_FALSE; + mutex_exit(&spa_l2cache_lock); +} + +boolean_t +spa_l2cache_exists(uint64_t guid, uint64_t *pool) +{ + boolean_t found; + + mutex_enter(&spa_l2cache_lock); + found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); + mutex_exit(&spa_l2cache_lock); + + return (found); +} + +void +spa_l2cache_activate(vdev_t *vd) +{ + mutex_enter(&spa_l2cache_lock); + ASSERT(vd->vdev_isl2cache); + spa_aux_activate(vd, &spa_l2cache_avl); + mutex_exit(&spa_l2cache_lock); +} + +/* + * ========================================================================== + * SPA vdev locking + * ========================================================================== + */ + +/* + * Lock the given spa_t for the purpose of adding or removing a vdev. + * Grabs the global spa_namespace_lock plus the spa config lock for writing. + * It returns the next transaction group for the spa_t. + */ +uint64_t +spa_vdev_enter(spa_t *spa) +{ + mutex_enter(&spa->spa_vdev_top_lock); + mutex_enter(&spa_namespace_lock); + return (spa_vdev_config_enter(spa)); +} + +/* + * Internal implementation for spa_vdev_enter(). Used when a vdev + * operation requires multiple syncs (i.e. removing a device) while + * keeping the spa_namespace_lock held. + */ +uint64_t +spa_vdev_config_enter(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); + + return (spa_last_synced_txg(spa) + 1); +} + +/* + * Used in combination with spa_vdev_config_enter() to allow the syncing + * of multiple transactions without releasing the spa_namespace_lock. + */ +void +spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + int config_changed = B_FALSE; + + ASSERT(txg > spa_last_synced_txg(spa)); + + spa->spa_pending_vdev = NULL; + + /* + * Reassess the DTLs. + */ + vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); + + if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { + config_changed = B_TRUE; + spa->spa_config_generation++; + } + + /* + * Verify the metaslab classes. + */ + ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); + ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); + + spa_config_exit(spa, SCL_ALL, spa); + + /* + * Panic the system if the specified tag requires it. This + * is useful for ensuring that configurations are updated + * transactionally. + */ + if (zio_injection_enabled) + zio_handle_panic_injection(spa, tag, 0); + + /* + * Note: this txg_wait_synced() is important because it ensures + * that there won't be more than one config change per txg. + * This allows us to use the txg as the generation number. + */ + if (error == 0) + txg_wait_synced(spa->spa_dsl_pool, txg); + + if (vd != NULL) { + ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); + if (vd->vdev_ops->vdev_op_leaf) { + mutex_enter(&vd->vdev_initialize_lock); + vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); + mutex_exit(&vd->vdev_initialize_lock); + } + + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); + vdev_free(vd); + spa_config_exit(spa, SCL_ALL, spa); + } + + /* + * If the config changed, update the config cache. + */ + if (config_changed) + spa_write_cachefile(spa, B_FALSE, B_TRUE); +} + +/* + * Unlock the spa_t after adding or removing a vdev. Besides undoing the + * locking of spa_vdev_enter(), we also want make sure the transactions have + * synced to disk, and then update the global configuration cache with the new + * information. + */ +int +spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) +{ + spa_vdev_config_exit(spa, vd, txg, error, FTAG); + mutex_exit(&spa_namespace_lock); + mutex_exit(&spa->spa_vdev_top_lock); + + return (error); +} + +/* + * Lock the given spa_t for the purpose of changing vdev state. + */ +void +spa_vdev_state_enter(spa_t *spa, int oplocks) +{ + int locks = SCL_STATE_ALL | oplocks; + + /* + * Root pools may need to read of the underlying devfs filesystem + * when opening up a vdev. Unfortunately if we're holding the + * SCL_ZIO lock it will result in a deadlock when we try to issue + * the read from the root filesystem. Instead we "prefetch" + * the associated vnodes that we need prior to opening the + * underlying devices and cache them so that we can prevent + * any I/O when we are doing the actual open. + */ + if (spa_is_root(spa)) { + int low = locks & ~(SCL_ZIO - 1); + int high = locks & ~low; + + spa_config_enter(spa, high, spa, RW_WRITER); + vdev_hold(spa->spa_root_vdev); + spa_config_enter(spa, low, spa, RW_WRITER); + } else { + spa_config_enter(spa, locks, spa, RW_WRITER); + } + spa->spa_vdev_locks = locks; +} + +int +spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) +{ + boolean_t config_changed = B_FALSE; + + if (vd != NULL || error == 0) + vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, + 0, 0, B_FALSE); + + if (vd != NULL) { + vdev_state_dirty(vd->vdev_top); + config_changed = B_TRUE; + spa->spa_config_generation++; + } + + if (spa_is_root(spa)) + vdev_rele(spa->spa_root_vdev); + + ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); + spa_config_exit(spa, spa->spa_vdev_locks, spa); + + /* + * If anything changed, wait for it to sync. This ensures that, + * from the system administrator's perspective, zpool(1M) commands + * are synchronous. This is important for things like zpool offline: + * when the command completes, you expect no further I/O from ZFS. + */ + if (vd != NULL) + txg_wait_synced(spa->spa_dsl_pool, 0); + + /* + * If the config changed, update the config cache. + */ + if (config_changed) { + mutex_enter(&spa_namespace_lock); + spa_write_cachefile(spa, B_FALSE, B_TRUE); + mutex_exit(&spa_namespace_lock); + } + + return (error); +} + +/* + * ========================================================================== + * Miscellaneous functions + * ========================================================================== + */ + +void +spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) +{ + if (!nvlist_exists(spa->spa_label_features, feature)) { + fnvlist_add_boolean(spa->spa_label_features, feature); + /* + * When we are creating the pool (tx_txg==TXG_INITIAL), we can't + * dirty the vdev config because lock SCL_CONFIG is not held. + * Thankfully, in this case we don't need to dirty the config + * because it will be written out anyway when we finish + * creating the pool. + */ + if (tx->tx_txg != TXG_INITIAL) + vdev_config_dirty(spa->spa_root_vdev); + } +} + +void +spa_deactivate_mos_feature(spa_t *spa, const char *feature) +{ + if (nvlist_remove_all(spa->spa_label_features, feature) == 0) + vdev_config_dirty(spa->spa_root_vdev); +} + +/* + * Rename a spa_t. + */ +int +spa_rename(const char *name, const char *newname) +{ + spa_t *spa; + int err; + + /* + * Lookup the spa_t and grab the config lock for writing. We need to + * actually open the pool so that we can sync out the necessary labels. + * It's OK to call spa_open() with the namespace lock held because we + * allow recursive calls for other reasons. + */ + mutex_enter(&spa_namespace_lock); + if ((err = spa_open(name, &spa, FTAG)) != 0) { + mutex_exit(&spa_namespace_lock); + return (err); + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + avl_remove(&spa_namespace_avl, spa); + (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); + avl_add(&spa_namespace_avl, spa); + + /* + * Sync all labels to disk with the new names by marking the root vdev + * dirty and waiting for it to sync. It will pick up the new pool name + * during the sync. + */ + vdev_config_dirty(spa->spa_root_vdev); + + spa_config_exit(spa, SCL_ALL, FTAG); + + txg_wait_synced(spa->spa_dsl_pool, 0); + + /* + * Sync the updated config cache. + */ + spa_write_cachefile(spa, B_FALSE, B_TRUE); + + spa_close(spa, FTAG); + + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/* + * Return the spa_t associated with given pool_guid, if it exists. If + * device_guid is non-zero, determine whether the pool exists *and* contains + * a device with the specified device_guid. + */ +spa_t * +spa_by_guid(uint64_t pool_guid, uint64_t device_guid) +{ + spa_t *spa; + avl_tree_t *t = &spa_namespace_avl; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { + if (spa->spa_state == POOL_STATE_UNINITIALIZED) + continue; + if (spa->spa_root_vdev == NULL) + continue; + if (spa_guid(spa) == pool_guid) { + if (device_guid == 0) + break; + + if (vdev_lookup_by_guid(spa->spa_root_vdev, + device_guid) != NULL) + break; + + /* + * Check any devices we may be in the process of adding. + */ + if (spa->spa_pending_vdev) { + if (vdev_lookup_by_guid(spa->spa_pending_vdev, + device_guid) != NULL) + break; + } + } + } + + return (spa); +} + +/* + * Determine whether a pool with the given pool_guid exists. + */ +boolean_t +spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) +{ + return (spa_by_guid(pool_guid, device_guid) != NULL); +} + +char * +spa_strdup(const char *s) +{ + size_t len; + char *new; + + len = strlen(s); + new = kmem_alloc(len + 1, KM_SLEEP); + bcopy(s, new, len); + new[len] = '\0'; + + return (new); +} + +void +spa_strfree(char *s) +{ + kmem_free(s, strlen(s) + 1); +} + +uint64_t +spa_get_random(uint64_t range) +{ + uint64_t r; + + ASSERT(range != 0); + + (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); + + return (r % range); +} + +uint64_t +spa_generate_guid(spa_t *spa) +{ + uint64_t guid = spa_get_random(-1ULL); + + if (spa != NULL) { + while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) + guid = spa_get_random(-1ULL); + } else { + while (guid == 0 || spa_guid_exists(guid, 0)) + guid = spa_get_random(-1ULL); + } + + return (guid); +} + +void +snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) +{ + char type[256]; + char *checksum = NULL; + char *compress = NULL; + + if (bp != NULL) { + if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); + (void) snprintf(type, sizeof (type), "bswap %s %s", + DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? + "metadata" : "data", + dmu_ot_byteswap[bswap].ob_name); + } else { + (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, + sizeof (type)); + } + if (!BP_IS_EMBEDDED(bp)) { + checksum = + zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; + } + compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; + } + + SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, + compress); +} + +void +spa_freeze(spa_t *spa) +{ + uint64_t freeze_txg = 0; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + if (spa->spa_freeze_txg == UINT64_MAX) { + freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; + spa->spa_freeze_txg = freeze_txg; + } + spa_config_exit(spa, SCL_ALL, FTAG); + if (freeze_txg != 0) + txg_wait_synced(spa_get_dsl(spa), freeze_txg); +} + +void +zfs_panic_recover(const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); + va_end(adx); +} + +/* + * This is a stripped-down version of strtoull, suitable only for converting + * lowercase hexadecimal numbers that don't overflow. + */ +uint64_t +zfs_strtonum(const char *str, char **nptr) +{ + uint64_t val = 0; + char c; + int digit; + + while ((c = *str) != '\0') { + if (c >= '0' && c <= '9') + digit = c - '0'; + else if (c >= 'a' && c <= 'f') + digit = 10 + c - 'a'; + else + break; + + val *= 16; + val += digit; + + str++; + } + + if (nptr) + *nptr = (char *)str; + + return (val); +} + +/* + * ========================================================================== + * Accessor functions + * ========================================================================== + */ + +boolean_t +spa_shutting_down(spa_t *spa) +{ + return (spa->spa_async_suspended); +} + +dsl_pool_t * +spa_get_dsl(spa_t *spa) +{ + return (spa->spa_dsl_pool); +} + +boolean_t +spa_is_initializing(spa_t *spa) +{ + return (spa->spa_is_initializing); +} + +boolean_t +spa_indirect_vdevs_loaded(spa_t *spa) +{ + return (spa->spa_indirect_vdevs_loaded); +} + +blkptr_t * +spa_get_rootblkptr(spa_t *spa) +{ + return (&spa->spa_ubsync.ub_rootbp); +} + +void +spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) +{ + spa->spa_uberblock.ub_rootbp = *bp; +} + +void +spa_altroot(spa_t *spa, char *buf, size_t buflen) +{ + if (spa->spa_root == NULL) + buf[0] = '\0'; + else + (void) strncpy(buf, spa->spa_root, buflen); +} + +int +spa_sync_pass(spa_t *spa) +{ + return (spa->spa_sync_pass); +} + +char * +spa_name(spa_t *spa) +{ + return (spa->spa_name); +} + +uint64_t +spa_guid(spa_t *spa) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + uint64_t guid; + + /* + * If we fail to parse the config during spa_load(), we can go through + * the error path (which posts an ereport) and end up here with no root + * vdev. We stash the original pool guid in 'spa_config_guid' to handle + * this case. + */ + if (spa->spa_root_vdev == NULL) + return (spa->spa_config_guid); + + guid = spa->spa_last_synced_guid != 0 ? + spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; + + /* + * Return the most recently synced out guid unless we're + * in syncing context. + */ + if (dp && dsl_pool_sync_context(dp)) + return (spa->spa_root_vdev->vdev_guid); + else + return (guid); +} + +uint64_t +spa_load_guid(spa_t *spa) +{ + /* + * This is a GUID that exists solely as a reference for the + * purposes of the arc. It is generated at load time, and + * is never written to persistent storage. + */ + return (spa->spa_load_guid); +} + +uint64_t +spa_last_synced_txg(spa_t *spa) +{ + return (spa->spa_ubsync.ub_txg); +} + +uint64_t +spa_first_txg(spa_t *spa) +{ + return (spa->spa_first_txg); +} + +uint64_t +spa_syncing_txg(spa_t *spa) +{ + return (spa->spa_syncing_txg); +} + +/* + * Return the last txg where data can be dirtied. The final txgs + * will be used to just clear out any deferred frees that remain. + */ +uint64_t +spa_final_dirty_txg(spa_t *spa) +{ + return (spa->spa_final_txg - TXG_DEFER_SIZE); +} + +pool_state_t +spa_state(spa_t *spa) +{ + return (spa->spa_state); +} + +spa_load_state_t +spa_load_state(spa_t *spa) +{ + return (spa->spa_load_state); +} + +uint64_t +spa_freeze_txg(spa_t *spa) +{ + return (spa->spa_freeze_txg); +} + +/* ARGSUSED */ +uint64_t +spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) +{ + return (lsize * spa_asize_inflation); +} + +/* + * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), + * or at least 128MB, unless that would cause it to be more than half the + * pool size. + * + * See the comment above spa_slop_shift for details. + */ +uint64_t +spa_get_slop_space(spa_t *spa) +{ + uint64_t space = spa_get_dspace(spa); + return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop))); +} + +uint64_t +spa_get_dspace(spa_t *spa) +{ + return (spa->spa_dspace); +} + +uint64_t +spa_get_checkpoint_space(spa_t *spa) +{ + return (spa->spa_checkpoint_info.sci_dspace); +} + +void +spa_update_dspace(spa_t *spa) +{ + spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + + ddt_get_dedup_dspace(spa); + if (spa->spa_vdev_removal != NULL) { + /* + * We can't allocate from the removing device, so + * subtract its size. This prevents the DMU/DSL from + * filling up the (now smaller) pool while we are in the + * middle of removing the device. + * + * Note that the DMU/DSL doesn't actually know or care + * how much space is allocated (it does its own tracking + * of how much space has been logically used). So it + * doesn't matter that the data we are moving may be + * allocated twice (on the old device and the new + * device). + */ + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vdev_t *vd = + vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); + spa->spa_dspace -= spa_deflate(spa) ? + vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + spa_config_exit(spa, SCL_VDEV, FTAG); + } +} + +/* + * Return the failure mode that has been set to this pool. The default + * behavior will be to block all I/Os when a complete failure occurs. + */ +uint8_t +spa_get_failmode(spa_t *spa) +{ + return (spa->spa_failmode); +} + +boolean_t +spa_suspended(spa_t *spa) +{ + return (spa->spa_suspended); +} + +uint64_t +spa_version(spa_t *spa) +{ + return (spa->spa_ubsync.ub_version); +} + +boolean_t +spa_deflate(spa_t *spa) +{ + return (spa->spa_deflate); +} + +metaslab_class_t * +spa_normal_class(spa_t *spa) +{ + return (spa->spa_normal_class); +} + +metaslab_class_t * +spa_log_class(spa_t *spa) +{ + return (spa->spa_log_class); +} + +void +spa_evicting_os_register(spa_t *spa, objset_t *os) +{ + mutex_enter(&spa->spa_evicting_os_lock); + list_insert_head(&spa->spa_evicting_os_list, os); + mutex_exit(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_deregister(spa_t *spa, objset_t *os) +{ + mutex_enter(&spa->spa_evicting_os_lock); + list_remove(&spa->spa_evicting_os_list, os); + cv_broadcast(&spa->spa_evicting_os_cv); + mutex_exit(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_wait(spa_t *spa) +{ + mutex_enter(&spa->spa_evicting_os_lock); + while (!list_is_empty(&spa->spa_evicting_os_list)) + cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); + mutex_exit(&spa->spa_evicting_os_lock); + + dmu_buf_user_evict_wait(); +} + +int +spa_max_replication(spa_t *spa) +{ + /* + * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to + * handle BPs with more than one DVA allocated. Set our max + * replication level accordingly. + */ + if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) + return (1); + return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); +} + +int +spa_prev_software_version(spa_t *spa) +{ + return (spa->spa_prev_software_version); +} + +uint64_t +spa_deadman_synctime(spa_t *spa) +{ + return (spa->spa_deadman_synctime); +} + +uint64_t +dva_get_dsize_sync(spa_t *spa, const dva_t *dva) +{ + uint64_t asize = DVA_GET_ASIZE(dva); + uint64_t dsize = asize; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + + if (asize != 0 && spa->spa_deflate) { + uint64_t vdev = DVA_GET_VDEV(dva); + vdev_t *vd = vdev_lookup_top(spa, vdev); + if (vd == NULL) { + panic( + "dva_get_dsize_sync(): bad DVA %llu:%llu", + (u_longlong_t)vdev, (u_longlong_t)asize); + } + dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; + } + + return (dsize); +} + +uint64_t +bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) +{ + uint64_t dsize = 0; + + for (int d = 0; d < BP_GET_NDVAS(bp); d++) + dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); + + return (dsize); +} + +uint64_t +bp_get_dsize(spa_t *spa, const blkptr_t *bp) +{ + uint64_t dsize = 0; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (int d = 0; d < BP_GET_NDVAS(bp); d++) + dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); + + spa_config_exit(spa, SCL_VDEV, FTAG); + + return (dsize); +} + +uint64_t +spa_dirty_data(spa_t *spa) +{ + return (spa->spa_dsl_pool->dp_dirty_total); +} + +/* + * ========================================================================== + * Initialization and Termination + * ========================================================================== + */ + +static int +spa_name_compare(const void *a1, const void *a2) +{ + const spa_t *s1 = a1; + const spa_t *s2 = a2; + int s; + + s = strcmp(s1->spa_name, s2->spa_name); + + return (AVL_ISIGN(s)); +} + +int +spa_busy(void) +{ + return (spa_active_count); +} + +void +spa_boot_init() +{ + spa_config_load(); +} + +#ifdef _KERNEL +EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); +#endif + +void +spa_init(int mode) +{ + mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); + + avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), + offsetof(spa_t, spa_avl)); + + avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), + offsetof(spa_aux_t, aux_avl)); + + avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), + offsetof(spa_aux_t, aux_avl)); + + spa_mode_global = mode; + +#ifdef illumos +#ifdef _KERNEL + spa_arch_init(); +#else + if (spa_mode_global != FREAD && dprintf_find_string("watch")) { + arc_procfd = open("/proc/self/ctl", O_WRONLY); + if (arc_procfd == -1) { + perror("could not enable watchpoints: " + "opening /proc/self/ctl failed: "); + } else { + arc_watch = B_TRUE; + } + } +#endif +#endif /* illumos */ + refcount_sysinit(); + unique_init(); + range_tree_init(); + metaslab_alloc_trace_init(); + zio_init(); + lz4_init(); + dmu_init(); + zil_init(); + vdev_cache_stat_init(); + vdev_file_init(); + zfs_prop_init(); + zpool_prop_init(); + zpool_feature_init(); + spa_config_load(); + l2arc_start(); + scan_init(); + dsl_scan_global_init(); +#ifndef illumos +#ifdef _KERNEL + zfs_deadman_init(); +#endif +#endif /* !illumos */ +} + +void +spa_fini(void) +{ + l2arc_stop(); + + spa_evict_all(); + + vdev_file_fini(); + vdev_cache_stat_fini(); + zil_fini(); + dmu_fini(); + lz4_fini(); + zio_fini(); + metaslab_alloc_trace_fini(); + range_tree_fini(); + unique_fini(); + refcount_fini(); + scan_fini(); + + avl_destroy(&spa_namespace_avl); + avl_destroy(&spa_spare_avl); + avl_destroy(&spa_l2cache_avl); + + cv_destroy(&spa_namespace_cv); + mutex_destroy(&spa_namespace_lock); + mutex_destroy(&spa_spare_lock); + mutex_destroy(&spa_l2cache_lock); +} + +/* + * Return whether this pool has slogs. No locking needed. + * It's not a problem if the wrong answer is returned as it's only for + * performance and not correctness + */ +boolean_t +spa_has_slogs(spa_t *spa) +{ + return (spa->spa_log_class->mc_rotor != NULL); +} + +spa_log_state_t +spa_get_log_state(spa_t *spa) +{ + return (spa->spa_log_state); +} + +void +spa_set_log_state(spa_t *spa, spa_log_state_t state) +{ + spa->spa_log_state = state; +} + +boolean_t +spa_is_root(spa_t *spa) +{ + return (spa->spa_is_root); +} + +boolean_t +spa_writeable(spa_t *spa) +{ + return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config); +} + +/* + * Returns true if there is a pending sync task in any of the current + * syncing txg, the current quiescing txg, or the current open txg. + */ +boolean_t +spa_has_pending_synctask(spa_t *spa) +{ + return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) || + !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks)); +} + +int +spa_mode(spa_t *spa) +{ + return (spa->spa_mode); +} + +uint64_t +spa_bootfs(spa_t *spa) +{ + return (spa->spa_bootfs); +} + +uint64_t +spa_delegation(spa_t *spa) +{ + return (spa->spa_delegation); +} + +objset_t * +spa_meta_objset(spa_t *spa) +{ + return (spa->spa_meta_objset); +} + +enum zio_checksum +spa_dedup_checksum(spa_t *spa) +{ + return (spa->spa_dedup_checksum); +} + +/* + * Reset pool scan stat per scan pass (or reboot). + */ +void +spa_scan_stat_init(spa_t *spa) +{ + /* data not stored on disk */ + spa->spa_scan_pass_start = gethrestime_sec(); + if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan)) + spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; + else + spa->spa_scan_pass_scrub_pause = 0; + spa->spa_scan_pass_scrub_spent_paused = 0; + spa->spa_scan_pass_exam = 0; + spa->spa_scan_pass_issued = 0; + vdev_scan_stat_init(spa->spa_root_vdev); +} + +/* + * Get scan stats for zpool status reports + */ +int +spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) +{ + dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; + + if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) + return (SET_ERROR(ENOENT)); + bzero(ps, sizeof (pool_scan_stat_t)); + + /* data stored on disk */ + ps->pss_func = scn->scn_phys.scn_func; + ps->pss_state = scn->scn_phys.scn_state; + ps->pss_start_time = scn->scn_phys.scn_start_time; + ps->pss_end_time = scn->scn_phys.scn_end_time; + ps->pss_to_examine = scn->scn_phys.scn_to_examine; + ps->pss_to_process = scn->scn_phys.scn_to_process; + ps->pss_processed = scn->scn_phys.scn_processed; + ps->pss_errors = scn->scn_phys.scn_errors; + ps->pss_examined = scn->scn_phys.scn_examined; + ps->pss_issued = + scn->scn_issued_before_pass + spa->spa_scan_pass_issued; + /* data not stored on disk */ + ps->pss_pass_start = spa->spa_scan_pass_start; + ps->pss_pass_exam = spa->spa_scan_pass_exam; + ps->pss_pass_issued = spa->spa_scan_pass_issued; + ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; + ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; + + return (0); +} + +int +spa_maxblocksize(spa_t *spa) +{ + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) + return (SPA_MAXBLOCKSIZE); + else + return (SPA_OLD_MAXBLOCKSIZE); +} + +int +spa_maxdnodesize(spa_t *spa) +{ + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) + return (DNODE_MAX_SIZE); + else + return (DNODE_MIN_SIZE); +} + + +/* + * Returns the txg that the last device removal completed. No indirect mappings + * have been added since this txg. + */ +uint64_t +spa_get_last_removal_txg(spa_t *spa) +{ + uint64_t vdevid; + uint64_t ret = -1ULL; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + /* + * sr_prev_indirect_vdev is only modified while holding all the + * config locks, so it is sufficient to hold SCL_VDEV as reader when + * examining it. + */ + vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev; + + while (vdevid != -1ULL) { + vdev_t *vd = vdev_lookup_top(spa, vdevid); + vdev_indirect_births_t *vib = vd->vdev_indirect_births; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + /* + * If the removal did not remap any data, we don't care. + */ + if (vdev_indirect_births_count(vib) != 0) { + ret = vdev_indirect_births_last_entry_txg(vib); + break; + } + + vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev; + } + spa_config_exit(spa, SCL_VDEV, FTAG); + + IMPLY(ret != -1ULL, + spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + return (ret); +} + +boolean_t +spa_trust_config(spa_t *spa) +{ + return (spa->spa_trust_config); +} + +uint64_t +spa_missing_tvds_allowed(spa_t *spa) +{ + return (spa->spa_missing_tvds_allowed); +} + +void +spa_set_missing_tvds(spa_t *spa, uint64_t missing) +{ + spa->spa_missing_tvds = missing; +} + +boolean_t +spa_top_vdevs_spacemap_addressable(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + if (!vdev_is_spacemap_addressable(rvd->vdev_child[c])) + return (B_FALSE); + } + return (B_TRUE); +} + +boolean_t +spa_has_checkpoint(spa_t *spa) +{ + return (spa->spa_checkpoint_txg != 0); +} + +boolean_t +spa_importing_readonly_checkpoint(spa_t *spa) +{ + return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && + spa->spa_mode == FREAD); +} + +uint64_t +spa_min_claim_txg(spa_t *spa) +{ + uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg; + + if (checkpoint_txg != 0) + return (checkpoint_txg + 1); + + return (spa->spa_first_txg); +} + +/* + * If there is a checkpoint, async destroys may consume more space from + * the pool instead of freeing it. In an attempt to save the pool from + * getting suspended when it is about to run out of space, we stop + * processing async destroys. + */ +boolean_t +spa_suspend_async_destroy(spa_t *spa) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + + uint64_t unreserved = dsl_pool_unreserved_space(dp, + ZFS_SPACE_CHECK_EXTRA_RESERVED); + uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; + uint64_t avail = (unreserved > used) ? (unreserved - used) : 0; + + if (spa_has_checkpoint(spa) && avail == 0) + return (B_TRUE); + + return (B_FALSE); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c new file mode 100644 index 000000000000..7356e3ceea75 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c @@ -0,0 +1,1101 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/dnode.h> +#include <sys/dsl_pool.h> +#include <sys/zio.h> +#include <sys/space_map.h> +#include <sys/refcount.h> +#include <sys/zfeature.h> + +SYSCTL_DECL(_vfs_zfs); + +/* + * Note on space map block size: + * + * The data for a given space map can be kept on blocks of any size. + * Larger blocks entail fewer I/O operations, but they also cause the + * DMU to keep more data in-core, and also to waste more I/O bandwidth + * when only a few blocks have changed since the last transaction group. + */ + +/* + * Enabled whenever we want to stress test the use of double-word + * space map entries. + */ +boolean_t zfs_force_some_double_word_sm_entries = B_FALSE; + +/* + * Override the default indirect block size of 128K, instead using 16K for + * spacemaps (2^14 bytes). This dramatically reduces write inflation since + * appending to a spacemap typically has to write one data block (4KB) and one + * or two indirect blocks (16K-32K, rather than 128K). + */ +int space_map_ibs = 14; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, + &space_map_ibs, 0, "Space map indirect block shift"); + +boolean_t +sm_entry_is_debug(uint64_t e) +{ + return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX); +} + +boolean_t +sm_entry_is_single_word(uint64_t e) +{ + uint8_t prefix = SM_PREFIX_DECODE(e); + return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX); +} + +boolean_t +sm_entry_is_double_word(uint64_t e) +{ + return (SM_PREFIX_DECODE(e) == SM2_PREFIX); +} + +/* + * Iterate through the space map, invoking the callback on each (non-debug) + * space map entry. + */ +int +space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg) +{ + uint64_t sm_len = space_map_length(sm); + ASSERT3U(sm->sm_blksz, !=, 0); + + dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len, + ZIO_PRIORITY_SYNC_READ); + + uint64_t blksz = sm->sm_blksz; + int error = 0; + for (uint64_t block_base = 0; block_base < sm_len && error == 0; + block_base += blksz) { + dmu_buf_t *db; + error = dmu_buf_hold(sm->sm_os, space_map_object(sm), + block_base, FTAG, &db, DMU_READ_PREFETCH); + if (error != 0) + return (error); + + uint64_t *block_start = db->db_data; + uint64_t block_length = MIN(sm_len - block_base, blksz); + uint64_t *block_end = block_start + + (block_length / sizeof (uint64_t)); + + VERIFY0(P2PHASE(block_length, sizeof (uint64_t))); + VERIFY3U(block_length, !=, 0); + ASSERT3U(blksz, ==, db->db_size); + + for (uint64_t *block_cursor = block_start; + block_cursor < block_end && error == 0; block_cursor++) { + uint64_t e = *block_cursor; + + if (sm_entry_is_debug(e)) /* Skip debug entries */ + continue; + + uint64_t raw_offset, raw_run, vdev_id; + maptype_t type; + if (sm_entry_is_single_word(e)) { + type = SM_TYPE_DECODE(e); + vdev_id = SM_NO_VDEVID; + raw_offset = SM_OFFSET_DECODE(e); + raw_run = SM_RUN_DECODE(e); + } else { + /* it is a two-word entry */ + ASSERT(sm_entry_is_double_word(e)); + raw_run = SM2_RUN_DECODE(e); + vdev_id = SM2_VDEV_DECODE(e); + + /* move on to the second word */ + block_cursor++; + e = *block_cursor; + VERIFY3P(block_cursor, <=, block_end); + + type = SM2_TYPE_DECODE(e); + raw_offset = SM2_OFFSET_DECODE(e); + } + + uint64_t entry_offset = (raw_offset << sm->sm_shift) + + sm->sm_start; + uint64_t entry_run = raw_run << sm->sm_shift; + + VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); + VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); + ASSERT3U(entry_offset, >=, sm->sm_start); + ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size); + ASSERT3U(entry_run, <=, sm->sm_size); + ASSERT3U(entry_offset + entry_run, <=, + sm->sm_start + sm->sm_size); + + space_map_entry_t sme = { + .sme_type = type, + .sme_vdev = vdev_id, + .sme_offset = entry_offset, + .sme_run = entry_run + }; + error = callback(&sme, arg); + } + dmu_buf_rele(db, FTAG); + } + return (error); +} + +/* + * Reads the entries from the last block of the space map into + * buf in reverse order. Populates nwords with number of words + * in the last block. + * + * Refer to block comment within space_map_incremental_destroy() + * to understand why this function is needed. + */ +static int +space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, + uint64_t bufsz, uint64_t *nwords) +{ + int error = 0; + dmu_buf_t *db; + + /* + * Find the offset of the last word in the space map and use + * that to read the last block of the space map with + * dmu_buf_hold(). + */ + uint64_t last_word_offset = + sm->sm_phys->smp_objsize - sizeof (uint64_t); + error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset, + FTAG, &db, DMU_READ_NO_PREFETCH); + if (error != 0) + return (error); + + ASSERT3U(sm->sm_object, ==, db->db_object); + ASSERT3U(sm->sm_blksz, ==, db->db_size); + ASSERT3U(bufsz, >=, db->db_size); + ASSERT(nwords != NULL); + + uint64_t *words = db->db_data; + *nwords = + (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); + + ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t)); + + uint64_t n = *nwords; + uint64_t j = n - 1; + for (uint64_t i = 0; i < n; i++) { + uint64_t entry = words[i]; + if (sm_entry_is_double_word(entry)) { + /* + * Since we are populating the buffer backwards + * we have to be extra careful and add the two + * words of the double-word entry in the right + * order. + */ + ASSERT3U(j, >, 0); + buf[j - 1] = entry; + + i++; + ASSERT3U(i, <, n); + entry = words[i]; + buf[j] = entry; + j -= 2; + } else { + ASSERT(sm_entry_is_debug(entry) || + sm_entry_is_single_word(entry)); + buf[j] = entry; + j--; + } + } + + /* + * Assert that we wrote backwards all the + * way to the beginning of the buffer. + */ + ASSERT3S(j, ==, -1); + + dmu_buf_rele(db, FTAG); + return (error); +} + +/* + * Note: This function performs destructive actions - specifically + * it deletes entries from the end of the space map. Thus, callers + * should ensure that they are holding the appropriate locks for + * the space map that they provide. + */ +int +space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, + dmu_tx_t *tx) +{ + uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); + uint64_t *buf = zio_buf_alloc(bufsz); + + dmu_buf_will_dirty(sm->sm_dbuf, tx); + + /* + * Ideally we would want to iterate from the beginning of the + * space map to the end in incremental steps. The issue with this + * approach is that we don't have any field on-disk that points + * us where to start between each step. We could try zeroing out + * entries that we've destroyed, but this doesn't work either as + * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]). + * + * As a result, we destroy its entries incrementally starting from + * the end after applying the callback to each of them. + * + * The problem with this approach is that we cannot literally + * iterate through the words in the space map backwards as we + * can't distinguish two-word space map entries from their second + * word. Thus we do the following: + * + * 1] We get all the entries from the last block of the space map + * and put them into a buffer in reverse order. This way the + * last entry comes first in the buffer, the second to last is + * second, etc. + * 2] We iterate through the entries in the buffer and we apply + * the callback to each one. As we move from entry to entry we + * we decrease the size of the space map, deleting effectively + * each entry. + * 3] If there are no more entries in the space map or the callback + * returns a value other than 0, we stop iterating over the + * space map. If there are entries remaining and the callback + * returned 0, we go back to step [1]. + */ + int error = 0; + while (space_map_length(sm) > 0 && error == 0) { + uint64_t nwords = 0; + error = space_map_reversed_last_block_entries(sm, buf, bufsz, + &nwords); + if (error != 0) + break; + + ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t)); + + for (uint64_t i = 0; i < nwords; i++) { + uint64_t e = buf[i]; + + if (sm_entry_is_debug(e)) { + sm->sm_phys->smp_objsize -= sizeof (uint64_t); + space_map_update(sm); + continue; + } + + int words = 1; + uint64_t raw_offset, raw_run, vdev_id; + maptype_t type; + if (sm_entry_is_single_word(e)) { + type = SM_TYPE_DECODE(e); + vdev_id = SM_NO_VDEVID; + raw_offset = SM_OFFSET_DECODE(e); + raw_run = SM_RUN_DECODE(e); + } else { + ASSERT(sm_entry_is_double_word(e)); + words = 2; + + raw_run = SM2_RUN_DECODE(e); + vdev_id = SM2_VDEV_DECODE(e); + + /* move to the second word */ + i++; + e = buf[i]; + + ASSERT3P(i, <=, nwords); + + type = SM2_TYPE_DECODE(e); + raw_offset = SM2_OFFSET_DECODE(e); + } + + uint64_t entry_offset = + (raw_offset << sm->sm_shift) + sm->sm_start; + uint64_t entry_run = raw_run << sm->sm_shift; + + VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); + VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); + VERIFY3U(entry_offset, >=, sm->sm_start); + VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size); + VERIFY3U(entry_run, <=, sm->sm_size); + VERIFY3U(entry_offset + entry_run, <=, + sm->sm_start + sm->sm_size); + + space_map_entry_t sme = { + .sme_type = type, + .sme_vdev = vdev_id, + .sme_offset = entry_offset, + .sme_run = entry_run + }; + error = callback(&sme, arg); + if (error != 0) + break; + + if (type == SM_ALLOC) + sm->sm_phys->smp_alloc -= entry_run; + else + sm->sm_phys->smp_alloc += entry_run; + sm->sm_phys->smp_objsize -= words * sizeof (uint64_t); + space_map_update(sm); + } + } + + if (space_map_length(sm) == 0) { + ASSERT0(error); + ASSERT0(sm->sm_phys->smp_objsize); + ASSERT0(sm->sm_alloc); + } + + zio_buf_free(buf, bufsz); + return (error); +} + +typedef struct space_map_load_arg { + space_map_t *smla_sm; + range_tree_t *smla_rt; + maptype_t smla_type; +} space_map_load_arg_t; + +static int +space_map_load_callback(space_map_entry_t *sme, void *arg) +{ + space_map_load_arg_t *smla = arg; + if (sme->sme_type == smla->smla_type) { + VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=, + smla->smla_sm->sm_size); + range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run); + } else { + range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run); + } + + return (0); +} + +/* + * Load the space map disk into the specified range tree. Segments of maptype + * are added to the range tree, other segment types are removed. + */ +int +space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) +{ + uint64_t space; + int err; + space_map_load_arg_t smla; + + VERIFY0(range_tree_space(rt)); + space = space_map_allocated(sm); + + if (maptype == SM_FREE) { + range_tree_add(rt, sm->sm_start, sm->sm_size); + space = sm->sm_size - space; + } + + smla.smla_rt = rt; + smla.smla_sm = sm; + smla.smla_type = maptype; + err = space_map_iterate(sm, space_map_load_callback, &smla); + + if (err == 0) { + VERIFY3U(range_tree_space(rt), ==, space); + } else { + range_tree_vacate(rt, NULL, NULL); + } + + return (err); +} + +void +space_map_histogram_clear(space_map_t *sm) +{ + if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) + return; + + bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram)); +} + +boolean_t +space_map_histogram_verify(space_map_t *sm, range_tree_t *rt) +{ + /* + * Verify that the in-core range tree does not have any + * ranges smaller than our sm_shift size. + */ + for (int i = 0; i < sm->sm_shift; i++) { + if (rt->rt_histogram[i] != 0) + return (B_FALSE); + } + return (B_TRUE); +} + +void +space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) +{ + int idx = 0; + + ASSERT(dmu_tx_is_syncing(tx)); + VERIFY3U(space_map_object(sm), !=, 0); + + if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) + return; + + dmu_buf_will_dirty(sm->sm_dbuf, tx); + + ASSERT(space_map_histogram_verify(sm, rt)); + /* + * Transfer the content of the range tree histogram to the space + * map histogram. The space map histogram contains 32 buckets ranging + * between 2^sm_shift to 2^(32+sm_shift-1). The range tree, + * however, can represent ranges from 2^0 to 2^63. Since the space + * map only cares about allocatable blocks (minimum of sm_shift) we + * can safely ignore all ranges in the range tree smaller than sm_shift. + */ + for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + + /* + * Since the largest histogram bucket in the space map is + * 2^(32+sm_shift-1), we need to normalize the values in + * the range tree for any bucket larger than that size. For + * example given an sm_shift of 9, ranges larger than 2^40 + * would get normalized as if they were 1TB ranges. Assume + * the range tree had a count of 5 in the 2^44 (16TB) bucket, + * the calculation below would normalize this to 5 * 2^4 (16). + */ + ASSERT3U(i, >=, idx + sm->sm_shift); + sm->sm_phys->smp_histogram[idx] += + rt->rt_histogram[i] << (i - idx - sm->sm_shift); + + /* + * Increment the space map's index as long as we haven't + * reached the maximum bucket size. Accumulate all ranges + * larger than the max bucket size into the last bucket. + */ + if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { + ASSERT3U(idx + sm->sm_shift, ==, i); + idx++; + ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); + } + } +} + +static void +space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) +{ + dmu_buf_will_dirty(sm->sm_dbuf, tx); + + uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | + SM_DEBUG_ACTION_ENCODE(maptype) | + SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) | + SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); + + dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize, + sizeof (dentry), &dentry, tx); + + sm->sm_phys->smp_objsize += sizeof (dentry); +} + +/* + * Writes one or more entries given a segment. + * + * Note: The function may release the dbuf from the pointer initially + * passed to it, and return a different dbuf. Also, the space map's + * dbuf must be dirty for the changes in sm_phys to take effect. + */ +static void +space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, + uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx) +{ + ASSERT3U(words, !=, 0); + ASSERT3U(words, <=, 2); + + /* ensure the vdev_id can be represented by the space map */ + ASSERT3U(vdev_id, <=, SM_NO_VDEVID); + + /* + * if this is a single word entry, ensure that no vdev was + * specified. + */ + IMPLY(words == 1, vdev_id == SM_NO_VDEVID); + + dmu_buf_t *db = *dbp; + ASSERT3U(db->db_size, ==, sm->sm_blksz); + + uint64_t *block_base = db->db_data; + uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t)); + uint64_t *block_cursor = block_base + + (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); + + ASSERT3P(block_cursor, <=, block_end); + + uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; + uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift; + uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; + + ASSERT3U(rs->rs_start, >=, sm->sm_start); + ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size); + ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size); + ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size); + + while (size != 0) { + ASSERT3P(block_cursor, <=, block_end); + + /* + * If we are at the end of this block, flush it and start + * writing again from the beginning. + */ + if (block_cursor == block_end) { + dmu_buf_rele(db, tag); + + uint64_t next_word_offset = sm->sm_phys->smp_objsize; + VERIFY0(dmu_buf_hold(sm->sm_os, + space_map_object(sm), next_word_offset, + tag, &db, DMU_READ_PREFETCH)); + dmu_buf_will_dirty(db, tx); + + /* update caller's dbuf */ + *dbp = db; + + ASSERT3U(db->db_size, ==, sm->sm_blksz); + + block_base = db->db_data; + block_cursor = block_base; + block_end = block_base + + (db->db_size / sizeof (uint64_t)); + } + + /* + * If we are writing a two-word entry and we only have one + * word left on this block, just pad it with an empty debug + * entry and write the two-word entry in the next block. + */ + uint64_t *next_entry = block_cursor + 1; + if (next_entry == block_end && words > 1) { + ASSERT3U(words, ==, 2); + *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | + SM_DEBUG_ACTION_ENCODE(0) | + SM_DEBUG_SYNCPASS_ENCODE(0) | + SM_DEBUG_TXG_ENCODE(0); + block_cursor++; + sm->sm_phys->smp_objsize += sizeof (uint64_t); + ASSERT3P(block_cursor, ==, block_end); + continue; + } + + uint64_t run_len = MIN(size, run_max); + switch (words) { + case 1: + *block_cursor = SM_OFFSET_ENCODE(start) | + SM_TYPE_ENCODE(maptype) | + SM_RUN_ENCODE(run_len); + block_cursor++; + break; + case 2: + /* write the first word of the entry */ + *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) | + SM2_RUN_ENCODE(run_len) | + SM2_VDEV_ENCODE(vdev_id); + block_cursor++; + + /* move on to the second word of the entry */ + ASSERT3P(block_cursor, <, block_end); + *block_cursor = SM2_TYPE_ENCODE(maptype) | + SM2_OFFSET_ENCODE(start); + block_cursor++; + break; + default: + panic("%d-word space map entries are not supported", + words); + break; + } + sm->sm_phys->smp_objsize += words * sizeof (uint64_t); + + start += run_len; + size -= run_len; + } + ASSERT0(size); + +} + +/* + * Note: The space map's dbuf must be dirty for the changes in sm_phys to + * take effect. + */ +static void +space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t vdev_id, dmu_tx_t *tx) +{ + spa_t *spa = tx->tx_pool->dp_spa; + dmu_buf_t *db; + + space_map_write_intro_debug(sm, maptype, tx); + +#ifdef DEBUG + /* + * We do this right after we write the intro debug entry + * because the estimate does not take it into account. + */ + uint64_t initial_objsize = sm->sm_phys->smp_objsize; + uint64_t estimated_growth = + space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID); + uint64_t estimated_final_objsize = initial_objsize + estimated_growth; +#endif + + /* + * Find the offset right after the last word in the space map + * and use that to get a hold of the last block, so we can + * start appending to it. + */ + uint64_t next_word_offset = sm->sm_phys->smp_objsize; + VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), + next_word_offset, FTAG, &db, DMU_READ_PREFETCH)); + ASSERT3U(db->db_size, ==, sm->sm_blksz); + + dmu_buf_will_dirty(db, tx); + + avl_tree_t *t = &rt->rt_root; + for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { + uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift; + uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift; + uint8_t words = 1; + + /* + * We only write two-word entries when both of the following + * are true: + * + * [1] The feature is enabled. + * [2] The offset or run is too big for a single-word entry, + * or the vdev_id is set (meaning not equal to + * SM_NO_VDEVID). + * + * Note that for purposes of testing we've added the case that + * we write two-word entries occasionally when the feature is + * enabled and zfs_force_some_double_word_sm_entries has been + * set. + */ + if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) && + (offset >= (1ULL << SM_OFFSET_BITS) || + length > SM_RUN_MAX || + vdev_id != SM_NO_VDEVID || + (zfs_force_some_double_word_sm_entries && + spa_get_random(100) == 0))) + words = 2; + + space_map_write_seg(sm, rs, maptype, vdev_id, words, + &db, FTAG, tx); + } + + dmu_buf_rele(db, FTAG); + +#ifdef DEBUG + /* + * We expect our estimation to be based on the worst case + * scenario [see comment in space_map_estimate_optimal_size()]. + * Therefore we expect the actual objsize to be equal or less + * than whatever we estimated it to be. + */ + ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize); +#endif +} + +/* + * Note: This function manipulates the state of the given space map but + * does not hold any locks implicitly. Thus the caller is responsible + * for synchronizing writes to the space map. + */ +void +space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t vdev_id, dmu_tx_t *tx) +{ + objset_t *os = sm->sm_os; + + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + VERIFY3U(space_map_object(sm), !=, 0); + + dmu_buf_will_dirty(sm->sm_dbuf, tx); + + /* + * This field is no longer necessary since the in-core space map + * now contains the object number but is maintained for backwards + * compatibility. + */ + sm->sm_phys->smp_object = sm->sm_object; + + if (range_tree_is_empty(rt)) { + VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object); + return; + } + + if (maptype == SM_ALLOC) + sm->sm_phys->smp_alloc += range_tree_space(rt); + else + sm->sm_phys->smp_alloc -= range_tree_space(rt); + + uint64_t nodes = avl_numnodes(&rt->rt_root); + uint64_t rt_space = range_tree_space(rt); + + space_map_write_impl(sm, rt, maptype, vdev_id, tx); + + /* + * Ensure that the space_map's accounting wasn't changed + * while we were in the middle of writing it out. + */ + VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root)); + VERIFY3U(range_tree_space(rt), ==, rt_space); +} + +static int +space_map_open_impl(space_map_t *sm) +{ + int error; + u_longlong_t blocks; + + error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf); + if (error) + return (error); + + dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks); + sm->sm_phys = sm->sm_dbuf->db_data; + return (0); +} + +int +space_map_open(space_map_t **smp, objset_t *os, uint64_t object, + uint64_t start, uint64_t size, uint8_t shift) +{ + space_map_t *sm; + int error; + + ASSERT(*smp == NULL); + ASSERT(os != NULL); + ASSERT(object != 0); + + sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP); + + sm->sm_start = start; + sm->sm_size = size; + sm->sm_shift = shift; + sm->sm_os = os; + sm->sm_object = object; + + error = space_map_open_impl(sm); + if (error != 0) { + space_map_close(sm); + return (error); + } + *smp = sm; + + return (0); +} + +void +space_map_close(space_map_t *sm) +{ + if (sm == NULL) + return; + + if (sm->sm_dbuf != NULL) + dmu_buf_rele(sm->sm_dbuf, sm); + sm->sm_dbuf = NULL; + sm->sm_phys = NULL; + + kmem_free(sm, sizeof (*sm)); +} + +void +space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) +{ + objset_t *os = sm->sm_os; + spa_t *spa = dmu_objset_spa(os); + dmu_object_info_t doi; + + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + ASSERT(dmu_tx_is_syncing(tx)); + VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa)); + + dmu_object_info_from_db(sm->sm_dbuf, &doi); + + /* + * If the space map has the wrong bonus size (because + * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or + * the wrong block size (because space_map_blksz has changed), + * free and re-allocate its object with the updated sizes. + * + * Otherwise, just truncate the current object. + */ + if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && + doi.doi_bonus_size != sizeof (space_map_phys_t)) || + doi.doi_data_block_size != blocksize || + doi.doi_metadata_block_size != 1 << space_map_ibs) { + zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating " + "object[%llu]: old bonus %u, old blocksz %u", + dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object, + doi.doi_bonus_size, doi.doi_data_block_size); + + space_map_free(sm, tx); + dmu_buf_rele(sm->sm_dbuf, sm); + + sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx); + VERIFY0(space_map_open_impl(sm)); + } else { + VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx)); + + /* + * If the spacemap is reallocated, its histogram + * will be reset. Do the same in the common case so that + * bugs related to the uncommon case do not go unnoticed. + */ + bzero(sm->sm_phys->smp_histogram, + sizeof (sm->sm_phys->smp_histogram)); + } + + dmu_buf_will_dirty(sm->sm_dbuf, tx); + sm->sm_phys->smp_objsize = 0; + sm->sm_phys->smp_alloc = 0; +} + +/* + * Update the in-core space_map allocation and length values. + */ +void +space_map_update(space_map_t *sm) +{ + if (sm == NULL) + return; + + sm->sm_alloc = sm->sm_phys->smp_alloc; + sm->sm_length = sm->sm_phys->smp_objsize; +} + +uint64_t +space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(os); + uint64_t object; + int bonuslen; + + if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { + spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); + bonuslen = sizeof (space_map_phys_t); + ASSERT3U(bonuslen, <=, dmu_bonus_max()); + } else { + bonuslen = SPACE_MAP_SIZE_V0; + } + + object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize, + space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); + + return (object); +} + +void +space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(os); + if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(os, smobj, &doi)); + if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) { + spa_feature_decr(spa, + SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); + } + } + + VERIFY0(dmu_object_free(os, smobj, tx)); +} + +void +space_map_free(space_map_t *sm, dmu_tx_t *tx) +{ + if (sm == NULL) + return; + + space_map_free_obj(sm->sm_os, space_map_object(sm), tx); + sm->sm_object = 0; +} + +/* + * Given a range tree, it makes a worst-case estimate of how much + * space would the tree's segments take if they were written to + * the given space map. + */ +uint64_t +space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, + uint64_t vdev_id) +{ + spa_t *spa = dmu_objset_spa(sm->sm_os); + uint64_t shift = sm->sm_shift; + uint64_t *histogram = rt->rt_histogram; + uint64_t entries_for_seg = 0; + + /* + * In order to get a quick estimate of the optimal size that this + * range tree would have on-disk as a space map, we iterate through + * its histogram buckets instead of iterating through its nodes. + * + * Note that this is a highest-bound/worst-case estimate for the + * following reasons: + * + * 1] We assume that we always add a debug padding for each block + * we write and we also assume that we start at the last word + * of a block attempting to write a two-word entry. + * 2] Rounding up errors due to the way segments are distributed + * in the buckets of the range tree's histogram. + * 3] The activation of zfs_force_some_double_word_sm_entries + * (tunable) when testing. + * + * = Math and Rounding Errors = + * + * rt_histogram[i] bucket of a range tree represents the number + * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given + * that, we want to divide the buckets into groups: Buckets that + * can be represented using a single-word entry, ones that can + * be represented with a double-word entry, and ones that can + * only be represented with multiple two-word entries. + * + * [Note that if the new encoding feature is not enabled there + * are only two groups: single-word entry buckets and multiple + * single-word entry buckets. The information below assumes + * two-word entries enabled, but it can easily applied when + * the feature is not enabled] + * + * To find the highest bucket that can be represented with a + * single-word entry we look at the maximum run that such entry + * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that + * the run of a space map entry is shifted by sm_shift, thus we + * add it to the exponent]. This way, excluding the value of the + * maximum run that can be represented by a single-word entry, + * all runs that are smaller exist in buckets 0 to + * SM_RUN_BITS + shift - 1. + * + * To find the highest bucket that can be represented with a + * double-word entry, we follow the same approach. Finally, any + * bucket higher than that are represented with multiple two-word + * entries. To be more specific, if the highest bucket whose + * segments can be represented with a single two-word entry is X, + * then bucket X+1 will need 2 two-word entries for each of its + * segments, X+2 will need 4, X+3 will need 8, ...etc. + * + * With all of the above we make our estimation based on bucket + * groups. There is a rounding error though. As we mentioned in + * the example with the one-word entry, the maximum run that can + * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is + * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of + * that length fall into the next bucket (and bucket group) where + * we start counting two-word entries and this is one more reason + * why the estimated size may end up being bigger than the actual + * size written. + */ + uint64_t size = 0; + uint64_t idx = 0; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) || + (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) { + + /* + * If we are trying to force some double word entries just + * assume the worst-case of every single word entry being + * written as a double word entry. + */ + uint64_t entry_size = + (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) && + zfs_force_some_double_word_sm_entries) ? + (2 * sizeof (uint64_t)) : sizeof (uint64_t); + + uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1; + for (; idx <= single_entry_max_bucket; idx++) + size += histogram[idx] * entry_size; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) { + for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { + ASSERT3U(idx, >=, single_entry_max_bucket); + entries_for_seg = + 1ULL << (idx - single_entry_max_bucket); + size += histogram[idx] * + entries_for_seg * entry_size; + } + return (size); + } + } + + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)); + + uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1; + for (; idx <= double_entry_max_bucket; idx++) + size += histogram[idx] * 2 * sizeof (uint64_t); + + for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { + ASSERT3U(idx, >=, double_entry_max_bucket); + entries_for_seg = 1ULL << (idx - double_entry_max_bucket); + size += histogram[idx] * + entries_for_seg * 2 * sizeof (uint64_t); + } + + /* + * Assume the worst case where we start with the padding at the end + * of the current block and we add an extra padding entry at the end + * of all subsequent blocks. + */ + size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t); + + return (size); +} + +uint64_t +space_map_object(space_map_t *sm) +{ + return (sm != NULL ? sm->sm_object : 0); +} + +/* + * Returns the already synced, on-disk allocated space. + */ +uint64_t +space_map_allocated(space_map_t *sm) +{ + return (sm != NULL ? sm->sm_alloc : 0); +} + +/* + * Returns the already synced, on-disk length; + */ +uint64_t +space_map_length(space_map_t *sm) +{ + return (sm != NULL ? sm->sm_length : 0); +} + +/* + * Returns the allocated space that is currently syncing. + */ +int64_t +space_map_alloc_delta(space_map_t *sm) +{ + if (sm == NULL) + return (0); + ASSERT(sm->sm_dbuf != NULL); + return (sm->sm_phys->smp_alloc - space_map_allocated(sm)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c new file mode 100644 index 000000000000..aa289ba1061d --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c @@ -0,0 +1,149 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/range_tree.h> +#include <sys/space_reftree.h> + +/* + * Space reference trees. + * + * A range tree is a collection of integers. Every integer is either + * in the tree, or it's not. A space reference tree generalizes + * the idea: it allows its members to have arbitrary reference counts, + * as opposed to the implicit reference count of 0 or 1 in a range tree. + * This representation comes in handy when computing the union or + * intersection of multiple space maps. For example, the union of + * N range trees is the subset of the reference tree with refcnt >= 1. + * The intersection of N range trees is the subset with refcnt >= N. + * + * [It's very much like a Fourier transform. Unions and intersections + * are hard to perform in the 'range tree domain', so we convert the trees + * into the 'reference count domain', where it's trivial, then invert.] + * + * vdev_dtl_reassess() uses computations of this form to determine + * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev + * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev + * has an outage wherever refcnt >= vdev_children. + */ +static int +space_reftree_compare(const void *x1, const void *x2) +{ + const space_ref_t *sr1 = (const space_ref_t *)x1; + const space_ref_t *sr2 = (const space_ref_t *)x2; + + int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset); + if (likely(cmp)) + return (cmp); + + return (AVL_PCMP(sr1, sr2)); +} + +void +space_reftree_create(avl_tree_t *t) +{ + avl_create(t, space_reftree_compare, + sizeof (space_ref_t), offsetof(space_ref_t, sr_node)); +} + +void +space_reftree_destroy(avl_tree_t *t) +{ + space_ref_t *sr; + void *cookie = NULL; + + while ((sr = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(sr, sizeof (*sr)); + + avl_destroy(t); +} + +static void +space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt) +{ + space_ref_t *sr; + + sr = kmem_alloc(sizeof (*sr), KM_SLEEP); + sr->sr_offset = offset; + sr->sr_refcnt = refcnt; + + avl_add(t, sr); +} + +void +space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, + int64_t refcnt) +{ + space_reftree_add_node(t, start, refcnt); + space_reftree_add_node(t, end, -refcnt); +} + +/* + * Convert (or add) a range tree into a reference tree. + */ +void +space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt) +{ + range_seg_t *rs; + + for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) + space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt); +} + +/* + * Convert a reference tree into a range tree. The range tree will contain + * all members of the reference tree for which refcnt >= minref. + */ +void +space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref) +{ + uint64_t start = -1ULL; + int64_t refcnt = 0; + space_ref_t *sr; + + range_tree_vacate(rt, NULL, NULL); + + for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { + refcnt += sr->sr_refcnt; + if (refcnt >= minref) { + if (start == -1ULL) { + start = sr->sr_offset; + } + } else { + if (start != -1ULL) { + uint64_t end = sr->sr_offset; + ASSERT(start <= end); + if (end > start) + range_tree_add(rt, start, end - start); + start = -1ULL; + } + } + } + ASSERT(refcnt == 0); + ASSERT(start == -1ULL); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h new file mode 100644 index 000000000000..07da1e13cc7d --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h @@ -0,0 +1,154 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _ABD_H +#define _ABD_H + +#include <sys/isa_defs.h> +#ifdef illumos +#include <sys/int_types.h> +#else +#include <sys/stdint.h> +#endif +#include <sys/debug.h> +#include <sys/refcount.h> +#ifdef _KERNEL +#include <sys/uio.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum abd_flags { + ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ + ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ + ABD_FLAG_META = 1 << 2 /* does this represent FS metadata? */ +} abd_flags_t; + +typedef struct abd { + abd_flags_t abd_flags; + uint_t abd_size; /* excludes scattered abd_offset */ + struct abd *abd_parent; + refcount_t abd_children; + union { + struct abd_scatter { + uint_t abd_offset; + uint_t abd_chunk_size; + void *abd_chunks[]; + } abd_scatter; + struct abd_linear { + void *abd_buf; + } abd_linear; + } abd_u; +} abd_t; + +typedef int abd_iter_func_t(void *, size_t, void *); +typedef int abd_iter_func2_t(void *, void *, size_t, void *); + +extern boolean_t zfs_abd_scatter_enabled; + +inline boolean_t +abd_is_linear(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE); +} + +/* + * Allocations and deallocations + */ + +abd_t *abd_alloc(size_t, boolean_t); +abd_t *abd_alloc_linear(size_t, boolean_t); +abd_t *abd_alloc_for_io(size_t, boolean_t); +abd_t *abd_alloc_sametype(abd_t *, size_t); +void abd_free(abd_t *); +abd_t *abd_get_offset(abd_t *, size_t); +abd_t *abd_get_from_buf(void *, size_t); +void abd_put(abd_t *); + +/* + * Conversion to and from a normal buffer + */ + +void *abd_to_buf(abd_t *); +void *abd_borrow_buf(abd_t *, size_t); +void *abd_borrow_buf_copy(abd_t *, size_t); +void abd_return_buf(abd_t *, void *, size_t); +void abd_return_buf_copy(abd_t *, void *, size_t); +void abd_take_ownership_of_buf(abd_t *, boolean_t); +void abd_release_ownership_of_buf(abd_t *); + +/* + * ABD operations + */ + +int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); +int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, + abd_iter_func2_t *, void *); +void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); +void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); +void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); +int abd_cmp(abd_t *, abd_t *, size_t); +int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); +void abd_zero_off(abd_t *, size_t, size_t); + +/* + * Wrappers for calls with offsets of 0 + */ + +inline void +abd_copy(abd_t *dabd, abd_t *sabd, size_t size) +{ + abd_copy_off(dabd, sabd, 0, 0, size); +} + +inline void +abd_copy_from_buf(abd_t *abd, const void *buf, size_t size) +{ + abd_copy_from_buf_off(abd, buf, 0, size); +} + +inline void +abd_copy_to_buf(void* buf, abd_t *abd, size_t size) +{ + abd_copy_to_buf_off(buf, abd, 0, size); +} + +inline int +abd_cmp_buf(abd_t *abd, const void *buf, size_t size) +{ + return (abd_cmp_buf_off(abd, buf, 0, size)); +} + +inline void +abd_zero(abd_t *abd, size_t size) +{ + abd_zero_off(abd, 0, size); +} + +/* + * Module lifecycle + */ + +void abd_init(void); +void abd_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h new file mode 100644 index 000000000000..4d64e05a1adf --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_AGGSUM_H +#define _SYS_AGGSUM_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct aggsum_bucket { + kmutex_t asc_lock; + int64_t asc_delta; + uint64_t asc_borrowed; + uint64_t asc_pad[2]; /* pad out to cache line (64 bytes) */ +} aggsum_bucket_t __aligned(CACHE_LINE_SIZE); + +/* + * Fan out over FANOUT cpus. + */ +typedef struct aggsum { + kmutex_t as_lock; + int64_t as_lower_bound; + int64_t as_upper_bound; + uint64_t as_numbuckets; + aggsum_bucket_t *as_buckets; +} aggsum_t; + +void aggsum_init(aggsum_t *, uint64_t); +void aggsum_fini(aggsum_t *); +int64_t aggsum_lower_bound(aggsum_t *); +int64_t aggsum_upper_bound(aggsum_t *); +int aggsum_compare(aggsum_t *, uint64_t); +uint64_t aggsum_value(aggsum_t *); +void aggsum_add(aggsum_t *, int64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_AGGSUM_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h new file mode 100644 index 000000000000..7be5f9037dd3 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h @@ -0,0 +1,290 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + */ + +#ifndef _SYS_ARC_H +#define _SYS_ARC_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/zio.h> +#include <sys/dmu.h> +#include <sys/spa.h> + +/* + * Used by arc_flush() to inform arc_evict_state() that it should evict + * all available buffers from the arc state being passed in. + */ +#define ARC_EVICT_ALL -1ULL + +#define HDR_SET_LSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_SET_PSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) +#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) + +typedef struct arc_buf_hdr arc_buf_hdr_t; +typedef struct arc_buf arc_buf_t; +typedef struct arc_prune arc_prune_t; + +/* + * Because the ARC can store encrypted data, errors (not due to bugs) may arise + * while transforming data into its desired format - specifically, when + * decrypting, the key may not be present, or the HMAC may not be correct + * which signifies deliberate tampering with the on-disk state + * (assuming that the checksum was correct). If any error occurs, the "buf" + * parameter will be NULL. + */ +typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *bp, arc_buf_t *buf, void *priv); +typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); +typedef void arc_prune_func_t(int64_t bytes, void *priv); + +/* Shared module parameters */ +extern uint64_t zfs_arc_average_blocksize; + +/* generic arc_done_func_t's which you can use */ +arc_read_done_func_t arc_bcopy_func; +arc_read_done_func_t arc_getbuf_func; + +/* generic arc_prune_func_t wrapper for callbacks */ +struct arc_prune { + arc_prune_func_t *p_pfunc; + void *p_private; + uint64_t p_adjust; + list_node_t p_node; + refcount_t p_refcnt; +}; + +typedef enum arc_strategy { + ARC_STRATEGY_META_ONLY = 0, /* Evict only meta data buffers */ + ARC_STRATEGY_META_BALANCED = 1, /* Evict data buffers if needed */ +} arc_strategy_t; + +typedef enum arc_flags +{ + /* + * Public flags that can be passed into the ARC by external consumers. + */ + ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */ + ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */ + ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ + ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ + ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ + ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ + ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */ + + /* + * Private ARC flags. These flags are private ARC only flags that + * will show up in b_flags in the arc_hdr_buf_t. These flags should + * only be set by ARC code. + */ + ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ + ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */ + /* Indicates that block was read with ASYNC priority. */ + ARC_FLAG_PRIO_ASYNC_READ = 1 << 11, + ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */ + /* indicates that the buffer contains metadata (otherwise, data) */ + ARC_FLAG_BUFC_METADATA = 1 << 15, + + /* Flags specifying whether optional hdr struct fields are defined */ + ARC_FLAG_HAS_L1HDR = 1 << 16, + ARC_FLAG_HAS_L2HDR = 1 << 17, + + /* + * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. + * This allows the l2arc to use the blkptr's checksum to verify + * the data without having to store the checksum in the hdr. + */ + ARC_FLAG_COMPRESSED_ARC = 1 << 18, + ARC_FLAG_SHARED_DATA = 1 << 19, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 + +} arc_flags_t; + +typedef enum arc_buf_flags { + ARC_BUF_FLAG_SHARED = 1 << 0, + ARC_BUF_FLAG_COMPRESSED = 1 << 1 +} arc_buf_flags_t; + +struct arc_buf { + arc_buf_hdr_t *b_hdr; + arc_buf_t *b_next; + kmutex_t b_evict_lock; + void *b_data; + arc_buf_flags_t b_flags; +}; + +typedef enum arc_buf_contents { + ARC_BUFC_INVALID, /* invalid type */ + ARC_BUFC_DATA, /* buffer contains data */ + ARC_BUFC_METADATA, /* buffer contains metadata */ + ARC_BUFC_NUMTYPES +} arc_buf_contents_t; + +/* + * The following breakdows of arc_size exist for kstat only. + */ +typedef enum arc_space_type { + ARC_SPACE_DATA, + ARC_SPACE_META, + ARC_SPACE_HDRS, + ARC_SPACE_L2HDRS, + ARC_SPACE_DBUF, + ARC_SPACE_DNODE, + ARC_SPACE_BONUS, + ARC_SPACE_NUMTYPES +} arc_space_type_t; + +typedef enum arc_state_type { + ARC_STATE_ANON, + ARC_STATE_MRU, + ARC_STATE_MRU_GHOST, + ARC_STATE_MFU, + ARC_STATE_MFU_GHOST, + ARC_STATE_L2C_ONLY, + ARC_STATE_NUMTYPES +} arc_state_type_t; + +typedef struct arc_buf_info { + arc_state_type_t abi_state_type; + arc_buf_contents_t abi_state_contents; + uint64_t abi_state_index; + uint32_t abi_flags; + uint32_t abi_bufcnt; + uint64_t abi_size; + uint64_t abi_spa; + uint64_t abi_access; + uint32_t abi_mru_hits; + uint32_t abi_mru_ghost_hits; + uint32_t abi_mfu_hits; + uint32_t abi_mfu_ghost_hits; + uint32_t abi_l2arc_hits; + uint32_t abi_holds; + uint64_t abi_l2arc_dattr; + uint64_t abi_l2arc_asize; + enum zio_compress abi_l2arc_compress; +} arc_buf_info_t; + +void arc_space_consume(uint64_t space, arc_space_type_t type); +void arc_space_return(uint64_t space, arc_space_type_t type); +boolean_t arc_is_metadata(arc_buf_t *buf); +enum zio_compress arc_get_compression(arc_buf_t *buf); +int arc_decompress(arc_buf_t *buf); +arc_buf_t *arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, + int32_t size); +arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, void *tag, + uint64_t psize, uint64_t lsize, enum zio_compress compression_type); +arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size); +arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type); +void arc_return_buf(arc_buf_t *buf, void *tag); +void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); +void arc_buf_destroy(arc_buf_t *buf, void *tag); +void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index); +int arc_buf_size(arc_buf_t *buf); +int arc_buf_lsize(arc_buf_t *buf); +void arc_buf_access(arc_buf_t *buf); +void arc_release(arc_buf_t *buf, void *tag); +int arc_released(arc_buf_t *buf); +void arc_buf_freeze(arc_buf_t *buf); +void arc_buf_thaw(arc_buf_t *buf); +#ifdef ZFS_DEBUG +int arc_referenced(arc_buf_t *buf); +#endif + +int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + arc_read_done_func_t *done, void *priv, zio_priority_t priority, + int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); +zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, + arc_write_done_func_t *ready, arc_write_done_func_t *child_ready, + arc_write_done_func_t *physdone, arc_write_done_func_t *done, + void *priv, zio_priority_t priority, int zio_flags, + const zbookmark_phys_t *zb); + +arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv); +void arc_remove_prune_callback(arc_prune_t *p); +void arc_freed(spa_t *spa, const blkptr_t *bp); + +void arc_flush(spa_t *spa, boolean_t retry); +void arc_tempreserve_clear(uint64_t reserve); +int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); + +uint64_t arc_max_bytes(void); +void arc_init(void); +void arc_fini(void); + +/* + * Level 2 ARC + */ + +void l2arc_add_vdev(spa_t *spa, vdev_t *vd); +void l2arc_remove_vdev(vdev_t *vd); +boolean_t l2arc_vdev_present(vdev_t *vd); +void l2arc_init(void); +void l2arc_fini(void); +void l2arc_start(void); +void l2arc_stop(void); + +#ifdef illumos +#ifndef _KERNEL +extern boolean_t arc_watch; +extern int arc_procfd; +#endif +#endif /* illumos */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ARC_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h new file mode 100644 index 000000000000..77b1b827ac37 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#ifndef _SYS_BLKPTR_H +#define _SYS_BLKPTR_H + +#include <sys/spa.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +void encode_embedded_bp_compressed(blkptr_t *, void *, + enum zio_compress, int, int); +void decode_embedded_bp_compressed(const blkptr_t *, void *); +int decode_embedded_bp(const blkptr_t *, void *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BLKPTR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h new file mode 100644 index 000000000000..471be9047ec2 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h @@ -0,0 +1,57 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_BPLIST_H +#define _SYS_BPLIST_H + +#include <sys/zfs_context.h> +#include <sys/spa.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bplist_entry { + blkptr_t bpe_blk; + list_node_t bpe_node; +} bplist_entry_t; + +typedef struct bplist { + kmutex_t bpl_lock; + list_t bpl_list; +} bplist_t; + +typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +void bplist_create(bplist_t *bpl); +void bplist_destroy(bplist_t *bpl); +void bplist_append(bplist_t *bpl, const blkptr_t *bp); +void bplist_iterate(bplist_t *bpl, bplist_itor_t *func, + void *arg, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPLIST_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h new file mode 100644 index 000000000000..d425e239f6a6 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h @@ -0,0 +1,95 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_BPOBJ_H +#define _SYS_BPOBJ_H + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bpobj_phys { + /* + * This is the bonus buffer for the dead lists. The object's + * contents is an array of bpo_entries blkptr_t's, representing + * a total of bpo_bytes physical space. + */ + uint64_t bpo_num_blkptrs; + uint64_t bpo_bytes; + uint64_t bpo_comp; + uint64_t bpo_uncomp; + uint64_t bpo_subobjs; + uint64_t bpo_num_subobjs; +} bpobj_phys_t; + +#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t)) +#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t)) + +typedef struct bpobj { + kmutex_t bpo_lock; + objset_t *bpo_os; + uint64_t bpo_object; + int bpo_epb; + uint8_t bpo_havecomp; + uint8_t bpo_havesubobj; + bpobj_phys_t *bpo_phys; + dmu_buf_t *bpo_dbuf; + dmu_buf_t *bpo_cached_dbuf; +} bpobj_t; + +typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); +uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx); +void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); +void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx); + +int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object); +void bpobj_close(bpobj_t *bpo); +boolean_t bpobj_is_open(const bpobj_t *bpo); + +int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); +int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *); + +void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); +void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx); + +int bpobj_space(bpobj_t *bpo, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +boolean_t bpobj_is_empty(bpobj_t *bpo); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPOBJ_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h new file mode 100644 index 000000000000..327c128bf493 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#ifndef _SYS_BPTREE_H +#define _SYS_BPTREE_H + +#include <sys/spa.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bptree_phys { + uint64_t bt_begin; + uint64_t bt_end; + uint64_t bt_bytes; + uint64_t bt_comp; + uint64_t bt_uncomp; +} bptree_phys_t; + +typedef struct bptree_entry_phys { + blkptr_t be_bp; + uint64_t be_birth_txg; /* only delete blocks born after this txg */ + zbookmark_phys_t be_zb; /* holds traversal resume point if needed */ +} bptree_entry_phys_t; + +typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx); +int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); +boolean_t bptree_is_empty(objset_t *os, uint64_t obj); + +void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, + uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx); + +int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, + bptree_itor_t func, void *arg, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPTREE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h new file mode 100644 index 000000000000..63722df1bbf3 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#ifndef _BQUEUE_H +#define _BQUEUE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/zfs_context.h> + +typedef struct bqueue { + list_t bq_list; + kmutex_t bq_lock; + kcondvar_t bq_add_cv; + kcondvar_t bq_pop_cv; + uint64_t bq_size; + uint64_t bq_maxsize; + size_t bq_node_offset; +} bqueue_t; + +typedef struct bqueue_node { + list_node_t bqn_node; + uint64_t bqn_size; +} bqueue_node_t; + + +int bqueue_init(bqueue_t *, uint64_t, size_t); +void bqueue_destroy(bqueue_t *); +void bqueue_enqueue(bqueue_t *, void *, uint64_t); +void *bqueue_dequeue(bqueue_t *); +boolean_t bqueue_empty(bqueue_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _BQUEUE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h new file mode 100644 index 000000000000..33c3b7bc2532 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011 Google, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_CITYHASH_H +#define _SYS_CITYHASH_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +uint64_t cityhash4(uint64_t, uint64_t, uint64_t, uint64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CITYHASH_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h new file mode 100644 index 000000000000..af9a0a72159b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -0,0 +1,416 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#ifndef _SYS_DBUF_H +#define _SYS_DBUF_H + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/arc.h> +#include <sys/zfs_context.h> +#include <sys/refcount.h> +#include <sys/zrlock.h> +#include <sys/multilist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define IN_DMU_SYNC 2 + +/* + * define flags for dbuf_read + */ + +#define DB_RF_MUST_SUCCEED (1 << 0) +#define DB_RF_CANFAIL (1 << 1) +#define DB_RF_HAVESTRUCT (1 << 2) +#define DB_RF_NOPREFETCH (1 << 3) +#define DB_RF_NEVERWAIT (1 << 4) +#define DB_RF_CACHED (1 << 5) + +/* + * The simplified state transition diagram for dbufs looks like: + * + * +----> READ ----+ + * | | + * | V + * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) + * | ^ ^ + * | | | + * +----> FILL ----+ | + * | | + * | | + * +--------> NOFILL -------+ + * + * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range + * to find all dbufs in a range of a dnode and must be less than any other + * dbuf_states_t (see comment on dn_dbufs in dnode.h). + */ +typedef enum dbuf_states { + DB_SEARCH = -1, + DB_UNCACHED, + DB_FILL, + DB_NOFILL, + DB_READ, + DB_CACHED, + DB_EVICTING +} dbuf_states_t; + +typedef enum dbuf_cached_state { + DB_NO_CACHE = -1, + DB_DBUF_CACHE, + DB_DBUF_METADATA_CACHE, + DB_CACHE_MAX +} dbuf_cached_state_t; + +struct dnode; +struct dmu_tx; + +/* + * level = 0 means the user data + * level = 1 means the single indirect block + * etc. + */ + +struct dmu_buf_impl; + +typedef enum override_states { + DR_NOT_OVERRIDDEN, + DR_IN_DMU_SYNC, + DR_OVERRIDDEN +} override_states_t; + +typedef struct dbuf_dirty_record { + /* link on our parents dirty list */ + list_node_t dr_dirty_node; + + /* transaction group this data will sync in */ + uint64_t dr_txg; + + /* zio of outstanding write IO */ + zio_t *dr_zio; + + /* pointer back to our dbuf */ + struct dmu_buf_impl *dr_dbuf; + + /* pointer to next dirty record */ + struct dbuf_dirty_record *dr_next; + + /* pointer to parent dirty record */ + struct dbuf_dirty_record *dr_parent; + + /* How much space was changed to dsl_pool_dirty_space() for this? */ + unsigned int dr_accounted; + + /* A copy of the bp that points to us */ + blkptr_t dr_bp_copy; + + union dirty_types { + struct dirty_indirect { + + /* protect access to list */ + kmutex_t dr_mtx; + + /* Our list of dirty children */ + list_t dr_children; + } di; + struct dirty_leaf { + + /* + * dr_data is set when we dirty the buffer + * so that we can retain the pointer even if it + * gets COW'd in a subsequent transaction group. + */ + arc_buf_t *dr_data; + blkptr_t dr_overridden_by; + override_states_t dr_override_state; + uint8_t dr_copies; + boolean_t dr_nopwrite; + } dl; + } dt; +} dbuf_dirty_record_t; + +typedef struct dmu_buf_impl { + /* + * The following members are immutable, with the exception of + * db.db_data, which is protected by db_mtx. + */ + + /* the publicly visible structure */ + dmu_buf_t db; + + /* the objset we belong to */ + struct objset *db_objset; + + /* + * handle to safely access the dnode we belong to (NULL when evicted) + */ + struct dnode_handle *db_dnode_handle; + + /* + * our parent buffer; if the dnode points to us directly, + * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf + * only accessed by sync thread ??? + * (NULL when evicted) + * May change from NULL to non-NULL under the protection of db_mtx + * (see dbuf_check_blkptr()) + */ + struct dmu_buf_impl *db_parent; + + /* + * link for hash table of all dmu_buf_impl_t's + */ + struct dmu_buf_impl *db_hash_next; + + /* our block number */ + uint64_t db_blkid; + + /* + * Pointer to the blkptr_t which points to us. May be NULL if we + * don't have one yet. (NULL when evicted) + */ + blkptr_t *db_blkptr; + + /* + * Our indirection level. Data buffers have db_level==0. + * Indirect buffers which point to data buffers have + * db_level==1. etc. Buffers which contain dnodes have + * db_level==0, since the dnodes are stored in a file. + */ + uint8_t db_level; + + /* db_mtx protects the members below */ + kmutex_t db_mtx; + + /* + * Current state of the buffer + */ + dbuf_states_t db_state; + + /* + * Refcount accessed by dmu_buf_{hold,rele}. + * If nonzero, the buffer can't be destroyed. + * Protected by db_mtx. + */ + refcount_t db_holds; + + /* buffer holding our data */ + arc_buf_t *db_buf; + + kcondvar_t db_changed; + dbuf_dirty_record_t *db_data_pending; + + /* pointer to most recent dirty record for this buffer */ + dbuf_dirty_record_t *db_last_dirty; + + /* + * Our link on the owner dnodes's dn_dbufs list. + * Protected by its dn_dbufs_mtx. + */ + avl_node_t db_link; + + /* Link in dbuf_cache or dbuf_metadata_cache */ + multilist_node_t db_cache_link; + + /* Tells us which dbuf cache this dbuf is in, if any */ + dbuf_cached_state_t db_caching_status; + + /* Data which is unique to data (leaf) blocks: */ + + /* User callback information. */ + dmu_buf_user_t *db_user; + + /* + * Evict user data as soon as the dirty and reference + * counts are equal. + */ + uint8_t db_user_immediate_evict; + + /* + * This block was freed while a read or write was + * active. + */ + uint8_t db_freed_in_flight; + + /* + * dnode_evict_dbufs() or dnode_evict_bonus() tried to + * evict this dbuf, but couldn't due to outstanding + * references. Evict once the refcount drops to 0. + */ + uint8_t db_pending_evict; + + uint8_t db_dirtycnt; +} dmu_buf_impl_t; + +/* Note: the dbuf hash table is exposed only for the mdb module */ +#define DBUF_MUTEXES 256 +#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) +typedef struct dbuf_hash_table { + uint64_t hash_table_mask; + dmu_buf_impl_t **hash_table; + kmutex_t hash_mutexes[DBUF_MUTEXES]; +} dbuf_hash_table_t; + +uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset); + +dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); +void dbuf_create_bonus(struct dnode *dn); +int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx); +void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag); + +void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx); + +dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); +dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, + void *tag); +int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, + void *tag, dmu_buf_impl_t **dbp); + +void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, + zio_priority_t prio, arc_flags_t aflags); + +void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); +boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj, + uint64_t blkid, void *tag); +uint64_t dbuf_refcount(dmu_buf_impl_t *db); + +void dbuf_rele(dmu_buf_impl_t *db, void *tag); +void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting); + +dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, + uint64_t blkid); + +int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); +void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); +void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); +dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); +void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, + bp_embedded_type_t etype, enum zio_compress comp, + int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); + +void dbuf_destroy(dmu_buf_impl_t *db); + +void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dbuf_unoverride(dbuf_dirty_record_t *dr); +void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); +void dbuf_release_bp(dmu_buf_impl_t *db); + +boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf); + +void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, + struct dmu_tx *); + +void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); + +void dbuf_stats_init(dbuf_hash_table_t *hash); +void dbuf_stats_destroy(void); + +#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) +#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) +#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) +#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db))) +#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db))) + +void dbuf_init(void); +void dbuf_fini(void); + +boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); + +#define DBUF_GET_BUFC_TYPE(_db) \ + (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) + +#define DBUF_IS_CACHEABLE(_db) \ + ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ + (dbuf_is_metadata(_db) && \ + ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) + +#define DBUF_IS_L2CACHEABLE(_db) \ + ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ + (dbuf_is_metadata(_db) && \ + ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) + +#define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \ + ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || \ + (((_level) > 0 || \ + DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \ + ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA))) + +#ifdef ZFS_DEBUG + +/* + * There should be a ## between the string literal and fmt, to make it + * clear that we're joining two strings together, but gcc does not + * support that preprocessor token. + */ +#define dprintf_dbuf(dbuf, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __db_buf[32]; \ + uint64_t __db_obj = (dbuf)->db.db_object; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + (void) strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ + (u_longlong_t)__db_obj); \ + dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \ + "obj=%s lvl=%u blkid=%lld " fmt, \ + __db_buf, (dbuf)->db_level, \ + (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ + snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \ + dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ + kmem_free(__blkbuf, BP_SPRINTF_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define DBUF_VERIFY(db) dbuf_verify(db) + +#else + +#define dprintf_dbuf(db, fmt, ...) +#define dprintf_dbuf_bp(db, bp, fmt, ...) +#define DBUF_VERIFY(db) + +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DBUF_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h new file mode 100644 index 000000000000..2468a1485fd3 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h @@ -0,0 +1,248 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DDT_H +#define _SYS_DDT_H + +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct abd; + +/* + * On-disk DDT formats, in the desired search order (newest version first). + */ +enum ddt_type { + DDT_TYPE_ZAP = 0, + DDT_TYPES +}; + +/* + * DDT classes, in the desired search order (highest replication level first). + */ +enum ddt_class { + DDT_CLASS_DITTO = 0, + DDT_CLASS_DUPLICATE, + DDT_CLASS_UNIQUE, + DDT_CLASSES +}; + +#define DDT_TYPE_CURRENT 0 + +#define DDT_COMPRESS_BYTEORDER_MASK 0x80 +#define DDT_COMPRESS_FUNCTION_MASK 0x7f + +/* + * On-disk ddt entry: key (name) and physical storage (value). + */ +typedef struct ddt_key { + zio_cksum_t ddk_cksum; /* 256-bit block checksum */ + /* + * Encoded with logical & physical size, and compression, as follows: + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | 0 | 0 | 0 | comp | PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + */ + uint64_t ddk_prop; +} ddt_key_t; + +#define DDK_GET_LSIZE(ddk) \ + BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) +#define DDK_SET_LSIZE(ddk, x) \ + BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define DDK_GET_PSIZE(ddk) \ + BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) +#define DDK_SET_PSIZE(ddk, x) \ + BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 8) +#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 8, x) + +#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) + +typedef struct ddt_phys { + dva_t ddp_dva[SPA_DVAS_PER_BP]; + uint64_t ddp_refcnt; + uint64_t ddp_phys_birth; +} ddt_phys_t; + +enum ddt_phys_type { + DDT_PHYS_DITTO = 0, + DDT_PHYS_SINGLE = 1, + DDT_PHYS_DOUBLE = 2, + DDT_PHYS_TRIPLE = 3, + DDT_PHYS_TYPES +}; + +/* + * In-core ddt entry + */ +struct ddt_entry { + ddt_key_t dde_key; + ddt_phys_t dde_phys[DDT_PHYS_TYPES]; + zio_t *dde_lead_zio[DDT_PHYS_TYPES]; + struct abd *dde_repair_abd; + enum ddt_type dde_type; + enum ddt_class dde_class; + uint8_t dde_loading; + uint8_t dde_loaded; + kcondvar_t dde_cv; + avl_node_t dde_node; +}; + +/* + * In-core ddt + */ +struct ddt { + kmutex_t ddt_lock; + avl_tree_t ddt_tree; + avl_tree_t ddt_repair_tree; + enum zio_checksum ddt_checksum; + spa_t *ddt_spa; + objset_t *ddt_os; + uint64_t ddt_stat_object; + uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; + ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; + ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; + ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; + avl_node_t ddt_node; +}; + +/* + * In-core and on-disk bookmark for DDT walks + */ +typedef struct ddt_bookmark { + uint64_t ddb_class; + uint64_t ddb_type; + uint64_t ddb_checksum; + uint64_t ddb_cursor; +} ddt_bookmark_t; + +/* + * Ops vector to access a specific DDT object type. + */ +typedef struct ddt_ops { + char ddt_op_name[32]; + int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, + boolean_t prehash); + int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); + int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde); + void (*ddt_op_prefetch)(objset_t *os, uint64_t object, + ddt_entry_t *dde); + int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde, + dmu_tx_t *tx); + int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde, + dmu_tx_t *tx); + int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde, + uint64_t *walk); + int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); +} ddt_ops_t; + +#define DDT_NAMELEN 80 + +extern void ddt_object_name(ddt_t *ddt, enum ddt_type type, + enum ddt_class cls, char *name); +extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type, + enum ddt_class cls, uint64_t *walk, ddt_entry_t *dde); +extern int ddt_object_count(ddt_t *ddt, enum ddt_type type, + enum ddt_class cls, uint64_t *count); +extern int ddt_object_info(ddt_t *ddt, enum ddt_type type, + enum ddt_class cls, dmu_object_info_t *); +extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type, + enum ddt_class cls); + +extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, + uint64_t txg); +extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, + const ddt_phys_t *ddp, blkptr_t *bp); + +extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); + +extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); +extern void ddt_phys_clear(ddt_phys_t *ddp); +extern void ddt_phys_addref(ddt_phys_t *ddp); +extern void ddt_phys_decref(ddt_phys_t *ddp); +extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, + uint64_t txg); +extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp); +extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); + +extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); + +extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); +extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); +extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); +extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); +extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); +extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); + +extern uint64_t ddt_get_dedup_dspace(spa_t *spa); +extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); + +extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, + ddt_phys_t *ddp_willref); +extern int ddt_ditto_copies_present(ddt_entry_t *dde); + +extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len); +extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len); + +extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); +extern void ddt_enter(ddt_t *ddt); +extern void ddt_exit(ddt_t *ddt); +extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); +extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); +extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); + +extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class, + const blkptr_t *bp); + +extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); +extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); + +extern int ddt_entry_compare(const void *x1, const void *x2); + +extern void ddt_create(spa_t *spa); +extern int ddt_load(spa_t *spa); +extern void ddt_unload(spa_t *spa); +extern void ddt_sync(spa_t *spa, uint64_t txg); +extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); +extern int ddt_object_update(ddt_t *ddt, enum ddt_type type, + enum ddt_class cls, ddt_entry_t *dde, dmu_tx_t *tx); + +extern const ddt_ops_t ddt_zap_ops; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DDT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h new file mode 100644 index 000000000000..b23ce0194378 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -0,0 +1,1015 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright 2013 DEY Storage Systems, Inc. + * Copyright 2014 HybridCluster. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#ifndef _SYS_DMU_H +#define _SYS_DMU_H + +/* + * This file describes the interface that the DMU provides for its + * consumers. + * + * The DMU also interacts with the SPA. That interface is described in + * dmu_spa.h. + */ + +#include <sys/zfs_context.h> +#include <sys/cred.h> +#include <sys/fs/zfs.h> +#include <sys/zio_compress.h> +#include <sys/zio_priority.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct uio; +struct xuio; +struct page; +struct vnode; +struct spa; +struct zilog; +struct zio; +struct blkptr; +struct zap_cursor; +struct dsl_dataset; +struct dsl_pool; +struct dnode; +struct drr_begin; +struct drr_end; +struct zbookmark_phys; +struct spa; +struct nvlist; +struct arc_buf; +struct zio_prop; +struct sa_handle; +struct file; + +typedef struct objset objset_t; +typedef struct dmu_tx dmu_tx_t; +typedef struct dsl_dir dsl_dir_t; +typedef struct dnode dnode_t; + +typedef enum dmu_object_byteswap { + DMU_BSWAP_UINT8, + DMU_BSWAP_UINT16, + DMU_BSWAP_UINT32, + DMU_BSWAP_UINT64, + DMU_BSWAP_ZAP, + DMU_BSWAP_DNODE, + DMU_BSWAP_OBJSET, + DMU_BSWAP_ZNODE, + DMU_BSWAP_OLDACL, + DMU_BSWAP_ACL, + /* + * Allocating a new byteswap type number makes the on-disk format + * incompatible with any other format that uses the same number. + * + * Data can usually be structured to work with one of the + * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. + */ + DMU_BSWAP_NUMFUNCS +} dmu_object_byteswap_t; + +#define DMU_OT_NEWTYPE 0x80 +#define DMU_OT_METADATA 0x40 +#define DMU_OT_BYTESWAP_MASK 0x3f + +/* + * Defines a uint8_t object type. Object types specify if the data + * in the object is metadata (boolean) and how to byteswap the data + * (dmu_object_byteswap_t). All of the types created by this method + * are cached in the dbuf metadata cache. + */ +#define DMU_OT(byteswap, metadata) \ + (DMU_OT_NEWTYPE | \ + ((metadata) ? DMU_OT_METADATA : 0) | \ + ((byteswap) & DMU_OT_BYTESWAP_MASK)) + +#define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ + (ot) < DMU_OT_NUMTYPES) + +#define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_METADATA) : \ + dmu_ot[(ot)].ot_metadata) + +#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache) + +/* + * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't + * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill + * is repurposed for embedded BPs. + */ +#define DMU_OT_HAS_FILL(ot) \ + ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET) + +#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_BYTESWAP_MASK) : \ + dmu_ot[(ot)].ot_byteswap) + +typedef enum dmu_object_type { + DMU_OT_NONE, + /* general: */ + DMU_OT_OBJECT_DIRECTORY, /* ZAP */ + DMU_OT_OBJECT_ARRAY, /* UINT64 */ + DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ + DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ + DMU_OT_BPOBJ, /* UINT64 */ + DMU_OT_BPOBJ_HDR, /* UINT64 */ + /* spa: */ + DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ + DMU_OT_SPACE_MAP, /* UINT64 */ + /* zil: */ + DMU_OT_INTENT_LOG, /* UINT64 */ + /* dmu: */ + DMU_OT_DNODE, /* DNODE */ + DMU_OT_OBJSET, /* OBJSET */ + /* dsl: */ + DMU_OT_DSL_DIR, /* UINT64 */ + DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ + DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ + DMU_OT_DSL_PROPS, /* ZAP */ + DMU_OT_DSL_DATASET, /* UINT64 */ + /* zpl: */ + DMU_OT_ZNODE, /* ZNODE */ + DMU_OT_OLDACL, /* Old ACL */ + DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ + DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ + DMU_OT_MASTER_NODE, /* ZAP */ + DMU_OT_UNLINKED_SET, /* ZAP */ + /* zvol: */ + DMU_OT_ZVOL, /* UINT8 */ + DMU_OT_ZVOL_PROP, /* ZAP */ + /* other; for testing only! */ + DMU_OT_PLAIN_OTHER, /* UINT8 */ + DMU_OT_UINT64_OTHER, /* UINT64 */ + DMU_OT_ZAP_OTHER, /* ZAP */ + /* new object types: */ + DMU_OT_ERROR_LOG, /* ZAP */ + DMU_OT_SPA_HISTORY, /* UINT8 */ + DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ + DMU_OT_POOL_PROPS, /* ZAP */ + DMU_OT_DSL_PERMS, /* ZAP */ + DMU_OT_ACL, /* ACL */ + DMU_OT_SYSACL, /* SYSACL */ + DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ + DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ + DMU_OT_NEXT_CLONES, /* ZAP */ + DMU_OT_SCAN_QUEUE, /* ZAP */ + DMU_OT_USERGROUP_USED, /* ZAP */ + DMU_OT_USERGROUP_QUOTA, /* ZAP */ + DMU_OT_USERREFS, /* ZAP */ + DMU_OT_DDT_ZAP, /* ZAP */ + DMU_OT_DDT_STATS, /* ZAP */ + DMU_OT_SA, /* System attr */ + DMU_OT_SA_MASTER_NODE, /* ZAP */ + DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ + DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ + DMU_OT_SCAN_XLATE, /* ZAP */ + DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ + DMU_OT_DEADLIST, /* ZAP */ + DMU_OT_DEADLIST_HDR, /* UINT64 */ + DMU_OT_DSL_CLONES, /* ZAP */ + DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ + /* + * Do not allocate new object types here. Doing so makes the on-disk + * format incompatible with any other format that uses the same object + * type number. + * + * When creating an object which does not have one of the above types + * use the DMU_OTN_* type with the correct byteswap and metadata + * values. + * + * The DMU_OTN_* types do not have entries in the dmu_ot table, + * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead + * of indexing into dmu_ot directly (this works for both DMU_OT_* types + * and DMU_OTN_* types). + */ + DMU_OT_NUMTYPES, + + /* + * Names for valid types declared with DMU_OT(). + */ + DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), + DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), + DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), + DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), + DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), + DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), + DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), + DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), + DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), + DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), +} dmu_object_type_t; + +/* + * These flags are intended to be used to specify the "txg_how" + * parameter when calling the dmu_tx_assign() function. See the comment + * above dmu_tx_assign() for more details on the meaning of these flags. + */ +#define TXG_NOWAIT (0ULL) +#define TXG_WAIT (1ULL<<0) +#define TXG_NOTHROTTLE (1ULL<<1) + +void byteswap_uint64_array(void *buf, size_t size); +void byteswap_uint32_array(void *buf, size_t size); +void byteswap_uint16_array(void *buf, size_t size); +void byteswap_uint8_array(void *buf, size_t size); +void zap_byteswap(void *buf, size_t size); +void zfs_oldacl_byteswap(void *buf, size_t size); +void zfs_acl_byteswap(void *buf, size_t size); +void zfs_znode_byteswap(void *buf, size_t size); + +#define DS_FIND_SNAPSHOTS (1<<0) +#define DS_FIND_CHILDREN (1<<1) +#define DS_FIND_SERIALIZE (1<<2) + +/* + * The maximum number of bytes that can be accessed as part of one + * operation, including metadata. + */ +#define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */ +#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ + +#define DMU_USERUSED_OBJECT (-1ULL) +#define DMU_GROUPUSED_OBJECT (-2ULL) + +/* + * artificial blkids for bonus buffer and spill blocks + */ +#define DMU_BONUS_BLKID (-1ULL) +#define DMU_SPILL_BLKID (-2ULL) +/* + * Public routines to create, destroy, open, and close objsets. + */ +int dmu_objset_hold(const char *name, void *tag, objset_t **osp); +int dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp); +void dmu_objset_rele(objset_t *os, void *tag); +void dmu_objset_disown(objset_t *os, void *tag); +int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); + +void dmu_objset_evict_dbufs(objset_t *os); +int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, + void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); +int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname, + struct nvlist *snaps); +int dmu_objset_clone(const char *name, const char *origin); +int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, + struct nvlist *errlist); +int dmu_objset_snapshot_one(const char *fsname, const char *snapname); +int dmu_objset_snapshot_tmp(const char *, const char *, int); +int dmu_objset_find(char *name, int func(const char *, void *), void *arg, + int flags); +void dmu_objset_byteswap(void *buf, size_t size); +int dsl_dataset_rename_snapshot(const char *fsname, + const char *oldsnapname, const char *newsnapname, boolean_t recursive); +int dmu_objset_remap_indirects(const char *fsname); + +typedef struct dmu_buf { + uint64_t db_object; /* object that this buffer is part of */ + uint64_t db_offset; /* byte offset in this object */ + uint64_t db_size; /* size of buffer in bytes */ + void *db_data; /* data in buffer */ +} dmu_buf_t; + +/* + * The names of zap entries in the DIRECTORY_OBJECT of the MOS. + */ +#define DMU_POOL_DIRECTORY_OBJECT 1 +#define DMU_POOL_CONFIG "config" +#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" +#define DMU_POOL_FEATURES_FOR_READ "features_for_read" +#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" +#define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" +#define DMU_POOL_ROOT_DATASET "root_dataset" +#define DMU_POOL_SYNC_BPOBJ "sync_bplist" +#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" +#define DMU_POOL_ERRLOG_LAST "errlog_last" +#define DMU_POOL_SPARES "spares" +#define DMU_POOL_DEFLATE "deflate" +#define DMU_POOL_HISTORY "history" +#define DMU_POOL_PROPS "pool_props" +#define DMU_POOL_L2CACHE "l2cache" +#define DMU_POOL_TMP_USERREFS "tmp_userrefs" +#define DMU_POOL_DDT "DDT-%s-%s-%s" +#define DMU_POOL_DDT_STATS "DDT-statistics" +#define DMU_POOL_CREATION_VERSION "creation_version" +#define DMU_POOL_SCAN "scan" +#define DMU_POOL_FREE_BPOBJ "free_bpobj" +#define DMU_POOL_BPTREE_OBJ "bptree_obj" +#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" +#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt" +#define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map" +#define DMU_POOL_REMOVING "com.delphix:removing" +#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" +#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" +#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" + +/* + * Allocate an object from this objset. The range of object numbers + * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. + * + * The transaction must be assigned to a txg. The newly allocated + * object will be "held" in the transaction (ie. you can modify the + * newly allocated object in this transaction). + * + * dmu_object_alloc() chooses an object and returns it in *objectp. + * + * dmu_object_claim() allocates a specific object number. If that + * number is already allocated, it fails and returns EEXIST. + * + * Return 0 on success, or ENOSPC or EEXIST as specified above. + */ +uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, + int dnodesize, dmu_tx_t *tx); +int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, + int dnodesize, dmu_tx_t *tx); +int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, + dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, + int bonuslen, int dnodesize, dmu_tx_t *txp); +int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp); + +/* + * Free an object from this objset. + * + * The object's data will be freed as well (ie. you don't need to call + * dmu_free(object, 0, -1, tx)). + * + * The object need not be held in the transaction. + * + * If there are any holds on this object's buffers (via dmu_buf_hold()), + * or tx holds on the object (via dmu_tx_hold_object()), you can not + * free it; it fails and returns EBUSY. + * + * If the object is not allocated, it fails and returns ENOENT. + * + * Return 0 on success, or EBUSY or ENOENT as specified above. + */ +int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); + +/* + * Find the next allocated or free object. + * + * The objectp parameter is in-out. It will be updated to be the next + * object which is allocated. Ignore objects which have not been + * modified since txg. + * + * XXX Can only be called on a objset with no dirty data. + * + * Returns 0 on success, or ENOENT if there are no more objects. + */ +int dmu_object_next(objset_t *os, uint64_t *objectp, + boolean_t hole, uint64_t txg); + +/* + * Set the data blocksize for an object. + * + * The object cannot have any blocks allcated beyond the first. If + * the first block is allocated already, the new size must be greater + * than the current block size. If these conditions are not met, + * ENOTSUP will be returned. + * + * Returns 0 on success, or EBUSY if there are any holds on the object + * contents, or ENOTSUP as described above. + */ +int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, + int ibs, dmu_tx_t *tx); + +/* + * Set the checksum property on a dnode. The new checksum algorithm will + * apply to all newly written blocks; existing blocks will not be affected. + */ +void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, + dmu_tx_t *tx); + +/* + * Set the compress property on a dnode. The new compression algorithm will + * apply to all newly written blocks; existing blocks will not be affected. + */ +void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, + dmu_tx_t *tx); + +int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg); + +void +dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, + void *data, uint8_t etype, uint8_t comp, int uncompressed_size, + int compressed_size, int byteorder, dmu_tx_t *tx); + +/* + * Decide how to write a block: checksum, compression, number of copies, etc. + */ +#define WP_NOFILL 0x1 +#define WP_DMU_SYNC 0x2 +#define WP_SPILL 0x4 + +void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, + struct zio_prop *zp); +/* + * The bonus data is accessed more or less like a regular buffer. + * You must dmu_bonus_hold() to get the buffer, which will give you a + * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus + * data. As with any normal buffer, you must call dmu_buf_will_dirty() + * before modifying it, and the + * object must be held in an assigned transaction before calling + * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus + * buffer as well. You must release your hold with dmu_buf_rele(). + * + * Returns ENOENT, EIO, or 0. + */ +int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); +int dmu_bonus_max(void); +int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); +int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); +dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); +int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); + +/* + * Special spill buffer support used by "SA" framework + */ + +int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); +int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, + void *tag, dmu_buf_t **dbp); +int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); + +/* + * Obtain the DMU buffer from the specified object which contains the + * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so + * that it will remain in memory. You must release the hold with + * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your + * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. + * + * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill + * on the returned buffer before reading or writing the buffer's + * db_data. The comments for those routines describe what particular + * operations are valid after calling them. + * + * The object number must be a valid, allocated object number. + */ +int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **, int flags); +int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags); + +/* + * Add a reference to a dmu buffer that has already been held via + * dmu_buf_hold() in the current context. + */ +void dmu_buf_add_ref(dmu_buf_t *db, void* tag); + +/* + * Attempt to add a reference to a dmu buffer that is in an unknown state, + * using a pointer that may have been invalidated by eviction processing. + * The request will succeed if the passed in dbuf still represents the + * same os/object/blkid, is ineligible for eviction, and has at least + * one hold by a user other than the syncer. + */ +boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object, + uint64_t blkid, void *tag); + +void dmu_buf_rele(dmu_buf_t *db, void *tag); +uint64_t dmu_buf_refcount(dmu_buf_t *db); + +/* + * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a + * range of an object. A pointer to an array of dmu_buf_t*'s is + * returned (in *dbpp). + * + * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and + * frees the array. The hold on the array of buffers MUST be released + * with dmu_buf_rele_array. You can NOT release the hold on each buffer + * individually with dmu_buf_rele. + */ +int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, + uint64_t length, boolean_t read, void *tag, + int *numbufsp, dmu_buf_t ***dbpp); +int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, + boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, + uint32_t flags); +void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); + +typedef void dmu_buf_evict_func_t(void *user_ptr); + +/* + * A DMU buffer user object may be associated with a dbuf for the + * duration of its lifetime. This allows the user of a dbuf (client) + * to attach private data to a dbuf (e.g. in-core only data such as a + * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified + * when that dbuf has been evicted. Clients typically respond to the + * eviction notification by freeing their private data, thus ensuring + * the same lifetime for both dbuf and private data. + * + * The mapping from a dmu_buf_user_t to any client private data is the + * client's responsibility. All current consumers of the API with private + * data embed a dmu_buf_user_t as the first member of the structure for + * their private data. This allows conversions between the two types + * with a simple cast. Since the DMU buf user API never needs access + * to the private data, other strategies can be employed if necessary + * or convenient for the client (e.g. using container_of() to do the + * conversion for private data that cannot have the dmu_buf_user_t as + * its first member). + * + * Eviction callbacks are executed without the dbuf mutex held or any + * other type of mechanism to guarantee that the dbuf is still available. + * For this reason, users must assume the dbuf has already been freed + * and not reference the dbuf from the callback context. + * + * Users requesting "immediate eviction" are notified as soon as the dbuf + * is only referenced by dirty records (dirties == holds). Otherwise the + * notification occurs after eviction processing for the dbuf begins. + */ +typedef struct dmu_buf_user { + /* + * Asynchronous user eviction callback state. + */ + taskq_ent_t dbu_tqent; + + /* + * This instance's eviction function pointers. + * + * dbu_evict_func_sync is called synchronously and then + * dbu_evict_func_async is executed asynchronously on a taskq. + */ + dmu_buf_evict_func_t *dbu_evict_func_sync; + dmu_buf_evict_func_t *dbu_evict_func_async; +#ifdef ZFS_DEBUG + /* + * Pointer to user's dbuf pointer. NULL for clients that do + * not associate a dbuf with their user data. + * + * The dbuf pointer is cleared upon eviction so as to catch + * use-after-evict bugs in clients. + */ + dmu_buf_t **dbu_clear_on_evict_dbufp; +#endif +} dmu_buf_user_t; + +/* + * Initialize the given dmu_buf_user_t instance with the eviction function + * evict_func, to be called when the user is evicted. + * + * NOTE: This function should only be called once on a given dmu_buf_user_t. + * To allow enforcement of this, dbu must already be zeroed on entry. + */ +/*ARGSUSED*/ +inline void +dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, + dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp) +{ + ASSERT(dbu->dbu_evict_func_sync == NULL); + ASSERT(dbu->dbu_evict_func_async == NULL); + + /* must have at least one evict func */ + IMPLY(evict_func_sync == NULL, evict_func_async != NULL); + dbu->dbu_evict_func_sync = evict_func_sync; + dbu->dbu_evict_func_async = evict_func_async; +#ifdef ZFS_DEBUG + dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; +#endif +} + +/* + * Attach user data to a dbuf and mark it for normal (when the dbuf's + * data is cleared or its reference count goes to zero) eviction processing. + * + * Returns NULL on success, or the existing user if another user currently + * owns the buffer. + */ +void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Attach user data to a dbuf and mark it for immediate (its dirty and + * reference counts are equal) eviction processing. + * + * Returns NULL on success, or the existing user if another user currently + * owns the buffer. + */ +void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Replace the current user of a dbuf. + * + * If given the current user of a dbuf, replaces the dbuf's user with + * "new_user" and returns the user data pointer that was replaced. + * Otherwise returns the current, and unmodified, dbuf user pointer. + */ +void *dmu_buf_replace_user(dmu_buf_t *db, + dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); + +/* + * Remove the specified user data for a DMU buffer. + * + * Returns the user that was removed on success, or the current user if + * another user currently owns the buffer. + */ +void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Returns the user data (dmu_buf_user_t *) associated with this dbuf. + */ +void *dmu_buf_get_user(dmu_buf_t *db); + +objset_t *dmu_buf_get_objset(dmu_buf_t *db); +dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); +void dmu_buf_dnode_exit(dmu_buf_t *db); + +/* Block until any in-progress dmu buf user evictions complete. */ +void dmu_buf_user_evict_wait(void); + +/* + * Returns the blkptr associated with this dbuf, or NULL if not set. + */ +struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); + +/* + * Indicate that you are going to modify the buffer's data (db_data). + * + * The transaction (tx) must be assigned to a txg (ie. you've called + * dmu_tx_assign()). The buffer's object must be held in the tx + * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). + */ +void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); + +/* + * You must create a transaction, then hold the objects which you will + * (or might) modify as part of this transaction. Then you must assign + * the transaction to a transaction group. Once the transaction has + * been assigned, you can modify buffers which belong to held objects as + * part of this transaction. You can't modify buffers before the + * transaction has been assigned; you can't modify buffers which don't + * belong to objects which this transaction holds; you can't hold + * objects once the transaction has been assigned. You may hold an + * object which you are going to free (with dmu_object_free()), but you + * don't have to. + * + * You can abort the transaction before it has been assigned. + * + * Note that you may hold buffers (with dmu_buf_hold) at any time, + * regardless of transaction state. + */ + +#define DMU_NEW_OBJECT (-1ULL) +#define DMU_OBJECT_END (-1ULL) + +dmu_tx_t *dmu_tx_create(objset_t *os); +void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); +void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + int len); +void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, + uint64_t len); +void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + uint64_t len); +void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); +void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, + const char *name); +void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn); +void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); +void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); +void dmu_tx_abort(dmu_tx_t *tx); +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +void dmu_tx_wait(dmu_tx_t *tx); +void dmu_tx_commit(dmu_tx_t *tx); +void dmu_tx_mark_netfree(dmu_tx_t *tx); + +/* + * To register a commit callback, dmu_tx_callback_register() must be called. + * + * dcb_data is a pointer to caller private data that is passed on as a + * callback parameter. The caller is responsible for properly allocating and + * freeing it. + * + * When registering a callback, the transaction must be already created, but + * it cannot be committed or aborted. It can be assigned to a txg or not. + * + * The callback will be called after the transaction has been safely written + * to stable storage and will also be called if the dmu_tx is aborted. + * If there is any error which prevents the transaction from being committed to + * disk, the callback will be called with a value of error != 0. + */ +typedef void dmu_tx_callback_func_t(void *dcb_data, int error); + +void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, + void *dcb_data); + +/* + * Free up the data blocks for a defined range of a file. If size is + * -1, the range from offset to end-of-file is freed. + */ +int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, dmu_tx_t *tx); +int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size); +int dmu_free_long_object(objset_t *os, uint64_t object); + +/* + * Convenience functions. + * + * Canfail routines will return 0 on success, or an errno if there is a + * nonrecoverable I/O error. + */ +#define DMU_READ_PREFETCH 0 /* prefetch */ +#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ +int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf, uint32_t flags); +int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, + uint32_t flags); +void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); +void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); +void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx); +int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); +int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); +int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size); +int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, + dmu_tx_t *tx); +int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, + dmu_tx_t *tx); +int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size, + dmu_tx_t *tx); +#ifdef _KERNEL +#ifdef illumos +int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, struct page *pp, dmu_tx_t *tx); +#else +int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, struct vm_page **ppa, dmu_tx_t *tx); +int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, + int *rbehind, int *rahead, int last_size); +#endif +#endif +struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); +void dmu_return_arcbuf(struct arc_buf *buf); +void dmu_assign_arcbuf_dnode(dnode_t *handle, uint64_t offset, + struct arc_buf *buf, dmu_tx_t *tx); +void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, + dmu_tx_t *tx); +int dmu_xuio_init(struct xuio *uio, int niov); +void dmu_xuio_fini(struct xuio *uio); +int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, + size_t n); +int dmu_xuio_cnt(struct xuio *uio); +struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); +void dmu_xuio_clear(struct xuio *uio, int i); +void xuio_stat_wbuf_copied(void); +void xuio_stat_wbuf_nocopy(void); + +extern boolean_t zfs_prefetch_disable; +extern int zfs_max_recordsize; + +/* + * Asynchronously try to read in the data. + */ +void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, + uint64_t len, enum zio_priority pri); + +typedef struct dmu_object_info { + /* All sizes are in bytes unless otherwise indicated. */ + uint32_t doi_data_block_size; + uint32_t doi_metadata_block_size; + dmu_object_type_t doi_type; + dmu_object_type_t doi_bonus_type; + uint64_t doi_bonus_size; + uint8_t doi_indirection; /* 2 = dnode->indirect->data */ + uint8_t doi_checksum; + uint8_t doi_compress; + uint8_t doi_nblkptr; + uint8_t doi_pad[4]; + uint64_t doi_dnodesize; + uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ + uint64_t doi_max_offset; + uint64_t doi_fill_count; /* number of non-empty blocks */ +} dmu_object_info_t; + +typedef void arc_byteswap_func_t(void *buf, size_t size); + +typedef struct dmu_object_type_info { + dmu_object_byteswap_t ot_byteswap; + boolean_t ot_metadata; + boolean_t ot_dbuf_metadata_cache; + char *ot_name; +} dmu_object_type_info_t; + +typedef struct dmu_object_byteswap_info { + arc_byteswap_func_t *ob_func; + char *ob_name; +} dmu_object_byteswap_info_t; + +extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; +extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; + +/* + * Get information on a DMU object. + * + * Return 0 on success or ENOENT if object is not allocated. + * + * If doi is NULL, just indicates whether the object exists. + */ +int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); +void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); +/* Like dmu_object_info, but faster if you have a held dnode in hand. */ +void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi); +/* Like dmu_object_info, but faster if you have a held dbuf in hand. */ +void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); +/* + * Like dmu_object_info_from_db, but faster still when you only care about + * the size. This is specifically optimized for zfs_getattr(). + */ +void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, + u_longlong_t *nblk512); + +void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize); + +typedef struct dmu_objset_stats { + uint64_t dds_num_clones; /* number of clones of this */ + uint64_t dds_creation_txg; + uint64_t dds_guid; + dmu_objset_type_t dds_type; + uint8_t dds_is_snapshot; + uint8_t dds_inconsistent; + char dds_origin[ZFS_MAX_DATASET_NAME_LEN]; +} dmu_objset_stats_t; + +/* + * Get stats on a dataset. + */ +void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); + +/* + * Add entries to the nvlist for all the objset's properties. See + * zfs_prop_table[] and zfs(1m) for details on the properties. + */ +void dmu_objset_stats(objset_t *os, struct nvlist *nv); + +/* + * Get the space usage statistics for statvfs(). + * + * refdbytes is the amount of space "referenced" by this objset. + * availbytes is the amount of space available to this objset, taking + * into account quotas & reservations, assuming that no other objsets + * use the space first. These values correspond to the 'referenced' and + * 'available' properties, described in the zfs(1m) manpage. + * + * usedobjs and availobjs are the number of objects currently allocated, + * and available. + */ +void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); + +/* + * The fsid_guid is a 56-bit ID that can change to avoid collisions. + * (Contrast with the ds_guid which is a 64-bit ID that will never + * change, so there is a small probability that it will collide.) + */ +uint64_t dmu_objset_fsid_guid(objset_t *os); + +/* + * Get the [cm]time for an objset's snapshot dir + */ +timestruc_t dmu_objset_snap_cmtime(objset_t *os); + +int dmu_objset_is_snapshot(objset_t *os); + +extern struct spa *dmu_objset_spa(objset_t *os); +extern struct zilog *dmu_objset_zil(objset_t *os); +extern struct dsl_pool *dmu_objset_pool(objset_t *os); +extern struct dsl_dataset *dmu_objset_ds(objset_t *os); +extern void dmu_objset_name(objset_t *os, char *buf); +extern dmu_objset_type_t dmu_objset_type(objset_t *os); +extern uint64_t dmu_objset_id(objset_t *os); +extern uint64_t dmu_objset_dnodesize(objset_t *os); +extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); +extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); +extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, + uint64_t *id, uint64_t *offp, boolean_t *case_conflict); +extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, + int maxlen, boolean_t *conflict); +extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp); + +typedef int objset_used_cb_t(dmu_object_type_t bonustype, + void *bonus, uint64_t *userp, uint64_t *groupp); +extern void dmu_objset_register_type(dmu_objset_type_t ost, + objset_used_cb_t *cb); +extern void dmu_objset_set_user(objset_t *os, void *user_ptr); +extern void *dmu_objset_get_user(objset_t *os); + +/* + * Return the txg number for the given assigned transaction. + */ +uint64_t dmu_tx_get_txg(dmu_tx_t *tx); + +/* + * Synchronous write. + * If a parent zio is provided this function initiates a write on the + * provided buffer as a child of the parent zio. + * In the absence of a parent zio, the write is completed synchronously. + * At write completion, blk is filled with the bp of the written block. + * Note that while the data covered by this function will be on stable + * storage when the write completes this new data does not become a + * permanent part of the file until the associated transaction commits. + */ + +/* + * {zfs,zvol,ztest}_get_done() args + */ +typedef struct zgd { + struct lwb *zgd_lwb; + struct blkptr *zgd_bp; + dmu_buf_t *zgd_db; + struct rl *zgd_rl; + void *zgd_private; +} zgd_t; + +typedef void dmu_sync_cb_t(zgd_t *arg, int error); +int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); + +/* + * Find the next hole or data block in file starting at *off + * Return found offset in *off. Return ESRCH for end of file. + */ +int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, + uint64_t *off); + +/* + * Check if a DMU object has any dirty blocks. If so, sync out + * all pending transaction groups. Otherwise, this function + * does not alter DMU state. This could be improved to only sync + * out the necessary transaction groups for this particular + * object. + */ +int dmu_object_wait_synced(objset_t *os, uint64_t object); + +/* + * Initial setup and final teardown. + */ +extern void dmu_init(void); +extern void dmu_fini(void); + +typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, + uint64_t object, uint64_t offset, int len); +void dmu_traverse_objset(objset_t *os, uint64_t txg_start, + dmu_traverse_cb_t cb, void *arg); +int dmu_diff(const char *tosnap_name, const char *fromsnap_name, + struct file *fp, offset_t *offp); + +/* CRC64 table */ +#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ +extern uint64_t zfs_crc64_table[256]; + +extern int zfs_mdcomp_disable; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h new file mode 100644 index 000000000000..44bd6a351772 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h @@ -0,0 +1,315 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DMU_IMPL_H +#define _SYS_DMU_IMPL_H + +#include <sys/txg_impl.h> +#include <sys/zio.h> +#include <sys/dnode.h> +#include <sys/kstat.h> +#include <sys/zfs_context.h> +#include <sys/zfs_ioctl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is the locking strategy for the DMU. Numbers in parenthesis are + * cases that use that lock order, referenced below: + * + * ARC is self-contained + * bplist is self-contained + * refcount is self-contained + * txg is self-contained (hopefully!) + * zst_lock + * zf_rwlock + * + * XXX try to improve evicting path? + * + * dp_config_rwlock > os_obj_lock > dn_struct_rwlock > + * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs + * + * dp_config_rwlock + * must be held before: everything + * protects dd namespace changes + * protects property changes globally + * held from: + * dsl_dir_open/r: + * dsl_dir_create_sync/w: + * dsl_dir_sync_destroy/w: + * dsl_dir_rename_sync/w: + * dsl_prop_changed_notify/r: + * + * os_obj_lock + * must be held before: + * everything except dp_config_rwlock + * protects os_obj_next + * held from: + * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock + * + * dn_struct_rwlock + * must be held before: + * everything except dp_config_rwlock and os_obj_lock + * protects structure of dnode (eg. nlevels) + * db_blkptr can change when syncing out change to nlevels + * dn_maxblkid + * dn_nlevels + * dn_*blksz* + * phys nlevels, maxblkid, physical blkptr_t's (?) + * held from: + * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch + * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) + * dbuf_read_impl: db_mtx, dmu_zfetch() + * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() + * dbuf_new_size: db_mtx + * dbuf_dirty: db_mtx + * dbuf_findbp: (callers, phys? - the real need) + * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?) + * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx + * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp() + * dnode_sync/w (increase_indirection): db_mtx (phys) + * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*) + * dnode_new_blkid/w: (dn_maxblkid) + * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid) + * dnode_next_offset: (phys) + * + * dn_dbufs_mtx + * must be held before: + * db_mtx, hash_mutexes + * protects: + * dn_dbufs + * dn_evicted + * held from: + * dmu_evict_user: db_mtx (dn_dbufs) + * dbuf_free_range: db_mtx (dn_dbufs) + * dbuf_remove_ref: db_mtx, callees: + * dbuf_hash_remove: hash_mutexes, db_mtx + * dbuf_create: hash_mutexes, db_mtx (dn_dbufs) + * dnode_set_blksz: (dn_dbufs) + * + * hash_mutexes (global) + * must be held before: + * db_mtx + * protects dbuf_hash_table (global) and db_hash_next + * held from: + * dbuf_find: db_mtx + * dbuf_hash_insert: db_mtx + * dbuf_hash_remove: db_mtx + * + * db_mtx (meta-leaf) + * must be held before: + * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes) + * protects: + * db_state + * db_holds + * db_buf + * db_changed + * db_data_pending + * db_dirtied + * db_link + * db_dirty_node (??) + * db_dirtycnt + * db_d.* + * db.* + * held from: + * dbuf_dirty: dn_mtx, dn_dirty_mtx + * dbuf_dirty->dsl_dir_willuse_space: dd_lock + * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock + * dbuf_undirty: dn_dirty_mtx (db_d) + * dbuf_write_done: dn_dirty_mtx (db_state) + * dbuf_* + * dmu_buf_update_user: none (db_d) + * dmu_evict_user: none (db_d) (maybe can eliminate) + * dbuf_find: none (db_holds) + * dbuf_hash_insert: none (db_holds) + * dmu_buf_read_array_impl: none (db_state, db_changed) + * dmu_sync: none (db_dirty_node, db_d) + * dnode_reallocate: none (db) + * + * dn_mtx (leaf) + * protects: + * dn_dirty_dbufs + * dn_ranges + * phys accounting + * dn_allocated_txg + * dn_free_txg + * dn_assigned_txg + * dd_assigned_tx + * dn_notxholds + * dn_dirtyctx + * dn_dirtyctx_firstset + * (dn_phys copy fields?) + * (dn_phys contents?) + * held from: + * dnode_* + * dbuf_dirty: none + * dbuf_sync: none (phys accounting) + * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs) + * dbuf_write_done: none (phys accounting) + * dmu_object_info_from_dnode: none (accounting) + * dmu_tx_commit: none + * dmu_tx_hold_object_impl: none + * dmu_tx_try_assign: dn_notxholds(cv) + * dmu_tx_unassign: none + * + * dd_lock + * must be held before: + * ds_lock + * ancestors' dd_lock + * protects: + * dd_prop_cbs + * dd_sync_* + * dd_used_bytes + * dd_tempreserved + * dd_space_towrite + * dd_myname + * dd_phys accounting? + * held from: + * dsl_dir_* + * dsl_prop_changed_notify: none (dd_prop_cbs) + * dsl_prop_register: none (dd_prop_cbs) + * dsl_prop_unregister: none (dd_prop_cbs) + * + * os_lock (leaf) + * protects: + * os_dirty_dnodes + * os_free_dnodes + * os_dnodes + * os_downgraded_dbufs + * dn_dirtyblksz + * dn_dirty_link + * held from: + * dnode_create: none (os_dnodes) + * dnode_destroy: none (os_dnodes) + * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes) + * dnode_free: none (dn_dirtyblksz, os_*_dnodes) + * + * ds_lock + * protects: + * ds_objset + * ds_open_refcount + * ds_snapname + * ds_phys accounting + * ds_phys userrefs zapobj + * ds_reserved + * held from: + * dsl_dataset_* + * + * dr_mtx (leaf) + * protects: + * dr_children + * held from: + * dbuf_dirty + * dbuf_undirty + * dbuf_sync_indirect + * dnode_new_blkid + */ + +struct objset; +struct dmu_pool; + +typedef struct dmu_xuio { + int next; + int cnt; + struct arc_buf **bufs; + iovec_t *iovp; +} dmu_xuio_t; + +typedef struct xuio_stats { + /* loaned yet not returned arc_buf */ + kstat_named_t xuiostat_onloan_rbuf; + kstat_named_t xuiostat_onloan_wbuf; + /* whether a copy is made when loaning out a read buffer */ + kstat_named_t xuiostat_rbuf_copied; + kstat_named_t xuiostat_rbuf_nocopy; + /* whether a copy is made when assigning a write buffer */ + kstat_named_t xuiostat_wbuf_copied; + kstat_named_t xuiostat_wbuf_nocopy; +} xuio_stats_t; + +static xuio_stats_t xuio_stats = { + { "onloan_read_buf", KSTAT_DATA_UINT64 }, + { "onloan_write_buf", KSTAT_DATA_UINT64 }, + { "read_buf_copied", KSTAT_DATA_UINT64 }, + { "read_buf_nocopy", KSTAT_DATA_UINT64 }, + { "write_buf_copied", KSTAT_DATA_UINT64 }, + { "write_buf_nocopy", KSTAT_DATA_UINT64 } +}; + +#define XUIOSTAT_INCR(stat, val) \ + atomic_add_64(&xuio_stats.stat.value.ui64, (val)) +#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) + +/* + * The list of data whose inclusion in a send stream can be pending from + * one call to backup_cb to another. Multiple calls to dump_free() and + * dump_freeobjects() can be aggregated into a single DRR_FREE or + * DRR_FREEOBJECTS replay record. + */ +typedef enum { + PENDING_NONE, + PENDING_FREE, + PENDING_FREEOBJECTS +} dmu_pendop_t; + +typedef struct dmu_sendarg { + list_node_t dsa_link; + dmu_replay_record_t *dsa_drr; + kthread_t *dsa_td; + struct file *dsa_fp; + int dsa_outfd; + struct proc *dsa_proc; + offset_t *dsa_off; + objset_t *dsa_os; + zio_cksum_t dsa_zc; + uint64_t dsa_toguid; + int dsa_err; + dmu_pendop_t dsa_pending_op; + uint64_t dsa_featureflags; + uint64_t dsa_last_data_object; + uint64_t dsa_last_data_offset; + uint64_t dsa_resume_object; + uint64_t dsa_resume_offset; + boolean_t dsa_sent_begin; + boolean_t dsa_sent_end; +} dmu_sendarg_t; + +void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); +void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); +int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t, + void *, dmu_buf_t **); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h new file mode 100644 index 000000000000..f692dae90fe1 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h @@ -0,0 +1,212 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#ifndef _SYS_DMU_OBJSET_H +#define _SYS_DMU_OBJSET_H + +#include <sys/spa.h> +#include <sys/arc.h> +#include <sys/txg.h> +#include <sys/zfs_context.h> +#include <sys/dnode.h> +#include <sys/zio.h> +#include <sys/zil.h> +#include <sys/sa.h> +#include <sys/zfs_ioctl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern krwlock_t os_lock; + +struct dsl_pool; +struct dsl_dataset; +struct dmu_tx; + +#define OBJSET_PHYS_SIZE 2048 +#define OBJSET_OLD_PHYS_SIZE 1024 + +#define OBJSET_BUF_HAS_USERUSED(buf) \ + (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE) + +#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) + +typedef struct objset_phys { + dnode_phys_t os_meta_dnode; + zil_header_t os_zil_header; + uint64_t os_type; + uint64_t os_flags; + char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 - + sizeof (zil_header_t) - sizeof (uint64_t)*2]; + dnode_phys_t os_userused_dnode; + dnode_phys_t os_groupused_dnode; +} objset_phys_t; + +#define OBJSET_PROP_UNINITIALIZED ((uint64_t)-1) +struct objset { + /* Immutable: */ + struct dsl_dataset *os_dsl_dataset; + spa_t *os_spa; + arc_buf_t *os_phys_buf; + objset_phys_t *os_phys; + /* + * The following "special" dnodes have no parent, are exempt + * from dnode_move(), and are not recorded in os_dnodes, but they + * root their descendents in this objset using handles anyway, so + * that all access to dnodes from dbufs consistently uses handles. + */ + dnode_handle_t os_meta_dnode; + dnode_handle_t os_userused_dnode; + dnode_handle_t os_groupused_dnode; + zilog_t *os_zil; + + list_node_t os_evicting_node; + + /* can change, under dsl_dir's locks: */ + uint64_t os_dnodesize; /* default dnode size for new objects */ + enum zio_checksum os_checksum; + enum zio_compress os_compress; + uint8_t os_copies; + enum zio_checksum os_dedup_checksum; + boolean_t os_dedup_verify; + zfs_logbias_op_t os_logbias; + zfs_cache_type_t os_primary_cache; + zfs_cache_type_t os_secondary_cache; + zfs_sync_type_t os_sync; + zfs_redundant_metadata_type_t os_redundant_metadata; + int os_recordsize; + /* + * The next four values are used as a cache of whatever's on disk, and + * are initialized the first time these properties are queried. Before + * being initialized with their real values, their values are + * OBJSET_PROP_UNINITIALIZED. + */ + uint64_t os_version; + uint64_t os_normalization; + uint64_t os_utf8only; + uint64_t os_casesensitivity; + + /* + * Pointer is constant; the blkptr it points to is protected by + * os_dsl_dataset->ds_bp_rwlock + */ + blkptr_t *os_rootbp; + + /* no lock needed: */ + struct dmu_tx *os_synctx; /* XXX sketchy */ + zil_header_t os_zil_header; + multilist_t *os_synced_dnodes; + uint64_t os_flags; + uint64_t os_freed_dnodes; + boolean_t os_rescan_dnodes; + + /* Protected by os_obj_lock */ + kmutex_t os_obj_lock; + uint64_t os_obj_next; + + /* Protected by os_lock */ + kmutex_t os_lock; + multilist_t *os_dirty_dnodes[TXG_SIZE]; + list_t os_dnodes; + list_t os_downgraded_dbufs; + + /* Protects changes to DMU_{USER,GROUP}USED_OBJECT */ + kmutex_t os_userused_lock; + + /* stuff we store for the user */ + kmutex_t os_user_ptr_lock; + void *os_user_ptr; + sa_os_t *os_sa; +}; + +#define DMU_META_OBJSET 0 +#define DMU_META_DNODE_OBJECT 0 +#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) +#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode) +#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode) +#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode) + +#define DMU_OS_IS_L2CACHEABLE(os) \ + ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ + (os)->os_secondary_cache == ZFS_CACHE_METADATA) + +#define DMU_OS_IS_L2COMPRESSIBLE(os) (zfs_mdcomp_disable == B_FALSE) + +/* called from zpl */ +int dmu_objset_hold(const char *name, void *tag, objset_t **osp); +int dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp); +int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj, + dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); +void dmu_objset_refresh_ownership(struct dsl_dataset *ds, + struct dsl_dataset **newds, void *tag); +void dmu_objset_rele(objset_t *os, void *tag); +void dmu_objset_disown(objset_t *os, void *tag); +int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); + +void dmu_objset_stats(objset_t *os, nvlist_t *nv); +void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); +void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); +uint64_t dmu_objset_fsid_guid(objset_t *os); +int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj, + int func(struct dsl_pool *, struct dsl_dataset *, void *), + void *arg, int flags); +int dmu_objset_prefetch(const char *name, void *arg); +void dmu_objset_evict_dbufs(objset_t *os); +timestruc_t dmu_objset_snap_cmtime(objset_t *os); + +/* called from dsl */ +void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); +boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); +objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, + blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); +int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, + objset_t **osp); +void dmu_objset_evict(objset_t *os); +void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); +void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); +boolean_t dmu_objset_userused_enabled(objset_t *os); +int dmu_objset_userspace_upgrade(objset_t *os); +boolean_t dmu_objset_userspace_present(objset_t *os); +int dmu_fsname(const char *snapname, char *buf); + +void dmu_objset_evict_done(objset_t *os); +void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx); + +void dmu_objset_init(void); +void dmu_objset_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_OBJSET_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h new file mode 100644 index 000000000000..1f4b1f2cde9f --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h @@ -0,0 +1,93 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#ifndef _DMU_SEND_H +#define _DMU_SEND_H + +#include <sys/spa.h> + +struct vnode; +struct dsl_dataset; +struct drr_begin; +struct avl_tree; +struct dmu_replay_record; + +extern const char *recv_clone_name; + +int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, boolean_t compressok, int outfd, + uint64_t resumeobj, uint64_t resumeoff, +#ifdef illumos + struct vnode *vp, offset_t *off); +#else + struct file *fp, offset_t *off); +#endif +int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, + boolean_t stream_compressed, uint64_t *sizep); +int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg, + boolean_t stream_compressed, uint64_t *sizep); +int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, + boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, +#ifdef illumos + int outfd, struct vnode *vp, offset_t *off); +#else + int outfd, struct file *fp, offset_t *off); +#endif + +typedef struct dmu_recv_cookie { + struct dsl_dataset *drc_ds; + struct dmu_replay_record *drc_drr_begin; + struct drr_begin *drc_drrb; + const char *drc_tofs; + const char *drc_tosnap; + boolean_t drc_newfs; + boolean_t drc_byteswap; + boolean_t drc_force; + boolean_t drc_resumable; + boolean_t drc_clone; + struct avl_tree *drc_guid_to_ds_map; + zio_cksum_t drc_cksum; + uint64_t drc_newsnapobj; + void *drc_owner; + cred_t *drc_cred; +} dmu_recv_cookie_t; + +int dmu_recv_begin(char *tofs, char *tosnap, + struct dmu_replay_record *drr_begin, + boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc); +#ifdef illumos +int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, +#else +int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, +#endif + int cleanup_fd, uint64_t *action_handlep); +int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner); +boolean_t dmu_objset_is_receiving(objset_t *os); + +#endif /* _DMU_SEND_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h new file mode 100644 index 000000000000..c010edd440d9 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DMU_TRAVERSE_H +#define _SYS_DMU_TRAVERSE_H + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dnode_phys; +struct dsl_dataset; +struct zilog; +struct arc_buf; + +typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg); + +#define TRAVERSE_PRE (1<<0) +#define TRAVERSE_POST (1<<1) +#define TRAVERSE_PREFETCH_METADATA (1<<2) +#define TRAVERSE_PREFETCH_DATA (1<<3) +#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) +#define TRAVERSE_HARD (1<<4) + +/* Special traverse error return value to indicate skipping of children */ +#define TRAVERSE_VISIT_NO_CHILDREN -1 + +int traverse_dataset(struct dsl_dataset *ds, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start, + zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg); +int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, + uint64_t txg_start, zbookmark_phys_t *resume, int flags, + blkptr_cb_t func, void *arg); +int traverse_pool(spa_t *spa, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_TRAVERSE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h new file mode 100644 index 000000000000..e1d0f4099734 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h @@ -0,0 +1,152 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DMU_TX_H +#define _SYS_DMU_TX_H + +#include <sys/dmu.h> +#include <sys/txg.h> +#include <sys/refcount.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dmu_buf_impl; +struct dmu_tx_hold; +struct dnode_link; +struct dsl_pool; +struct dnode; +struct dsl_dir; + +struct dmu_tx { + /* + * No synchronization is needed because a tx can only be handled + * by one thread. + */ + list_t tx_holds; /* list of dmu_tx_hold_t */ + objset_t *tx_objset; + struct dsl_dir *tx_dir; + struct dsl_pool *tx_pool; + uint64_t tx_txg; + uint64_t tx_lastsnap_txg; + uint64_t tx_lasttried_txg; + txg_handle_t tx_txgh; + void *tx_tempreserve_cookie; + struct dmu_tx_hold *tx_needassign_txh; + + /* list of dmu_tx_callback_t on this dmu_tx */ + list_t tx_callbacks; + + /* placeholder for syncing context, doesn't need specific holds */ + boolean_t tx_anyobj; + + /* transaction is marked as being a "net free" of space */ + boolean_t tx_netfree; + + /* time this transaction was created */ + hrtime_t tx_start; + + /* need to wait for sufficient dirty space */ + boolean_t tx_wait_dirty; + + /* has this transaction already been delayed? */ + boolean_t tx_dirty_delayed; + + int tx_err; +}; + +enum dmu_tx_hold_type { + THT_NEWOBJECT, + THT_WRITE, + THT_BONUS, + THT_FREE, + THT_ZAP, + THT_SPACE, + THT_SPILL, + THT_NUMTYPES +}; + +typedef struct dmu_tx_hold { + dmu_tx_t *txh_tx; + list_node_t txh_node; + struct dnode *txh_dnode; + refcount_t txh_space_towrite; + refcount_t txh_memory_tohold; + enum dmu_tx_hold_type txh_type; + uint64_t txh_arg1; + uint64_t txh_arg2; +} dmu_tx_hold_t; + +typedef struct dmu_tx_callback { + list_node_t dcb_node; /* linked to tx_callbacks list */ + dmu_tx_callback_func_t *dcb_func; /* caller function pointer */ + void *dcb_data; /* caller private data */ +} dmu_tx_callback_t; + +/* + * These routines are defined in dmu.h, and are called by the user. + */ +dmu_tx_t *dmu_tx_create(objset_t *dd); +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +void dmu_tx_commit(dmu_tx_t *tx); +void dmu_tx_abort(dmu_tx_t *tx); +uint64_t dmu_tx_get_txg(dmu_tx_t *tx); +struct dsl_pool *dmu_tx_pool(dmu_tx_t *tx); +void dmu_tx_wait(dmu_tx_t *tx); + +void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, + void *dcb_data); +void dmu_tx_do_callbacks(list_t *cb_list, int error); + +/* + * These routines are defined in dmu_spa.h, and are called by the SPA. + */ +extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg); + +/* + * These routines are only called by the DMU. + */ +dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd); +int dmu_tx_is_syncing(dmu_tx_t *tx); +int dmu_tx_private_ok(dmu_tx_t *tx); +void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn); +void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); +void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); + +#ifdef ZFS_DEBUG +#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) +#else +#define DMU_TX_DIRTY_BUF(tx, db) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_TX_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h new file mode 100644 index 000000000000..21a3ff3a2032 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#ifndef _DMU_ZFETCH_H +#define _DMU_ZFETCH_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint64_t zfetch_array_rd_sz; + +struct dnode; /* so we can reference dnode */ + +typedef struct zstream { + uint64_t zs_blkid; /* expect next access at this blkid */ + uint64_t zs_pf_blkid; /* next block to prefetch */ + + /* + * We will next prefetch the L1 indirect block of this level-0 + * block id. + */ + uint64_t zs_ipf_blkid; + + kmutex_t zs_lock; /* protects stream */ + hrtime_t zs_atime; /* time last prefetch issued */ + list_node_t zs_node; /* link for zf_stream */ +} zstream_t; + +typedef struct zfetch { + krwlock_t zf_rwlock; /* protects zfetch structure */ + list_t zf_stream; /* list of zstream_t's */ + struct dnode *zf_dnode; /* dnode that owns this zfetch */ +} zfetch_t; + +void zfetch_init(void); +void zfetch_fini(void); + +void dmu_zfetch_init(zfetch_t *, struct dnode *); +void dmu_zfetch_fini(zfetch_t *); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t); + + +#ifdef __cplusplus +} +#endif + +#endif /* _DMU_ZFETCH_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h new file mode 100644 index 000000000000..74acef0ae194 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h @@ -0,0 +1,411 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#ifndef _SYS_DNODE_H +#define _SYS_DNODE_H + +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/refcount.h> +#include <sys/dmu_zfetch.h> +#include <sys/zrlock.h> +#include <sys/multilist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * dnode_hold() flags. + */ +#define DNODE_MUST_BE_ALLOCATED 1 +#define DNODE_MUST_BE_FREE 2 + +/* + * dnode_next_offset() flags. + */ +#define DNODE_FIND_HOLE 1 +#define DNODE_FIND_BACKWARDS 2 +#define DNODE_FIND_HAVELOCK 4 + +/* + * Fixed constants. + */ +#define DNODE_SHIFT 9 /* 512 bytes */ +#define DN_MIN_INDBLKSHIFT 12 /* 4k */ +/* + * If we ever increase this value beyond 20, we need to revisit all logic that + * does x << level * ebps to handle overflow. With a 1M indirect block size, + * 4 levels of indirect blocks would not be able to guarantee addressing an + * entire object, so 5 levels will be used, but 5 * (20 - 7) = 65. + */ +#define DN_MAX_INDBLKSHIFT 17 /* 128k */ +#define DNODE_BLOCK_SHIFT 14 /* 16k */ +#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ +#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ +#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ + +/* + * dnode id flags + * + * Note: a file will never ever have its + * ids moved from bonus->spill + * and only in a crypto environment would it be on spill + */ +#define DN_ID_CHKED_BONUS 0x1 +#define DN_ID_CHKED_SPILL 0x2 +#define DN_ID_OLD_EXIST 0x4 +#define DN_ID_NEW_EXIST 0x8 + +/* + * Derived constants. + */ +#define DNODE_MIN_SIZE (1 << DNODE_SHIFT) +#define DNODE_MAX_SIZE (1 << DNODE_BLOCK_SHIFT) +#define DNODE_BLOCK_SIZE (1 << DNODE_BLOCK_SHIFT) +#define DNODE_MIN_SLOTS (DNODE_MIN_SIZE >> DNODE_SHIFT) +#define DNODE_MAX_SLOTS (DNODE_MAX_SIZE >> DNODE_SHIFT) +#define DN_BONUS_SIZE(dnsize) ((dnsize) - DNODE_CORE_SIZE - \ + (1 << SPA_BLKPTRSHIFT)) +#define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT) +#define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE)) +#define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) +#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) +#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1) +#define DN_KILL_SPILLBLK (1) + +#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) +#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) + +/* + * This is inaccurate if the indblkshift of the particular object is not the + * max. But it's only used by userland to calculate the zvol reservation. + */ +#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) +#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT) + +/* The +2 here is a cheesy way to round up */ +#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ + (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) + +#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ + (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) +#define DN_MAX_BONUS_LEN(dnp) \ + ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? \ + (uint8_t *)DN_SPILL_BLKPTR(dnp) - (uint8_t *)DN_BONUS(dnp) : \ + (uint8_t *)(dnp + (dnp->dn_extra_slots + 1)) - (uint8_t *)DN_BONUS(dnp)) + +#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ + (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) + +#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) + +struct dmu_buf_impl; +struct objset; +struct zio; + +enum dnode_dirtycontext { + DN_UNDIRTIED, + DN_DIRTY_OPEN, + DN_DIRTY_SYNC +}; + +/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ +#define DNODE_FLAG_USED_BYTES (1<<0) +#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1) + +/* Does dnode have a SA spill blkptr in bonus? */ +#define DNODE_FLAG_SPILL_BLKPTR (1<<2) + +typedef struct dnode_phys { + uint8_t dn_type; /* dmu_object_type_t */ + uint8_t dn_indblkshift; /* ln2(indirect block size) */ + uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */ + uint8_t dn_nblkptr; /* length of dn_blkptr */ + uint8_t dn_bonustype; /* type of data in bonus buffer */ + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + uint8_t dn_flags; /* DNODE_FLAG_* */ + uint16_t dn_datablkszsec; /* data block size in 512b sectors */ + uint16_t dn_bonuslen; /* length of dn_bonus */ + uint8_t dn_extra_slots; /* # of subsequent slots consumed */ + uint8_t dn_pad2[3]; + + /* accounting is protected by dn_dirty_mtx */ + uint64_t dn_maxblkid; /* largest allocated block ID */ + uint64_t dn_used; /* bytes (or sectors) of disk space */ + + /* + * Both dn_pad2 and dn_pad3 are protected by the block's MAC. This + * allows us to protect any fields that might be added here in the + * future. In either case, developers will want to check + * zio_crypt_init_uios_dnode() to ensure the new field is being + * protected properly. + */ + uint64_t dn_pad3[4]; + /* + * The tail region is 448 bytes for a 512 byte dnode, and + * correspondingly larger for larger dnode sizes. The spill + * block pointer, when present, is always at the end of the tail + * region. There are three ways this space may be used, using + * a 512 byte dnode for this diagram: + * + * 0 64 128 192 256 320 384 448 (offset) + * +---------------+---------------+---------------+-------+ + * | dn_blkptr[0] | dn_blkptr[1] | dn_blkptr[2] | / | + * +---------------+---------------+---------------+-------+ + * | dn_blkptr[0] | dn_bonus[0..319] | + * +---------------+-----------------------+---------------+ + * | dn_blkptr[0] | dn_bonus[0..191] | dn_spill | + * +---------------+-----------------------+---------------+ + */ + union { + blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)]; + struct { + blkptr_t __dn_ignore1; + uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN]; + }; + struct { + blkptr_t __dn_ignore2; + uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN - + sizeof (blkptr_t)]; + blkptr_t dn_spill; + }; + }; +} dnode_phys_t; + +#define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \ + (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT)) + +struct dnode { + /* + * Protects the structure of the dnode, including the number of levels + * of indirection (dn_nlevels), dn_maxblkid, and dn_next_* + */ + krwlock_t dn_struct_rwlock; + + /* Our link on dn_objset->os_dnodes list; protected by os_lock. */ + list_node_t dn_link; + + /* immutable: */ + struct objset *dn_objset; + uint64_t dn_object; + struct dmu_buf_impl *dn_dbuf; + struct dnode_handle *dn_handle; + dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ + + /* + * Copies of stuff in dn_phys. They're valid in the open + * context (eg. even before the dnode is first synced). + * Where necessary, these are protected by dn_struct_rwlock. + */ + dmu_object_type_t dn_type; /* object type */ + uint16_t dn_bonuslen; /* bonus length */ + uint8_t dn_bonustype; /* bonus type */ + uint8_t dn_nblkptr; /* number of blkptrs (immutable) */ + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + uint8_t dn_nlevels; + uint8_t dn_indblkshift; + uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ + uint8_t dn_moved; /* Has this dnode been moved? */ + uint16_t dn_datablkszsec; /* in 512b sectors */ + uint32_t dn_datablksz; /* in bytes */ + uint64_t dn_maxblkid; + uint8_t dn_next_type[TXG_SIZE]; + uint8_t dn_num_slots; /* metadnode slots consumed on disk */ + uint8_t dn_next_nblkptr[TXG_SIZE]; + uint8_t dn_next_nlevels[TXG_SIZE]; + uint8_t dn_next_indblkshift[TXG_SIZE]; + uint8_t dn_next_bonustype[TXG_SIZE]; + uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */ + uint16_t dn_next_bonuslen[TXG_SIZE]; + uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ + + /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */ + uint32_t dn_dbufs_count; /* count of dn_dbufs */ + + /* protected by os_lock: */ + multilist_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ + + /* protected by dn_mtx: */ + kmutex_t dn_mtx; + list_t dn_dirty_records[TXG_SIZE]; + struct range_tree *dn_free_ranges[TXG_SIZE]; + uint64_t dn_allocated_txg; + uint64_t dn_free_txg; + uint64_t dn_assigned_txg; + kcondvar_t dn_notxholds; + enum dnode_dirtycontext dn_dirtyctx; + uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ + + /* protected by own devices */ + refcount_t dn_tx_holds; + refcount_t dn_holds; + + kmutex_t dn_dbufs_mtx; + /* + * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs + * can contain multiple dbufs of the same (level, blkid) when a + * dbuf is marked DB_EVICTING without being removed from + * dn_dbufs. To maintain the avl invariant that there cannot be + * duplicate entries, we order the dbufs by an arbitrary value - + * their address in memory. This means that dn_dbufs cannot be used to + * directly look up a dbuf. Instead, callers must use avl_walk, have + * a reference to the dbuf, or look up a non-existant node with + * db_state = DB_SEARCH (see dbuf_free_range for an example). + */ + avl_tree_t dn_dbufs; + + /* protected by dn_struct_rwlock */ + struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ + + boolean_t dn_have_spill; /* have spill or are spilling */ + + /* parent IO for current sync write */ + zio_t *dn_zio; + + /* used in syncing context */ + uint64_t dn_oldused; /* old phys used bytes */ + uint64_t dn_oldflags; /* old phys dn_flags */ + uint64_t dn_olduid, dn_oldgid; + uint64_t dn_newuid, dn_newgid; + int dn_id_flags; + + /* holds prefetch structure */ + struct zfetch dn_zfetch; +}; + +/* + * Adds a level of indirection between the dbuf and the dnode to avoid + * iterating descendent dbufs in dnode_move(). Handles are not allocated + * individually, but as an array of child dnodes in dnode_hold_impl(). + */ +typedef struct dnode_handle { + /* Protects dnh_dnode from modification by dnode_move(). */ + zrlock_t dnh_zrlock; + dnode_t *dnh_dnode; +} dnode_handle_t; + +typedef struct dnode_children { + dmu_buf_user_t dnc_dbu; /* User evict data */ + size_t dnc_count; /* number of children */ + dnode_handle_t dnc_children[]; /* sized dynamically */ +} dnode_children_t; + +typedef struct free_range { + avl_node_t fr_node; + uint64_t fr_blkid; + uint64_t fr_nblks; +} free_range_t; + +void dnode_special_open(struct objset *dd, dnode_phys_t *dnp, + uint64_t object, dnode_handle_t *dnh); +void dnode_special_close(dnode_handle_t *dnh); + +void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); +void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx); +void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx); + +int dnode_hold(struct objset *dd, uint64_t object, + void *ref, dnode_t **dnp); +int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots, + void *ref, dnode_t **dnp); +boolean_t dnode_add_ref(dnode_t *dn, void *ref); +void dnode_rele(dnode_t *dn, void *ref); +void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting); +void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); +void dnode_sync(dnode_t *dn, dmu_tx_t *tx); +void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, + dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); +void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); +void dnode_free(dnode_t *dn, dmu_tx_t *tx); +void dnode_byteswap(dnode_phys_t *dnp); +void dnode_buf_byteswap(void *buf, size_t size); +void dnode_verify(dnode_t *dn); +int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); +void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); +void dnode_diduse_space(dnode_t *dn, int64_t space); +void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t); +uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); +void dnode_init(void); +void dnode_fini(void); +int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, + int minlvl, uint64_t blkfill, uint64_t txg); +void dnode_evict_dbufs(dnode_t *dn); +void dnode_evict_bonus(dnode_t *dn); +boolean_t dnode_needs_remap(const dnode_t *dn); + +#define DNODE_IS_CACHEABLE(_dn) \ + ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ + (DMU_OT_IS_METADATA((_dn)->dn_type) && \ + (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)) + +#define DNODE_META_IS_CACHEABLE(_dn) \ + ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ + (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA) + +#ifdef ZFS_DEBUG + +/* + * There should be a ## between the string literal and fmt, to make it + * clear that we're joining two strings together, but that piece of shit + * gcc doesn't support that preprocessor token. + */ +#define dprintf_dnode(dn, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __db_buf[32]; \ + uint64_t __db_obj = (dn)->dn_object; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + (void) strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ + (u_longlong_t)__db_obj);\ + dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \ + __db_buf, __VA_ARGS__); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define DNODE_VERIFY(dn) dnode_verify(dn) +#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx) + +#else + +#define dprintf_dnode(db, fmt, ...) +#define DNODE_VERIFY(dn) +#define FREE_VERIFY(db, start, end, tx) + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DNODE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h new file mode 100644 index 000000000000..3591986d7bd7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DSL_BOOKMARK_H +#define _SYS_DSL_BOOKMARK_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_pool; +struct dsl_dataset; + +/* + * On disk zap object. + */ +typedef struct zfs_bookmark_phys { + uint64_t zbm_guid; /* guid of bookmarked dataset */ + uint64_t zbm_creation_txg; /* birth transaction group */ + uint64_t zbm_creation_time; /* bookmark creation time */ +} zfs_bookmark_phys_t; + +int dsl_bookmark_create(nvlist_t *, nvlist_t *); +int dsl_get_bookmarks(const char *, nvlist_t *, nvlist_t *); +int dsl_get_bookmarks_impl(dsl_dataset_t *, nvlist_t *, nvlist_t *); +int dsl_bookmark_destroy(nvlist_t *, nvlist_t *); +int dsl_bookmark_lookup(struct dsl_pool *, const char *, + struct dsl_dataset *, zfs_bookmark_phys_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_BOOKMARK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h new file mode 100644 index 000000000000..15a64a832630 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -0,0 +1,457 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#ifndef _SYS_DSL_DATASET_H +#define _SYS_DSL_DATASET_H + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/bplist.h> +#include <sys/dsl_synctask.h> +#include <sys/zfs_context.h> +#include <sys/dsl_deadlist.h> +#include <sys/refcount.h> +#include <sys/rrwlock.h> +#include <zfeature_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct dsl_dir; +struct dsl_pool; + +#define DS_FLAG_INCONSISTENT (1ULL<<0) +#define DS_IS_INCONSISTENT(ds) \ + (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) + +/* + * Do not allow this dataset to be promoted. + */ +#define DS_FLAG_NOPROMOTE (1ULL<<1) + +/* + * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly + * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE, + * refquota/refreservations). + */ +#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) + +/* + * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called + * on a dataset. This allows the dataset to be destroyed using 'zfs release'. + */ +#define DS_FLAG_DEFER_DESTROY (1ULL<<3) +#define DS_IS_DEFER_DESTROY(ds) \ + (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY) + +/* + * DS_FIELD_* are strings that are used in the "extensified" dataset zap object. + * They should be of the format <reverse-dns>:<field>. + */ + +/* + * This field's value is the object ID of a zap object which contains the + * bookmarks of this dataset. If it is present, then this dataset is counted + * in the refcount of the SPA_FEATURES_BOOKMARKS feature. + */ +#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks" + +/* + * This field is present (with value=0) if this dataset may contain large + * dnodes (>512B). If it is present, then this dataset is counted in the + * refcount of the SPA_FEATURE_LARGE_DNODE feature. + */ +#define DS_FIELD_LARGE_DNODE "org.zfsonlinux:large_dnode" + +/* + * These fields are set on datasets that are in the middle of a resumable + * receive, and allow the sender to resume the send if it is interrupted. + */ +#define DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid" +#define DS_FIELD_RESUME_TONAME "com.delphix:resume_toname" +#define DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid" +#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object" +#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset" +#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes" +#define DS_FIELD_RESUME_LARGEBLOCK "com.delphix:resume_largeblockok" +#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok" +#define DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok" + +/* + * This field is set to the object number of the remap deadlist if one exists. + */ +#define DS_FIELD_REMAP_DEADLIST "com.delphix:remap_deadlist" + +/* + * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose + * name lookups should be performed case-insensitively. + */ +#define DS_FLAG_CI_DATASET (1ULL<<16) + +#define DS_CREATE_FLAG_NODIRTY (1ULL<<24) + +typedef struct dsl_dataset_phys { + uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */ + uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */ + uint64_t ds_prev_snap_txg; + uint64_t ds_next_snap_obj; /* DMU_OT_DSL_DATASET */ + uint64_t ds_snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */ + uint64_t ds_num_children; /* clone/snap children; ==0 for head */ + uint64_t ds_creation_time; /* seconds since 1970 */ + uint64_t ds_creation_txg; + uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */ + /* + * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes + * include all blocks referenced by this dataset, including those + * shared with any other datasets. + */ + uint64_t ds_referenced_bytes; + uint64_t ds_compressed_bytes; + uint64_t ds_uncompressed_bytes; + uint64_t ds_unique_bytes; /* only relevant to snapshots */ + /* + * The ds_fsid_guid is a 56-bit ID that can change to avoid + * collisions. The ds_guid is a 64-bit ID that will never + * change, so there is a small probability that it will collide. + */ + uint64_t ds_fsid_guid; + uint64_t ds_guid; + uint64_t ds_flags; /* DS_FLAG_* */ + blkptr_t ds_bp; + uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */ + uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */ + uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */ + uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */ +} dsl_dataset_phys_t; + +typedef struct dsl_dataset { + dmu_buf_user_t ds_dbu; + rrwlock_t ds_bp_rwlock; /* Protects ds_phys->ds_bp */ + + /* Immutable: */ + struct dsl_dir *ds_dir; + dmu_buf_t *ds_dbuf; + uint64_t ds_object; + uint64_t ds_fsid_guid; + boolean_t ds_is_snapshot; + + /* only used in syncing context, only valid for non-snapshots: */ + struct dsl_dataset *ds_prev; + uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */ + + /* has internal locking: */ + dsl_deadlist_t ds_deadlist; + bplist_t ds_pending_deadlist; + + /* + * The remap deadlist contains blocks (DVA's, really) that are + * referenced by the previous snapshot and point to indirect vdevs, + * but in this dataset they have been remapped to point to concrete + * (or at least, less-indirect) vdevs. In other words, the + * physical DVA is referenced by the previous snapshot but not by + * this dataset. Logically, the DVA continues to be referenced, + * but we are using a different (less indirect) physical DVA. + * This deadlist is used to determine when physical DVAs that + * point to indirect vdevs are no longer referenced anywhere, + * and thus should be marked obsolete. + * + * This is only used if SPA_FEATURE_OBSOLETE_COUNTS is enabled. + */ + dsl_deadlist_t ds_remap_deadlist; + /* protects creation of the ds_remap_deadlist */ + kmutex_t ds_remap_deadlist_lock; + + /* protected by lock on pool's dp_dirty_datasets list */ + txg_node_t ds_dirty_link; + list_node_t ds_synced_link; + + /* + * ds_phys->ds_<accounting> is also protected by ds_lock. + * Protected by ds_lock: + */ + kmutex_t ds_lock; + objset_t *ds_objset; + uint64_t ds_userrefs; + void *ds_owner; + + /* + * Long holds prevent the ds from being destroyed; they allow the + * ds to remain held even after dropping the dp_config_rwlock. + * Owning counts as a long hold. See the comments above + * dsl_pool_hold() for details. + */ + refcount_t ds_longholds; + + /* no locking; only for making guesses */ + uint64_t ds_trysnap_txg; + + /* for objset_open() */ + kmutex_t ds_opening_lock; + + uint64_t ds_reserved; /* cached refreservation */ + uint64_t ds_quota; /* cached refquota */ + + kmutex_t ds_sendstream_lock; + list_t ds_sendstreams; + + /* + * When in the middle of a resumable receive, tracks how much + * progress we have made. + */ + uint64_t ds_resume_object[TXG_SIZE]; + uint64_t ds_resume_offset[TXG_SIZE]; + uint64_t ds_resume_bytes[TXG_SIZE]; + + /* Protected by our dsl_dir's dd_lock */ + list_t ds_prop_cbs; + + /* + * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset + * uses this feature. + */ + uint8_t ds_feature_inuse[SPA_FEATURES]; + + /* + * Set if we need to activate the feature on this dataset this txg + * (used only in syncing context). + */ + uint8_t ds_feature_activation_needed[SPA_FEATURES]; + + /* Protected by ds_lock; keep at end of struct for better locality */ + char ds_snapname[ZFS_MAX_DATASET_NAME_LEN]; +} dsl_dataset_t; + +inline dsl_dataset_phys_t * +dsl_dataset_phys(dsl_dataset_t *ds) +{ + return (ds->ds_dbuf->db_data); +} + +typedef struct dsl_dataset_promote_arg { + const char *ddpa_clonename; + dsl_dataset_t *ddpa_clone; + list_t shared_snaps, origin_snaps, clone_snaps; + dsl_dataset_t *origin_origin; /* origin of the origin */ + uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; + nvlist_t *err_ds; + cred_t *cr; +} dsl_dataset_promote_arg_t; + +typedef struct dsl_dataset_rollback_arg { + const char *ddra_fsname; + const char *ddra_tosnap; + void *ddra_owner; + nvlist_t *ddra_result; +} dsl_dataset_rollback_arg_t; + +typedef struct dsl_dataset_snapshot_arg { + nvlist_t *ddsa_snaps; + nvlist_t *ddsa_props; + nvlist_t *ddsa_errors; + cred_t *ddsa_cr; +} dsl_dataset_snapshot_arg_t; + +/* + * The max length of a temporary tag prefix is the number of hex digits + * required to express UINT64_MAX plus one for the hyphen. + */ +#define MAX_TAG_PREFIX_LEN 17 + +#define dsl_dataset_is_snapshot(ds) \ + (dsl_dataset_phys(ds)->ds_num_children != 0) + +#define DS_UNIQUE_IS_ACCURATE(ds) \ + ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) + +int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag, + dsl_dataset_t **dsp); +boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds, + void *tag); +int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag, + dsl_dataset_t **); +void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); +int dsl_dataset_own(struct dsl_pool *dp, const char *name, + void *tag, dsl_dataset_t **dsp); +int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, + void *tag, dsl_dataset_t **dsp); +void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); +void dsl_dataset_name(dsl_dataset_t *ds, char *name); +boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag); +int dsl_dataset_namelen(dsl_dataset_t *ds); +boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds); +uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, + dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); +uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, + uint64_t flags, dmu_tx_t *tx); +void dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx); +int dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx); +int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors); +void dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx); +int dsl_dataset_promote_check(void *arg, dmu_tx_t *tx); +int dsl_dataset_promote(const char *name, char *conflsnap); +int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, + boolean_t force); +int dsl_dataset_rename_snapshot(const char *fsname, + const char *oldsnapname, const char *newsnapname, boolean_t recursive); +int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, + minor_t cleanup_minor, const char *htag); + +blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); + +spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); + +boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds, + dsl_dataset_t *snap); + +void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); +void dsl_dataset_sync_done(dsl_dataset_t *os, dmu_tx_t *tx); + +void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, + dmu_tx_t *tx); +int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, + dmu_tx_t *tx, boolean_t async); +void dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, + uint64_t offset, uint64_t size, uint64_t birth, dmu_tx_t *tx); + +void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); + +int get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val); +char *get_receive_resume_stats_impl(dsl_dataset_t *ds); +char *get_child_receive_stats(dsl_dataset_t *ds); +uint64_t dsl_get_refratio(dsl_dataset_t *ds); +uint64_t dsl_get_logicalreferenced(dsl_dataset_t *ds); +uint64_t dsl_get_compressratio(dsl_dataset_t *ds); +uint64_t dsl_get_used(dsl_dataset_t *ds); +uint64_t dsl_get_creation(dsl_dataset_t *ds); +uint64_t dsl_get_creationtxg(dsl_dataset_t *ds); +uint64_t dsl_get_refquota(dsl_dataset_t *ds); +uint64_t dsl_get_refreservation(dsl_dataset_t *ds); +uint64_t dsl_get_guid(dsl_dataset_t *ds); +uint64_t dsl_get_unique(dsl_dataset_t *ds); +uint64_t dsl_get_objsetid(dsl_dataset_t *ds); +uint64_t dsl_get_userrefs(dsl_dataset_t *ds); +uint64_t dsl_get_defer_destroy(dsl_dataset_t *ds); +uint64_t dsl_get_referenced(dsl_dataset_t *ds); +uint64_t dsl_get_numclones(dsl_dataset_t *ds); +uint64_t dsl_get_inconsistent(dsl_dataset_t *ds); +uint64_t dsl_get_available(dsl_dataset_t *ds); +int dsl_get_written(dsl_dataset_t *ds, uint64_t *written); +int dsl_get_prev_snap(dsl_dataset_t *ds, char *snap); +int dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, + char *source); + +void get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv); + +void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv); + +void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat); +void dsl_dataset_space(dsl_dataset_t *ds, + uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); +uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds); +int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds); + +int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); + +int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used, + uint64_t *ref_rsrv); +int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, + uint64_t quota); +int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, + uint64_t reservation); + +boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, + uint64_t earlier_txg); +void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag); +void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag); +boolean_t dsl_dataset_long_held(dsl_dataset_t *ds); + +int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx); +void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, dmu_tx_t *tx); +int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr); +void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx); + +void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, + dmu_tx_t *tx); +void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds); +int dsl_dataset_get_snapname(dsl_dataset_t *ds); +int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, + uint64_t *value); +int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, + boolean_t adj_cnt); +void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, + zprop_source_t source, uint64_t value, dmu_tx_t *tx); +void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx); +boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds); +boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds); + +int dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx); +void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx); +int dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner, + nvlist_t *result); + +uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds); +void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); +boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds); +void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); + +void dsl_dataset_deactivate_feature(uint64_t dsobj, + spa_feature_t f, dmu_tx_t *tx); + +#ifdef ZFS_DEBUG +#define dprintf_ds(ds, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \ + dsl_dataset_name(ds, __ds_name); \ + dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \ + kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_ds(dd, fmt, ...) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DATASET_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h new file mode 100644 index 000000000000..08f38233d7ab --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DSL_DEADLIST_H +#define _SYS_DSL_DEADLIST_H + +#include <sys/bpobj.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dmu_buf; +struct dsl_dataset; + +typedef struct dsl_deadlist_phys { + uint64_t dl_used; + uint64_t dl_comp; + uint64_t dl_uncomp; + uint64_t dl_pad[37]; /* pad out to 320b for future expansion */ +} dsl_deadlist_phys_t; + +typedef struct dsl_deadlist { + objset_t *dl_os; + uint64_t dl_object; + avl_tree_t dl_tree; + boolean_t dl_havetree; + struct dmu_buf *dl_dbuf; + dsl_deadlist_phys_t *dl_phys; + kmutex_t dl_lock; + + /* if it's the old on-disk format: */ + bpobj_t dl_bpobj; + boolean_t dl_oldfmt; +} dsl_deadlist_t; + +typedef struct dsl_deadlist_entry { + avl_node_t dle_node; + uint64_t dle_mintxg; + bpobj_t dle_bpobj; +} dsl_deadlist_entry_t; + +void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); +void dsl_deadlist_close(dsl_deadlist_t *dl); +uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx); +void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx); +void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx); +void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); +void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); +uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, + uint64_t mrs_obj, dmu_tx_t *tx); +void dsl_deadlist_space(dsl_deadlist_t *dl, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +void dsl_deadlist_space_range(dsl_deadlist_t *dl, + uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); +void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, + dmu_tx_t *tx); +boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DEADLIST_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h new file mode 100644 index 000000000000..6fb6a121ade6 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h @@ -0,0 +1,81 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DSL_DELEG_H +#define _SYS_DSL_DELEG_H + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_DELEG_PERM_NONE "" +#define ZFS_DELEG_PERM_CREATE "create" +#define ZFS_DELEG_PERM_DESTROY "destroy" +#define ZFS_DELEG_PERM_SNAPSHOT "snapshot" +#define ZFS_DELEG_PERM_ROLLBACK "rollback" +#define ZFS_DELEG_PERM_CLONE "clone" +#define ZFS_DELEG_PERM_PROMOTE "promote" +#define ZFS_DELEG_PERM_RENAME "rename" +#define ZFS_DELEG_PERM_MOUNT "mount" +#define ZFS_DELEG_PERM_SHARE "share" +#define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_RECEIVE "receive" +#define ZFS_DELEG_PERM_ALLOW "allow" +#define ZFS_DELEG_PERM_USERPROP "userprop" +#define ZFS_DELEG_PERM_VSCAN "vscan" +#define ZFS_DELEG_PERM_USERQUOTA "userquota" +#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" +#define ZFS_DELEG_PERM_USERUSED "userused" +#define ZFS_DELEG_PERM_GROUPUSED "groupused" +#define ZFS_DELEG_PERM_HOLD "hold" +#define ZFS_DELEG_PERM_RELEASE "release" +#define ZFS_DELEG_PERM_DIFF "diff" +#define ZFS_DELEG_PERM_BOOKMARK "bookmark" +#define ZFS_DELEG_PERM_REMAP "remap" + +/* + * Note: the names of properties that are marked delegatable are also + * valid delegated permissions + */ + +int dsl_deleg_get(const char *ddname, nvlist_t **nvp); +int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset); +int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr); +int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr); +void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr); +int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr); +int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr); +int dsl_deleg_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx); +boolean_t dsl_delegation_on(objset_t *os); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DELEG_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h new file mode 100644 index 000000000000..ae3ca0cfbd5e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h @@ -0,0 +1,68 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_DSL_DESTROY_H +#define _SYS_DSL_DESTROY_H + +#ifdef __cplusplus +extern "C" { +#endif + +struct nvlist; +struct dsl_dataset; +struct dmu_tx; + +int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t, + struct nvlist *); +int dsl_destroy_snapshot(const char *, boolean_t); +int dsl_destroy_head(const char *); +int dsl_destroy_head_check_impl(struct dsl_dataset *, int); +void dsl_destroy_head_sync_impl(struct dsl_dataset *, struct dmu_tx *); +int dsl_destroy_inconsistent(const char *, void *); +int dsl_destroy_snapshot_check_impl(struct dsl_dataset *, boolean_t); +void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *, + boolean_t, struct dmu_tx *); + +typedef struct dsl_destroy_snapshot_arg { + const char *ddsa_name; + boolean_t ddsa_defer; +} dsl_destroy_snapshot_arg_t; + +int dsl_destroy_snapshot_check(void *, dmu_tx_t *); +void dsl_destroy_snapshot_sync(void *, dmu_tx_t *); + +typedef struct dsl_destroy_head_arg { + const char *ddha_name; +} dsl_destroy_head_arg_t; + +int dsl_destroy_head_check(void *, dmu_tx_t *); +void dsl_destroy_head_sync(void *, dmu_tx_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DESTROY_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h new file mode 100644 index 000000000000..21d953cb6013 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h @@ -0,0 +1,209 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#ifndef _SYS_DSL_DIR_H +#define _SYS_DSL_DIR_H + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/refcount.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; + +/* + * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object. + * They should be of the format <reverse-dns>:<field>. + */ + +#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count" +#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" +#define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg" + +typedef enum dd_used { + DD_USED_HEAD, + DD_USED_SNAP, + DD_USED_CHILD, + DD_USED_CHILD_RSRV, + DD_USED_REFRSRV, + DD_USED_NUM +} dd_used_t; + +#define DD_FLAG_USED_BREAKDOWN (1<<0) + +typedef struct dsl_dir_phys { + uint64_t dd_creation_time; /* not actually used */ + uint64_t dd_head_dataset_obj; + uint64_t dd_parent_obj; + uint64_t dd_origin_obj; + uint64_t dd_child_dir_zapobj; + /* + * how much space our children are accounting for; for leaf + * datasets, == physical space used by fs + snaps + */ + uint64_t dd_used_bytes; + uint64_t dd_compressed_bytes; + uint64_t dd_uncompressed_bytes; + /* Administrative quota setting */ + uint64_t dd_quota; + /* Administrative reservation setting */ + uint64_t dd_reserved; + uint64_t dd_props_zapobj; + uint64_t dd_deleg_zapobj; /* dataset delegation permissions */ + uint64_t dd_flags; + uint64_t dd_used_breakdown[DD_USED_NUM]; + uint64_t dd_clones; /* dsl_dir objects */ + uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */ +} dsl_dir_phys_t; + +struct dsl_dir { + dmu_buf_user_t dd_dbu; + + /* These are immutable; no lock needed: */ + uint64_t dd_object; + dsl_pool_t *dd_pool; + + /* Stable until user eviction; no lock needed: */ + dmu_buf_t *dd_dbuf; + + /* protected by lock on pool's dp_dirty_dirs list */ + txg_node_t dd_dirty_link; + + /* protected by dp_config_rwlock */ + dsl_dir_t *dd_parent; + + /* Protected by dd_lock */ + kmutex_t dd_lock; + list_t dd_props; /* list of dsl_prop_record_t's */ + timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */ + uint64_t dd_origin_txg; + + /* gross estimate of space used by in-flight tx's */ + uint64_t dd_tempreserved[TXG_SIZE]; + /* amount of space we expect to write; == amount of dirty data */ + int64_t dd_space_towrite[TXG_SIZE]; + + /* protected by dd_lock; keep at end of struct for better locality */ + char dd_myname[ZFS_MAX_DATASET_NAME_LEN]; +}; + +inline dsl_dir_phys_t * +dsl_dir_phys(dsl_dir_t *dd) +{ + return (dd->dd_dbuf->db_data); +} + +void dsl_dir_rele(dsl_dir_t *dd, void *tag); +void dsl_dir_async_rele(dsl_dir_t *dd, void *tag); +int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, + dsl_dir_t **, const char **tail); +int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, + const char *tail, void *tag, dsl_dir_t **); +void dsl_dir_name(dsl_dir_t *dd, char *buf); +int dsl_dir_namelen(dsl_dir_t *dd); +uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, + const char *name, dmu_tx_t *tx); + +uint64_t dsl_dir_get_used(dsl_dir_t *dd); +uint64_t dsl_dir_get_compressed(dsl_dir_t *dd); +uint64_t dsl_dir_get_quota(dsl_dir_t *dd); +uint64_t dsl_dir_get_reservation(dsl_dir_t *dd); +uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd); +uint64_t dsl_dir_get_logicalused(dsl_dir_t *dd); +uint64_t dsl_dir_get_usedsnap(dsl_dir_t *dd); +uint64_t dsl_dir_get_usedds(dsl_dir_t *dd); +uint64_t dsl_dir_get_usedrefreserv(dsl_dir_t *dd); +uint64_t dsl_dir_get_usedchild(dsl_dir_t *dd); +void dsl_dir_get_origin(dsl_dir_t *dd, char *buf); +int dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count); +int dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count); +int dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count); + +void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); +uint64_t dsl_dir_space_available(dsl_dir_t *dd, + dsl_dir_t *ancestor, int64_t delta, int ondiskonly); +void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); +void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); +int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, + uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx); +void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); +void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); +void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, + int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); +void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); +int dsl_dir_set_quota(const char *ddname, zprop_source_t source, + uint64_t quota); +int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, + uint64_t reservation); +int dsl_dir_activate_fs_ss_limit(const char *); +int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *, + cred_t *); +void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *); +int dsl_dir_update_last_remap_txg(dsl_dir_t *, uint64_t); +int dsl_dir_rename(const char *oldname, const char *newname); +int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, + uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *); +boolean_t dsl_dir_is_clone(dsl_dir_t *dd); +void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, + uint64_t reservation, cred_t *cr, dmu_tx_t *tx); +void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); +timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); +void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, + dmu_tx_t *tx); +void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx); +boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); + +/* internal reserved dir name */ +#define MOS_DIR_NAME "$MOS" +#define ORIGIN_DIR_NAME "$ORIGIN" +#define FREE_DIR_NAME "$FREE" +#define LEAK_DIR_NAME "$LEAK" + +#ifdef ZFS_DEBUG +#define dprintf_dd(dd, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \ + dsl_dir_name(dd, __ds_name); \ + dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \ + kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_dd(dd, fmt, ...) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DIR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h new file mode 100644 index 000000000000..40bec9ab6ad8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h @@ -0,0 +1,190 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _SYS_DSL_POOL_H +#define _SYS_DSL_POOL_H + +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/txg_impl.h> +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/dnode.h> +#include <sys/ddt.h> +#include <sys/arc.h> +#include <sys/bpobj.h> +#include <sys/bptree.h> +#include <sys/rrwlock.h> +#include <sys/dsl_synctask.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct objset; +struct dsl_dir; +struct dsl_dataset; +struct dsl_pool; +struct dmu_tx; +struct dsl_scan; + +extern uint64_t zfs_dirty_data_max; +extern uint64_t zfs_dirty_data_max_max; +extern uint64_t zfs_dirty_data_sync; +extern int zfs_dirty_data_max_percent; +extern int zfs_delay_min_dirty_percent; +extern uint64_t zfs_delay_scale; + +/* These macros are for indexing into the zfs_all_blkstats_t. */ +#define DMU_OT_DEFERRED DMU_OT_NONE +#define DMU_OT_OTHER DMU_OT_NUMTYPES /* place holder for DMU_OT() types */ +#define DMU_OT_TOTAL (DMU_OT_NUMTYPES + 1) + +typedef struct zfs_blkstat { + uint64_t zb_count; + uint64_t zb_asize; + uint64_t zb_lsize; + uint64_t zb_psize; + uint64_t zb_gangs; + uint64_t zb_ditto_2_of_2_samevdev; + uint64_t zb_ditto_2_of_3_samevdev; + uint64_t zb_ditto_3_of_3_samevdev; +} zfs_blkstat_t; + +typedef struct zfs_all_blkstats { + zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; + kmutex_t zab_lock; +} zfs_all_blkstats_t; + + +typedef struct dsl_pool { + /* Immutable */ + spa_t *dp_spa; + struct objset *dp_meta_objset; + struct dsl_dir *dp_root_dir; + struct dsl_dir *dp_mos_dir; + struct dsl_dir *dp_free_dir; + struct dsl_dir *dp_leak_dir; + struct dsl_dataset *dp_origin_snap; + uint64_t dp_root_dir_obj; + struct taskq *dp_vnrele_taskq; + + /* No lock needed - sync context only */ + blkptr_t dp_meta_rootbp; + uint64_t dp_tmp_userrefs_obj; + bpobj_t dp_free_bpobj; + uint64_t dp_bptree_obj; + uint64_t dp_empty_bpobj; + bpobj_t dp_obsolete_bpobj; + + struct dsl_scan *dp_scan; + + /* Uses dp_lock */ + kmutex_t dp_lock; + kcondvar_t dp_spaceavail_cv; + uint64_t dp_dirty_pertxg[TXG_SIZE]; + uint64_t dp_dirty_total; + uint64_t dp_long_free_dirty_pertxg[TXG_SIZE]; + uint64_t dp_mos_used_delta; + uint64_t dp_mos_compressed_delta; + uint64_t dp_mos_uncompressed_delta; + + /* + * Time of most recently scheduled (furthest in the future) + * wakeup for delayed transactions. + */ + hrtime_t dp_last_wakeup; + + /* Has its own locking */ + tx_state_t dp_tx; + txg_list_t dp_dirty_datasets; + txg_list_t dp_dirty_zilogs; + txg_list_t dp_dirty_dirs; + txg_list_t dp_sync_tasks; + txg_list_t dp_early_sync_tasks; + taskq_t *dp_sync_taskq; + taskq_t *dp_zil_clean_taskq; + + /* + * Protects administrative changes (properties, namespace) + * + * It is only held for write in syncing context. Therefore + * syncing context does not need to ever have it for read, since + * nobody else could possibly have it for write. + */ + rrwlock_t dp_config_rwlock; + + zfs_all_blkstats_t *dp_blkstats; +} dsl_pool_t; + +int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); +int dsl_pool_open(dsl_pool_t *dp); +void dsl_pool_close(dsl_pool_t *dp); +dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg); +void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); +void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg); +int dsl_pool_sync_context(dsl_pool_t *dp); +uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy); +uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, + zfs_space_check_t slop_policy); +void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); +void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); +void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); +void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, + const blkptr_t *bpp); +void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_mos_diduse_space(dsl_pool_t *dp, + int64_t used, int64_t comp, int64_t uncomp); +void dsl_pool_ckpoint_diduse_space(dsl_pool_t *dp, + int64_t used, int64_t comp, int64_t uncomp); +void dsl_pool_config_enter(dsl_pool_t *dp, void *tag); +void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag); +void dsl_pool_config_exit(dsl_pool_t *dp, void *tag); +boolean_t dsl_pool_config_held(dsl_pool_t *dp); +boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp); +boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp); + +taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp); + +int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, uint64_t now, dmu_tx_t *tx); +int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, dmu_tx_t *tx); +void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); +int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); +int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp); +void dsl_pool_rele(dsl_pool_t *dp, void *tag); + +void dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_POOL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h new file mode 100644 index 000000000000..21e6f4674be9 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h @@ -0,0 +1,115 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DSL_PROP_H +#define _SYS_DSL_PROP_H + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/zfs_context.h> +#include <sys/dsl_synctask.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct dsl_dir; + +/* The callback func may not call into the DMU or DSL! */ +typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval); + +typedef struct dsl_prop_record { + list_node_t pr_node; /* link on dd_props */ + const char *pr_propname; + list_t pr_cbs; +} dsl_prop_record_t; + +typedef struct dsl_prop_cb_record { + list_node_t cbr_pr_node; /* link on pr_cbs */ + list_node_t cbr_ds_node; /* link on ds_prop_cbs */ + dsl_prop_record_t *cbr_pr; + struct dsl_dataset *cbr_ds; + dsl_prop_changed_cb_t *cbr_func; + void *cbr_arg; +} dsl_prop_cb_record_t; + +typedef struct dsl_props_arg { + nvlist_t *pa_props; + zprop_source_t pa_source; +} dsl_props_arg_t; + +void dsl_prop_init(dsl_dir_t *dd); +void dsl_prop_fini(dsl_dir_t *dd); +int dsl_prop_register(struct dsl_dataset *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg); +void dsl_prop_unregister_all(struct dsl_dataset *ds, void *cbarg); +void dsl_prop_notify_all(struct dsl_dir *dd); +boolean_t dsl_prop_hascb(struct dsl_dataset *ds); + +int dsl_prop_get(const char *ddname, const char *propname, + int intsz, int numints, void *buf, char *setpoint); +int dsl_prop_get_integer(const char *ddname, const char *propname, + uint64_t *valuep, char *setpoint); +int dsl_prop_get_all(objset_t *os, nvlist_t **nvp); +int dsl_prop_get_received(const char *dsname, nvlist_t **nvp); +int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, + int intsz, int numints, void *buf, char *setpoint); +int dsl_prop_get_int_ds(struct dsl_dataset *ds, const char *propname, + uint64_t *valuep); +int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, + int intsz, int numints, void *buf, char *setpoint, + boolean_t snapshot); + +void dsl_props_set_sync_impl(struct dsl_dataset *ds, zprop_source_t source, + nvlist_t *props, dmu_tx_t *tx); +void dsl_prop_set_sync_impl(struct dsl_dataset *ds, const char *propname, + zprop_source_t source, int intsz, int numints, const void *value, + dmu_tx_t *tx); +int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); +int dsl_prop_set_int(const char *dsname, const char *propname, + zprop_source_t source, uint64_t value); +int dsl_prop_set_string(const char *dsname, const char *propname, + zprop_source_t source, const char *value); +int dsl_prop_inherit(const char *dsname, const char *propname, + zprop_source_t source); + +int dsl_prop_predict(dsl_dir_t *dd, const char *propname, + zprop_source_t source, uint64_t value, uint64_t *newvalp); + +/* flag first receive on or after SPA_VERSION_RECVD_PROPS */ +boolean_t dsl_prop_get_hasrecvd(const char *dsname); +int dsl_prop_set_hasrecvd(const char *dsname); +void dsl_prop_unset_hasrecvd(const char *dsname); + +void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); +void dsl_prop_nvlist_add_string(nvlist_t *nv, + zfs_prop_t prop, const char *value); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_PROP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h new file mode 100644 index 000000000000..5ddffe57bf97 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h @@ -0,0 +1,188 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2017 Datto Inc. + */ + +#ifndef _SYS_DSL_SCAN_H +#define _SYS_DSL_SCAN_H + +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/ddt.h> +#include <sys/bplist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct objset; +struct dsl_dir; +struct dsl_dataset; +struct dsl_pool; +struct dmu_tx; + +/* + * All members of this structure must be uint64_t, for byteswap + * purposes. + */ +typedef struct dsl_scan_phys { + uint64_t scn_func; /* pool_scan_func_t */ + uint64_t scn_state; /* dsl_scan_state_t */ + uint64_t scn_queue_obj; + uint64_t scn_min_txg; + uint64_t scn_max_txg; + uint64_t scn_cur_min_txg; + uint64_t scn_cur_max_txg; + uint64_t scn_start_time; + uint64_t scn_end_time; + uint64_t scn_to_examine; /* total bytes to be scanned */ + uint64_t scn_examined; /* bytes scanned so far */ + uint64_t scn_to_process; + uint64_t scn_processed; + uint64_t scn_errors; /* scan I/O error count */ + uint64_t scn_ddt_class_max; + ddt_bookmark_t scn_ddt_bookmark; + zbookmark_phys_t scn_bookmark; + uint64_t scn_flags; /* dsl_scan_flags_t */ +} dsl_scan_phys_t; + +#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t)) + +typedef enum dsl_scan_flags { + DSF_VISIT_DS_AGAIN = 1<<0, + DSF_SCRUB_PAUSED = 1<<1, +} dsl_scan_flags_t; + +/* + * Every pool will have one dsl_scan_t and this structure will contain + * in-memory information about the scan and a pointer to the on-disk + * representation (i.e. dsl_scan_phys_t). Most of the state of the scan + * is contained on-disk to allow the scan to resume in the event of a reboot + * or panic. This structure maintains information about the behavior of a + * running scan, some caching information, and how it should traverse the pool. + * + * The following members of this structure direct the behavior of the scan: + * + * scn_suspending - a scan that cannot be completed in a single txg or + * has exceeded its allotted time will need to suspend. + * When this flag is set the scanner will stop traversing + * the pool and write out the current state to disk. + * + * scn_restart_txg - directs the scanner to either restart or start a + * a scan at the specified txg value. + * + * scn_done_txg - when a scan completes its traversal it will set + * the completion txg to the next txg. This is necessary + * to ensure that any blocks that were freed during + * the scan but have not yet been processed (i.e deferred + * frees) are accounted for. + * + * This structure also maintains information about deferred frees which are + * a special kind of traversal. Deferred free can exist in either a bptree or + * a bpobj structure. The scn_is_bptree flag will indicate the type of + * deferred free that is in progress. If the deferred free is part of an + * asynchronous destroy then the scn_async_destroying flag will be set. + */ +typedef struct dsl_scan { + struct dsl_pool *scn_dp; + + uint64_t scn_restart_txg; + uint64_t scn_done_txg; + uint64_t scn_sync_start_time; + uint64_t scn_issued_before_pass; + + /* for freeing blocks */ + boolean_t scn_is_bptree; + boolean_t scn_async_destroying; + boolean_t scn_async_stalled; + uint64_t scn_async_block_min_time_ms; + /* flags and stats for controlling scan state */ + boolean_t scn_is_sorted; /* doing sequential scan */ + boolean_t scn_clearing; /* scan is issuing sequential extents */ + boolean_t scn_checkpointing; /* scan is issuing all queued extents */ + boolean_t scn_suspending; /* scan is suspending until next txg */ + uint64_t scn_last_checkpoint; /* time of last checkpoint */ + + /* members for thread synchronization */ + zio_t *scn_zio_root; /* root zio for waiting on IO */ + taskq_t *scn_taskq; /* task queue for issuing extents */ + + /* for controlling scan prefetch, protected by spa_scrub_lock */ + boolean_t scn_prefetch_stop; /* prefetch should stop */ + zbookmark_phys_t scn_prefetch_bookmark; /* prefetch start bookmark */ + avl_tree_t scn_prefetch_queue; /* priority queue of prefetch IOs */ + uint64_t scn_maxinflight_bytes; /* max bytes in flight for poool */ + + /* per txg statistics */ + uint64_t scn_visited_this_txg; /* total bps visited this txg */ + uint64_t scn_holes_this_txg; + uint64_t scn_lt_min_this_txg; + uint64_t scn_gt_max_this_txg; + uint64_t scn_ddt_contained_this_txg; + uint64_t scn_objsets_visited_this_txg; + uint64_t scn_avg_seg_size_this_txg; + uint64_t scn_segs_this_txg; + uint64_t scn_avg_zio_size_this_txg; + uint64_t scn_zios_this_txg; + + /* members needed for syncing scan status to disk */ + dsl_scan_phys_t scn_phys; /* on disk representation of scan */ + dsl_scan_phys_t scn_phys_cached; + avl_tree_t scn_queue; /* queue of datasets to scan */ + uint64_t scn_bytes_pending; /* outstanding data to issue */ +} dsl_scan_t; + +typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; + +void dsl_scan_global_init(void); + +void scan_init(void); +void scan_fini(void); +int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); +void dsl_scan_fini(struct dsl_pool *dp); +void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); +int dsl_scan_cancel(struct dsl_pool *); +int dsl_scan(struct dsl_pool *, pool_scan_func_t); +boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp); +int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd); +void dsl_resilver_restart(struct dsl_pool *, uint64_t txg); +boolean_t dsl_scan_resilvering(struct dsl_pool *dp); +boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); +void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx); +void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, + struct dmu_tx *tx); +boolean_t dsl_scan_active(dsl_scan_t *scn); +boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn); +void dsl_scan_freed(spa_t *spa, const blkptr_t *bp); +void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue); +void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_SCAN_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h new file mode 100644 index 000000000000..da6c7a40daca --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h @@ -0,0 +1,124 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DSL_SYNCTASK_H +#define _SYS_DSL_SYNCTASK_H + +#include <sys/txg.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_pool; + +typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *); +typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *); + +typedef enum zfs_space_check { + /* + * Normal space check: if there is less than 3.2% free space, + * the operation will fail. Operations which are logically + * creating things should use this (e.g. "zfs create", "zfs snapshot"). + * User writes (via the ZPL / ZVOL) also fail at this point. + */ + ZFS_SPACE_CHECK_NORMAL, + + /* + * Space check allows use of half the slop space. If there + * is less than 1.6% free space, the operation will fail. Most + * operations should use this (e.g. "zfs set", "zfs rename"), + * because we want them to succeed even after user writes are failing, + * so that they can be used as part of the space recovery process. + */ + ZFS_SPACE_CHECK_RESERVED, + + /* + * Space check allows use of three quarters of the slop space. + * If there is less than 0.8% free space, the operation will + * fail. + */ + ZFS_SPACE_CHECK_EXTRA_RESERVED, + + /* + * In all cases "zfs destroy" is expected to result in an net + * reduction of space, except one. When the pool has a + * checkpoint, space freed by "zfs destroy" will not actually + * free anything internally. Thus, it starts failing after + * three quarters of the slop space is exceeded. + */ + ZFS_SPACE_CHECK_DESTROY = ZFS_SPACE_CHECK_EXTRA_RESERVED, + + /* + * A channel program can run a "zfs destroy" as part of its + * script and therefore has the same space_check policy when + * being evaluated. + */ + ZFS_SPACE_CHECK_ZCP_EVAL = ZFS_SPACE_CHECK_DESTROY, + + /* + * No space check is performed. This level of space check should + * be used cautiously as operations that use it can even run when + * 0.8% capacity is left for use. In this scenario, if there is a + * checkpoint, async destroys are suspended and any kind of freeing + * can potentially add space instead of freeing it. + * + * See also the comments above spa_slop_shift. + */ + ZFS_SPACE_CHECK_NONE, + + ZFS_SPACE_CHECK_DISCARD_CHECKPOINT = ZFS_SPACE_CHECK_NONE, + +} zfs_space_check_t; + +typedef struct dsl_sync_task { + txg_node_t dst_node; + struct dsl_pool *dst_pool; + uint64_t dst_txg; + int dst_space; + zfs_space_check_t dst_space_check; + dsl_checkfunc_t *dst_checkfunc; + dsl_syncfunc_t *dst_syncfunc; + void *dst_arg; + int dst_error; + boolean_t dst_nowaiter; +} dsl_sync_task_t; + +void dsl_sync_task_sync(dsl_sync_task_t *, dmu_tx_t *); +int dsl_sync_task(const char *, dsl_checkfunc_t *, + dsl_syncfunc_t *, void *, int, zfs_space_check_t); +void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *, + void *, int, zfs_space_check_t, dmu_tx_t *); +int dsl_early_sync_task(const char *, dsl_checkfunc_t *, + dsl_syncfunc_t *, void *, int, zfs_space_check_t); +void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *, + void *, int, zfs_space_check_t, dmu_tx_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_SYNCTASK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h new file mode 100644 index 000000000000..071aeb86d1f1 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h @@ -0,0 +1,57 @@ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + */ + +#ifndef _SYS_DSL_USERHOLD_H +#define _SYS_DSL_USERHOLD_H + +#include <sys/nvpair.h> +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_pool; +struct dsl_dataset; +struct dmu_tx; + +int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, + nvlist_t *errlist); +int dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist); +int dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl); +void dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds); +int dsl_dataset_user_hold_check_one(struct dsl_dataset *ds, const char *htag, + boolean_t temphold, struct dmu_tx *tx); +void dsl_dataset_user_hold_sync_one(struct dsl_dataset *ds, const char *htag, + minor_t minor, uint64_t now, struct dmu_tx *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_USERHOLD_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h new file mode 100644 index 000000000000..0509e95b1587 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_METASLAB_H +#define _SYS_METASLAB_H + +#include <sys/spa.h> +#include <sys/space_map.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +typedef struct metaslab_ops { + uint64_t (*msop_alloc)(metaslab_t *, uint64_t); +} metaslab_ops_t; + + +extern metaslab_ops_t *zfs_metaslab_ops; + +int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, + metaslab_t **); +void metaslab_fini(metaslab_t *); + +void metaslab_load_wait(metaslab_t *); +int metaslab_load(metaslab_t *); +void metaslab_unload(metaslab_t *); + +void metaslab_sync(metaslab_t *, uint64_t); +void metaslab_sync_done(metaslab_t *, uint64_t); +void metaslab_sync_reassess(metaslab_group_t *); +uint64_t metaslab_block_maxsize(metaslab_t *); + +#define METASLAB_HINTBP_FAVOR 0x0 +#define METASLAB_HINTBP_AVOID 0x1 +#define METASLAB_GANG_HEADER 0x2 +#define METASLAB_GANG_CHILD 0x4 +#define METASLAB_ASYNC_ALLOC 0x8 +#define METASLAB_DONT_THROTTLE 0x10 + +int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, + int); +int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, + dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); +void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); +void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t); +void metaslab_free_dva(spa_t *, const dva_t *, boolean_t); +void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *); +void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); +int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); +int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); +void metaslab_check_free(spa_t *, const blkptr_t *); + +void metaslab_alloc_trace_init(void); +void metaslab_alloc_trace_fini(void); +void metaslab_trace_init(zio_alloc_list_t *); +void metaslab_trace_fini(zio_alloc_list_t *); + +metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *); +void metaslab_class_destroy(metaslab_class_t *); +int metaslab_class_validate(metaslab_class_t *); +void metaslab_class_histogram_verify(metaslab_class_t *); +uint64_t metaslab_class_fragmentation(metaslab_class_t *); +uint64_t metaslab_class_expandable_space(metaslab_class_t *); +boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, + zio_t *, int); +void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); + +void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t, + int64_t, int64_t); +uint64_t metaslab_class_get_alloc(metaslab_class_t *); +uint64_t metaslab_class_get_space(metaslab_class_t *); +uint64_t metaslab_class_get_dspace(metaslab_class_t *); +uint64_t metaslab_class_get_deferred(metaslab_class_t *); +uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc); + +metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int); +void metaslab_group_destroy(metaslab_group_t *); +void metaslab_group_activate(metaslab_group_t *); +void metaslab_group_passivate(metaslab_group_t *); +boolean_t metaslab_group_initialized(metaslab_group_t *); +uint64_t metaslab_group_get_space(metaslab_group_t *); +void metaslab_group_histogram_verify(metaslab_group_t *); +uint64_t metaslab_group_fragmentation(metaslab_group_t *); +void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); +void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int, + boolean_t); +void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_METASLAB_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h new file mode 100644 index 000000000000..5eb59df37e51 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h @@ -0,0 +1,422 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_METASLAB_IMPL_H +#define _SYS_METASLAB_IMPL_H + +#include <sys/metaslab.h> +#include <sys/space_map.h> +#include <sys/range_tree.h> +#include <sys/vdev.h> +#include <sys/txg.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Metaslab allocation tracing record. + */ +typedef struct metaslab_alloc_trace { + list_node_t mat_list_node; + metaslab_group_t *mat_mg; + metaslab_t *mat_msp; + uint64_t mat_size; + uint64_t mat_weight; + uint32_t mat_dva_id; + uint64_t mat_offset; + int mat_allocator; +} metaslab_alloc_trace_t; + +/* + * Used by the metaslab allocation tracing facility to indicate + * error conditions. These errors are stored to the offset member + * of the metaslab_alloc_trace_t record and displayed by mdb. + */ +typedef enum trace_alloc_type { + TRACE_ALLOC_FAILURE = -1ULL, + TRACE_TOO_SMALL = -2ULL, + TRACE_FORCE_GANG = -3ULL, + TRACE_NOT_ALLOCATABLE = -4ULL, + TRACE_GROUP_FAILURE = -5ULL, + TRACE_ENOSPC = -6ULL, + TRACE_CONDENSING = -7ULL, + TRACE_VDEV_ERROR = -8ULL, + TRACE_INITIALIZING = -9ULL +} trace_alloc_type_t; + +#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) +#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) +#define METASLAB_WEIGHT_CLAIM (1ULL << 61) +#define METASLAB_WEIGHT_TYPE (1ULL << 60) +#define METASLAB_ACTIVE_MASK \ + (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ + METASLAB_WEIGHT_CLAIM) + +/* + * The metaslab weight is used to encode the amount of free space in a + * metaslab, such that the "best" metaslab appears first when sorting the + * metaslabs by weight. The weight (and therefore the "best" metaslab) can + * be determined in two different ways: by computing a weighted sum of all + * the free space in the metaslab (a space based weight) or by counting only + * the free segments of the largest size (a segment based weight). We prefer + * the segment based weight because it reflects how the free space is + * comprised, but we cannot always use it -- legacy pools do not have the + * space map histogram information necessary to determine the largest + * contiguous regions. Pools that have the space map histogram determine + * the segment weight by looking at each bucket in the histogram and + * determining the free space whose size in bytes is in the range: + * [2^i, 2^(i+1)) + * We then encode the largest index, i, that contains regions into the + * segment-weighted value. + * + * Space-based weight: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |PSC1| weighted-free space | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * PS - indicates primary and secondary activation + * C - indicates activation for claimed block zio + * space - the fragmentation-weighted space + * + * Segment-based weight: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |PSC0| idx| count of segments in region | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * PS - indicates primary and secondary activation + * C - indicates activation for claimed block zio + * idx - index for the highest bucket in the histogram + * count - number of segments in the specified bucket + */ +#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) +#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) + +#define WEIGHT_IS_SPACEBASED(weight) \ + ((weight) == 0 || BF64_GET((weight), 60, 1)) +#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) + +/* + * These macros are only applicable to segment-based weighting. + */ +#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) +#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) +#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) +#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) + +/* + * A metaslab class encompasses a category of allocatable top-level vdevs. + * Each top-level vdev is associated with a metaslab group which defines + * the allocatable region for that vdev. Examples of these categories include + * "normal" for data block allocations (i.e. main pool allocations) or "log" + * for allocations designated for intent log devices (i.e. slog devices). + * When a block allocation is requested from the SPA it is associated with a + * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging + * to the class can be used to satisfy that request. Allocations are done + * by traversing the metaslab groups that are linked off of the mc_rotor field. + * This rotor points to the next metaslab group where allocations will be + * attempted. Allocating a block is a 3 step process -- select the metaslab + * group, select the metaslab, and then allocate the block. The metaslab + * class defines the low-level block allocator that will be used as the + * final step in allocation. These allocators are pluggable allowing each class + * to use a block allocator that best suits that class. + */ +struct metaslab_class { + kmutex_t mc_lock; + spa_t *mc_spa; + metaslab_group_t *mc_rotor; + metaslab_ops_t *mc_ops; + uint64_t mc_aliquot; + + /* + * Track the number of metaslab groups that have been initialized + * and can accept allocations. An initialized metaslab group is + * one has been completely added to the config (i.e. we have + * updated the MOS config and the space has been added to the pool). + */ + uint64_t mc_groups; + + /* + * Toggle to enable/disable the allocation throttle. + */ + boolean_t mc_alloc_throttle_enabled; + + /* + * The allocation throttle works on a reservation system. Whenever + * an asynchronous zio wants to perform an allocation it must + * first reserve the number of blocks that it wants to allocate. + * If there aren't sufficient slots available for the pending zio + * then that I/O is throttled until more slots free up. The current + * number of reserved allocations is maintained by the mc_alloc_slots + * refcount. The mc_alloc_max_slots value determines the maximum + * number of allocations that the system allows. Gang blocks are + * allowed to reserve slots even if we've reached the maximum + * number of allocations allowed. + */ + uint64_t *mc_alloc_max_slots; + refcount_t *mc_alloc_slots; + + uint64_t mc_alloc_groups; /* # of allocatable groups */ + + uint64_t mc_alloc; /* total allocated space */ + uint64_t mc_deferred; /* total deferred frees */ + uint64_t mc_space; /* total space (alloc + free) */ + uint64_t mc_dspace; /* total deflated space */ + uint64_t mc_minblocksize; + uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; +}; + +/* + * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) + * of a top-level vdev. They are linked togther to form a circular linked + * list and can belong to only one metaslab class. Metaslab groups may become + * ineligible for allocations for a number of reasons such as limited free + * space, fragmentation, or going offline. When this happens the allocator will + * simply find the next metaslab group in the linked list and attempt + * to allocate from that group instead. + */ +struct metaslab_group { + kmutex_t mg_lock; + metaslab_t **mg_primaries; + metaslab_t **mg_secondaries; + avl_tree_t mg_metaslab_tree; + uint64_t mg_aliquot; + boolean_t mg_allocatable; /* can we allocate? */ + uint64_t mg_ms_ready; + + /* + * A metaslab group is considered to be initialized only after + * we have updated the MOS config and added the space to the pool. + * We only allow allocation attempts to a metaslab group if it + * has been initialized. + */ + boolean_t mg_initialized; + + uint64_t mg_free_capacity; /* percentage free */ + int64_t mg_bias; + int64_t mg_activation_count; + metaslab_class_t *mg_class; + vdev_t *mg_vd; + taskq_t *mg_taskq; + metaslab_group_t *mg_prev; + metaslab_group_t *mg_next; + + /* + * In order for the allocation throttle to function properly, we cannot + * have too many IOs going to each disk by default; the throttle + * operates by allocating more work to disks that finish quickly, so + * allocating larger chunks to each disk reduces its effectiveness. + * However, if the number of IOs going to each allocator is too small, + * we will not perform proper aggregation at the vdev_queue layer, + * also resulting in decreased performance. Therefore, we will use a + * ramp-up strategy. + * + * Each allocator in each metaslab group has a current queue depth + * (mg_alloc_queue_depth[allocator]) and a current max queue depth + * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group + * has an absolute max queue depth (mg_max_alloc_queue_depth). We + * add IOs to an allocator until the mg_alloc_queue_depth for that + * allocator hits the cur_max. Every time an IO completes for a given + * allocator on a given metaslab group, we increment its cur_max until + * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to + * help protect against disks that decrease in performance over time. + * + * It's possible for an allocator to handle more allocations than + * its max. This can occur when gang blocks are required or when other + * groups are unable to handle their share of allocations. + */ + uint64_t mg_max_alloc_queue_depth; + uint64_t *mg_cur_max_alloc_queue_depth; + refcount_t *mg_alloc_queue_depth; + int mg_allocators; + /* + * A metalab group that can no longer allocate the minimum block + * size will set mg_no_free_space. Once a metaslab group is out + * of space then its share of work must be distributed to other + * groups. + */ + boolean_t mg_no_free_space; + + uint64_t mg_allocations; + uint64_t mg_failed_allocations; + uint64_t mg_fragmentation; + uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + + int mg_ms_initializing; + boolean_t mg_initialize_updating; + kmutex_t mg_ms_initialize_lock; + kcondvar_t mg_ms_initialize_cv; +}; + +/* + * This value defines the number of elements in the ms_lbas array. The value + * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. + * This is the equivalent of highbit(UINT64_MAX). + */ +#define MAX_LBAS 64 + +/* + * Each metaslab maintains a set of in-core trees to track metaslab + * operations. The in-core free tree (ms_allocatable) contains the list of + * free segments which are eligible for allocation. As blocks are + * allocated, the allocated segment are removed from the ms_allocatable and + * added to a per txg allocation tree (ms_allocating). As blocks are + * freed, they are added to the free tree (ms_freeing). These trees + * allow us to process all allocations and frees in syncing context + * where it is safe to update the on-disk space maps. An additional set + * of in-core trees is maintained to track deferred frees + * (ms_defer). Once a block is freed it will move from the + * ms_freed to the ms_defer tree. A deferred free means that a block + * has been freed but cannot be used by the pool until TXG_DEFER_SIZE + * transactions groups later. For example, a block that is freed in txg + * 50 will not be available for reallocation until txg 52 (50 + + * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback. + * A pool could be safely rolled back TXG_DEFERS_SIZE transactions + * groups and ensure that no block has been reallocated. + * + * The simplified transition diagram looks like this: + * + * + * ALLOCATE + * | + * V + * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map) + * ^ + * | ms_freeing <--- FREE + * | | + * | v + * | ms_freed + * | | + * +-------- ms_defer[2] <-------+-------> (write to space map) + * + * + * Each metaslab's space is tracked in a single space map in the MOS, + * which is only updated in syncing context. Each time we sync a txg, + * we append the allocs and frees from that txg to the space map. The + * pool space is only updated once all metaslabs have finished syncing. + * + * To load the in-core free tree we read the space map from disk. This + * object contains a series of alloc and free records that are combined + * to make up the list of all free segments in this metaslab. These + * segments are represented in-core by the ms_allocatable and are stored + * in an AVL tree. + * + * As the space map grows (as a result of the appends) it will + * eventually become space-inefficient. When the metaslab's in-core + * free tree is zfs_condense_pct/100 times the size of the minimal + * on-disk representation, we rewrite it in its minimized form. If a + * metaslab needs to condense then we must set the ms_condensing flag to + * ensure that allocations are not performed on the metaslab that is + * being written. + */ +struct metaslab { + kmutex_t ms_lock; + kmutex_t ms_sync_lock; + kcondvar_t ms_load_cv; + space_map_t *ms_sm; + uint64_t ms_id; + uint64_t ms_start; + uint64_t ms_size; + uint64_t ms_fragmentation; + + range_tree_t *ms_allocating[TXG_SIZE]; + range_tree_t *ms_allocatable; + + /* + * The following range trees are accessed only from syncing context. + * ms_free*tree only have entries while syncing, and are empty + * between syncs. + */ + range_tree_t *ms_freeing; /* to free this syncing txg */ + range_tree_t *ms_freed; /* already freed this syncing txg */ + range_tree_t *ms_defer[TXG_DEFER_SIZE]; + range_tree_t *ms_checkpointing; /* to add to the checkpoint */ + + boolean_t ms_condensing; /* condensing? */ + boolean_t ms_condense_wanted; + uint64_t ms_condense_checked_txg; + + uint64_t ms_initializing; /* leaves initializing this ms */ + + /* + * We must hold both ms_lock and ms_group->mg_lock in order to + * modify ms_loaded. + */ + boolean_t ms_loaded; + boolean_t ms_loading; + + int64_t ms_deferspace; /* sum of ms_defermap[] space */ + uint64_t ms_weight; /* weight vs. others in group */ + uint64_t ms_activation_weight; /* activation weight */ + + /* + * Track of whenever a metaslab is selected for loading or allocation. + * We use this value to determine how long the metaslab should + * stay cached. + */ + uint64_t ms_selected_txg; + + uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ + uint64_t ms_max_size; /* maximum allocatable size */ + + /* + * -1 if it's not active in an allocator, otherwise set to the allocator + * this metaslab is active for. + */ + int ms_allocator; + boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ + + /* + * The metaslab block allocators can optionally use a size-ordered + * range tree and/or an array of LBAs. Not all allocators use + * this functionality. The ms_allocatable_by_size should always + * contain the same number of segments as the ms_allocatable. The + * only difference is that the ms_allocatable_by_size is ordered by + * segment sizes. + */ + avl_tree_t ms_allocatable_by_size; + uint64_t ms_lbas[MAX_LBAS]; + + metaslab_group_t *ms_group; /* metaslab group */ + avl_node_t ms_group_node; /* node in metaslab group tree */ + txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ + + boolean_t ms_new; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_METASLAB_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h new file mode 100644 index 000000000000..a2031da77daa --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h @@ -0,0 +1,106 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_MULTILIST_H +#define _SYS_MULTILIST_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef list_node_t multilist_node_t; +typedef struct multilist multilist_t; +typedef struct multilist_sublist multilist_sublist_t; +typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *); + +struct multilist_sublist { + /* + * The mutex used internally to implement thread safe insertions + * and removals to this individual sublist. It can also be locked + * by a consumer using multilist_sublist_{lock,unlock}, which is + * useful if a consumer needs to traverse the list in a thread + * safe manner. + */ + kmutex_t mls_lock; + /* + * The actual list object containing all objects in this sublist. + */ + list_t mls_list; + /* + * Pad to cache line (64 bytes), in an effort to try and prevent + * cache line contention. + */ + uint8_t mls_pad[24]; +}; + +struct multilist { + /* + * This is used to get to the multilist_node_t structure given + * the void *object contained on the list. + */ + size_t ml_offset; + /* + * The number of sublists used internally by this multilist. + */ + uint64_t ml_num_sublists; + /* + * The array of pointers to the actual sublists. + */ + multilist_sublist_t *ml_sublists; + /* + * Pointer to function which determines the sublist to use + * when inserting and removing objects from this multilist. + * Please see the comment above multilist_create for details. + */ + multilist_sublist_index_func_t *ml_index_func; +}; + +void multilist_destroy(multilist_t *); +multilist_t *multilist_create(size_t, size_t, multilist_sublist_index_func_t *); + +void multilist_insert(multilist_t *, void *); +void multilist_remove(multilist_t *, void *); +int multilist_is_empty(multilist_t *); + +unsigned int multilist_get_num_sublists(multilist_t *); +unsigned int multilist_get_random_index(multilist_t *); + +multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int); +multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *); +void multilist_sublist_unlock(multilist_sublist_t *); + +void multilist_sublist_insert_head(multilist_sublist_t *, void *); +void multilist_sublist_insert_tail(multilist_sublist_t *, void *); +void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); +void multilist_sublist_remove(multilist_sublist_t *, void *); + +void *multilist_sublist_head(multilist_sublist_t *); +void *multilist_sublist_tail(multilist_sublist_t *); +void *multilist_sublist_next(multilist_sublist_t *, void *); +void *multilist_sublist_prev(multilist_sublist_t *, void *); + +void multilist_link_init(multilist_node_t *); +int multilist_link_active(multilist_node_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MULTILIST_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h new file mode 100644 index 000000000000..961202bb6cf2 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h @@ -0,0 +1,123 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_RANGE_TREE_H +#define _SYS_RANGE_TREE_H + +#include <sys/avl.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define RANGE_TREE_HISTOGRAM_SIZE 64 + +typedef struct range_tree_ops range_tree_ops_t; + +/* + * Note: the range_tree may not be accessed concurrently; consumers + * must provide external locking if required. + */ +typedef struct range_tree { + avl_tree_t rt_root; /* offset-ordered segment AVL tree */ + uint64_t rt_space; /* sum of all segments in the map */ + range_tree_ops_t *rt_ops; + void *rt_arg; + + /* rt_avl_compare should only be set it rt_arg is an AVL tree */ + uint64_t rt_gap; /* allowable inter-segment gap */ + int (*rt_avl_compare)(const void *, const void *); + /* + * The rt_histogram maintains a histogram of ranges. Each bucket, + * rt_histogram[i], contains the number of ranges whose size is: + * 2^i <= size of range in bytes < 2^(i+1) + */ + uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; +} range_tree_t; + +typedef struct range_seg { + avl_node_t rs_node; /* AVL node */ + avl_node_t rs_pp_node; /* AVL picker-private node */ + uint64_t rs_start; /* starting offset of this segment */ + uint64_t rs_end; /* ending offset (non-inclusive) */ + uint64_t rs_fill; /* actual fill if gap mode is on */ +} range_seg_t; + +struct range_tree_ops { + void (*rtop_create)(range_tree_t *rt, void *arg); + void (*rtop_destroy)(range_tree_t *rt, void *arg); + void (*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg); + void (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg); + void (*rtop_vacate)(range_tree_t *rt, void *arg); +}; + +typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); + +void range_tree_init(void); +void range_tree_fini(void); +range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg, + int (*avl_compare)(const void*, const void*), uint64_t gap); +range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); +void range_tree_destroy(range_tree_t *rt); +boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); +range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); +void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, + uint64_t newstart, uint64_t newsize); +uint64_t range_tree_space(range_tree_t *rt); +boolean_t range_tree_is_empty(range_tree_t *rt); +void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size); +void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); +void range_tree_stat_verify(range_tree_t *rt); +uint64_t range_tree_min(range_tree_t *rt); +uint64_t range_tree_max(range_tree_t *rt); +uint64_t range_tree_span(range_tree_t *rt); + +void range_tree_add(void *arg, uint64_t start, uint64_t size); +void range_tree_remove(void *arg, uint64_t start, uint64_t size); +void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size); +void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta); +void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size); + +void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg); +void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg); +range_seg_t *range_tree_first(range_tree_t *rt); + +void rt_avl_create(range_tree_t *rt, void *arg); +void rt_avl_destroy(range_tree_t *rt, void *arg); +void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_avl_vacate(range_tree_t *rt, void *arg); +extern struct range_tree_ops rt_avl_ops; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_RANGE_TREE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h new file mode 100644 index 000000000000..a806678d8fdc --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_REFCOUNT_H +#define _SYS_REFCOUNT_H + +#include <sys/cdefs.h> +#include <sys/types.h> +#include_next <sys/refcount.h> +#include <sys/list.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * If the reference is held only by the calling function and not any + * particular object, use FTAG (which is a string) for the holder_tag. + * Otherwise, use the object that holds the reference. + */ +#define FTAG ((char *)(uintptr_t)__func__) + +#ifdef ZFS_DEBUG +typedef struct reference { + list_node_t ref_link; + void *ref_holder; + uint64_t ref_number; + uint8_t *ref_removed; +} reference_t; + +typedef struct refcount { + kmutex_t rc_mtx; + boolean_t rc_tracked; + list_t rc_list; + list_t rc_removed; + uint64_t rc_count; + uint64_t rc_removed_count; +} refcount_t; + +/* Note: refcount_t must be initialized with refcount_create[_untracked]() */ + +void refcount_create(refcount_t *rc); +void refcount_create_untracked(refcount_t *rc); +void refcount_create_tracked(refcount_t *rc); +void refcount_destroy(refcount_t *rc); +void refcount_destroy_many(refcount_t *rc, uint64_t number); +int refcount_is_zero(refcount_t *rc); +int64_t refcount_count(refcount_t *rc); +int64_t refcount_add(refcount_t *rc, void *holder_tag); +int64_t refcount_remove(refcount_t *rc, void *holder_tag); +int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); +int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); +void refcount_transfer(refcount_t *dst, refcount_t *src); +void refcount_transfer_ownership(refcount_t *, void *, void *); +boolean_t refcount_held(refcount_t *, void *); +boolean_t refcount_not_held(refcount_t *, void *); + +void refcount_sysinit(void); +void refcount_fini(void); + +#else /* ZFS_DEBUG */ + +typedef struct refcount { + uint64_t rc_count; +} refcount_t; + +#define refcount_create(rc) ((rc)->rc_count = 0) +#define refcount_create_untracked(rc) ((rc)->rc_count = 0) +#define refcount_create_tracked(rc) ((rc)->rc_count = 0) +#define refcount_destroy(rc) ((rc)->rc_count = 0) +#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0) +#define refcount_is_zero(rc) ((rc)->rc_count == 0) +#define refcount_count(rc) ((rc)->rc_count) +#define refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count) +#define refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count) +#define refcount_add_many(rc, number, holder) \ + atomic_add_64_nv(&(rc)->rc_count, number) +#define refcount_remove_many(rc, number, holder) \ + atomic_add_64_nv(&(rc)->rc_count, -number) +#define refcount_transfer(dst, src) { \ + uint64_t __tmp = (src)->rc_count; \ + atomic_add_64(&(src)->rc_count, -__tmp); \ + atomic_add_64(&(dst)->rc_count, __tmp); \ +} +#define refcount_transfer_ownership(rc, current_holder, new_holder) (void)0 +#define refcount_held(rc, holder) ((rc)->rc_count > 0) +#define refcount_not_held(rc, holder) (B_TRUE) + +#define refcount_sysinit() +#define refcount_fini() + +#endif /* ZFS_DEBUG */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_REFCOUNT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h new file mode 100644 index 000000000000..a1844f01aa2b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h @@ -0,0 +1,112 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_RR_RW_LOCK_H +#define _SYS_RR_RW_LOCK_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +/* + * A reader-writer lock implementation that allows re-entrant reads, but + * still gives writers priority on "new" reads. + * + * See rrwlock.c for more details about the implementation. + * + * Fields of the rrwlock_t structure: + * - rr_lock: protects modification and reading of rrwlock_t fields + * - rr_cv: cv for waking up readers or waiting writers + * - rr_writer: thread id of the current writer + * - rr_anon_rount: number of active anonymous readers + * - rr_linked_rcount: total number of non-anonymous active readers + * - rr_writer_wanted: a writer wants the lock + */ +typedef struct rrwlock { + kmutex_t rr_lock; + kcondvar_t rr_cv; + kthread_t *rr_writer; + refcount_t rr_anon_rcount; + refcount_t rr_linked_rcount; + boolean_t rr_writer_wanted; + boolean_t rr_track_all; +} rrwlock_t; + +/* + * 'tag' is used in reference counting tracking. The + * 'tag' must be the same in a rrw_enter() as in its + * corresponding rrw_exit(). + */ +void rrw_init(rrwlock_t *rrl, boolean_t track_all); +void rrw_destroy(rrwlock_t *rrl); +void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag); +void rrw_enter_read(rrwlock_t *rrl, void *tag); +void rrw_enter_read_prio(rrwlock_t *rrl, void *tag); +void rrw_enter_write(rrwlock_t *rrl); +void rrw_exit(rrwlock_t *rrl, void *tag); +boolean_t rrw_held(rrwlock_t *rrl, krw_t rw); +void rrw_tsd_destroy(void *arg); + +#define RRW_READ_HELD(x) rrw_held(x, RW_READER) +#define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER) +#define RRW_LOCK_HELD(x) \ + (rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER)) + +/* + * A reader-mostly lock implementation, tuning above reader-writer locks + * for hightly parallel read acquisitions, pessimizing write acquisitions. + * + * This should be a prime number. See comment in rrwlock.c near + * RRM_TD_LOCK() for details. + */ +#define RRM_NUM_LOCKS 17 +typedef struct rrmlock { + rrwlock_t locks[RRM_NUM_LOCKS]; +} rrmlock_t; + +void rrm_init(rrmlock_t *rrl, boolean_t track_all); +void rrm_destroy(rrmlock_t *rrl); +void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag); +void rrm_enter_read(rrmlock_t *rrl, void *tag); +void rrm_enter_write(rrmlock_t *rrl); +void rrm_exit(rrmlock_t *rrl, void *tag); +boolean_t rrm_held(rrmlock_t *rrl, krw_t rw); + +#define RRM_READ_HELD(x) rrm_held(x, RW_READER) +#define RRM_WRITE_HELD(x) rrm_held(x, RW_WRITER) +#define RRM_LOCK_HELD(x) \ + (rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER)) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_RR_RW_LOCK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h new file mode 100644 index 000000000000..62332ea126a0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h @@ -0,0 +1,170 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_SA_H +#define _SYS_SA_H + +#include <sys/dmu.h> +#include <sys/uio.h> + +/* + * Currently available byteswap functions. + * If it all possible new attributes should used + * one of the already defined byteswap functions. + * If a new byteswap function is added then the + * ZPL/Pool version will need to be bumped. + */ + +typedef enum sa_bswap_type { + SA_UINT64_ARRAY, + SA_UINT32_ARRAY, + SA_UINT16_ARRAY, + SA_UINT8_ARRAY, + SA_ACL, +} sa_bswap_type_t; + +typedef uint16_t sa_attr_type_t; + +/* + * Attribute to register support for. + */ +typedef struct sa_attr_reg { + char *sa_name; /* attribute name */ + uint16_t sa_length; + sa_bswap_type_t sa_byteswap; /* bswap functon enum */ + sa_attr_type_t sa_attr; /* filled in during registration */ +} sa_attr_reg_t; + + +typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t, + boolean_t, void *userptr); + +/* + * array of attributes to store. + * + * This array should be treated as opaque/private data. + * The SA_BULK_ADD_ATTR() macro should be used for manipulating + * the array. + * + * When sa_replace_all_by_template() is used the attributes + * will be stored in the order defined in the array, except that + * the attributes may be split between the bonus and the spill buffer + * + */ +typedef struct sa_bulk_attr { + void *sa_data; + sa_data_locator_t *sa_data_func; + uint16_t sa_length; + sa_attr_type_t sa_attr; + /* the following are private to the sa framework */ + void *sa_addr; + uint16_t sa_buftype; + uint16_t sa_size; +} sa_bulk_attr_t; + + +/* + * special macro for adding entries for bulk attr support + * bulk - sa_bulk_attr_t + * count - integer that will be incremented during each add + * attr - attribute to manipulate + * func - function for accessing data. + * data - pointer to data. + * len - length of data + */ + +#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \ +{ \ + b[idx].sa_attr = attr;\ + b[idx].sa_data_func = func; \ + b[idx].sa_data = data; \ + b[idx++].sa_length = len; \ +} + +typedef struct sa_os sa_os_t; + +typedef enum sa_handle_type { + SA_HDL_SHARED, + SA_HDL_PRIVATE +} sa_handle_type_t; + +struct sa_handle; +typedef void *sa_lookup_tab_t; +typedef struct sa_handle sa_handle_t; + +typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx); + +int sa_handle_get(objset_t *, uint64_t, void *userp, + sa_handle_type_t, sa_handle_t **); +int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp, + sa_handle_type_t, sa_handle_t **); +void sa_handle_destroy(sa_handle_t *); +int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **); +void sa_buf_rele(dmu_buf_t *, void *); +int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen); +int sa_update(sa_handle_t *, sa_attr_type_t, void *buf, + uint32_t buflen, dmu_tx_t *); +int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *); +int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count); +int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count); +int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *); +int sa_size(sa_handle_t *, sa_attr_type_t, int *); +int sa_update_from_cb(sa_handle_t *, sa_attr_type_t, + uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *); +void sa_object_info(sa_handle_t *, dmu_object_info_t *); +void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *); +void *sa_get_userdata(sa_handle_t *); +void sa_set_userp(sa_handle_t *, void *); +dmu_buf_t *sa_get_db(sa_handle_t *); +uint64_t sa_handle_object(sa_handle_t *); +boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size); +void sa_register_update_callback(objset_t *, sa_update_cb_t *); +int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **); +void sa_tear_down(objset_t *); +int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *, + int, dmu_tx_t *); +int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *, + int, dmu_tx_t *); +boolean_t sa_enabled(objset_t *); +void sa_cache_init(void); +void sa_cache_fini(void); +int sa_set_sa_object(objset_t *, uint64_t); +int sa_hdrsize(void *); +void sa_handle_lock(sa_handle_t *); +void sa_handle_unlock(sa_handle_t *); + +#ifdef _KERNEL +int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *); +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h new file mode 100644 index 000000000000..e444e2fb5723 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h @@ -0,0 +1,291 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#ifndef _SYS_SA_IMPL_H +#define _SYS_SA_IMPL_H + +#include <sys/dmu.h> +#include <sys/refcount.h> +#include <sys/list.h> + +/* + * Array of known attributes and their + * various characteristics. + */ +typedef struct sa_attr_table { + sa_attr_type_t sa_attr; + uint8_t sa_registered; + uint16_t sa_length; + sa_bswap_type_t sa_byteswap; + char *sa_name; +} sa_attr_table_t; + +/* + * Zap attribute format for attribute registration + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | unused | len | bswap | attr num | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Zap attribute format for layout information. + * + * layout information is stored as an array of attribute numbers + * The name of the attribute is the layout number (0, 1, 2, ...) + * + * 16 0 + * +---- ---+ + * | attr # | + * +--------+ + * | attr # | + * +--- ----+ + * ...... + * + */ + +#define ATTR_BSWAP(x) BF32_GET(x, 16, 8) +#define ATTR_LENGTH(x) BF32_GET(x, 24, 16) +#define ATTR_NUM(x) BF32_GET(x, 0, 16) +#define ATTR_ENCODE(x, attr, length, bswap) \ +{ \ + BF64_SET(x, 24, 16, length); \ + BF64_SET(x, 16, 8, bswap); \ + BF64_SET(x, 0, 16, attr); \ +} + +#define TOC_OFF(x) BF32_GET(x, 0, 23) +#define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1) +#define TOC_LEN_IDX(x) BF32_GET(x, 24, 4) +#define TOC_ATTR_ENCODE(x, len_idx, offset) \ +{ \ + BF32_SET(x, 31, 1, 1); \ + BF32_SET(x, 24, 7, len_idx); \ + BF32_SET(x, 0, 24, offset); \ +} + +#define SA_LAYOUTS "LAYOUTS" +#define SA_REGISTRY "REGISTRY" + +/* + * Each unique layout will have their own table + * sa_lot (layout_table) + */ +typedef struct sa_lot { + avl_node_t lot_num_node; + avl_node_t lot_hash_node; + uint64_t lot_num; + uint64_t lot_hash; + sa_attr_type_t *lot_attrs; /* array of attr #'s */ + uint32_t lot_var_sizes; /* how many aren't fixed size */ + uint32_t lot_attr_count; /* total attr count */ + list_t lot_idx_tab; /* should be only a couple of entries */ + int lot_instance; /* used with lot_hash to identify entry */ +} sa_lot_t; + +/* index table of offsets */ +typedef struct sa_idx_tab { + list_node_t sa_next; + sa_lot_t *sa_layout; + uint16_t *sa_variable_lengths; + refcount_t sa_refcount; + uint32_t *sa_idx_tab; /* array of offsets */ +} sa_idx_tab_t; + +/* + * Since the offset/index information into the actual data + * will usually be identical we can share that information with + * all handles that have the exact same offsets. + * + * You would typically only have a large number of different table of + * contents if you had a several variable sized attributes. + * + * Two AVL trees are used to track the attribute layout numbers. + * one is keyed by number and will be consulted when a DMU_OT_SA + * object is first read. The second tree is keyed by the hash signature + * of the attributes and will be consulted when an attribute is added + * to determine if we already have an instance of that layout. Both + * of these tree's are interconnected. The only difference is that + * when an entry is found in the "hash" tree the list of attributes will + * need to be compared against the list of attributes you have in hand. + * The assumption is that typically attributes will just be updated and + * adding a completely new attribute is a very rare operation. + */ +struct sa_os { + kmutex_t sa_lock; + boolean_t sa_need_attr_registration; + boolean_t sa_force_spill; + uint64_t sa_master_obj; + uint64_t sa_reg_attr_obj; + uint64_t sa_layout_attr_obj; + int sa_num_attrs; + sa_attr_table_t *sa_attr_table; /* private attr table */ + sa_update_cb_t *sa_update_cb; + avl_tree_t sa_layout_num_tree; /* keyed by layout number */ + avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */ + int sa_user_table_sz; + sa_attr_type_t *sa_user_table; /* user name->attr mapping table */ +}; + +/* + * header for all bonus and spill buffers. + * + * The header has a fixed portion with a variable number + * of "lengths" depending on the number of variable sized + * attributes which are determined by the "layout number" + */ + +#define SA_MAGIC 0x2F505A /* ZFS SA */ +typedef struct sa_hdr_phys { + uint32_t sa_magic; + /* BEGIN CSTYLED */ + /* + * Encoded with hdrsize and layout number as follows: + * 16 10 0 + * +--------+-------+ + * | hdrsz |layout | + * +--------+-------+ + * + * Bits 0-10 are the layout number + * Bits 11-16 are the size of the header. + * The hdrsize is the number * 8 + * + * For example. + * hdrsz of 1 ==> 8 byte header + * 2 ==> 16 byte header + * + */ + /* END CSTYLED */ + uint16_t sa_layout_info; + uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */ + /* ... Data follows the lengths. */ +} sa_hdr_phys_t; + +#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) +#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0) +#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ +{ \ + BF32_SET_SB(x, 10, 6, 3, 0, size); \ + BF32_SET(x, 0, 10, num); \ +} + +typedef enum sa_buf_type { + SA_BONUS = 1, + SA_SPILL = 2 +} sa_buf_type_t; + +typedef enum sa_data_op { + SA_LOOKUP, + SA_UPDATE, + SA_ADD, + SA_REPLACE, + SA_REMOVE +} sa_data_op_t; + +/* + * Opaque handle used for most sa functions + * + * This needs to be kept as small as possible. + */ + +struct sa_handle { + dmu_buf_user_t sa_dbu; + kmutex_t sa_lock; + dmu_buf_t *sa_bonus; + dmu_buf_t *sa_spill; + objset_t *sa_os; + void *sa_userp; + sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */ + sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */ +}; + +#define SA_GET_DB(hdl, type) \ + (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill) + +#define SA_GET_HDR(hdl, type) \ + ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \ + type))->db.db_data)) + +#define SA_IDX_TAB_GET(hdl, type) \ + (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab) + +#define IS_SA_BONUSTYPE(a) \ + ((a == DMU_OT_SA) ? B_TRUE : B_FALSE) + +#define SA_BONUSTYPE_FROM_DB(db) \ + (dmu_get_bonustype((dmu_buf_t *)db)) + +#define SA_BLKPTR_SPACE (DN_OLD_MAX_BONUSLEN - sizeof (blkptr_t)) + +#define SA_LAYOUT_NUM(x, type) \ + ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \ + ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x)))) + + +#define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length + +#define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\ + hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \ + SA_REGISTERED_LEN(sa, attr)) + +#define SA_SET_HDR(hdr, num, size) \ + { \ + hdr->sa_magic = SA_MAGIC; \ + SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \ + } + +#define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \ + { \ + bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \ + bulk.sa_buftype = type; \ + bulk.sa_addr = \ + (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \ + (uintptr_t)hdr); \ +} + +#define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \ + (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \ + (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \ + sizeof (uint16_t), 8) : 0))) + +int sa_add_impl(sa_handle_t *, sa_attr_type_t, + uint32_t, sa_data_locator_t, void *, dmu_tx_t *); + +void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *); +int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *); + +void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *); +int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t, + uint16_t *, sa_hdr_phys_t *); + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SA_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h new file mode 100644 index 000000000000..8e71f64efc23 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -0,0 +1,960 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017 Datto Inc. + */ + +#ifndef _SYS_SPA_H +#define _SYS_SPA_H + +#include <sys/avl.h> +#include <sys/zfs_context.h> +#include <sys/nvpair.h> +#include <sys/sysevent.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/fs/zfs.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Forward references that lots of things need. + */ +typedef struct spa spa_t; +typedef struct vdev vdev_t; +typedef struct metaslab metaslab_t; +typedef struct metaslab_group metaslab_group_t; +typedef struct metaslab_class metaslab_class_t; +typedef struct zio zio_t; +typedef struct zilog zilog_t; +typedef struct spa_aux_vdev spa_aux_vdev_t; +typedef struct ddt ddt_t; +typedef struct ddt_entry ddt_entry_t; +struct dsl_pool; +struct dsl_dataset; + +/* + * General-purpose 32-bit and 64-bit bitfield encodings. + */ +#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) +#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) +#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) +#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) + +#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) +#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) + +#define BF32_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1U << (len)); \ + ASSERT3U(low + len, <=, 32); \ + (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ +_NOTE(CONSTCOND) } while (0) + +#define BF64_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1ULL << (len)); \ + ASSERT3U(low + len, <=, 64); \ + ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ +_NOTE(CONSTCOND) } while (0) + +#define BF32_GET_SB(x, low, len, shift, bias) \ + ((BF32_GET(x, low, len) + (bias)) << (shift)) +#define BF64_GET_SB(x, low, len, shift, bias) \ + ((BF64_GET(x, low, len) + (bias)) << (shift)) + +#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT(IS_P2ALIGNED(val, 1U << shift)); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) +#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) + +/* + * We currently support block sizes from 512 bytes to 16MB. + * The benefits of larger blocks, and thus larger IO, need to be weighed + * against the cost of COWing a giant block to modify one byte, and the + * large latency of reading or writing a large block. + * + * Note that although blocks up to 16MB are supported, the recordsize + * property can not be set larger than zfs_max_recordsize (default 1MB). + * See the comment near zfs_max_recordsize in dsl_dataset.c for details. + * + * Note that although the LSIZE field of the blkptr_t can store sizes up + * to 32MB, the dnode's dn_datablkszsec can only store sizes up to + * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. + */ +#define SPA_MINBLOCKSHIFT 9 +#define SPA_OLD_MAXBLOCKSHIFT 17 +#define SPA_MAXBLOCKSHIFT 24 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + +/* + * Default maximum supported logical ashift. + * + * The current 8k allocation block size limit is due to the 8k + * aligned/sized operations performed by vdev_probe() on + * vdev_label->vl_pad2. Using another "safe region" for these tests + * would allow the limit to be raised to 16k, at the expense of + * only having 8 available uberblocks in the label area. + */ +#define SPA_MAXASHIFT 13 + +/* + * Default minimum supported logical ashift. + */ +#define SPA_MINASHIFT SPA_MINBLOCKSHIFT + +/* + * Size of block to hold the configuration data (a packed nvlist) + */ +#define SPA_CONFIG_BLOCKSIZE (1ULL << 14) + +/* + * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. + * The ASIZE encoding should be at least 64 times larger (6 more bits) + * to support up to 4-way RAID-Z mirror mode with worst-case gang block + * overhead, three DVAs per bp, plus one more bit in case we do anything + * else that expands the ASIZE. + */ +#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ +#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ +#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ + +#define SPA_COMPRESSBITS 7 +#define SPA_VDEVBITS 24 + +/* + * All SPA data is represented by 128-bit data virtual addresses (DVAs). + * The members of the dva_t should be considered opaque outside the SPA. + */ +typedef struct dva { + uint64_t dva_word[2]; +} dva_t; + +/* + * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. + */ +typedef struct zio_cksum { + uint64_t zc_word[4]; +} zio_cksum_t; + +/* + * Some checksums/hashes need a 256-bit initialization salt. This salt is kept + * secret and is suitable for use in MAC algorithms as the key. + */ +typedef struct zio_cksum_salt { + uint8_t zcs_bytes[32]; +} zio_cksum_salt_t; + +/* + * Each block is described by its DVAs, time of birth, checksum, etc. + * The word-by-word, bit-by-bit layout of the blkptr is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | pad | vdev1 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 1 |G| offset1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 2 | pad | vdev2 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 3 |G| offset2 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 4 | pad | vdev3 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 5 |G| offset3 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 8 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 9 | physical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | logical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | fill count | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * c | checksum[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * d | checksum[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * e | checksum[2] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * f | checksum[3] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * vdev virtual device ID + * offset offset into virtual device + * LSIZE logical size + * PSIZE physical size (after compression) + * ASIZE allocated size (including RAID-Z parity and gang block headers) + * GRID RAID-Z layout information (reserved for future use) + * cksum checksum function + * comp compression function + * G gang block indicator + * B byteorder (endianness) + * D dedup + * X encryption (on version 30, which is not supported) + * E blkptr_t contains embedded data (see below) + * lvl level of indirection + * type DMU object type + * phys birth txg when dva[0] was written; zero if same as logical birth txg + * note that typically all the dva's would be written in this + * txg, but they could be different if they were moved by + * device removal. + * log. birth transaction group in which the block was logically born + * fill count number of non-zero blocks under this bp + * checksum[4] 256-bit checksum of the data this bp describes + */ + +/* + * "Embedded" blkptr_t's don't actually point to a block, instead they + * have a data payload embedded in the blkptr_t itself. See the comment + * in blkptr.c for more details. + * + * The blkptr_t is laid out as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | payload | + * 1 | payload | + * 2 | payload | + * 3 | payload | + * 4 | payload | + * 5 | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | payload | + * 8 | payload | + * 9 | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | logical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | payload | + * c | payload | + * d | payload | + * e | payload | + * f | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * payload contains the embedded data + * B (byteorder) byteorder (endianness) + * D (dedup) padding (set to zero) + * X encryption (set to zero; see above) + * E (embedded) set to one + * lvl indirection level + * type DMU object type + * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) + * comp compression function of payload + * PSIZE size of payload after compression, in bytes + * LSIZE logical size of payload, in bytes + * note that 25 bits is enough to store the largest + * "normal" BP's LSIZE (2^16 * 2^9) in bytes + * log. birth transaction group in which the block was logically born + * + * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded + * bp's they are stored in units of SPA_MINBLOCKSHIFT. + * Generally, the generic BP_GET_*() macros can be used on embedded BP's. + * The B, D, X, lvl, type, and comp fields are stored the same as with normal + * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must + * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before + * other macros, as they assert that they are only used on BP's of the correct + * "embedded-ness". + */ + +#define BPE_GET_ETYPE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET((bp)->blk_prop, 40, 8)) +#define BPE_SET_ETYPE(bp, t) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET((bp)->blk_prop, 40, 8, t); \ +_NOTE(CONSTCOND) } while (0) + +#define BPE_GET_LSIZE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) +#define BPE_SET_LSIZE(bp, x) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BPE_GET_PSIZE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) +#define BPE_SET_PSIZE(bp, x) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +typedef enum bp_embedded_type { + BP_EMBEDDED_TYPE_DATA, + BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */ + NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED +} bp_embedded_type_t; + +#define BPE_NUM_WORDS 14 +#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) +#define BPE_IS_PAYLOADWORD(bp, wp) \ + ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) + +#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ +#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ +#define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */ + +/* + * A block is a hole when it has either 1) never been written to, or + * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads + * without physically allocating disk space. Holes are represented in the + * blkptr_t structure by zeroed blk_dva. Correct checking for holes is + * done through the BP_IS_HOLE macro. For holes, the logical size, level, + * DMU object type, and birth times are all also stored for holes that + * were written to at some point (i.e. were punched after having been filled). + */ +typedef struct blkptr { + dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ + uint64_t blk_prop; /* size, compression, type, etc */ + uint64_t blk_pad[2]; /* Extra space for the future */ + uint64_t blk_phys_birth; /* txg when block was allocated */ + uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_fill; /* fill count */ + zio_cksum_t blk_cksum; /* 256-bit checksum */ +} blkptr_t; + +/* + * Macros to get and set fields in a bp or DVA. + */ +#define DVA_GET_ASIZE(dva) \ + BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_ASIZE(dva, x) \ + BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ + SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) +#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) + +#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) +#define DVA_SET_VDEV(dva, x) \ + BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) + +#define DVA_GET_OFFSET(dva) \ + BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_OFFSET(dva, x) \ + BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) +#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) + +#define BP_GET_LSIZE(bp) \ + (BP_IS_EMBEDDED(bp) ? \ + (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ + BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_LSIZE(bp, x) do { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, \ + 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BP_GET_PSIZE(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_PSIZE(bp, x) do { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, \ + 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BP_GET_COMPRESS(bp) \ + BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) +#define BP_SET_COMPRESS(bp, x) \ + BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) + +#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) +#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) + +#define BP_GET_CHECKSUM(bp) \ + (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ + BF64_GET((bp)->blk_prop, 40, 8)) +#define BP_SET_CHECKSUM(bp, x) do { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET((bp)->blk_prop, 40, 8, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) +#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) + +#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) +#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) + +#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) +#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) + +#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) +#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) + +#define BP_PHYSICAL_BIRTH(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) + +#define BP_SET_BIRTH(bp, logical, physical) \ +{ \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + (bp)->blk_birth = (logical); \ + (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ +} + +#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill) + +#define BP_IS_METADATA(bp) \ + (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) + +#define BP_GET_ASIZE(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define BP_GET_UCSIZE(bp) \ + (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) + +#define BP_GET_NDVAS(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define BP_COUNT_GANG(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ + DVA_GET_GANG(&(bp)->blk_dva[1]) + \ + DVA_GET_GANG(&(bp)->blk_dva[2]))) + +#define DVA_EQUAL(dva1, dva2) \ + ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ + (dva1)->dva_word[0] == (dva2)->dva_word[0]) + +#define BP_EQUAL(bp1, bp2) \ + (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ + (bp1)->blk_birth == (bp2)->blk_birth && \ + DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ + DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ + DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) + +#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ + (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ + ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ + ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ + ((zc1).zc_word[3] - (zc2).zc_word[3]))) + +#define ZIO_CHECKSUM_IS_ZERO(zc) \ + (0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \ + (zc)->zc_word[2] | (zc)->zc_word[3])) + +#define ZIO_CHECKSUM_BSWAP(zcp) \ +{ \ + (zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]); \ + (zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]); \ + (zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]); \ + (zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]); \ +} + + +#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) + +#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ +{ \ + (zcp)->zc_word[0] = w0; \ + (zcp)->zc_word[1] = w1; \ + (zcp)->zc_word[2] = w2; \ + (zcp)->zc_word[3] = w3; \ +} + +#define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) +#define BP_IS_GANG(bp) \ + (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) +#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ + (dva)->dva_word[1] == 0ULL) +#define BP_IS_HOLE(bp) \ + (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) + +/* BP_IS_RAIDZ(bp) assumes no block compression */ +#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ + BP_GET_PSIZE(bp)) + +#define BP_ZERO(bp) \ +{ \ + (bp)->blk_dva[0].dva_word[0] = 0; \ + (bp)->blk_dva[0].dva_word[1] = 0; \ + (bp)->blk_dva[1].dva_word[0] = 0; \ + (bp)->blk_dva[1].dva_word[1] = 0; \ + (bp)->blk_dva[2].dva_word[0] = 0; \ + (bp)->blk_dva[2].dva_word[1] = 0; \ + (bp)->blk_prop = 0; \ + (bp)->blk_pad[0] = 0; \ + (bp)->blk_pad[1] = 0; \ + (bp)->blk_phys_birth = 0; \ + (bp)->blk_birth = 0; \ + (bp)->blk_fill = 0; \ + ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ +} + +#if BYTE_ORDER == _BIG_ENDIAN +#define ZFS_HOST_BYTEORDER (0ULL) +#else +#define ZFS_HOST_BYTEORDER (1ULL) +#endif + +#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) + +#define BP_SPRINTF_LEN 320 + +/* + * This macro allows code sharing between zfs, libzpool, and mdb. + * 'func' is either snprintf() or mdb_snprintf(). + * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. + */ +#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ +{ \ + static const char *copyname[] = \ + { "zero", "single", "double", "triple" }; \ + int len = 0; \ + int copies = 0; \ + \ + if (bp == NULL) { \ + len += func(buf + len, size - len, "<NULL>"); \ + } else if (BP_IS_HOLE(bp)) { \ + len += func(buf + len, size - len, \ + "HOLE [L%llu %s] " \ + "size=%llxL birth=%lluL", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + (u_longlong_t)BP_GET_LSIZE(bp), \ + (u_longlong_t)bp->blk_birth); \ + } else if (BP_IS_EMBEDDED(bp)) { \ + len = func(buf + len, size - len, \ + "EMBEDDED [L%llu %s] et=%u %s " \ + "size=%llxL/%llxP birth=%lluL", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + (int)BPE_GET_ETYPE(bp), \ + compress, \ + (u_longlong_t)BPE_GET_LSIZE(bp), \ + (u_longlong_t)BPE_GET_PSIZE(bp), \ + (u_longlong_t)bp->blk_birth); \ + } else { \ + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ + const dva_t *dva = &bp->blk_dva[d]; \ + if (DVA_IS_VALID(dva)) \ + copies++; \ + len += func(buf + len, size - len, \ + "DVA[%d]=<%llu:%llx:%llx>%c", d, \ + (u_longlong_t)DVA_GET_VDEV(dva), \ + (u_longlong_t)DVA_GET_OFFSET(dva), \ + (u_longlong_t)DVA_GET_ASIZE(dva), \ + ws); \ + } \ + if (BP_IS_GANG(bp) && \ + DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ + DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ + copies--; \ + len += func(buf + len, size - len, \ + "[L%llu %s] %s %s %s %s %s %s%c" \ + "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ + "cksum=%llx:%llx:%llx:%llx", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + checksum, \ + compress, \ + BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ + BP_IS_GANG(bp) ? "gang" : "contiguous", \ + BP_GET_DEDUP(bp) ? "dedup" : "unique", \ + copyname[copies], \ + ws, \ + (u_longlong_t)BP_GET_LSIZE(bp), \ + (u_longlong_t)BP_GET_PSIZE(bp), \ + (u_longlong_t)bp->blk_birth, \ + (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ + (u_longlong_t)BP_GET_FILL(bp), \ + ws, \ + (u_longlong_t)bp->blk_cksum.zc_word[0], \ + (u_longlong_t)bp->blk_cksum.zc_word[1], \ + (u_longlong_t)bp->blk_cksum.zc_word[2], \ + (u_longlong_t)bp->blk_cksum.zc_word[3]); \ + } \ + ASSERT(len < size); \ +} + +#define BP_GET_BUFC_TYPE(bp) \ + (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) + +typedef enum spa_import_type { + SPA_IMPORT_EXISTING, + SPA_IMPORT_ASSEMBLE +} spa_import_type_t; + +/* state manipulation functions */ +extern int spa_open(const char *pool, spa_t **, void *tag); +extern int spa_open_rewind(const char *pool, spa_t **, void *tag, + nvlist_t *policy, nvlist_t **config); +extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, + size_t buflen); +extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, + nvlist_t *zplprops); +#ifdef illumos +extern int spa_import_rootpool(char *devpath, char *devid); +#else +extern int spa_import_rootpool(const char *name); +#endif +extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, + uint64_t flags); +extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); +extern int spa_destroy(char *pool); +extern int spa_checkpoint(const char *pool); +extern int spa_checkpoint_discard(const char *pool); +extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, + boolean_t hardforce); +extern int spa_reset(char *pool); +extern void spa_async_request(spa_t *spa, int flag); +extern void spa_async_unrequest(spa_t *spa, int flag); +extern void spa_async_suspend(spa_t *spa); +extern void spa_async_resume(spa_t *spa); +extern spa_t *spa_inject_addref(char *pool); +extern void spa_inject_delref(spa_t *spa); +extern void spa_scan_stat_init(spa_t *spa); +extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); + +#define SPA_ASYNC_CONFIG_UPDATE 0x01 +#define SPA_ASYNC_REMOVE 0x02 +#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_RESILVER_DONE 0x08 +#define SPA_ASYNC_RESILVER 0x10 +#define SPA_ASYNC_AUTOEXPAND 0x20 +#define SPA_ASYNC_REMOVE_DONE 0x40 +#define SPA_ASYNC_REMOVE_STOP 0x80 +#define SPA_ASYNC_INITIALIZE_RESTART 0x100 + +/* + * Controls the behavior of spa_vdev_remove(). + */ +#define SPA_REMOVE_UNSPARE 0x01 +#define SPA_REMOVE_DONE 0x02 + +/* device manipulation */ +extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); +extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, + int replacing); +extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, + int replace_done); +extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); +extern boolean_t spa_vdev_remove_active(spa_t *spa); +extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type); +extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); +extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); +extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, + nvlist_t *props, boolean_t exp); + +/* spare state (which is global across all pools) */ +extern void spa_spare_add(vdev_t *vd); +extern void spa_spare_remove(vdev_t *vd); +extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); +extern void spa_spare_activate(vdev_t *vd); + +/* L2ARC state (which is global across all pools) */ +extern void spa_l2cache_add(vdev_t *vd); +extern void spa_l2cache_remove(vdev_t *vd); +extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); +extern void spa_l2cache_activate(vdev_t *vd); +extern void spa_l2cache_drop(spa_t *spa); + +/* scanning */ +extern int spa_scan(spa_t *spa, pool_scan_func_t func); +extern int spa_scan_stop(spa_t *spa); +extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); + +/* spa syncing */ +extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ +extern void spa_sync_allpools(void); + +/* spa namespace global mutex */ +extern kmutex_t spa_namespace_lock; + +/* + * SPA configuration functions in spa_config.c + */ + +#define SPA_CONFIG_UPDATE_POOL 0 +#define SPA_CONFIG_UPDATE_VDEVS 1 + +extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t); +extern void spa_config_load(void); +extern nvlist_t *spa_all_configs(uint64_t *); +extern void spa_config_set(spa_t *spa, nvlist_t *config); +extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, + int getstats); +extern void spa_config_update(spa_t *spa, int what); + +/* + * Miscellaneous SPA routines in spa_misc.c + */ + +/* Namespace manipulation */ +extern spa_t *spa_lookup(const char *name); +extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); +extern void spa_remove(spa_t *spa); +extern spa_t *spa_next(spa_t *prev); + +/* Refcount functions */ +extern void spa_open_ref(spa_t *spa, void *tag); +extern void spa_close(spa_t *spa, void *tag); +extern void spa_async_close(spa_t *spa, void *tag); +extern boolean_t spa_refcount_zero(spa_t *spa); + +#define SCL_NONE 0x00 +#define SCL_CONFIG 0x01 +#define SCL_STATE 0x02 +#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ +#define SCL_ALLOC 0x08 +#define SCL_ZIO 0x10 +#define SCL_FREE 0x20 +#define SCL_VDEV 0x40 +#define SCL_LOCKS 7 +#define SCL_ALL ((1 << SCL_LOCKS) - 1) +#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) + +/* Pool configuration locks */ +extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); +extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); +extern void spa_config_exit(spa_t *spa, int locks, void *tag); +extern int spa_config_held(spa_t *spa, int locks, krw_t rw); + +/* Pool vdev add/remove lock */ +extern uint64_t spa_vdev_enter(spa_t *spa); +extern uint64_t spa_vdev_config_enter(spa_t *spa); +extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, + int error, char *tag); +extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); + +/* Pool vdev state change lock */ +extern void spa_vdev_state_enter(spa_t *spa, int oplock); +extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); + +/* Log state */ +typedef enum spa_log_state { + SPA_LOG_UNKNOWN = 0, /* unknown log state */ + SPA_LOG_MISSING, /* missing log(s) */ + SPA_LOG_CLEAR, /* clear the log(s) */ + SPA_LOG_GOOD, /* log(s) are good */ +} spa_log_state_t; + +extern spa_log_state_t spa_get_log_state(spa_t *spa); +extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); +extern int spa_reset_logs(spa_t *spa); + +/* Log claim callback */ +extern void spa_claim_notify(zio_t *zio); + +/* Accessor functions */ +extern boolean_t spa_shutting_down(spa_t *spa); +extern struct dsl_pool *spa_get_dsl(spa_t *spa); +extern boolean_t spa_is_initializing(spa_t *spa); +extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); +extern blkptr_t *spa_get_rootblkptr(spa_t *spa); +extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); +extern void spa_altroot(spa_t *, char *, size_t); +extern int spa_sync_pass(spa_t *spa); +extern char *spa_name(spa_t *spa); +extern uint64_t spa_guid(spa_t *spa); +extern uint64_t spa_load_guid(spa_t *spa); +extern uint64_t spa_last_synced_txg(spa_t *spa); +extern uint64_t spa_first_txg(spa_t *spa); +extern uint64_t spa_syncing_txg(spa_t *spa); +extern uint64_t spa_final_dirty_txg(spa_t *spa); +extern uint64_t spa_version(spa_t *spa); +extern pool_state_t spa_state(spa_t *spa); +extern spa_load_state_t spa_load_state(spa_t *spa); +extern uint64_t spa_freeze_txg(spa_t *spa); +extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); +extern uint64_t spa_get_dspace(spa_t *spa); +extern uint64_t spa_get_checkpoint_space(spa_t *spa); +extern uint64_t spa_get_slop_space(spa_t *spa); +extern void spa_update_dspace(spa_t *spa); +extern uint64_t spa_version(spa_t *spa); +extern boolean_t spa_deflate(spa_t *spa); +extern metaslab_class_t *spa_normal_class(spa_t *spa); +extern metaslab_class_t *spa_log_class(spa_t *spa); +extern void spa_evicting_os_register(spa_t *, objset_t *os); +extern void spa_evicting_os_deregister(spa_t *, objset_t *os); +extern void spa_evicting_os_wait(spa_t *spa); +extern int spa_max_replication(spa_t *spa); +extern int spa_prev_software_version(spa_t *spa); +extern int spa_busy(void); +extern uint8_t spa_get_failmode(spa_t *spa); +extern boolean_t spa_suspended(spa_t *spa); +extern uint64_t spa_bootfs(spa_t *spa); +extern uint64_t spa_delegation(spa_t *spa); +extern objset_t *spa_meta_objset(spa_t *spa); +extern uint64_t spa_deadman_synctime(spa_t *spa); +extern uint64_t spa_dirty_data(spa_t *spa); + +/* Miscellaneous support routines */ +extern void spa_load_failed(spa_t *spa, const char *fmt, ...); +extern void spa_load_note(spa_t *spa, const char *fmt, ...); +extern void spa_activate_mos_feature(spa_t *spa, const char *feature, + dmu_tx_t *tx); +extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); +extern int spa_rename(const char *oldname, const char *newname); +extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); +extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); +extern char *spa_strdup(const char *); +extern void spa_strfree(char *); +extern uint64_t spa_get_random(uint64_t range); +extern uint64_t spa_generate_guid(spa_t *spa); +extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); +extern void spa_freeze(spa_t *spa); +extern int spa_change_guid(spa_t *spa); +extern void spa_upgrade(spa_t *spa, uint64_t version); +extern void spa_evict_all(void); +extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, + boolean_t l2cache); +extern boolean_t spa_has_spare(spa_t *, uint64_t guid); +extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); +extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); +extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); +extern boolean_t spa_has_slogs(spa_t *spa); +extern boolean_t spa_is_root(spa_t *spa); +extern boolean_t spa_writeable(spa_t *spa); +extern boolean_t spa_has_pending_synctask(spa_t *spa); +extern int spa_maxblocksize(spa_t *spa); +extern int spa_maxdnodesize(spa_t *spa); +extern boolean_t spa_has_checkpoint(spa_t *spa); +extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); +extern boolean_t spa_suspend_async_destroy(spa_t *spa); +extern uint64_t spa_min_claim_txg(spa_t *spa); +extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp); +extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, + const blkptr_t *bp); +typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, + void *arg); +extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, + spa_remap_cb_t callback, void *arg); +extern uint64_t spa_get_last_removal_txg(spa_t *spa); +extern boolean_t spa_trust_config(spa_t *spa); +extern uint64_t spa_missing_tvds_allowed(spa_t *spa); +extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); +extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); + +extern int spa_mode(spa_t *spa); +extern uint64_t zfs_strtonum(const char *str, char **nptr); + +extern char *spa_his_ievent_table[]; + +extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); +extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, + char *his_buf); +extern int spa_history_log(spa_t *spa, const char *his_buf); +extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); +extern void spa_history_log_version(spa_t *spa, const char *operation); +extern void spa_history_log_internal(spa_t *spa, const char *operation, + dmu_tx_t *tx, const char *fmt, ...); +extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, + dmu_tx_t *tx, const char *fmt, ...); +extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, + dmu_tx_t *tx, const char *fmt, ...); + +/* error handling */ +struct zbookmark_phys; +extern void spa_log_error(spa_t *spa, zio_t *zio); +extern void zfs_ereport_post(const char *cls, spa_t *spa, vdev_t *vd, + zio_t *zio, uint64_t stateoroffset, uint64_t length); +extern void zfs_post_remove(spa_t *spa, vdev_t *vd); +extern void zfs_post_state_change(spa_t *spa, vdev_t *vd); +extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); +extern uint64_t spa_get_errlog_size(spa_t *spa); +extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); +extern void spa_errlog_rotate(spa_t *spa); +extern void spa_errlog_drain(spa_t *spa); +extern void spa_errlog_sync(spa_t *spa, uint64_t txg); +extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); + +/* vdev cache */ +extern void vdev_cache_stat_init(void); +extern void vdev_cache_stat_fini(void); + +/* Initialization and termination */ +extern void spa_init(int flags); +extern void spa_fini(void); +extern void spa_boot_init(void); + +/* properties */ +extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); +extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); +extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); +extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); + +/* asynchronous event notification */ +extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, + const char *name); +extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, + const char *name); +extern void spa_event_post(sysevent_t *ev); +extern void spa_event_discard(sysevent_t *ev); + +#ifdef ZFS_DEBUG +#define dprintf_bp(bp, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ + snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ + dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ + kmem_free(__blkbuf, BP_SPRINTF_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_bp(bp, fmt, ...) +#endif + +extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h new file mode 100644 index 000000000000..8df5072a55ef --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_SPA_BOOT_H +#define _SYS_SPA_BOOT_H + +#include <sys/nvpair.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern char *spa_get_bootprop(char *prop); +extern void spa_free_bootprop(char *prop); + +extern void spa_arch_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_BOOT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h new file mode 100644 index 000000000000..a5c8560149a4 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_SPA_CHECKPOINT_H +#define _SYS_SPA_CHECKPOINT_H + +#include <sys/zthr.h> + +typedef struct spa_checkpoint_info { + uint64_t sci_timestamp; /* when checkpointed uberblock was synced */ + uint64_t sci_dspace; /* disk space used by checkpoint in bytes */ +} spa_checkpoint_info_t; + +int spa_checkpoint(const char *); +int spa_checkpoint_discard(const char *); + +boolean_t spa_checkpoint_discard_thread_check(void *, zthr_t *); +int spa_checkpoint_discard_thread(void *, zthr_t *); + +int spa_checkpoint_get_stats(spa_t *, pool_checkpoint_stat_t *); + +#endif /* _SYS_SPA_CHECKPOINT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h new file mode 100644 index 000000000000..f69dde66dd9b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -0,0 +1,423 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2017 Datto Inc. + */ + +#ifndef _SYS_SPA_IMPL_H +#define _SYS_SPA_IMPL_H + +#include <sys/spa.h> +#include <sys/spa_checkpoint.h> +#include <sys/vdev.h> +#include <sys/vdev_removal.h> +#include <sys/metaslab.h> +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/uberblock_impl.h> +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/refcount.h> +#include <sys/bplist.h> +#include <sys/bpobj.h> +#include <sys/zfeature.h> +#include <sys/zthr.h> +#include <zfeature_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct spa_error_entry { + zbookmark_phys_t se_bookmark; + char *se_name; + avl_node_t se_avl; +} spa_error_entry_t; + +typedef struct spa_history_phys { + uint64_t sh_pool_create_len; /* ending offset of zpool create */ + uint64_t sh_phys_max_off; /* physical EOF */ + uint64_t sh_bof; /* logical BOF */ + uint64_t sh_eof; /* logical EOF */ + uint64_t sh_records_lost; /* num of records overwritten */ +} spa_history_phys_t; + +/* + * All members must be uint64_t, for byteswap purposes. + */ +typedef struct spa_removing_phys { + uint64_t sr_state; /* dsl_scan_state_t */ + + /* + * The vdev ID that we most recently attempted to remove, + * or -1 if no removal has been attempted. + */ + uint64_t sr_removing_vdev; + + /* + * The vdev ID that we most recently successfully removed, + * or -1 if no devices have been removed. + */ + uint64_t sr_prev_indirect_vdev; + + uint64_t sr_start_time; + uint64_t sr_end_time; + + /* + * Note that we can not use the space map's or indirect mapping's + * accounting as a substitute for these values, because we need to + * count frees of not-yet-copied data as though it did the copy. + * Otherwise, we could get into a situation where copied > to_copy, + * or we complete before copied == to_copy. + */ + uint64_t sr_to_copy; /* bytes that need to be copied */ + uint64_t sr_copied; /* bytes that have been copied or freed */ +} spa_removing_phys_t; + +/* + * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT + * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense + * of an indirect vdev's mapping object is in progress. + */ +typedef struct spa_condensing_indirect_phys { + /* + * The vdev ID of the indirect vdev whose indirect mapping is + * being condensed. + */ + uint64_t scip_vdev; + + /* + * The vdev's old obsolete spacemap. This spacemap's contents are + * being integrated into the new mapping. + */ + uint64_t scip_prev_obsolete_sm_object; + + /* + * The new mapping object that is being created. + */ + uint64_t scip_next_mapping_object; +} spa_condensing_indirect_phys_t; + +struct spa_aux_vdev { + uint64_t sav_object; /* MOS object for device list */ + nvlist_t *sav_config; /* cached device config */ + vdev_t **sav_vdevs; /* devices */ + int sav_count; /* number devices */ + boolean_t sav_sync; /* sync the device list */ + nvlist_t **sav_pending; /* pending device additions */ + uint_t sav_npending; /* # pending devices */ +}; + +typedef struct spa_config_lock { + kmutex_t scl_lock; + kthread_t *scl_writer; + int scl_write_wanted; + kcondvar_t scl_cv; + refcount_t scl_count; +} spa_config_lock_t; + +typedef struct spa_config_dirent { + list_node_t scd_link; + char *scd_path; +} spa_config_dirent_t; + +typedef enum zio_taskq_type { + ZIO_TASKQ_ISSUE = 0, + ZIO_TASKQ_ISSUE_HIGH, + ZIO_TASKQ_INTERRUPT, + ZIO_TASKQ_INTERRUPT_HIGH, + ZIO_TASKQ_TYPES +} zio_taskq_type_t; + +/* + * State machine for the zpool-poolname process. The states transitions + * are done as follows: + * + * From To Routine + * PROC_NONE -> PROC_CREATED spa_activate() + * PROC_CREATED -> PROC_ACTIVE spa_thread() + * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate() + * PROC_DEACTIVATE -> PROC_GONE spa_thread() + * PROC_GONE -> PROC_NONE spa_deactivate() + */ +typedef enum spa_proc_state { + SPA_PROC_NONE, /* spa_proc = &p0, no process created */ + SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */ + SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */ + SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */ + SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ +} spa_proc_state_t; + +typedef struct spa_taskqs { + uint_t stqs_count; + taskq_t **stqs_taskq; +} spa_taskqs_t; + +typedef enum spa_all_vdev_zap_action { + AVZ_ACTION_NONE = 0, + AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */ + AVZ_ACTION_REBUILD, /* Populate the new AVZ, see spa_avz_rebuild */ + AVZ_ACTION_INITIALIZE +} spa_avz_action_t; + +typedef enum spa_config_source { + SPA_CONFIG_SRC_NONE = 0, + SPA_CONFIG_SRC_SCAN, /* scan of path (default: /dev/dsk) */ + SPA_CONFIG_SRC_CACHEFILE, /* any cachefile */ + SPA_CONFIG_SRC_TRYIMPORT, /* returned from call to tryimport */ + SPA_CONFIG_SRC_SPLIT, /* new pool in a pool split */ + SPA_CONFIG_SRC_MOS /* MOS, but not always from right txg */ +} spa_config_source_t; + +struct spa { + /* + * Fields protected by spa_namespace_lock. + */ + char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */ + char *spa_comment; /* comment */ + avl_node_t spa_avl; /* node in spa_namespace_avl */ + nvlist_t *spa_config; /* last synced config */ + nvlist_t *spa_config_syncing; /* currently syncing config */ + nvlist_t *spa_config_splitting; /* config for splitting */ + nvlist_t *spa_load_info; /* info and errors from load */ + uint64_t spa_config_txg; /* txg of last config change */ + int spa_sync_pass; /* iterate-to-convergence */ + pool_state_t spa_state; /* pool state */ + int spa_inject_ref; /* injection references */ + uint8_t spa_sync_on; /* sync threads are running */ + spa_load_state_t spa_load_state; /* current load operation */ + boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */ + boolean_t spa_trust_config; /* do we trust vdev tree? */ + spa_config_source_t spa_config_source; /* where config comes from? */ + uint64_t spa_import_flags; /* import specific flags */ + spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; + dsl_pool_t *spa_dsl_pool; + boolean_t spa_is_initializing; /* true while opening pool */ + metaslab_class_t *spa_normal_class; /* normal data class */ + metaslab_class_t *spa_log_class; /* intent log data class */ + uint64_t spa_first_txg; /* first txg after spa_open() */ + uint64_t spa_final_txg; /* txg of export/destroy */ + uint64_t spa_freeze_txg; /* freeze pool at this txg */ + uint64_t spa_load_max_txg; /* best initial ub_txg */ + uint64_t spa_claim_max_txg; /* highest claimed birth txg */ + timespec_t spa_loaded_ts; /* 1st successful open time */ + objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ + kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ + list_t spa_evicting_os_list; /* Objsets being evicted. */ + kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ + txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ + vdev_t *spa_root_vdev; /* top-level vdev container */ + int spa_min_ashift; /* of vdevs in normal class */ + int spa_max_ashift; /* of vdevs in normal class */ + uint64_t spa_config_guid; /* config pool guid */ + uint64_t spa_load_guid; /* spa_load initialized guid */ + uint64_t spa_last_synced_guid; /* last synced guid */ + list_t spa_config_dirty_list; /* vdevs with dirty config */ + list_t spa_state_dirty_list; /* vdevs with dirty state */ + /* + * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are + * stored in spa_alloc_count. There is one tree and one lock for each + * allocator, to help improve allocation performance in write-heavy + * workloads. + */ + kmutex_t *spa_alloc_locks; + avl_tree_t *spa_alloc_trees; + int spa_alloc_count; + + spa_aux_vdev_t spa_spares; /* hot spares */ + spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ + nvlist_t *spa_label_features; /* Features for reading MOS */ + uint64_t spa_config_object; /* MOS object for pool config */ + uint64_t spa_config_generation; /* config generation number */ + uint64_t spa_syncing_txg; /* txg currently syncing */ + bpobj_t spa_deferred_bpobj; /* deferred-free bplist */ + bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ + zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */ + /* checksum context templates */ + kmutex_t spa_cksum_tmpls_lock; + void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS]; + uberblock_t spa_ubsync; /* last synced uberblock */ + uberblock_t spa_uberblock; /* current uberblock */ + boolean_t spa_extreme_rewind; /* rewind past deferred frees */ + uint64_t spa_last_io; /* lbolt of last non-scan I/O */ + kmutex_t spa_scrub_lock; /* resilver/scrub lock */ + uint64_t spa_scrub_inflight; /* in-flight scrub bytes */ + uint64_t spa_load_verify_ios; /* in-flight verifications IOs */ + kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ + uint8_t spa_scrub_active; /* active or suspended? */ + uint8_t spa_scrub_type; /* type of scrub we're doing */ + uint8_t spa_scrub_finished; /* indicator to rotate logs */ + uint8_t spa_scrub_started; /* started since last boot */ + uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ + uint64_t spa_scan_pass_start; /* start time per pass/reboot */ + uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */ + uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */ + uint64_t spa_scan_pass_exam; /* examined bytes per pass */ + uint64_t spa_scan_pass_issued; /* issued bytes per pass */ + kmutex_t spa_async_lock; /* protect async state */ + kthread_t *spa_async_thread; /* thread doing async task */ + kthread_t *spa_async_thread_vd; /* thread doing vd async task */ + int spa_async_suspended; /* async tasks suspended */ + kcondvar_t spa_async_cv; /* wait for thread_exit() */ + uint16_t spa_async_tasks; /* async task mask */ + uint64_t spa_missing_tvds; /* unopenable tvds on load */ + uint64_t spa_missing_tvds_allowed; /* allow loading spa? */ + + spa_removing_phys_t spa_removing_phys; + spa_vdev_removal_t *spa_vdev_removal; + + spa_condensing_indirect_phys_t spa_condensing_indirect_phys; + spa_condensing_indirect_t *spa_condensing_indirect; + zthr_t *spa_condense_zthr; /* zthr doing condense. */ + + uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ + spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ + zthr_t *spa_checkpoint_discard_zthr; + + char *spa_root; /* alternate root directory */ + uint64_t spa_ena; /* spa-wide ereport ENA */ + int spa_last_open_failed; /* error if last open failed */ + uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */ + uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */ + uint64_t spa_load_txg; /* ub txg that loaded */ + uint64_t spa_load_txg_ts; /* timestamp from that ub */ + uint64_t spa_load_meta_errors; /* verify metadata err count */ + uint64_t spa_load_data_errors; /* verify data err count */ + uint64_t spa_verify_min_txg; /* start txg of verify scrub */ + kmutex_t spa_errlog_lock; /* error log lock */ + uint64_t spa_errlog_last; /* last error log object */ + uint64_t spa_errlog_scrub; /* scrub error log object */ + kmutex_t spa_errlist_lock; /* error list/ereport lock */ + avl_tree_t spa_errlist_last; /* last error list */ + avl_tree_t spa_errlist_scrub; /* scrub error list */ + uint64_t spa_deflate; /* should we deflate? */ + uint64_t spa_history; /* history object */ + kmutex_t spa_history_lock; /* history lock */ + vdev_t *spa_pending_vdev; /* pending vdev additions */ + kmutex_t spa_props_lock; /* property lock */ + uint64_t spa_pool_props_object; /* object for properties */ + uint64_t spa_bootfs; /* default boot filesystem */ + uint64_t spa_failmode; /* failure mode for the pool */ + uint64_t spa_delegation; /* delegation on/off */ + list_t spa_config_list; /* previous cache file(s) */ + /* per-CPU array of root of async I/O: */ + zio_t **spa_async_zio_root; + zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ + zio_t *spa_txg_zio[TXG_SIZE]; /* spa_sync() waits for this */ + kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ + kcondvar_t spa_suspend_cv; /* notification of resume */ + uint8_t spa_suspended; /* pool is suspended */ + uint8_t spa_claiming; /* pool is doing zil_claim() */ + boolean_t spa_is_root; /* pool is root */ + int spa_minref; /* num refs when first opened */ + int spa_mode; /* FREAD | FWRITE */ + spa_log_state_t spa_log_state; /* log state */ + uint64_t spa_autoexpand; /* lun expansion on/off */ + uint64_t spa_bootsize; /* efi system partition size */ + ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ + uint64_t spa_ddt_stat_object; /* DDT statistics */ + uint64_t spa_dedup_ditto; /* dedup ditto threshold */ + uint64_t spa_dedup_checksum; /* default dedup checksum */ + uint64_t spa_dspace; /* dspace in normal class */ + kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ + kmutex_t spa_proc_lock; /* protects spa_proc* */ + kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ + spa_proc_state_t spa_proc_state; /* see definition */ + struct proc *spa_proc; /* "zpool-poolname" process */ + uint64_t spa_did; /* if procp != p0, did of t1 */ + kthread_t *spa_trim_thread; /* thread sending TRIM I/Os */ + kmutex_t spa_trim_lock; /* protects spa_trim_cv */ + kcondvar_t spa_trim_cv; /* used to notify TRIM thread */ + boolean_t spa_autoreplace; /* autoreplace set in open */ + int spa_vdev_locks; /* locks grabbed */ + uint64_t spa_creation_version; /* version at pool creation */ + uint64_t spa_prev_software_version; /* See ub_software_version */ + uint64_t spa_feat_for_write_obj; /* required to write to pool */ + uint64_t spa_feat_for_read_obj; /* required to read from pool */ + uint64_t spa_feat_desc_obj; /* Feature descriptions */ + uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */ + kmutex_t spa_feat_stats_lock; /* protects spa_feat_stats */ + nvlist_t *spa_feat_stats; /* Cache of enabled features */ + /* cache feature refcounts */ + uint64_t spa_feat_refcount_cache[SPA_FEATURES]; +#ifdef illumos + cyclic_id_t spa_deadman_cycid; /* cyclic id */ +#else /* !illumos */ +#ifdef _KERNEL + struct callout spa_deadman_cycid; /* callout id */ + struct task spa_deadman_task; +#endif +#endif /* illumos */ + uint64_t spa_deadman_calls; /* number of deadman calls */ + hrtime_t spa_sync_starttime; /* starting time fo spa_sync */ + uint64_t spa_deadman_synctime; /* deadman expiration timer */ + uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */ + spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */ + +#ifdef illumos + /* + * spa_iokstat_lock protects spa_iokstat and + * spa_queue_stats[]. + */ + kmutex_t spa_iokstat_lock; + struct kstat *spa_iokstat; /* kstat of io to this pool */ + struct { + int spa_active; + int spa_queued; + } spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE]; +#endif + /* arc_memory_throttle() parameters during low memory condition */ + uint64_t spa_lowmem_page_load; /* memory load during txg */ + uint64_t spa_lowmem_last_txg; /* txg window start */ + + hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ + + /* + * spa_refcount & spa_config_lock must be the last elements + * because refcount_t changes size based on compilation options. + * In order for the MDB module to function correctly, the other + * fields must remain in the same location. + */ + spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ + refcount_t spa_refcount; /* number of opens */ +#ifndef illumos + boolean_t spa_splitting_newspa; /* creating new spa in split */ +#endif +}; + +extern const char *spa_config_path; + +extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); +extern void spa_load_spares(spa_t *spa); +extern void spa_load_l2cache(spa_t *spa); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h new file mode 100644 index 000000000000..d3d852978a57 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h @@ -0,0 +1,225 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_SPACE_MAP_H +#define _SYS_SPACE_MAP_H + +#include <sys/avl.h> +#include <sys/range_tree.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The size of the space map object has increased to include a histogram. + * The SPACE_MAP_SIZE_V0 designates the original size and is used to + * maintain backward compatibility. + */ +#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t)) +#define SPACE_MAP_HISTOGRAM_SIZE 32 + +/* + * The space_map_phys is the on-disk representation of the space map. + * Consumers of space maps should never reference any of the members of this + * structure directly. These members may only be updated in syncing context. + * + * Note the smp_object is no longer used but remains in the structure + * for backward compatibility. + */ +typedef struct space_map_phys { + uint64_t smp_object; /* on-disk space map object */ + uint64_t smp_objsize; /* size of the object */ + int64_t smp_alloc; /* space allocated from the map */ + uint64_t smp_pad[5]; /* reserved */ + + /* + * The smp_histogram maintains a histogram of free regions. Each + * bucket, smp_histogram[i], contains the number of free regions + * whose size is: + * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1) + */ + uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE]; +} space_map_phys_t; + +/* + * The space map object defines a region of space, its size, how much is + * allocated, and the on-disk object that stores this information. + * Consumers of space maps may only access the members of this structure. + * + * Note: the space_map may not be accessed concurrently; consumers + * must provide external locking if required. + */ +typedef struct space_map { + uint64_t sm_start; /* start of map */ + uint64_t sm_size; /* size of map */ + uint8_t sm_shift; /* unit shift */ + uint64_t sm_length; /* synced length */ + int64_t sm_alloc; /* synced space allocated */ + objset_t *sm_os; /* objset for this map */ + uint64_t sm_object; /* object id for this map */ + uint32_t sm_blksz; /* block size for space map */ + dmu_buf_t *sm_dbuf; /* space_map_phys_t dbuf */ + space_map_phys_t *sm_phys; /* on-disk space map */ +} space_map_t; + +/* + * debug entry + * + * 2 2 10 50 + * +-----+-----+------------+----------------------------------+ + * | 1 0 | act | syncpass | txg (lower bits) | + * +-----+-----+------------+----------------------------------+ + * 63 62 61 60 59 50 49 0 + * + * + * one-word entry + * + * 1 47 1 15 + * +-----------------------------------------------------------+ + * | 0 | offset (sm_shift units) | type | run | + * +-----------------------------------------------------------+ + * 63 62 16 15 14 0 + * + * + * two-word entry + * + * 2 2 36 24 + * +-----+-----+---------------------------+-------------------+ + * | 1 1 | pad | run | vdev | + * +-----+-----+---------------------------+-------------------+ + * 63 62 61 60 59 24 23 0 + * + * 1 63 + * +------+----------------------------------------------------+ + * | type | offset | + * +------+----------------------------------------------------+ + * 63 62 0 + * + * Note that a two-word entry will not strandle a block boundary. + * If necessary, the last word of a block will be padded with a + * debug entry (with act = syncpass = txg = 0). + */ + +typedef enum { + SM_ALLOC, + SM_FREE +} maptype_t; + +typedef struct space_map_entry { + maptype_t sme_type; + uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */ + uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */ + uint64_t sme_run; /* max is 2^36; units of sm_shift */ +} space_map_entry_t; + +#define SM_NO_VDEVID (1 << SPA_VDEVBITS) + +/* one-word entry constants */ +#define SM_DEBUG_PREFIX 2 +#define SM_OFFSET_BITS 47 +#define SM_RUN_BITS 15 + +/* two-word entry constants */ +#define SM2_PREFIX 3 +#define SM2_OFFSET_BITS 63 +#define SM2_RUN_BITS 36 + +#define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2) +#define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2) + +#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2) +#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2) +#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) +#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) +#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) +#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) + +#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS) +#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS) +#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) +#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) +#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1) +#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS) +#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) +#define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL) + +#define SM2_RUN_DECODE(x) (BF64_DECODE(x, SPA_VDEVBITS, SM2_RUN_BITS) + 1) +#define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, SPA_VDEVBITS, SM2_RUN_BITS) +#define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, SPA_VDEVBITS) +#define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, SPA_VDEVBITS) +#define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1) +#define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1) +#define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS) +#define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS) +#define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL) +#define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL) + +boolean_t sm_entry_is_debug(uint64_t e); +boolean_t sm_entry_is_single_word(uint64_t e); +boolean_t sm_entry_is_double_word(uint64_t e); + +typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg); + +int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype); +int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg); +int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, + dmu_tx_t *tx); + +void space_map_histogram_clear(space_map_t *sm); +void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, + dmu_tx_t *tx); + +void space_map_update(space_map_t *sm); + +uint64_t space_map_object(space_map_t *sm); +uint64_t space_map_allocated(space_map_t *sm); +uint64_t space_map_length(space_map_t *sm); + +void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t vdev_id, dmu_tx_t *tx); +uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, + uint64_t vdev_id); +void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx); +uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx); +void space_map_free(space_map_t *sm, dmu_tx_t *tx); +void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx); + +int space_map_open(space_map_t **smp, objset_t *os, uint64_t object, + uint64_t start, uint64_t size, uint8_t shift); +void space_map_close(space_map_t *sm); + +int64_t space_map_alloc_delta(space_map_t *sm); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPACE_MAP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h new file mode 100644 index 000000000000..249b15be6729 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h @@ -0,0 +1,57 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#ifndef _SYS_SPACE_REFTREE_H +#define _SYS_SPACE_REFTREE_H + +#include <sys/range_tree.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct space_ref { + avl_node_t sr_node; /* AVL node */ + uint64_t sr_offset; /* range offset (start or end) */ + int64_t sr_refcnt; /* associated reference count */ +} space_ref_t; + +void space_reftree_create(avl_tree_t *t); +void space_reftree_destroy(avl_tree_t *t); +void space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, + int64_t refcnt); +void space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt); +void space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, + int64_t minref); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPACE_REFTREE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h new file mode 100644 index 000000000000..f228d0766631 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>. + * All rights reserved. + */ + +#ifndef _SYS_TRIM_MAP_H +#define _SYS_TRIM_MAP_H + +#include <sys/avl.h> +#include <sys/list.h> +#include <sys/spa.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern void trim_map_create(vdev_t *vd); +extern void trim_map_destroy(vdev_t *vd); +extern void trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg); +extern boolean_t trim_map_write_start(zio_t *zio); +extern void trim_map_write_done(zio_t *zio); + +extern void trim_thread_create(spa_t *spa); +extern void trim_thread_destroy(spa_t *spa); +extern void trim_thread_wakeup(spa_t *spa); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TRIM_MAP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h new file mode 100644 index 000000000000..272108ea19f3 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h @@ -0,0 +1,131 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_TXG_H +#define _SYS_TXG_H + +#include <sys/spa.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */ +#define TXG_SIZE 4 /* next power of 2 */ +#define TXG_MASK (TXG_SIZE - 1) /* mask for size */ +#define TXG_INITIAL TXG_SIZE /* initial txg */ +#define TXG_IDX (txg & TXG_MASK) + +/* Number of txgs worth of frees we defer adding to in-core spacemaps */ +#define TXG_DEFER_SIZE 2 + +typedef struct tx_cpu tx_cpu_t; + +typedef struct txg_handle { + tx_cpu_t *th_cpu; + uint64_t th_txg; +} txg_handle_t; + +typedef struct txg_node { + struct txg_node *tn_next[TXG_SIZE]; + uint8_t tn_member[TXG_SIZE]; +} txg_node_t; + +typedef struct txg_list { + kmutex_t tl_lock; + size_t tl_offset; + spa_t *tl_spa; + txg_node_t *tl_head[TXG_SIZE]; +} txg_list_t; + +struct dsl_pool; + +extern void txg_init(struct dsl_pool *dp, uint64_t txg); +extern void txg_fini(struct dsl_pool *dp); +extern void txg_sync_start(struct dsl_pool *dp); +extern void txg_sync_stop(struct dsl_pool *dp); +extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); +extern void txg_rele_to_quiesce(txg_handle_t *txghp); +extern void txg_rele_to_sync(txg_handle_t *txghp); +extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); + +extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta, + hrtime_t resolution); +extern void txg_kick(struct dsl_pool *dp); + +/* + * Wait until the given transaction group has finished syncing. + * Try to make this happen as soon as possible (eg. kick off any + * necessary syncs immediately). If txg==0, wait for the currently open + * txg to finish syncing. + */ +extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); + +/* + * Wait until the given transaction group, or one after it, is + * the open transaction group. Try to make this happen as soon + * as possible (eg. kick off any necessary syncs immediately). + * If txg == 0, wait for the next open txg. + */ +extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg); + +/* + * Returns TRUE if we are "backed up" waiting for the syncing + * transaction to complete; otherwise returns FALSE. + */ +extern boolean_t txg_stalled(struct dsl_pool *dp); + +/* returns TRUE if someone is waiting for the next txg to sync */ +extern boolean_t txg_sync_waiting(struct dsl_pool *dp); + +extern void txg_verify(spa_t *spa, uint64_t txg); + +/* + * Per-txg object lists. + */ + +#define TXG_CLEAN(txg) ((txg) - 1) + +extern void txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset); +extern void txg_list_destroy(txg_list_t *tl); +extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg); +extern boolean_t txg_all_lists_empty(txg_list_t *tl); +extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg); +extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); +extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); +extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); +extern boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg); +extern void *txg_list_head(txg_list_t *tl, uint64_t txg); +extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TXG_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h new file mode 100644 index 000000000000..bf3b269d707d --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_TXG_IMPL_H +#define _SYS_TXG_IMPL_H + +#include <sys/spa.h> +#include <sys/txg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The tx_cpu structure is a per-cpu structure that is used to track + * the number of active transaction holds (tc_count). As transactions + * are assigned into a transaction group the appropriate tc_count is + * incremented to indicate that there are pending changes that have yet + * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement + * the tc_count. A transaction group is not considered quiesced until all + * tx_cpu structures have reached a tc_count of zero. + * + * This structure is a per-cpu structure by design. Updates to this structure + * are frequent and concurrent. Having a single structure would result in + * heavy lock contention so a per-cpu design was implemented. With the fanned + * out mutex design, consumers only need to lock the mutex associated with + * thread's cpu. + * + * The tx_cpu contains two locks, the tc_lock and tc_open_lock. + * The tc_lock is used to protect all members of the tx_cpu structure with + * the exception of the tc_open_lock. This lock should only be held for a + * short period of time, typically when updating the value of tc_count. + * + * The tc_open_lock protects the tx_open_txg member of the tx_state structure. + * This lock is used to ensure that transactions are only assigned into + * the current open transaction group. In order to move the current open + * transaction group to the quiesce phase, the txg_quiesce thread must + * grab all tc_open_locks, increment the tx_open_txg, and drop the locks. + * The tc_open_lock is held until the transaction is assigned into the + * transaction group. Typically, this is a short operation but if throttling + * is occuring it may be held for longer periods of time. + */ +struct tx_cpu { + kmutex_t tc_open_lock; /* protects tx_open_txg */ + kmutex_t tc_lock; /* protects the rest of this struct */ + kcondvar_t tc_cv[TXG_SIZE]; + uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */ + list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ + char tc_pad[8]; /* pad to fill 3 cache lines */ +}; + +/* + * The tx_state structure maintains the state information about the different + * stages of the pool's transcation groups. A per pool tx_state structure + * is used to track this information. The tx_state structure also points to + * an array of tx_cpu structures (described above). Although the tx_sync_lock + * is used to protect the members of this structure, it is not used to + * protect the tx_open_txg. Instead a special lock in the tx_cpu structure + * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock. + * Any thread wishing to update tx_open_txg must grab the tc_open_lock on + * every cpu (see txg_quiesce()). + */ +typedef struct tx_state { + tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */ + kmutex_t tx_sync_lock; /* protects the rest of this struct */ + + uint64_t tx_open_txg; /* currently open txg id */ + uint64_t tx_quiescing_txg; /* currently quiescing txg id */ + uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ + uint64_t tx_syncing_txg; /* currently syncing txg id */ + uint64_t tx_synced_txg; /* last synced txg id */ + + hrtime_t tx_open_time; /* start time of tx_open_txg */ + + uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */ + uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */ + + kcondvar_t tx_sync_more_cv; + kcondvar_t tx_sync_done_cv; + kcondvar_t tx_quiesce_more_cv; + kcondvar_t tx_quiesce_done_cv; + kcondvar_t tx_timeout_cv; + kcondvar_t tx_exit_cv; /* wait for all threads to exit */ + + uint8_t tx_threads; /* number of threads */ + uint8_t tx_exiting; /* set when we're exiting */ + + kthread_t *tx_sync_thread; + kthread_t *tx_quiesce_thread; + + taskq_t *tx_commit_cb_taskq; /* commit callback taskq */ +} tx_state_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TXG_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h new file mode 100644 index 000000000000..21e7ae0de7a7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#ifndef _SYS_UBERBLOCK_H +#define _SYS_UBERBLOCK_H + +#include <sys/spa.h> +#include <sys/vdev.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct uberblock uberblock_t; + +extern int uberblock_verify(uberblock_t *); +extern boolean_t uberblock_update(uberblock_t *, vdev_t *, uint64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UBERBLOCK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h new file mode 100644 index 000000000000..9a3684577dd7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h @@ -0,0 +1,92 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_UBERBLOCK_IMPL_H +#define _SYS_UBERBLOCK_IMPL_H + +#include <sys/uberblock.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The uberblock version is incremented whenever an incompatible on-disk + * format change is made to the SPA, DMU, or ZAP. + * + * Note: the first two fields should never be moved. When a storage pool + * is opened, the uberblock must be read off the disk before the version + * can be checked. If the ub_version field is moved, we may not detect + * version mismatch. If the ub_magic field is moved, applications that + * expect the magic number in the first word won't work. + */ +#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ +#define UBERBLOCK_SHIFT 10 /* up to 1K */ + +struct uberblock { + uint64_t ub_magic; /* UBERBLOCK_MAGIC */ + uint64_t ub_version; /* SPA_VERSION */ + uint64_t ub_txg; /* txg of last sync */ + uint64_t ub_guid_sum; /* sum of all vdev guids */ + uint64_t ub_timestamp; /* UTC time of last sync */ + blkptr_t ub_rootbp; /* MOS objset_phys_t */ + + /* highest SPA_VERSION supported by software that wrote this txg */ + uint64_t ub_software_version; + + /* These fields are reserved for features that are under development: */ + uint64_t ub_mmp_magic; + uint64_t ub_mmp_delay; + uint64_t ub_mmp_seq; + + /* + * ub_checkpoint_txg indicates two things about the current uberblock: + * + * 1] If it is not zero then this uberblock is a checkpoint. If it is + * zero, then this uberblock is not a checkpoint. + * + * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is + * the ub_txg that the uberblock had at the time we moved it to + * the MOS config. + * + * The field is set when we checkpoint the uberblock and continues to + * hold that value even after we've rewound (unlike the ub_txg that + * is reset to a higher value). + * + * Besides checks used to determine whether we are reopening the + * pool from a checkpointed uberblock [see spa_ld_select_uberblock()], + * the value of the field is used to determine which ZIL blocks have + * been allocated according to the ms_sm when we are rewinding to a + * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then + * the ZIL block is not allocated [see uses of spa_min_claim_txg()]. + */ + uint64_t ub_checkpoint_txg; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UBERBLOCK_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h new file mode 100644 index 000000000000..d4ba32e5c642 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h @@ -0,0 +1,57 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UNIQUE_H +#define _SYS_UNIQUE_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* The number of significant bits in each unique value. */ +#define UNIQUE_BITS 56 + +void unique_init(void); +void unique_fini(void); + +/* + * Return a new unique value (which will not be uniquified against until + * it is unique_insert()-ed). + */ +uint64_t unique_create(void); + +/* Return a unique value, which equals the one passed in if possible. */ +uint64_t unique_insert(uint64_t value); + +/* Indicate that this value no longer needs to be uniquified against. */ +void unique_remove(uint64_t value); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UNIQUE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h new file mode 100644 index 000000000000..27e1925f285b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h @@ -0,0 +1,189 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_H +#define _SYS_VDEV_H + +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu.h> +#include <sys/space_map.h> +#include <sys/fs/zfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum vdev_dtl_type { + DTL_MISSING, /* 0% replication: no copies of the data */ + DTL_PARTIAL, /* less than 100% replication: some copies missing */ + DTL_SCRUB, /* unable to fully repair during scrub/resilver */ + DTL_OUTAGE, /* temporarily missing (used to attempt detach) */ + DTL_TYPES +} vdev_dtl_type_t; + +extern boolean_t zfs_nocacheflush; +extern boolean_t zfs_trim_enabled; + +extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...); +extern void vdev_dbgmsg_print_tree(vdev_t *, int); +extern int vdev_open(vdev_t *); +extern void vdev_open_children(vdev_t *); +extern boolean_t vdev_uses_zvols(vdev_t *); +extern int vdev_validate(vdev_t *); +extern int vdev_copy_path_strict(vdev_t *, vdev_t *); +extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *); +extern void vdev_close(vdev_t *); +extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); +extern void vdev_reopen(vdev_t *); +extern int vdev_validate_aux(vdev_t *vd); +extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); +extern boolean_t vdev_is_concrete(vdev_t *vd); +extern boolean_t vdev_is_bootable(vdev_t *vd); +extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); +extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); +extern int vdev_count_leaves(spa_t *spa); +extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, + uint64_t txg, uint64_t size); +extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, + uint64_t txg, uint64_t size); +extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); +extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); +extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, + int scrub_done); +extern boolean_t vdev_dtl_required(vdev_t *vd); +extern boolean_t vdev_resilver_needed(vdev_t *vd, + uint64_t *minp, uint64_t *maxp); +extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, + dmu_tx_t *tx); +extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx); +extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx); +extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx); +extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, + uint64_t size); +extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev, + uint64_t offset, uint64_t size, dmu_tx_t *tx); + +extern void vdev_hold(vdev_t *); +extern void vdev_rele(vdev_t *); + +extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); +extern void vdev_metaslab_fini(vdev_t *vd); +extern void vdev_metaslab_set_size(vdev_t *); +extern void vdev_ashift_optimize(vdev_t *); +extern void vdev_expand(vdev_t *vd, uint64_t txg); +extern void vdev_split(vdev_t *vd); +extern void vdev_deadman(vdev_t *vd); + +extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); +extern void vdev_clear_stats(vdev_t *vd); +extern void vdev_stat_update(zio_t *zio, uint64_t psize); +extern void vdev_scan_stat_init(vdev_t *vd); +extern void vdev_propagate_state(vdev_t *vd); +extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, + vdev_aux_t aux); +extern boolean_t vdev_children_are_offline(vdev_t *vd); + +extern void vdev_space_update(vdev_t *vd, + int64_t alloc_delta, int64_t defer_delta, int64_t space_delta); + +extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); + +extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); +extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux); +extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, + vdev_state_t *); +extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); +extern void vdev_clear(spa_t *spa, vdev_t *vd); + +extern boolean_t vdev_is_dead(vdev_t *vd); +extern boolean_t vdev_readable(vdev_t *vd); +extern boolean_t vdev_writeable(vdev_t *vd); +extern boolean_t vdev_allocatable(vdev_t *vd); +extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); +extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd); + +extern void vdev_cache_init(vdev_t *vd); +extern void vdev_cache_fini(vdev_t *vd); +extern boolean_t vdev_cache_read(zio_t *zio); +extern void vdev_cache_write(zio_t *zio); +extern void vdev_cache_purge(vdev_t *vd); + +extern void vdev_queue_init(vdev_t *vd); +extern void vdev_queue_fini(vdev_t *vd); +extern zio_t *vdev_queue_io(zio_t *zio); +extern void vdev_queue_io_done(zio_t *zio); +extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); +extern int vdev_queue_length(vdev_t *vd); +extern uint64_t vdev_queue_lastoffset(vdev_t *vd); +extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio); + +extern void vdev_config_dirty(vdev_t *vd); +extern void vdev_config_clean(vdev_t *vd); +extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); + +extern void vdev_state_dirty(vdev_t *vd); +extern void vdev_state_clean(vdev_t *vd); + +typedef enum vdev_config_flag { + VDEV_CONFIG_SPARE = 1 << 0, + VDEV_CONFIG_L2CACHE = 1 << 1, + VDEV_CONFIG_REMOVING = 1 << 2, + VDEV_CONFIG_MOS = 1 << 3, + VDEV_CONFIG_MISSING = 1 << 4 +} vdev_config_flag_t; + +extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); +extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, + boolean_t getstats, vdev_config_flag_t flags); + +/* + * Label routines + */ +struct uberblock; +extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); +extern int vdev_label_number(uint64_t psise, uint64_t offset); +extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg); +extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); + +typedef enum { + VDEV_LABEL_CREATE, /* create/add a new device */ + VDEV_LABEL_REPLACE, /* replace an existing device */ + VDEV_LABEL_SPARE, /* add a new hot spare */ + VDEV_LABEL_REMOVE, /* remove an existing device */ + VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */ + VDEV_LABEL_SPLIT /* generating new label for split-off dev */ +} vdev_labeltype_t; + +extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); + +extern int vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h new file mode 100644 index 000000000000..61e2f273f0a0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright (c) 2013 Joyent, Inc. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _SYS_VDEV_DISK_H +#define _SYS_VDEV_DISK_H + +#include <sys/vdev.h> +#ifdef _KERNEL +#include <sys/buf.h> +#include <sys/ddi.h> +#include <sys/sunldi.h> +#include <sys/sunddi.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +typedef struct vdev_disk { + ddi_devid_t vd_devid; + char *vd_minor; + ldi_handle_t vd_lh; + list_t vd_ldi_cbs; + boolean_t vd_ldi_offline; +} vdev_disk_t; +#endif + +extern int vdev_disk_physio(vdev_t *, + caddr_t, size_t, uint64_t, int, boolean_t); + +/* + * Since vdev_disk.c is not compiled into libzpool, this function should only be + * defined in the zfs kernel module. + */ +#ifdef _KERNEL +extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); +#endif +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_DISK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h new file mode 100644 index 000000000000..0260b4ab4f79 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_FILE_H +#define _SYS_VDEV_FILE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/vdev.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_file { + vnode_t *vf_vnode; +} vdev_file_t; + +extern void vdev_file_init(void); +extern void vdev_file_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_FILE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h new file mode 100644 index 000000000000..476b2b18b472 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -0,0 +1,543 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_IMPL_H +#define _SYS_VDEV_IMPL_H + +#include <sys/avl.h> +#include <sys/bpobj.h> +#include <sys/dmu.h> +#include <sys/metaslab.h> +#include <sys/nvpair.h> +#include <sys/space_map.h> +#include <sys/vdev.h> +#include <sys/dkio.h> +#include <sys/uberblock_impl.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/vdev_indirect_births.h> +#include <sys/vdev_removal.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Virtual device descriptors. + * + * All storage pool operations go through the virtual device framework, + * which provides data replication and I/O scheduling. + */ + +/* + * Forward declarations that lots of things need. + */ +typedef struct vdev_queue vdev_queue_t; +typedef struct vdev_cache vdev_cache_t; +typedef struct vdev_cache_entry vdev_cache_entry_t; +struct abd; + +extern int zfs_vdev_queue_depth_pct; +extern int zfs_vdev_def_queue_depth; +extern uint32_t zfs_vdev_async_write_max_active; + +/* + * Virtual device operations + */ +typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, + uint64_t *logical_ashift, uint64_t *physical_ashift); +typedef void vdev_close_func_t(vdev_t *vd); +typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef void vdev_io_start_func_t(zio_t *zio); +typedef void vdev_io_done_func_t(zio_t *zio); +typedef void vdev_state_change_func_t(vdev_t *vd, int, int); +typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); +typedef void vdev_hold_func_t(vdev_t *vd); +typedef void vdev_rele_func_t(vdev_t *vd); + +typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, + uint64_t offset, uint64_t size, void *arg); +typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, + vdev_remap_cb_t callback, void *arg); +/* + * Given a target vdev, translates the logical range "in" to the physical + * range "res" + */ +typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in, + range_seg_t *res); + +typedef struct vdev_ops { + vdev_open_func_t *vdev_op_open; + vdev_close_func_t *vdev_op_close; + vdev_asize_func_t *vdev_op_asize; + vdev_io_start_func_t *vdev_op_io_start; + vdev_io_done_func_t *vdev_op_io_done; + vdev_state_change_func_t *vdev_op_state_change; + vdev_need_resilver_func_t *vdev_op_need_resilver; + vdev_hold_func_t *vdev_op_hold; + vdev_rele_func_t *vdev_op_rele; + vdev_remap_func_t *vdev_op_remap; + /* + * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves. + * Used when initializing vdevs. Isn't used by leaf ops. + */ + vdev_xlation_func_t *vdev_op_xlate; + char vdev_op_type[16]; + boolean_t vdev_op_leaf; +} vdev_ops_t; + +/* + * Virtual device properties + */ +struct vdev_cache_entry { + struct abd *ve_abd; + uint64_t ve_offset; + uint64_t ve_lastused; + avl_node_t ve_offset_node; + avl_node_t ve_lastused_node; + uint32_t ve_hits; + uint16_t ve_missed_update; + zio_t *ve_fill_io; +}; + +struct vdev_cache { + avl_tree_t vc_offset_tree; + avl_tree_t vc_lastused_tree; + kmutex_t vc_lock; +}; + +typedef struct vdev_queue_class { + uint32_t vqc_active; + + /* + * Sorted by offset or timestamp, depending on if the queue is + * LBA-ordered vs FIFO. + */ + avl_tree_t vqc_queued_tree; +} vdev_queue_class_t; + +struct vdev_queue { + vdev_t *vq_vdev; + vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; + avl_tree_t vq_active_tree; + avl_tree_t vq_read_offset_tree; + avl_tree_t vq_write_offset_tree; + uint64_t vq_last_offset; + hrtime_t vq_io_complete_ts; /* time last i/o completed */ + kmutex_t vq_lock; + uint64_t vq_lastoffset; +}; + +/* + * On-disk indirect vdev state. + * + * An indirect vdev is described exclusively in the MOS config of a pool. + * The config for an indirect vdev includes several fields, which are + * accessed in memory by a vdev_indirect_config_t. + */ +typedef struct vdev_indirect_config { + /* + * Object (in MOS) which contains the indirect mapping. This object + * contains an array of vdev_indirect_mapping_entry_phys_t ordered by + * vimep_src. The bonus buffer for this object is a + * vdev_indirect_mapping_phys_t. This object is allocated when a vdev + * removal is initiated. + * + * Note that this object can be empty if none of the data on the vdev + * has been copied yet. + */ + uint64_t vic_mapping_object; + + /* + * Object (in MOS) which contains the birth times for the mapping + * entries. This object contains an array of + * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus + * buffer for this object is a vdev_indirect_birth_phys_t. This object + * is allocated when a vdev removal is initiated. + * + * Note that this object can be empty if none of the vdev has yet been + * copied. + */ + uint64_t vic_births_object; + + /* + * This is the vdev ID which was removed previous to this vdev, or + * UINT64_MAX if there are no previously removed vdevs. + */ + uint64_t vic_prev_indirect_vdev; +} vdev_indirect_config_t; + +/* + * Virtual device descriptor + */ +struct vdev { + /* + * Common to all vdev types. + */ + uint64_t vdev_id; /* child number in vdev parent */ + uint64_t vdev_guid; /* unique ID for this vdev */ + uint64_t vdev_guid_sum; /* self guid + all child guids */ + uint64_t vdev_orig_guid; /* orig. guid prior to remove */ + uint64_t vdev_asize; /* allocatable device capacity */ + uint64_t vdev_min_asize; /* min acceptable asize */ + uint64_t vdev_max_asize; /* max acceptable asize */ + uint64_t vdev_ashift; /* block alignment shift */ + /* + * Logical block alignment shift + * + * The smallest sized/aligned I/O supported by the device. + */ + uint64_t vdev_logical_ashift; + /* + * Physical block alignment shift + * + * The device supports logical I/Os with vdev_logical_ashift + * size/alignment, but optimum performance will be achieved by + * aligning/sizing requests to vdev_physical_ashift. Smaller + * requests may be inflated or incur device level read-modify-write + * operations. + * + * May be 0 to indicate no preference (i.e. use vdev_logical_ashift). + */ + uint64_t vdev_physical_ashift; + uint64_t vdev_state; /* see VDEV_STATE_* #defines */ + uint64_t vdev_prevstate; /* used when reopening a vdev */ + vdev_ops_t *vdev_ops; /* vdev operations */ + spa_t *vdev_spa; /* spa for this vdev */ + void *vdev_tsd; /* type-specific data */ + vnode_t *vdev_name_vp; /* vnode for pathname */ + vnode_t *vdev_devid_vp; /* vnode for devid */ + vdev_t *vdev_top; /* top-level vdev */ + vdev_t *vdev_parent; /* parent vdev */ + vdev_t **vdev_child; /* array of children */ + uint64_t vdev_children; /* number of children */ + vdev_stat_t vdev_stat; /* virtual device statistics */ + boolean_t vdev_expanding; /* expand the vdev? */ + boolean_t vdev_reopening; /* reopen in progress? */ + boolean_t vdev_nonrot; /* true if solid state */ + int vdev_open_error; /* error on last open */ + kthread_t *vdev_open_thread; /* thread opening children */ + uint64_t vdev_crtxg; /* txg when top-level was added */ + + /* + * Top-level vdev state. + */ + uint64_t vdev_ms_array; /* metaslab array object */ + uint64_t vdev_ms_shift; /* metaslab size shift */ + uint64_t vdev_ms_count; /* number of metaslabs */ + metaslab_group_t *vdev_mg; /* metaslab group */ + metaslab_t **vdev_ms; /* metaslab array */ + txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ + txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ + txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ + boolean_t vdev_remove_wanted; /* async remove wanted? */ + boolean_t vdev_probe_wanted; /* async probe wanted? */ + list_node_t vdev_config_dirty_node; /* config dirty list */ + list_node_t vdev_state_dirty_node; /* state dirty list */ + uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ + uint64_t vdev_islog; /* is an intent log device */ + uint64_t vdev_removing; /* device is being removed? */ + boolean_t vdev_ishole; /* is a hole in the namespace */ + kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */ + uint64_t vdev_top_zap; + + /* pool checkpoint related */ + space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ + + boolean_t vdev_initialize_exit_wanted; + vdev_initializing_state_t vdev_initialize_state; + kthread_t *vdev_initialize_thread; + /* Protects vdev_initialize_thread and vdev_initialize_state. */ + kmutex_t vdev_initialize_lock; + kcondvar_t vdev_initialize_cv; + uint64_t vdev_initialize_offset[TXG_SIZE]; + uint64_t vdev_initialize_last_offset; + range_tree_t *vdev_initialize_tree; /* valid while initializing */ + uint64_t vdev_initialize_bytes_est; + uint64_t vdev_initialize_bytes_done; + time_t vdev_initialize_action_time; /* start and end time */ + + /* for limiting outstanding I/Os */ + kmutex_t vdev_initialize_io_lock; + kcondvar_t vdev_initialize_io_cv; + uint64_t vdev_initialize_inflight; + + /* + * Values stored in the config for an indirect or removing vdev. + */ + vdev_indirect_config_t vdev_indirect_config; + + /* + * The vdev_indirect_rwlock protects the vdev_indirect_mapping + * pointer from changing on indirect vdevs (when it is condensed). + * Note that removing (not yet indirect) vdevs have different + * access patterns (the mapping is not accessed from open context, + * e.g. from zio_read) and locking strategy (e.g. svr_lock). + */ + krwlock_t vdev_indirect_rwlock; + vdev_indirect_mapping_t *vdev_indirect_mapping; + vdev_indirect_births_t *vdev_indirect_births; + + /* + * In memory data structures used to manage the obsolete sm, for + * indirect or removing vdevs. + * + * The vdev_obsolete_segments is the in-core record of the segments + * that are no longer referenced anywhere in the pool (due to + * being freed or remapped and not referenced by any snapshots). + * During a sync, segments are added to vdev_obsolete_segments + * via vdev_indirect_mark_obsolete(); at the end of each sync + * pass, this is appended to vdev_obsolete_sm via + * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock + * protects against concurrent modifications of vdev_obsolete_segments + * from multiple zio threads. + */ + kmutex_t vdev_obsolete_lock; + range_tree_t *vdev_obsolete_segments; + space_map_t *vdev_obsolete_sm; + + /* + * The queue depth parameters determine how many async writes are + * still pending (i.e. allocated by net yet issued to disk) per + * top-level (vdev_async_write_queue_depth) and the maximum allowed + * (vdev_max_async_write_queue_depth). These values only apply to + * top-level vdevs. + */ + uint64_t vdev_async_write_queue_depth; + uint64_t vdev_max_async_write_queue_depth; + + /* + * Protects the vdev_scan_io_queue field itself as well as the + * structure's contents (when present). + */ + kmutex_t vdev_scan_io_queue_lock; + struct dsl_scan_io_queue *vdev_scan_io_queue; + + /* + * Leaf vdev state. + */ + range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */ + space_map_t *vdev_dtl_sm; /* dirty time log space map */ + txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ + uint64_t vdev_dtl_object; /* DTL object */ + uint64_t vdev_psize; /* physical device capacity */ + uint64_t vdev_wholedisk; /* true if this is a whole disk */ + uint64_t vdev_offline; /* persistent offline state */ + uint64_t vdev_faulted; /* persistent faulted state */ + uint64_t vdev_degraded; /* persistent degraded state */ + uint64_t vdev_removed; /* persistent removed state */ + uint64_t vdev_resilver_txg; /* persistent resilvering state */ + uint64_t vdev_nparity; /* number of parity devices for raidz */ + char *vdev_path; /* vdev path (if any) */ + char *vdev_devid; /* vdev devid (if any) */ + char *vdev_physpath; /* vdev device path (if any) */ + char *vdev_fru; /* physical FRU location */ + uint64_t vdev_not_present; /* not present during import */ + uint64_t vdev_unspare; /* unspare when resilvering done */ + boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + boolean_t vdev_notrim; /* true if trim failed */ + boolean_t vdev_checkremove; /* temporary online test */ + boolean_t vdev_forcefault; /* force online fault */ + boolean_t vdev_splitting; /* split or repair in progress */ + boolean_t vdev_delayed_close; /* delayed device close? */ + boolean_t vdev_tmpoffline; /* device taken offline temporarily? */ + boolean_t vdev_detached; /* device detached? */ + boolean_t vdev_cant_read; /* vdev is failing all reads */ + boolean_t vdev_cant_write; /* vdev is failing all writes */ + boolean_t vdev_isspare; /* was a hot spare */ + boolean_t vdev_isl2cache; /* was a l2cache device */ + vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ + vdev_cache_t vdev_cache; /* physical block cache */ + spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ + zio_t *vdev_probe_zio; /* root of current probe */ + vdev_aux_t vdev_label_aux; /* on-disk aux state */ + struct trim_map *vdev_trimmap; /* map on outstanding trims */ + uint64_t vdev_leaf_zap; + + /* + * For DTrace to work in userland (libzpool) context, these fields must + * remain at the end of the structure. DTrace will use the kernel's + * CTF definition for 'struct vdev', and since the size of a kmutex_t is + * larger in userland, the offsets for the rest of the fields would be + * incorrect. + */ + kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ + kmutex_t vdev_stat_lock; /* vdev_stat */ + kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ +}; + +#define VDEV_RAIDZ_MAXPARITY 3 + +#define VDEV_PAD_SIZE (8 << 10) +/* 2 padding areas (vl_pad1 and vl_pad2) to skip */ +#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 +#define VDEV_PHYS_SIZE (112 << 10) +#define VDEV_UBERBLOCK_RING (128 << 10) + +/* The largest uberblock we support is 8k. */ +#define MAX_UBERBLOCK_SHIFT (13) +#define VDEV_UBERBLOCK_SHIFT(vd) \ + MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \ + MAX_UBERBLOCK_SHIFT) +#define VDEV_UBERBLOCK_COUNT(vd) \ + (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) +#define VDEV_UBERBLOCK_OFFSET(vd, n) \ + offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) +#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) + +typedef struct vdev_phys { + char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; + zio_eck_t vp_zbt; +} vdev_phys_t; + +typedef struct vdev_label { + char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ + char vl_pad2[VDEV_PAD_SIZE]; /* 8K */ + vdev_phys_t vl_vdev_phys; /* 112K */ + char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ +} vdev_label_t; /* 256K total */ + +/* + * vdev_dirty() flags + */ +#define VDD_METASLAB 0x01 +#define VDD_DTL 0x02 + +/* Offset of embedded boot loader region on each label */ +#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) +/* + * Size of embedded boot loader region on each label. + * The total size of the first two labels plus the boot area is 4MB. + */ +#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ + +/* + * Size of label regions at the start and end of each leaf device. + */ +#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) +#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) +#define VDEV_LABELS 4 +#define VDEV_BEST_LABEL VDEV_LABELS + +#define VDEV_ALLOC_LOAD 0 +#define VDEV_ALLOC_ADD 1 +#define VDEV_ALLOC_SPARE 2 +#define VDEV_ALLOC_L2CACHE 3 +#define VDEV_ALLOC_ROOTPOOL 4 +#define VDEV_ALLOC_SPLIT 5 +#define VDEV_ALLOC_ATTACH 6 + +/* + * Allocate or free a vdev + */ +extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, + vdev_ops_t *ops); +extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, + vdev_t *parent, uint_t id, int alloctype); +extern void vdev_free(vdev_t *vd); + +/* + * Add or remove children and parents + */ +extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); +extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); +extern void vdev_compact_children(vdev_t *pvd); +extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); +extern void vdev_remove_parent(vdev_t *cvd); + +/* + * vdev sync load and sync + */ +extern boolean_t vdev_log_state_valid(vdev_t *vd); +extern int vdev_load(vdev_t *vd); +extern int vdev_dtl_load(vdev_t *vd); +extern void vdev_sync(vdev_t *vd, uint64_t txg); +extern void vdev_sync_done(vdev_t *vd, uint64_t txg); +extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); +extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg); + +/* + * Available vdev types. + */ +extern vdev_ops_t vdev_root_ops; +extern vdev_ops_t vdev_mirror_ops; +extern vdev_ops_t vdev_replacing_ops; +extern vdev_ops_t vdev_raidz_ops; +#ifdef _KERNEL +extern vdev_ops_t vdev_geom_ops; +#else +extern vdev_ops_t vdev_disk_ops; +#endif +extern vdev_ops_t vdev_file_ops; +extern vdev_ops_t vdev_missing_ops; +extern vdev_ops_t vdev_hole_ops; +extern vdev_ops_t vdev_spare_ops; +extern vdev_ops_t vdev_indirect_ops; + +/* + * Common size functions + */ +extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, + range_seg_t *out); +extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_get_min_asize(vdev_t *vd); +extern void vdev_set_min_asize(vdev_t *vd); + +/* + * Global variables + */ +extern int vdev_standard_sm_blksz; +/* zdb uses this tunable, so it must be declared here to make lint happy. */ +extern int zfs_vdev_cache_size; +extern uint_t zfs_geom_probe_vdev_key; + +/* + * Functions from vdev_indirect.c + */ +extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx); +extern boolean_t vdev_indirect_should_condense(vdev_t *vd); +extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx); +extern int vdev_obsolete_sm_object(vdev_t *vd); +extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd); + +#ifdef illumos +/* + * Other miscellaneous functions + */ +int vdev_checkpoint_sm_object(vdev_t *vd); + +/* + * The vdev_buf_t is used to translate between zio_t and buf_t, and back again. + */ +typedef struct vdev_buf { + buf_t vb_buf; /* buffer that describes the io */ + zio_t *vb_io; /* pointer back to the original zio_t */ +} vdev_buf_t; +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h new file mode 100644 index 000000000000..987b14485d2b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_INDIRECT_BIRTHS_H +#define _SYS_VDEV_INDIRECT_BIRTHS_H + +#include <sys/dmu.h> +#include <sys/spa.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_indirect_birth_entry_phys { + uint64_t vibe_offset; + uint64_t vibe_phys_birth_txg; +} vdev_indirect_birth_entry_phys_t; + +typedef struct vdev_indirect_birth_phys { + uint64_t vib_count; /* count of v_i_b_entry_phys_t's */ +} vdev_indirect_birth_phys_t; + +typedef struct vdev_indirect_births { + uint64_t vib_object; + + /* + * Each entry indicates that everything up to but not including + * vibe_offset was copied in vibe_phys_birth_txg. Entries are sorted + * by increasing phys_birth, and also by increasing offset. See + * vdev_indirect_births_physbirth for usage. + */ + vdev_indirect_birth_entry_phys_t *vib_entries; + + objset_t *vib_objset; + + dmu_buf_t *vib_dbuf; + vdev_indirect_birth_phys_t *vib_phys; +} vdev_indirect_births_t; + +extern vdev_indirect_births_t *vdev_indirect_births_open(objset_t *os, + uint64_t object); +extern void vdev_indirect_births_close(vdev_indirect_births_t *vib); +extern boolean_t vdev_indirect_births_is_open(vdev_indirect_births_t *vib); +extern uint64_t vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx); +extern void vdev_indirect_births_free(objset_t *os, uint64_t object, + dmu_tx_t *tx); + +extern uint64_t vdev_indirect_births_count(vdev_indirect_births_t *vib); +extern uint64_t vdev_indirect_births_object(vdev_indirect_births_t *vib); + +extern void vdev_indirect_births_add_entry(vdev_indirect_births_t *vib, + uint64_t offset, uint64_t txg, dmu_tx_t *tx); + +extern uint64_t vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, + uint64_t offset, uint64_t asize); + +extern uint64_t vdev_indirect_births_last_entry_txg( + vdev_indirect_births_t *vib); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_INDIRECT_BIRTHS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h new file mode 100644 index 000000000000..7e42c1019504 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h @@ -0,0 +1,141 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_INDIRECT_MAPPING_H +#define _SYS_VDEV_INDIRECT_MAPPING_H + +#include <sys/dmu.h> +#include <sys/list.h> +#include <sys/spa.h> +#include <sys/space_map.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_indirect_mapping_entry_phys { + /* + * Decode with DVA_MAPPING_* macros. + * Contains: + * the source offset (low 63 bits) + * the one-bit "mark", used for garbage collection (by zdb) + */ + uint64_t vimep_src; + + /* + * Note: the DVA's asize is 24 bits, and can thus store ranges + * up to 8GB. + */ + dva_t vimep_dst; +} vdev_indirect_mapping_entry_phys_t; + +#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \ + BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \ + BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +typedef struct vdev_indirect_mapping_entry { + vdev_indirect_mapping_entry_phys_t vime_mapping; + uint32_t vime_obsolete_count; + list_node_t vime_node; +} vdev_indirect_mapping_entry_t; + +/* + * This is stored in the bonus buffer of the mapping object, see comment of + * vdev_indirect_config for more details. + */ +typedef struct vdev_indirect_mapping_phys { + uint64_t vimp_max_offset; + uint64_t vimp_bytes_mapped; + uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */ + + /* + * For each entry in the mapping object, this object contains an + * entry representing the number of bytes of that mapping entry + * that were no longer in use by the pool at the time this indirect + * vdev was last condensed. + */ + uint64_t vimp_counts_object; +} vdev_indirect_mapping_phys_t; + +#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t)) + +typedef struct vdev_indirect_mapping { + uint64_t vim_object; + boolean_t vim_havecounts; + + /* + * An ordered array of all mapping entries, sorted by source offset. + * Note that vim_entries is needed during a removal (and contains + * mappings that have been synced to disk so far) to handle frees + * from the removing device. + */ + vdev_indirect_mapping_entry_phys_t *vim_entries; + + objset_t *vim_objset; + + dmu_buf_t *vim_dbuf; + vdev_indirect_mapping_phys_t *vim_phys; +} vdev_indirect_mapping_t; + +extern vdev_indirect_mapping_t *vdev_indirect_mapping_open(objset_t *os, + uint64_t object); +extern void vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx); +extern void vdev_indirect_mapping_free(objset_t *os, uint64_t obj, + dmu_tx_t *tx); + +extern uint64_t vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_bytes_mapped( + vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim); + +/* + * Writes the given list of vdev_indirect_mapping_entry_t to the mapping + * then updates internal state. + */ +extern void vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, + list_t *vime_list, dmu_tx_t *tx); + +extern vdev_indirect_mapping_entry_phys_t * + vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, + uint64_t offset); + +extern vdev_indirect_mapping_entry_phys_t * + vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, + uint64_t offset); + +extern uint32_t *vdev_indirect_mapping_load_obsolete_counts( + vdev_indirect_mapping_t *vim); +extern void vdev_indirect_mapping_load_obsolete_spacemap( + vdev_indirect_mapping_t *vim, + uint32_t *counts, space_map_t *obsolete_space_sm); +extern void vdev_indirect_mapping_increment_obsolete_count( + vdev_indirect_mapping_t *vim, + uint64_t offset, uint64_t asize, uint32_t *counts); +extern void vdev_indirect_mapping_free_obsolete_counts( + vdev_indirect_mapping_t *vim, uint32_t *counts); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_INDIRECT_MAPPING_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h new file mode 100644 index 000000000000..db4b0572cd60 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_INITIALIZE_H +#define _SYS_VDEV_INITIALIZE_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vdev_initialize(vdev_t *vd); +extern void vdev_initialize_stop(vdev_t *vd, + vdev_initializing_state_t tgt_state); +extern void vdev_initialize_stop_all(vdev_t *vd, + vdev_initializing_state_t tgt_state); +extern void vdev_initialize_restart(vdev_t *vd); +extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, + range_seg_t *physical_rs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_INITIALIZE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h new file mode 100644 index 000000000000..e771e668fda6 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VDEV_RAIDZ_H +#define _SYS_VDEV_RAIDZ_H + +#include <sys/vdev.h> +#ifdef illumos +#include <sys/semaphore.h> +#ifdef _KERNEL +#include <sys/ddi.h> +#include <sys/sunldi.h> +#include <sys/sunddi.h> +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +extern int vdev_raidz_physio(vdev_t *, + caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t); +#endif +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_RAIDZ_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h new file mode 100644 index 000000000000..3962237afdab --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h @@ -0,0 +1,96 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_REMOVAL_H +#define _SYS_VDEV_REMOVAL_H + +#include <sys/spa.h> +#include <sys/bpobj.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/vdev_indirect_births.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct spa_vdev_removal { + uint64_t svr_vdev_id; + uint64_t svr_max_offset_to_sync[TXG_SIZE]; + /* Thread performing a vdev removal. */ + kthread_t *svr_thread; + /* Segments left to copy from the current metaslab. */ + range_tree_t *svr_allocd_segs; + kmutex_t svr_lock; + kcondvar_t svr_cv; + boolean_t svr_thread_exit; + + /* + * New mappings to write out each txg. + */ + list_t svr_new_segments[TXG_SIZE]; + + /* + * Ranges that were freed while a mapping was in flight. This is + * a subset of the ranges covered by vdev_im_new_segments. + */ + range_tree_t *svr_frees[TXG_SIZE]; + + /* + * Number of bytes which we have finished our work for + * in each txg. This could be data copied (which will be part of + * the mappings in vdev_im_new_segments), or data freed before + * we got around to copying it. + */ + uint64_t svr_bytes_done[TXG_SIZE]; + + /* List of leaf zap objects to be unlinked */ + nvlist_t *svr_zaplist; +} spa_vdev_removal_t; + +typedef struct spa_condensing_indirect { + /* + * New mappings to write out each txg. + */ + list_t sci_new_mapping_entries[TXG_SIZE]; + + vdev_indirect_mapping_t *sci_new_mapping; +} spa_condensing_indirect_t; + +extern int spa_remove_init(spa_t *); +extern void spa_restart_removal(spa_t *); +extern int spa_condense_init(spa_t *); +extern void spa_condense_fini(spa_t *); +extern void spa_start_indirect_condensing_thread(spa_t *); +extern void spa_vdev_condense_suspend(spa_t *); +extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t); +extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t); +extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *); +extern void svr_sync(spa_t *spa, dmu_tx_t *tx); +extern void spa_vdev_remove_suspend(spa_t *); +extern int spa_vdev_remove_cancel(spa_t *); +extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr); + +extern int vdev_removal_max_span; +extern int zfs_remove_max_segment; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_REMOVAL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h new file mode 100644 index 000000000000..ea372f02e99a --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h @@ -0,0 +1,509 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#ifndef _SYS_ZAP_H +#define _SYS_ZAP_H + +/* + * ZAP - ZFS Attribute Processor + * + * The ZAP is a module which sits on top of the DMU (Data Management + * Unit) and implements a higher-level storage primitive using DMU + * objects. Its primary consumer is the ZPL (ZFS Posix Layer). + * + * A "zapobj" is a DMU object which the ZAP uses to stores attributes. + * Users should use only zap routines to access a zapobj - they should + * not access the DMU object directly using DMU routines. + * + * The attributes stored in a zapobj are name-value pairs. The name is + * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including + * terminating NULL). The value is an array of integers, which may be + * 1, 2, 4, or 8 bytes long. The total space used by the array (number + * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes. + * Note that an 8-byte integer value can be used to store the location + * (object number) of another dmu object (which may be itself a zapobj). + * Note that you can use a zero-length attribute to store a single bit + * of information - the attribute is present or not. + * + * The ZAP routines are thread-safe. However, you must observe the + * DMU's restriction that a transaction may not be operated on + * concurrently. + * + * Any of the routines that return an int may return an I/O error (EIO + * or ECHECKSUM). + * + * + * Implementation / Performance Notes: + * + * The ZAP is intended to operate most efficiently on attributes with + * short (49 bytes or less) names and single 8-byte values, for which + * the microzap will be used. The ZAP should be efficient enough so + * that the user does not need to cache these attributes. + * + * The ZAP's locking scheme makes its routines thread-safe. Operations + * on different zapobjs will be processed concurrently. Operations on + * the same zapobj which only read data will be processed concurrently. + * Operations on the same zapobj which modify data will be processed + * concurrently when there are many attributes in the zapobj (because + * the ZAP uses per-block locking - more than 128 * (number of cpus) + * small attributes will suffice). + */ + +/* + * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C + * strings) for the names of attributes, rather than a byte string + * bounded by an explicit length. If some day we want to support names + * in character sets which have embedded zeros (eg. UTF-16, UTF-32), + * we'll have to add routines for using length-bounded strings. + */ + +#include <sys/dmu.h> +#include <sys/refcount.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Specifies matching criteria for ZAP lookups. + * MT_NORMALIZE Use ZAP normalization flags, which can include both + * unicode normalization and case-insensitivity. + * MT_MATCH_CASE Do case-sensitive lookups even if MT_NORMALIZE is + * specified and ZAP normalization flags include + * U8_TEXTPREP_TOUPPER. + */ +typedef enum matchtype { + MT_NORMALIZE = 1 << 0, + MT_MATCH_CASE = 1 << 1, +} matchtype_t; + +typedef enum zap_flags { + /* Use 64-bit hash value (serialized cursors will always use 64-bits) */ + ZAP_FLAG_HASH64 = 1 << 0, + /* Key is binary, not string (zap_add_uint64() can be used) */ + ZAP_FLAG_UINT64_KEY = 1 << 1, + /* + * First word of key (which must be an array of uint64) is + * already randomly distributed. + */ + ZAP_FLAG_PRE_HASHED_KEY = 1 << 2, +} zap_flags_t; + +/* + * Create a new zapobj with no attributes and return its object number. + * + * dnodesize specifies the on-disk size of the dnode for the new zapobj. + * Valid values are multiples of 512 up to DNODE_MAX_SIZE. + */ +uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); +uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags, + dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx); +uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_flags_dnsize(objset_t *os, int normflags, + zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx); +uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, + uint64_t parent_obj, const char *name, dmu_tx_t *tx); +uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, + uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx); + +/* + * Initialize an already-allocated object. + */ +void mzap_create_impl(objset_t *os, uint64_t obj, int normflags, + zap_flags_t flags, dmu_tx_t *tx); + +/* + * Create a new zapobj with no attributes from the given (unallocated) + * object number. + */ +int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); +int zap_create_claim_norm(objset_t *ds, uint64_t obj, + int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj, + int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); + +/* + * The zapobj passed in must be a valid ZAP object for all of the + * following routines. + */ + +/* + * Destroy this zapobj and all its attributes. + * + * Frees the object number using dmu_object_free. + */ +int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); + +/* + * Manipulate attributes. + * + * 'integer_size' is in bytes, and must be 1, 2, 4, or 8. + */ + +/* + * Retrieve the contents of the attribute with the given name. + * + * If the requested attribute does not exist, the call will fail and + * return ENOENT. + * + * If 'integer_size' is smaller than the attribute's integer size, the + * call will fail and return EINVAL. + * + * If 'integer_size' is equal to or larger than the attribute's integer + * size, the call will succeed and return 0. + * + * When converting to a larger integer size, the integers will be treated as + * unsigned (ie. no sign-extension will be performed). + * + * 'num_integers' is the length (in integers) of 'buf'. + * + * If the attribute is longer than the buffer, as many integers as will + * fit will be transferred to 'buf'. If the entire attribute was not + * transferred, the call will return EOVERFLOW. + */ +int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf); + +/* + * If rn_len is nonzero, realname will be set to the name of the found + * entry (which may be different from the requested name if matchtype is + * not MT_EXACT). + * + * If normalization_conflictp is not NULL, it will be set if there is + * another name with the same case/unicode normalized form. + */ +int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *normalization_conflictp); +int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); +int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints); +int zap_lookup_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp); + +int zap_count_write_by_dnode(dnode_t *dn, const char *name, + int add, refcount_t *towrite, refcount_t *tooverwrite); + +/* + * Create an attribute with the given name and value. + * + * If an attribute with the given name already exists, the call will + * fail and return EEXIST. + */ +int zap_add(objset_t *ds, uint64_t zapobj, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); +int zap_add_by_dnode(dnode_t *dn, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); +int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); + +/* + * Set the attribute with the given name to the given value. If an + * attribute with the given name does not exist, it will be created. If + * an attribute with the given name already exists, the previous value + * will be overwritten. The integer_size may be different from the + * existing attribute's integer size, in which case the attribute's + * integer size will be updated to the new value. + */ +int zap_update(objset_t *ds, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); + +/* + * Get the length (in integers) and the integer size of the specified + * attribute. + * + * If the requested attribute does not exist, the call will fail and + * return ENOENT. + */ +int zap_length(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t *integer_size, uint64_t *num_integers); +int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers); + +/* + * Remove the specified attribute. + * + * If the specified attribute does not exist, the call will fail and + * return ENOENT. + */ +int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); +int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx); +int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx); +int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, dmu_tx_t *tx); + +/* + * Returns (in *count) the number of attributes in the specified zap + * object. + */ +int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); + +/* + * Returns (in name) the name of the entry whose (value & mask) + * (za_first_integer) is value, or ENOENT if not found. The string + * pointed to by name must be at least 256 bytes long. If mask==0, the + * match must be exact (ie, same as mask=-1ULL). + */ +int zap_value_search(objset_t *os, uint64_t zapobj, + uint64_t value, uint64_t mask, char *name); + +/* + * Transfer all the entries from fromobj into intoobj. Only works on + * int_size=8 num_integers=1 values. Fails if there are any duplicated + * entries. + */ +int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); + +/* Same as zap_join, but set the values to 'value'. */ +int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, + uint64_t value, dmu_tx_t *tx); + +/* Same as zap_join, but add together any duplicated entries. */ +int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, + dmu_tx_t *tx); + +/* + * Manipulate entries where the name + value are the "same" (the name is + * a stringified version of the value). + */ +int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); +int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); +int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); +int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx); + +/* Here the key is an int and the value is a different int. */ +int zap_add_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx); +int zap_update_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx); +int zap_lookup_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t *valuep); + +int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, + dmu_tx_t *tx); + +struct zap; +struct zap_leaf; +typedef struct zap_cursor { + /* This structure is opaque! */ + objset_t *zc_objset; + struct zap *zc_zap; + struct zap_leaf *zc_leaf; + uint64_t zc_zapobj; + uint64_t zc_serialized; + uint64_t zc_hash; + uint32_t zc_cd; +} zap_cursor_t; + +typedef struct { + int za_integer_length; + /* + * za_normalization_conflict will be set if there are additional + * entries with this normalized form (eg, "foo" and "Foo"). + */ + boolean_t za_normalization_conflict; + uint64_t za_num_integers; + uint64_t za_first_integer; /* no sign extension for <8byte ints */ + char za_name[ZAP_MAXNAMELEN]; +} zap_attribute_t; + +/* + * The interface for listing all the attributes of a zapobj can be + * thought of as cursor moving down a list of the attributes one by + * one. The cookie returned by the zap_cursor_serialize routine is + * persistent across system calls (and across reboot, even). + */ + +/* + * Initialize a zap cursor, pointing to the "first" attribute of the + * zapobj. You must _fini the cursor when you are done with it. + */ +void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); +void zap_cursor_fini(zap_cursor_t *zc); + +/* + * Get the attribute currently pointed to by the cursor. Returns + * ENOENT if at the end of the attributes. + */ +int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za); + +/* + * Advance the cursor to the next attribute. + */ +void zap_cursor_advance(zap_cursor_t *zc); + +/* + * Get a persistent cookie pointing to the current position of the zap + * cursor. The low 4 bits in the cookie are always zero, and thus can + * be used as to differentiate a serialized cookie from a different type + * of value. The cookie will be less than 2^32 as long as there are + * fewer than 2^22 (4.2 million) entries in the zap object. + */ +uint64_t zap_cursor_serialize(zap_cursor_t *zc); + +/* + * Advance the cursor to the attribute having the given key. + */ +int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt); + +/* + * Initialize a zap cursor pointing to the position recorded by + * zap_cursor_serialize (in the "serialized" argument). You can also + * use a "serialized" argument of 0 to start at the beginning of the + * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to + * zap_cursor_init(...).) + */ +void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds, + uint64_t zapobj, uint64_t serialized); + + +#define ZAP_HISTOGRAM_SIZE 10 + +typedef struct zap_stats { + /* + * Size of the pointer table (in number of entries). + * This is always a power of 2, or zero if it's a microzap. + * In general, it should be considerably greater than zs_num_leafs. + */ + uint64_t zs_ptrtbl_len; + + uint64_t zs_blocksize; /* size of zap blocks */ + + /* + * The number of blocks used. Note that some blocks may be + * wasted because old ptrtbl's and large name/value blocks are + * not reused. (Although their space is reclaimed, we don't + * reuse those offsets in the object.) + */ + uint64_t zs_num_blocks; + + /* + * Pointer table values from zap_ptrtbl in the zap_phys_t + */ + uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */ + uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */ + uint64_t zs_ptrtbl_zt_blk; /* starting block number */ + uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */ + uint64_t zs_ptrtbl_zt_shift; /* bits to index it */ + + /* + * Values of the other members of the zap_phys_t + */ + uint64_t zs_block_type; /* ZBT_HEADER */ + uint64_t zs_magic; /* ZAP_MAGIC */ + uint64_t zs_num_leafs; /* The number of leaf blocks */ + uint64_t zs_num_entries; /* The number of zap entries */ + uint64_t zs_salt; /* salt to stir into hash function */ + + /* + * Histograms. For all histograms, the last index + * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater + * than what can be represented. For example + * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number + * of leafs with more than 45 entries. + */ + + /* + * zs_leafs_with_n_pointers[n] is the number of leafs with + * 2^n pointers to it. + */ + uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_leafs_with_n_entries[n] is the number of leafs with + * [n*5, (n+1)*5) entries. In the current implementation, there + * can be at most 55 entries in any block, but there may be + * fewer if the name or value is large, or the block is not + * completely full. + */ + uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_leafs_n_tenths_full[n] is the number of leafs whose + * fullness is in the range [n/10, (n+1)/10). + */ + uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_entries_using_n_chunks[n] is the number of entries which + * consume n 24-byte chunks. (Note, large names/values only use + * one chunk, but contribute to zs_num_blocks_large.) + */ + uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_buckets_with_n_entries[n] is the number of buckets (each + * leaf has 64 buckets) with n entries. + * zs_buckets_with_n_entries[1] should be very close to + * zs_num_entries. + */ + uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE]; +} zap_stats_t; + +/* + * Get statistics about a ZAP object. Note: you need to be aware of the + * internal implementation of the ZAP to correctly interpret some of the + * statistics. This interface shouldn't be relied on unless you really + * know what you're doing. + */ +int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h new file mode 100644 index 000000000000..912b8b219c4c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h @@ -0,0 +1,242 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Nexenta Systems, Inc. + */ + +#ifndef _SYS_ZAP_IMPL_H +#define _SYS_ZAP_IMPL_H + +#include <sys/zap.h> +#include <sys/zfs_context.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern int fzap_default_block_shift; + +#define ZAP_MAGIC 0x2F52AB2ABULL + +#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) + +#define MZAP_ENT_LEN 64 +#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) +#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE + +#define ZAP_NEED_CD (-1U) + +typedef struct mzap_ent_phys { + uint64_t mze_value; + uint32_t mze_cd; + uint16_t mze_pad; /* in case we want to chain them someday */ + char mze_name[MZAP_NAME_LEN]; +} mzap_ent_phys_t; + +typedef struct mzap_phys { + uint64_t mz_block_type; /* ZBT_MICRO */ + uint64_t mz_salt; + uint64_t mz_normflags; + uint64_t mz_pad[5]; + mzap_ent_phys_t mz_chunk[1]; + /* actually variable size depending on block size */ +} mzap_phys_t; + +typedef struct mzap_ent { + avl_node_t mze_node; + int mze_chunkid; + uint64_t mze_hash; + uint32_t mze_cd; /* copy from mze_phys->mze_cd */ +} mzap_ent_t; + +#define MZE_PHYS(zap, mze) \ + (&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid]) + +/* + * The (fat) zap is stored in one object. It is an array of + * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of: + * + * ptrtbl fits in first block: + * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ... + * + * ptrtbl too big for first block: + * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ... + * + */ + +struct dmu_buf; +struct zap_leaf; + +#define ZBT_LEAF ((1ULL << 63) + 0) +#define ZBT_HEADER ((1ULL << 63) + 1) +#define ZBT_MICRO ((1ULL << 63) + 3) +/* any other values are ptrtbl blocks */ + +/* + * the embedded pointer table takes up half a block: + * block size / entry size (2^3) / 2 + */ +#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) + +/* + * The embedded pointer table starts half-way through the block. Since + * the pointer table itself is half the block, it starts at (64-bit) + * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)). + */ +#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \ + ((uint64_t *)zap_f_phys(zap)) \ + [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))] + +/* + * TAKE NOTE: + * If zap_phys_t is modified, zap_byteswap() must be modified. + */ +typedef struct zap_phys { + uint64_t zap_block_type; /* ZBT_HEADER */ + uint64_t zap_magic; /* ZAP_MAGIC */ + + struct zap_table_phys { + uint64_t zt_blk; /* starting block number */ + uint64_t zt_numblks; /* number of blocks */ + uint64_t zt_shift; /* bits to index it */ + uint64_t zt_nextblk; /* next (larger) copy start block */ + uint64_t zt_blks_copied; /* number source blocks copied */ + } zap_ptrtbl; + + uint64_t zap_freeblk; /* the next free block */ + uint64_t zap_num_leafs; /* number of leafs */ + uint64_t zap_num_entries; /* number of entries */ + uint64_t zap_salt; /* salt to stir into hash function */ + uint64_t zap_normflags; /* flags for u8_textprep_str() */ + uint64_t zap_flags; /* zap_flags_t */ + /* + * This structure is followed by padding, and then the embedded + * pointer table. The embedded pointer table takes up second + * half of the block. It is accessed using the + * ZAP_EMBEDDED_PTRTBL_ENT() macro. + */ +} zap_phys_t; + +typedef struct zap_table_phys zap_table_phys_t; + +typedef struct zap { + dmu_buf_user_t zap_dbu; + objset_t *zap_objset; + uint64_t zap_object; + struct dmu_buf *zap_dbuf; + krwlock_t zap_rwlock; + boolean_t zap_ismicro; + int zap_normflags; + uint64_t zap_salt; + union { + struct { + /* + * zap_num_entries_mtx protects + * zap_num_entries + */ + kmutex_t zap_num_entries_mtx; + int zap_block_shift; + } zap_fat; + struct { + int16_t zap_num_entries; + int16_t zap_num_chunks; + int16_t zap_alloc_next; + avl_tree_t zap_avl; + } zap_micro; + } zap_u; +} zap_t; + +inline zap_phys_t * +zap_f_phys(zap_t *zap) +{ + return (zap->zap_dbuf->db_data); +} + +inline mzap_phys_t * +zap_m_phys(zap_t *zap) +{ + return (zap->zap_dbuf->db_data); +} + +typedef struct zap_name { + zap_t *zn_zap; + int zn_key_intlen; + const void *zn_key_orig; + int zn_key_orig_numints; + const void *zn_key_norm; + int zn_key_norm_numints; + uint64_t zn_hash; + matchtype_t zn_matchtype; + int zn_normflags; + char zn_normbuf[ZAP_MAXNAMELEN]; +} zap_name_t; + +#define zap_f zap_u.zap_fat +#define zap_m zap_u.zap_micro + +boolean_t zap_match(zap_name_t *zn, const char *matchname); +int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp); +void zap_unlockdir(zap_t *zap, void *tag); +void zap_evict_sync(void *dbu); +zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); +void zap_name_free(zap_name_t *zn); +int zap_hashbits(zap_t *zap); +uint32_t zap_maxcd(zap_t *zap); +uint64_t zap_getflags(zap_t *zap); + +#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) + +void fzap_byteswap(void *buf, size_t size); +int fzap_count(zap_t *zap, uint64_t *count); +int fzap_lookup(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, void *buf, + char *realname, int rn_len, boolean_t *normalization_conflictp); +void fzap_prefetch(zap_name_t *zn); +int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, + const void *val, void *tag, dmu_tx_t *tx); +int fzap_update(zap_name_t *zn, + int integer_size, uint64_t num_integers, const void *val, + void *tag, dmu_tx_t *tx); +int fzap_length(zap_name_t *zn, + uint64_t *integer_size, uint64_t *num_integers); +int fzap_remove(zap_name_t *zn, dmu_tx_t *tx); +int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); +void fzap_get_stats(zap_t *zap, zap_stats_t *zs); +void zap_put_leaf(struct zap_leaf *l); + +int fzap_add_cd(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, void *tag, dmu_tx_t *tx); +void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags); +int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h new file mode 100644 index 000000000000..76b3ecc72557 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h @@ -0,0 +1,248 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#ifndef _SYS_ZAP_LEAF_H +#define _SYS_ZAP_LEAF_H + +#include <sys/zap.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct zap; +struct zap_name; +struct zap_stats; + +#define ZAP_LEAF_MAGIC 0x2AB1EAF + +/* chunk size = 24 bytes */ +#define ZAP_LEAF_CHUNKSIZE 24 + +/* + * The amount of space available for chunks is: + * block size (1<<l->l_bs) - hash entry size (2) * number of hash + * entries - header space (2*chunksize) + */ +#define ZAP_LEAF_NUMCHUNKS(l) \ + (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \ + ZAP_LEAF_CHUNKSIZE - 2) + +/* + * The amount of space within the chunk available for the array is: + * chunk size - space for type (1) - space for next pointer (2) + */ +#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3) + +#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \ + (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES) + +/* + * Low water mark: when there are only this many chunks free, start + * growing the ptrtbl. Ideally, this should be larger than a + * "reasonably-sized" entry. 20 chunks is more than enough for the + * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value), + * while still being only around 3% for 16k blocks. + */ +#define ZAP_LEAF_LOW_WATER (20) + +/* + * The leaf hash table has block size / 2^5 (32) number of entries, + * which should be more than enough for the maximum number of entries, + * which is less than block size / CHUNKSIZE (24) / minimum number of + * chunks per entry (3). + */ +#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5) +#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l)) + +/* + * The chunks start immediately after the hash table. The end of the + * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a + * chunk_t. + */ +#define ZAP_LEAF_CHUNK(l, idx) \ + ((zap_leaf_chunk_t *) \ + (zap_leaf_phys(l)->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] +#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry) + +typedef enum zap_chunk_type { + ZAP_CHUNK_FREE = 253, + ZAP_CHUNK_ENTRY = 252, + ZAP_CHUNK_ARRAY = 251, + ZAP_CHUNK_TYPE_MAX = 250 +} zap_chunk_type_t; + +#define ZLF_ENTRIES_CDSORTED (1<<0) + +/* + * TAKE NOTE: + * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. + */ +typedef struct zap_leaf_phys { + struct zap_leaf_header { + /* Public to ZAP */ + uint64_t lh_block_type; /* ZBT_LEAF */ + uint64_t lh_pad1; + uint64_t lh_prefix; /* hash prefix of this leaf */ + uint32_t lh_magic; /* ZAP_LEAF_MAGIC */ + uint16_t lh_nfree; /* number free chunks */ + uint16_t lh_nentries; /* number of entries */ + uint16_t lh_prefix_len; /* num bits used to id this */ + + /* Private to zap_leaf */ + uint16_t lh_freelist; /* chunk head of free list */ + uint8_t lh_flags; /* ZLF_* flags */ + uint8_t lh_pad2[11]; + } l_hdr; /* 2 24-byte chunks */ + + /* + * The header is followed by a hash table with + * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is + * followed by an array of ZAP_LEAF_NUMCHUNKS(zap) + * zap_leaf_chunk structures. These structures are accessed + * with the ZAP_LEAF_CHUNK() macro. + */ + + uint16_t l_hash[1]; +} zap_leaf_phys_t; + +typedef union zap_leaf_chunk { + struct zap_leaf_entry { + uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ + uint8_t le_value_intlen; /* size of value's ints */ + uint16_t le_next; /* next entry in hash chain */ + uint16_t le_name_chunk; /* first chunk of the name */ + uint16_t le_name_numints; /* ints in name (incl null) */ + uint16_t le_value_chunk; /* first chunk of the value */ + uint16_t le_value_numints; /* value length in ints */ + uint32_t le_cd; /* collision differentiator */ + uint64_t le_hash; /* hash value of the name */ + } l_entry; + struct zap_leaf_array { + uint8_t la_type; /* always ZAP_CHUNK_ARRAY */ + uint8_t la_array[ZAP_LEAF_ARRAY_BYTES]; + uint16_t la_next; /* next blk or CHAIN_END */ + } l_array; + struct zap_leaf_free { + uint8_t lf_type; /* always ZAP_CHUNK_FREE */ + uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES]; + uint16_t lf_next; /* next in free list, or CHAIN_END */ + } l_free; +} zap_leaf_chunk_t; + +typedef struct zap_leaf { + dmu_buf_user_t l_dbu; + krwlock_t l_rwlock; + uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */ + int l_bs; /* block size shift */ + dmu_buf_t *l_dbuf; +} zap_leaf_t; + +inline zap_leaf_phys_t * +zap_leaf_phys(zap_leaf_t *l) +{ + return (l->l_dbuf->db_data); +} + +typedef struct zap_entry_handle { + /* Set by zap_leaf and public to ZAP */ + uint64_t zeh_num_integers; + uint64_t zeh_hash; + uint32_t zeh_cd; + uint8_t zeh_integer_size; + + /* Private to zap_leaf */ + uint16_t zeh_fakechunk; + uint16_t *zeh_chunkp; + zap_leaf_t *zeh_leaf; +} zap_entry_handle_t; + +/* + * Return a handle to the named entry, or ENOENT if not found. The hash + * value must equal zap_hash(name). + */ +extern int zap_leaf_lookup(zap_leaf_t *l, + struct zap_name *zn, zap_entry_handle_t *zeh); + +/* + * Return a handle to the entry with this hash+cd, or the entry with the + * next closest hash+cd. + */ +extern int zap_leaf_lookup_closest(zap_leaf_t *l, + uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh); + +/* + * Read the first num_integers in the attribute. Integer size + * conversion will be done without sign extension. Return EINVAL if + * integer_size is too small. Return EOVERFLOW if there are more than + * num_integers in the attribute. + */ +extern int zap_entry_read(const zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, void *buf); + +extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh, + uint16_t buflen, char *buf); + +/* + * Replace the value of an existing entry. + * + * May fail if it runs out of space (ENOSPC). + */ +extern int zap_entry_update(zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, const void *buf); + +/* + * Remove an entry. + */ +extern void zap_entry_remove(zap_entry_handle_t *zeh); + +/* + * Create an entry. An equal entry must not exist, and this entry must + * belong in this leaf (according to its hash value). Fills in the + * entry handle on success. Returns 0 on success or ENOSPC on failure. + */ +extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh); + +/* Determine whether there is another entry with the same normalized form. */ +extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, + struct zap_name *zn, const char *name, struct zap *zap); + +/* + * Other stuff. + */ + +extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort); +extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len); +extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort); +extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l, + struct zap_stats *zs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_LEAF_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h new file mode 100644 index 000000000000..5713a748f7a0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h @@ -0,0 +1,154 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZCP_H +#define _SYS_ZCP_H + +#include <sys/dmu_tx.h> +#include <sys/dsl_pool.h> + +#include "lua.h" +#include "lualib.h" +#include "lauxlib.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZCP_RUN_INFO_KEY "runinfo" + +extern uint64_t zfs_lua_max_instrlimit; +extern uint64_t zfs_lua_max_memlimit; + +int zcp_argerror(lua_State *, int, const char *, ...); + +int zcp_eval(const char *, const char *, boolean_t, uint64_t, uint64_t, + nvpair_t *, nvlist_t *); + +int zcp_load_list_lib(lua_State *); + +int zcp_load_synctask_lib(lua_State *, boolean_t); + +typedef void (zcp_cleanup_t)(void *); +typedef struct zcp_cleanup_handler { + zcp_cleanup_t *zch_cleanup_func; + void *zch_cleanup_arg; + list_node_t zch_node; +} zcp_cleanup_handler_t; + +typedef struct zcp_run_info { + dsl_pool_t *zri_pool; + + /* + * An estimate of the total amount of space consumed by all + * synctasks we have successfully performed so far in this + * channel program. Used to generate ENOSPC errors for syncfuncs. + */ + int zri_space_used; + + /* + * The credentials of the thread which originally invoked the channel + * program. Since channel programs are always invoked from the synctask + * thread they should always do permissions checks against this cred + * rather than the 'current' thread's. + */ + cred_t *zri_cred; + + /* + * The tx in which this channel program is running. + */ + dmu_tx_t *zri_tx; + + /* + * The maximum number of Lua instructions the channel program is allowed + * to execute. If it takes longer than this it will time out. A value + * of 0 indicates no instruction limit. + */ + uint64_t zri_maxinstrs; + + /* + * The number of Lua instructions the channel program has executed. + */ + uint64_t zri_curinstrs; + + /* + * Boolean indicating whether or not the channel program exited + * because it timed out. + */ + boolean_t zri_timed_out; + + /* + * Boolean indicating whether or not we are running in syncing + * context. + */ + boolean_t zri_sync; + + /* + * List of currently registered cleanup handlers, which will be + * triggered in the event of a fatal error. + */ + list_t zri_cleanup_handlers; +} zcp_run_info_t; + +zcp_run_info_t *zcp_run_info(lua_State *); +zcp_cleanup_handler_t *zcp_register_cleanup(lua_State *, zcp_cleanup_t, void *); +void zcp_deregister_cleanup(lua_State *, zcp_cleanup_handler_t *); +void zcp_cleanup(lua_State *); + +/* + * Argument parsing routines for channel program callback functions. + */ +typedef struct zcp_arg { + /* + * The name of this argument. For keyword arguments this is the name + * functions will use to set the argument. For positional arguments + * the name has no programatic meaning, but will appear in error + * messages and help output. + */ + const char *za_name; + + /* + * The Lua type this argument should have (e.g. LUA_TSTRING, + * LUA_TBOOLEAN) see the lua_type() function documentation for a + * complete list. Calling a function with an argument that does + * not match the expected type will result in the program terminating. + */ + const int za_lua_type; +} zcp_arg_t; + +void zcp_parse_args(lua_State *, const char *, const zcp_arg_t *, + const zcp_arg_t *); +int zcp_nvlist_to_lua(lua_State *, nvlist_t *, char *, int); +int zcp_dataset_hold_error(lua_State *, dsl_pool_t *, const char *, int); +struct dsl_dataset *zcp_dataset_hold(lua_State *, dsl_pool_t *, + const char *, void *); + +typedef int (zcp_lib_func_t)(lua_State *); +typedef struct zcp_lib_info { + const char *name; + zcp_lib_func_t *func; + const zcp_arg_t pargs[4]; + const zcp_arg_t kwargs[2]; +} zcp_lib_info_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZCP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h new file mode 100644 index 000000000000..e227f2f4b7f5 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZCP_GLOBALS_H +#define _SYS_ZCP_GLOBALS_H + +#include "lua.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void zcp_load_globals(lua_State *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZCP_GLOBALS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h new file mode 100644 index 000000000000..a021e1ce8917 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZCP_LIST_H +#define _SYS_ZCP_LIST_H + +#include "lua.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void zcp_load_list_funcs(lua_State *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZCP_LIST_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h new file mode 100644 index 000000000000..97b17619565c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h @@ -0,0 +1,34 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZCP_PROP_H +#define _SYS_ZCP_PROP_H + +#ifdef __cplusplus +extern "C" { +#endif + +int zcp_load_get_lib(lua_State *state); +boolean_t prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZCP_PROP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h new file mode 100644 index 000000000000..5abde149a615 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZFEATURE_H +#define _SYS_ZFEATURE_H + +#include <sys/nvpair.h> +#include <sys/txg.h> +#include "zfeature_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define VALID_FEATURE_FID(fid) ((fid) >= 0 && (fid) < SPA_FEATURES) +#define VALID_FEATURE_OR_NONE(fid) ((fid) == SPA_FEATURE_NONE || \ + VALID_FEATURE_FID(fid)) + +struct spa; +struct dmu_tx; +struct objset; + +extern void spa_feature_create_zap_objects(struct spa *, struct dmu_tx *); +extern void spa_feature_enable(struct spa *, spa_feature_t, + struct dmu_tx *); +extern void spa_feature_incr(struct spa *, spa_feature_t, struct dmu_tx *); +extern void spa_feature_decr(struct spa *, spa_feature_t, struct dmu_tx *); +extern boolean_t spa_feature_is_enabled(struct spa *, spa_feature_t); +extern boolean_t spa_feature_is_active(struct spa *, spa_feature_t); +extern boolean_t spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, + uint64_t *txg); +extern uint64_t spa_feature_refcount(spa_t *, spa_feature_t, uint64_t); +extern boolean_t spa_features_check(spa_t *, boolean_t, nvlist_t *, nvlist_t *); + +/* + * These functions are only exported for zhack and zdb; normal callers should + * use the above interfaces. + */ +extern int feature_get_refcount(struct spa *, zfeature_info_t *, uint64_t *); +extern int feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, + uint64_t *res); +extern void feature_enable_sync(struct spa *, zfeature_info_t *, + struct dmu_tx *); +extern void feature_sync(struct spa *, zfeature_info_t *, uint64_t, + struct dmu_tx *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFEATURE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h new file mode 100644 index 000000000000..48f55a03f19f --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h @@ -0,0 +1,245 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_ACL_H +#define _SYS_FS_ZFS_ACL_H + +#ifdef _KERNEL +#include <sys/cred.h> +#endif +#include <sys/acl.h> +#include <sys/dmu.h> +#include <sys/zfs_fuid.h> +#include <sys/sa.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct znode_phys; + +#define ACE_SLOT_CNT 6 +#define ZFS_ACL_VERSION_INITIAL 0ULL +#define ZFS_ACL_VERSION_FUID 1ULL +#define ZFS_ACL_VERSION ZFS_ACL_VERSION_FUID + +/* + * ZFS ACLs (Access Control Lists) are stored in various forms. + * + * Files created with ACL version ZFS_ACL_VERSION_INITIAL + * will all be created with fixed length ACEs of type + * zfs_oldace_t. + * + * Files with ACL version ZFS_ACL_VERSION_FUID will be created + * with various sized ACEs. The abstraction entries will utilize + * zfs_ace_hdr_t, normal user/group entries will use zfs_ace_t + * and some specialized CIFS ACEs will use zfs_object_ace_t. + */ + +/* + * All ACEs have a common hdr. For + * owner@, group@, and everyone@ this is all + * thats needed. + */ +typedef struct zfs_ace_hdr { + uint16_t z_type; + uint16_t z_flags; + uint32_t z_access_mask; +} zfs_ace_hdr_t; + +typedef zfs_ace_hdr_t zfs_ace_abstract_t; + +/* + * Standard ACE + */ +typedef struct zfs_ace { + zfs_ace_hdr_t z_hdr; + uint64_t z_fuid; +} zfs_ace_t; + +/* + * The following type only applies to ACE_ACCESS_ALLOWED|DENIED_OBJECT_ACE_TYPE + * and will only be set/retrieved in a CIFS context. + */ + +typedef struct zfs_object_ace { + zfs_ace_t z_ace; + uint8_t z_object_type[16]; /* object type */ + uint8_t z_inherit_type[16]; /* inherited object type */ +} zfs_object_ace_t; + +typedef struct zfs_oldace { + uint32_t z_fuid; /* "who" */ + uint32_t z_access_mask; /* access mask */ + uint16_t z_flags; /* flags, i.e inheritance */ + uint16_t z_type; /* type of entry allow/deny */ +} zfs_oldace_t; + +typedef struct zfs_acl_phys_v0 { + uint64_t z_acl_extern_obj; /* ext acl pieces */ + uint32_t z_acl_count; /* Number of ACEs */ + uint16_t z_acl_version; /* acl version */ + uint16_t z_acl_pad; /* pad */ + zfs_oldace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ +} zfs_acl_phys_v0_t; + +#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT) + +/* + * Size of ACL count is always 2 bytes. + * Necessary to for dealing with both V0 ACL and V1 ACL layout + */ +#define ZFS_ACL_COUNT_SIZE (sizeof (uint16_t)) + +typedef struct zfs_acl_phys { + uint64_t z_acl_extern_obj; /* ext acl pieces */ + uint32_t z_acl_size; /* Number of bytes in ACL */ + uint16_t z_acl_version; /* acl version */ + uint16_t z_acl_count; /* ace count */ + uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ +} zfs_acl_phys_t; + +typedef struct acl_ops { + uint32_t (*ace_mask_get) (void *acep); /* get access mask */ + void (*ace_mask_set) (void *acep, + uint32_t mask); /* set access mask */ + uint16_t (*ace_flags_get) (void *acep); /* get flags */ + void (*ace_flags_set) (void *acep, + uint16_t flags); /* set flags */ + uint16_t (*ace_type_get)(void *acep); /* get type */ + void (*ace_type_set)(void *acep, + uint16_t type); /* set type */ + uint64_t (*ace_who_get)(void *acep); /* get who/fuid */ + void (*ace_who_set)(void *acep, + uint64_t who); /* set who/fuid */ + size_t (*ace_size)(void *acep); /* how big is this ace */ + size_t (*ace_abstract_size)(void); /* sizeof abstract entry */ + int (*ace_mask_off)(void); /* off of access mask in ace */ + /* ptr to data if any */ + int (*ace_data)(void *acep, void **datap); +} acl_ops_t; + +/* + * A zfs_acl_t structure is composed of a list of zfs_acl_node_t's. + * Each node will have one or more ACEs associated with it. You will + * only have multiple nodes during a chmod operation. Normally only + * one node is required. + */ +typedef struct zfs_acl_node { + list_node_t z_next; /* Next chunk of ACEs */ + void *z_acldata; /* pointer into actual ACE(s) */ + void *z_allocdata; /* pointer to kmem allocated memory */ + size_t z_allocsize; /* Size of blob in bytes */ + size_t z_size; /* length of ACL data */ + uint64_t z_ace_count; /* number of ACEs in this acl node */ + int z_ace_idx; /* ace iterator positioned on */ +} zfs_acl_node_t; + +typedef struct zfs_acl { + uint64_t z_acl_count; /* Number of ACEs */ + size_t z_acl_bytes; /* Number of bytes in ACL */ + uint_t z_version; /* version of ACL */ + void *z_next_ace; /* pointer to next ACE */ + uint64_t z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */ + zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ + list_t z_acl; /* chunks of ACE data */ + acl_ops_t z_ops; /* ACL operations */ +} zfs_acl_t; + +typedef struct acl_locator_cb { + zfs_acl_t *cb_aclp; + zfs_acl_node_t *cb_acl_node; +} zfs_acl_locator_cb_t; + +#define ACL_DATA_ALLOCED 0x1 +#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) + +struct zfs_fuid_info; + +typedef struct zfs_acl_ids { + uint64_t z_fuid; /* file owner fuid */ + uint64_t z_fgid; /* file group owner fuid */ + uint64_t z_mode; /* mode to set on create */ + zfs_acl_t *z_aclp; /* ACL to create with file */ + struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */ +} zfs_acl_ids_t; + +/* + * Property values for acl_mode and acl_inherit. + * + * acl_mode can take discard, noallow, groupmask and passthrough. + * whereas acl_inherit has secure instead of groupmask. + */ + +#define ZFS_ACL_DISCARD 0 +#define ZFS_ACL_NOALLOW 1 +#define ZFS_ACL_GROUPMASK 2 +#define ZFS_ACL_PASSTHROUGH 3 +#define ZFS_ACL_RESTRICTED 4 +#define ZFS_ACL_PASSTHROUGH_X 5 + +struct znode; +struct zfsvfs; + +#ifdef _KERNEL +int zfs_acl_ids_create(struct znode *, int, vattr_t *, + cred_t *, vsecattr_t *, zfs_acl_ids_t *); +void zfs_acl_ids_free(zfs_acl_ids_t *); +boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *); +int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); +int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); +void zfs_acl_rele(void *); +void zfs_oldace_byteswap(ace_t *, int); +void zfs_ace_byteswap(void *, size_t, boolean_t); +extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr); +extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *); +int zfs_fastaccesschk_execute(struct znode *, cred_t *); +extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); +extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); +extern int zfs_acl_access(struct znode *, int, cred_t *); +int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); +int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); +int zfs_zaccess_rename(struct znode *, struct znode *, + struct znode *, struct znode *, cred_t *cr); +void zfs_acl_free(zfs_acl_t *); +int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *, + struct zfs_fuid_info **, zfs_acl_t **); +int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *); +uint64_t zfs_external_acl(struct znode *); +int zfs_znode_acl_version(struct znode *); +int zfs_acl_size(struct znode *, int *); +zfs_acl_t *zfs_acl_alloc(int); +zfs_acl_node_t *zfs_acl_node_alloc(size_t); +void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *); +void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *); +uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *, + uint64_t *, uint64_t, uint64_t); +int zfs_acl_chown_setattr(struct znode *); + +#endif + +#ifdef __cplusplus +} +#endif +#endif /* _SYS_FS_ZFS_ACL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h new file mode 100644 index 000000000000..a3c0e4c31d0d --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h @@ -0,0 +1,145 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZFS_CONTEXT_H +#define _SYS_ZFS_CONTEXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/param.h> +#include <sys/stdint.h> +#include <sys/note.h> +#include <sys/kernel.h> +#include <sys/debug.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/sysmacros.h> +#include <sys/bitmap.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/taskq.h> +#include <sys/taskqueue.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/kcondvar.h> +#include <sys/random.h> +#include <sys/byteorder.h> +#include <sys/systm.h> +#include <sys/list.h> +#include <sys/zfs_debug.h> +#include <sys/sysevent.h> +#include <sys/uio.h> +#include <sys/dirent.h> +#include <sys/time.h> +#include <sys/uio.h> +#include <sys/fcntl.h> +#include <sys/limits.h> +#include <sys/string.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/cred.h> +#include <sys/sdt.h> +#include <sys/file.h> +#include <sys/vfs.h> +#include <sys/sysctl.h> +#include <sys/sbuf.h> +#include <sys/priv.h> +#include <sys/kdb.h> +#include <sys/ktr.h> +#include <sys/stack.h> +#include <sys/lockf.h> +#include <sys/pathname.h> +#include <sys/policy.h> +#include <sys/refstr.h> +#include <sys/zone.h> +#include <sys/eventhandler.h> +#include <sys/extattr.h> +#include <sys/misc.h> +#include <sys/sig.h> +#include <sys/osd.h> +#include <sys/sysevent/dev.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/u8_textprep.h> +#include <sys/fm/util.h> +#include <sys/sunddi.h> +#ifdef illumos +#include <sys/cyclic.h> +#endif +#include <sys/callo.h> +#include <sys/disp.h> +#include <machine/stdarg.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <vm/vm_extern.h> +#include <vm/vnode_pager.h> + +#define boot_ncpus (mp_ncpus) + +#define CPU_SEQID (curcpu) + +#define tsd_create(keyp, destructor) do { \ + *(keyp) = osd_thread_register((destructor)); \ + KASSERT(*(keyp) > 0, ("cannot register OSD")); \ +} while (0) +#define tsd_destroy(keyp) osd_thread_deregister(*(keyp)) +#define tsd_get(key) osd_thread_get(curthread, (key)) +#define tsd_set(key, value) osd_thread_set(curthread, (key), (value)) + +#ifdef __cplusplus +} +#endif + +extern int zfs_debug_level; +extern struct mtx zfs_debug_mtx; +#define ZFS_LOG(lvl, ...) do { \ + if (((lvl) & 0xff) <= zfs_debug_level) { \ + mtx_lock(&zfs_debug_mtx); \ + printf("%s:%u[%d]: ", __func__, __LINE__, (lvl)); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + if ((lvl) & 0x100) \ + kdb_backtrace(); \ + mtx_unlock(&zfs_debug_mtx); \ + } \ +} while (0) + +#define sys_shutdown rebooting + +#define noinline __attribute__((noinline)) +#define likely(x) __builtin_expect((x), 1) + +#endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h new file mode 100644 index 000000000000..de770c52add0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _ZFS_CTLDIR_H +#define _ZFS_CTLDIR_H + +#include <sys/vnode.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_CTLDIR_NAME ".zfs" + +#define zfs_has_ctldir(zdp) \ + ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \ + ((zdp)->z_zfsvfs->z_ctldir != NULL)) +#define zfs_show_ctldir(zdp) \ + (zfs_has_ctldir(zdp) && \ + ((zdp)->z_zfsvfs->z_show_ctldir)) + +void zfsctl_create(zfsvfs_t *); +void zfsctl_destroy(zfsvfs_t *); +int zfsctl_root(zfsvfs_t *, int, vnode_t **); +void zfsctl_init(void); +void zfsctl_fini(void); +boolean_t zfsctl_is_node(vnode_t *); + +int zfsctl_rename_snapshot(const char *from, const char *to); +int zfsctl_destroy_snapshot(const char *snapname, int force); +int zfsctl_umount_snapshots(vfs_t *, int, cred_t *); + +int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); + +#define ZFSCTL_INO_ROOT 0x1 +#define ZFSCTL_INO_SNAPDIR 0x2 + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_CTLDIR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h new file mode 100644 index 000000000000..9cbfc26b64e2 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h @@ -0,0 +1,99 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZFS_DEBUG_H +#define _SYS_ZFS_DEBUG_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +/* + * ZFS debugging + */ + +#if defined(DEBUG) || !defined(_KERNEL) +#if !defined(ZFS_DEBUG) +#define ZFS_DEBUG +#endif +#endif + +extern int zfs_flags; +extern boolean_t zfs_recover; +extern boolean_t zfs_free_leak_on_eio; + +#define ZFS_DEBUG_DPRINTF (1 << 0) +#define ZFS_DEBUG_DBUF_VERIFY (1 << 1) +#define ZFS_DEBUG_DNODE_VERIFY (1 << 2) +#define ZFS_DEBUG_SNAPNAMES (1 << 3) +#define ZFS_DEBUG_MODIFY (1 << 4) +/* 1<<5 was previously used, try not to reuse */ +#define ZFS_DEBUG_ZIO_FREE (1 << 6) +#define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7) +#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8) +#define ZFS_DEBUG_INDIRECT_REMAP (1 << 9) + +#ifdef ZFS_DEBUG +extern void __dprintf(const char *file, const char *func, + int line, const char *fmt, ...); +#define dprintf(...) \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) \ + __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__) +#else +#define dprintf(...) ((void)0) +#endif /* ZFS_DEBUG */ + +extern void zfs_panic_recover(const char *fmt, ...); + +typedef struct zfs_dbgmsg { + list_node_t zdm_node; + time_t zdm_timestamp; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +extern void zfs_dbgmsg_init(void); +extern void zfs_dbgmsg_fini(void); +extern void zfs_dbgmsg(const char *fmt, ...); +extern void zfs_dbgmsg_print(const char *tag); + +#ifdef illumos +#ifndef _KERNEL +extern int dprintf_find_string(const char *string); +#endif +#endif /* illumos */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_DEBUG_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h new file mode 100644 index 000000000000..22d8e603c433 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_DIR_H +#define _SYS_FS_ZFS_DIR_H + +#include <sys/pathname.h> +#include <sys/dmu.h> +#include <sys/zfs_znode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* zfs_dirent_lock() flags */ +#define ZNEW 0x0001 /* entry should not exist */ +#define ZEXISTS 0x0002 /* entry should exist */ +#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ +#define ZXATTR 0x0008 /* we want the xattr dir */ +#define ZRENAMING 0x0010 /* znode is being renamed */ +#define ZCILOOK 0x0020 /* case-insensitive lookup requested */ +#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ +#define ZHAVELOCK 0x0080 /* z_name_lock is already held */ + +/* mknode flags */ +#define IS_ROOT_NODE 0x01 /* create a root node */ +#define IS_XATTR 0x02 /* create an extended attribute node */ + +extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int); +extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int); +extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int, + boolean_t *); +#if 0 +extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int); +#else +extern int zfs_dirlook(znode_t *, const char *name, znode_t **); +#endif +extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, + uint_t, znode_t **, zfs_acl_ids_t *); +extern void zfs_rmnode(znode_t *); +extern boolean_t zfs_dirempty(znode_t *); +extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); +extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); +extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); +extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *, int); +extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_DIR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h new file mode 100644 index 000000000000..b381bb98e734 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h @@ -0,0 +1,132 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_FUID_H +#define _SYS_FS_ZFS_FUID_H + +#include <sys/types.h> +#ifdef _KERNEL +#include <sys/kidmap.h> +#include <sys/dmu.h> +#include <sys/zfs_vfsops.h> +#endif +#include <sys/avl.h> +#include <sys/list.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + ZFS_OWNER, + ZFS_GROUP, + ZFS_ACE_USER, + ZFS_ACE_GROUP +} zfs_fuid_type_t; + +/* + * Estimate space needed for one more fuid table entry. + * for now assume its current size + 1K + */ +#define FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) + +#define FUID_INDEX(x) ((x) >> 32) +#define FUID_RID(x) ((x) & 0xffffffff) +#define FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid)) +/* + * FUIDs cause problems for the intent log + * we need to replay the creation of the FUID, + * but we can't count on the idmapper to be around + * and during replay the FUID index may be different than + * before. Also, if an ACL has 100 ACEs and 12 different + * domains we don't want to log 100 domain strings, but rather + * just the unique 12. + */ + +/* + * The FUIDs in the log will index into + * domain string table and the bottom half will be the rid. + * Used for mapping ephemeral uid/gid during ACL setting to FUIDs + */ +typedef struct zfs_fuid { + list_node_t z_next; + uint64_t z_id; /* uid/gid being converted to fuid */ + uint64_t z_domidx; /* index in AVL domain table */ + uint64_t z_logfuid; /* index for domain in log */ +} zfs_fuid_t; + +/* list of unique domains */ +typedef struct zfs_fuid_domain { + list_node_t z_next; + uint64_t z_domidx; /* AVL tree idx */ + const char *z_domain; /* domain string */ +} zfs_fuid_domain_t; + +/* + * FUID information necessary for logging create, setattr, and setacl. + */ +typedef struct zfs_fuid_info { + list_t z_fuids; + list_t z_domains; + uint64_t z_fuid_owner; + uint64_t z_fuid_group; + char **z_domain_table; /* Used during replay */ + uint32_t z_fuid_cnt; /* How many fuids in z_fuids */ + uint32_t z_domain_cnt; /* How many domains */ + size_t z_domain_str_sz; /* len of domain strings z_domain list */ +} zfs_fuid_info_t; + +#ifdef _KERNEL +struct znode; +extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t); +extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t, + uint64_t, uint64_t, zfs_fuid_type_t); +extern void zfs_fuid_destroy(zfsvfs_t *); +extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t, + cred_t *, zfs_fuid_info_t **); +extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t, + zfs_fuid_info_t **); +extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, + uid_t *uid, uid_t *gid); +extern zfs_fuid_info_t *zfs_fuid_info_alloc(void); +extern void zfs_fuid_info_free(zfs_fuid_info_t *); +extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *); +void zfs_fuid_sync(zfsvfs_t *, dmu_tx_t *); +extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain, + char **retdomain, boolean_t addok); +extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx); +extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx); +#endif + +char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t); +void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *); +uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *); +void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_FUID_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h new file mode 100644 index 000000000000..7cd294316984 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -0,0 +1,466 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright 2016 RackTop Systems. + * Copyright (c) 2014 Integros [integros.com] + */ + +#ifndef _SYS_ZFS_IOCTL_H +#define _SYS_ZFS_IOCTL_H + +#include <sys/cred.h> +#include <sys/dmu.h> +#include <sys/zio.h> +#include <sys/dsl_deleg.h> +#include <sys/spa.h> +#include <sys/zfs_stat.h> + +#ifdef _KERNEL +#include <sys/nvpair.h> +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The structures in this file are passed between userland and the + * kernel. Userland may be running a 32-bit process, while the kernel + * is 64-bit. Therefore, these structures need to compile the same in + * 32-bit and 64-bit. This means not using type "long", and adding + * explicit padding so that the 32-bit structure will not be packed more + * tightly than the 64-bit structure (which requires 64-bit alignment). + */ + +/* + * Property values for snapdir + */ +#define ZFS_SNAPDIR_HIDDEN 0 +#define ZFS_SNAPDIR_VISIBLE 1 + +/* + * Field manipulation macros for the drr_versioninfo field of the + * send stream header. + */ + +/* + * Header types for zfs send streams. + */ +typedef enum drr_headertype { + DMU_SUBSTREAM = 0x1, + DMU_COMPOUNDSTREAM = 0x2 +} drr_headertype_t; + +#define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2) +#define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x) + +#define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30) +#define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x) + +/* + * Feature flags for zfs send streams (flags in drr_versioninfo) + */ + +#define DMU_BACKUP_FEATURE_DEDUP (1 << 0) +#define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1) +#define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) +/* flags #3 - #15 are reserved for incompatible closed-source implementations */ +#define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) +#define DMU_BACKUP_FEATURE_LZ4 (1 << 17) +/* flag #18 is reserved for a Delphix feature */ +#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) +#define DMU_BACKUP_FEATURE_RESUMING (1 << 20) +/* flag #21 is reserved for a Delphix feature */ +#define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) +#define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23) +/* flag #24 is reserved for the raw send (encryption) feature */ +/* flag #25 is reserved for the ZSTD compression feature */ + +/* + * Mask of all supported backup features + */ +#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ + DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \ + DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \ + DMU_BACKUP_FEATURE_RESUMING | \ + DMU_BACKUP_FEATURE_LARGE_BLOCKS | DMU_BACKUP_FEATURE_LARGE_DNODE | \ + DMU_BACKUP_FEATURE_COMPRESSED) + +/* Are all features in the given flag word currently supported? */ +#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) + +typedef enum dmu_send_resume_token_version { + ZFS_SEND_RESUME_TOKEN_VERSION = 1 +} dmu_send_resume_token_version_t; + +/* + * The drr_versioninfo field of the dmu_replay_record has the + * following layout: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | reserved | feature-flags |C|S| + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * The low order two bits indicate the header type: SUBSTREAM (0x1) + * or COMPOUNDSTREAM (0x2). Using two bits for this is historical: + * this field used to be a version number, where the two version types + * were 1 and 2. Using two bits for this allows earlier versions of + * the code to be able to recognize send streams that don't use any + * of the features indicated by feature flags. + */ + +#define DMU_BACKUP_MAGIC 0x2F5bacbacULL + +/* + * Send stream flags. Bits 24-31 are reserved for vendor-specific + * implementations and should not be used. + */ +#define DRR_FLAG_CLONE (1<<0) +#define DRR_FLAG_CI_DATA (1<<1) +/* + * This send stream, if it is a full send, includes the FREE and FREEOBJECT + * records that are created by the sending process. This means that the send + * stream can be received as a clone, even though it is not an incremental. + * This is not implemented as a feature flag, because the receiving side does + * not need to have implemented it to receive this stream; it is fully backwards + * compatible. We need a flag, though, because full send streams without it + * cannot necessarily be received as a clone correctly. + */ +#define DRR_FLAG_FREERECORDS (1<<2) + +/* + * flags in the drr_checksumflags field in the DRR_WRITE and + * DRR_WRITE_BYREF blocks + */ +#define DRR_CHECKSUM_DEDUP (1<<0) + +#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) + +/* deal with compressed drr_write replay records */ +#define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0) +#define DRR_WRITE_PAYLOAD_SIZE(drrw) \ + (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \ + (drrw)->drr_logical_size) + +typedef struct dmu_replay_record { + enum { + DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, + DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, + DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES + } drr_type; + uint32_t drr_payloadlen; + union { + struct drr_begin { + uint64_t drr_magic; + uint64_t drr_versioninfo; /* was drr_version */ + uint64_t drr_creation_time; + dmu_objset_type_t drr_type; + uint32_t drr_flags; + uint64_t drr_toguid; + uint64_t drr_fromguid; + char drr_toname[MAXNAMELEN]; + } drr_begin; + struct drr_end { + zio_cksum_t drr_checksum; + uint64_t drr_toguid; + } drr_end; + struct drr_object { + uint64_t drr_object; + dmu_object_type_t drr_type; + dmu_object_type_t drr_bonustype; + uint32_t drr_blksz; + uint32_t drr_bonuslen; + uint8_t drr_checksumtype; + uint8_t drr_compress; + uint8_t drr_dn_slots; + uint8_t drr_pad[5]; + uint64_t drr_toguid; + /* bonus content follows */ + } drr_object; + struct drr_freeobjects { + uint64_t drr_firstobj; + uint64_t drr_numobjs; + uint64_t drr_toguid; + } drr_freeobjects; + struct drr_write { + uint64_t drr_object; + dmu_object_type_t drr_type; + uint32_t drr_pad; + uint64_t drr_offset; + uint64_t drr_logical_size; + uint64_t drr_toguid; + uint8_t drr_checksumtype; + uint8_t drr_checksumflags; + uint8_t drr_compressiontype; + uint8_t drr_pad2[5]; + /* deduplication key */ + ddt_key_t drr_key; + /* only nonzero if drr_compressiontype is not 0 */ + uint64_t drr_compressed_size; + /* content follows */ + } drr_write; + struct drr_free { + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + uint64_t drr_toguid; + } drr_free; + struct drr_write_byref { + /* where to put the data */ + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + uint64_t drr_toguid; + /* where to find the prior copy of the data */ + uint64_t drr_refguid; + uint64_t drr_refobject; + uint64_t drr_refoffset; + /* properties of the data */ + uint8_t drr_checksumtype; + uint8_t drr_checksumflags; + uint8_t drr_pad2[6]; + ddt_key_t drr_key; /* deduplication key */ + } drr_write_byref; + struct drr_spill { + uint64_t drr_object; + uint64_t drr_length; + uint64_t drr_toguid; + uint64_t drr_pad[4]; /* needed for crypto */ + /* spill data follows */ + } drr_spill; + struct drr_write_embedded { + uint64_t drr_object; + uint64_t drr_offset; + /* logical length, should equal blocksize */ + uint64_t drr_length; + uint64_t drr_toguid; + uint8_t drr_compression; + uint8_t drr_etype; + uint8_t drr_pad[6]; + uint32_t drr_lsize; /* uncompressed size of payload */ + uint32_t drr_psize; /* compr. (real) size of payload */ + /* (possibly compressed) content follows */ + } drr_write_embedded; + + /* + * Nore: drr_checksum is overlaid with all record types + * except DRR_BEGIN. Therefore its (non-pad) members + * must not overlap with members from the other structs. + * We accomplish this by putting its members at the very + * end of the struct. + */ + struct drr_checksum { + uint64_t drr_pad[34]; + /* + * fletcher-4 checksum of everything preceding the + * checksum. + */ + zio_cksum_t drr_checksum; + } drr_checksum; + } drr_u; +} dmu_replay_record_t; + +/* diff record range types */ +typedef enum diff_type { + DDR_NONE = 0x1, + DDR_INUSE = 0x2, + DDR_FREE = 0x4 +} diff_type_t; + +/* + * The diff reports back ranges of free or in-use objects. + */ +typedef struct dmu_diff_record { + uint64_t ddr_type; + uint64_t ddr_first; + uint64_t ddr_last; +} dmu_diff_record_t; + +typedef struct zinject_record { + uint64_t zi_objset; + uint64_t zi_object; + uint64_t zi_start; + uint64_t zi_end; + uint64_t zi_guid; + uint32_t zi_level; + uint32_t zi_error; + uint64_t zi_type; + uint32_t zi_freq; + uint32_t zi_failfast; + char zi_func[MAXNAMELEN]; + uint32_t zi_iotype; + int32_t zi_duration; + uint64_t zi_timer; + uint64_t zi_nlanes; + uint32_t zi_cmd; + uint32_t zi_pad; +} zinject_record_t; + +#define ZINJECT_NULL 0x1 +#define ZINJECT_FLUSH_ARC 0x2 +#define ZINJECT_UNLOAD_SPA 0x4 + +typedef enum zinject_type { + ZINJECT_UNINITIALIZED, + ZINJECT_DATA_FAULT, + ZINJECT_DEVICE_FAULT, + ZINJECT_LABEL_FAULT, + ZINJECT_IGNORED_WRITES, + ZINJECT_PANIC, + ZINJECT_DELAY_IO, +} zinject_type_t; + +typedef struct zfs_share { + uint64_t z_exportdata; + uint64_t z_sharedata; + uint64_t z_sharetype; /* 0 = share, 1 = unshare */ + uint64_t z_sharemax; /* max length of share string */ +} zfs_share_t; + +/* + * ZFS file systems may behave the usual, POSIX-compliant way, where + * name lookups are case-sensitive. They may also be set up so that + * all the name lookups are case-insensitive, or so that only some + * lookups, the ones that set an FIGNORECASE flag, are case-insensitive. + */ +typedef enum zfs_case { + ZFS_CASE_SENSITIVE, + ZFS_CASE_INSENSITIVE, + ZFS_CASE_MIXED +} zfs_case_t; + +/* + * Note: this struct must have the same layout in 32-bit and 64-bit, so + * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit + * kernel. Therefore, we add padding to it so that no "hidden" padding + * is automatically added on 64-bit (but not on 32-bit). + */ +typedef struct zfs_cmd { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + dmu_replay_record_t zc_begin_record; + zinject_record_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad3[3]; + boolean_t zc_resumable; + uint32_t zc_pad4; + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_t; + +typedef struct zfs_useracct { + char zu_domain[256]; + uid_t zu_rid; + uint32_t zu_pad; + uint64_t zu_space; +} zfs_useracct_t; + +#define ZFSDEV_MAX_MINOR (1 << 16) +#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1) + +#define ZPOOL_EXPORT_AFTER_SPLIT 0x1 + +#ifdef _KERNEL +struct objset; +struct zfsvfs; + +typedef struct zfs_creat { + nvlist_t *zct_zplprops; + nvlist_t *zct_props; +} zfs_creat_t; + +extern int zfs_secpolicy_snapshot_perms(const char *, cred_t *); +extern int zfs_secpolicy_rename_perms(const char *, const char *, cred_t *); +extern int zfs_secpolicy_destroy_perms(const char *, cred_t *); +extern int zfs_busy(void); +extern void zfs_unmount_snap(const char *); +extern void zfs_destroy_unmount_origin(const char *); +#ifdef illumos +extern int getzfsvfs_impl(struct objset *, struct zfsvfs **); +#else +extern int getzfsvfs_impl(struct objset *, vfs_t **); +#endif +extern int getzfsvfs(const char *, struct zfsvfs **); + +/* + * ZFS minor numbers can refer to either a control device instance or + * a zvol. Depending on the value of zss_type, zss_data points to either + * a zvol_state_t or a zfs_onexit_t. + */ +enum zfs_soft_state_type { + ZSST_ZVOL, + ZSST_CTLDEV +}; + +typedef struct zfs_soft_state { + enum zfs_soft_state_type zss_type; + void *zss_data; +} zfs_soft_state_t; + +extern void *zfsdev_get_soft_state(minor_t minor, + enum zfs_soft_state_type which); +extern minor_t zfsdev_minor_alloc(void); + +extern void *zfsdev_state; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IOCTL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h new file mode 100644 index 000000000000..4982bd4d0afc --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_ZFS_ONEXIT_H +#define _SYS_ZFS_ONEXIT_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef struct zfs_onexit { + kmutex_t zo_lock; + list_t zo_actions; +} zfs_onexit_t; + +typedef struct zfs_onexit_action_node { + list_node_t za_link; + void (*za_func)(void *); + void *za_data; +} zfs_onexit_action_node_t; + +extern void zfs_onexit_init(zfs_onexit_t **zo); +extern void zfs_onexit_destroy(zfs_onexit_t *zo); + +#endif + +extern int zfs_onexit_fd_hold(int fd, minor_t *minorp); +extern void zfs_onexit_fd_rele(int fd); +extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, + uint64_t *action_handle); +extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, + boolean_t fire); +extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, + void **data); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_ONEXIT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h new file mode 100644 index 000000000000..93733ba8a2b7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_RLOCK_H +#define _SYS_FS_ZFS_RLOCK_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#include <sys/zfs_znode.h> + +typedef enum { + RL_READER, + RL_WRITER, + RL_APPEND +} rl_type_t; + +typedef struct rl { + znode_t *r_zp; /* znode this lock applies to */ + avl_node_t r_node; /* avl node link */ + uint64_t r_off; /* file range offset */ + uint64_t r_len; /* file range length */ + uint_t r_cnt; /* range reference count in tree */ + rl_type_t r_type; /* range type */ + kcondvar_t r_wr_cv; /* cv for waiting writers */ + kcondvar_t r_rd_cv; /* cv for waiting readers */ + uint8_t r_proxy; /* acting for original range */ + uint8_t r_write_wanted; /* writer wants to lock this range */ + uint8_t r_read_wanted; /* reader wants to lock this range */ +} rl_t; + +/* + * Lock a range (offset, length) as either shared (RL_READER) + * or exclusive (RL_WRITER or RL_APPEND). RL_APPEND is a special type that + * is converted to RL_WRITER that specified to lock from the start of the + * end of file. Returns the range lock structure. + */ +rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); + +/* Unlock range and destroy range lock structure. */ +void zfs_range_unlock(rl_t *rl); + +/* + * Reduce range locked as RW_WRITER from whole file to specified range. + * Asserts the whole file was previously locked. + */ +void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len); + +/* + * AVL comparison function used to order range locks + * Locks are ordered on the start offset of the range. + */ +int zfs_range_compare(const void *arg1, const void *arg2); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_RLOCK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h new file mode 100644 index 000000000000..fc40b0e9517c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h @@ -0,0 +1,142 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_SA_H +#define _SYS_ZFS_SA_H + +#ifdef _KERNEL +#include <sys/list.h> +#include <sys/dmu.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_znode.h> +#include <sys/sa.h> +#include <sys/zil.h> + + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is the list of known attributes + * to the ZPL. The values of the actual + * attributes are not defined by the order + * the enums. It is controlled by the attribute + * registration mechanism. Two different file system + * could have different numeric values for the same + * attributes. this list is only used for dereferencing + * into the table that will hold the actual numeric value. + */ +typedef enum zpl_attr { + ZPL_ATIME, + ZPL_MTIME, + ZPL_CTIME, + ZPL_CRTIME, + ZPL_GEN, + ZPL_MODE, + ZPL_SIZE, + ZPL_PARENT, + ZPL_LINKS, + ZPL_XATTR, + ZPL_RDEV, + ZPL_FLAGS, + ZPL_UID, + ZPL_GID, + ZPL_PAD, + ZPL_ZNODE_ACL, + ZPL_DACL_COUNT, + ZPL_SYMLINK, + ZPL_SCANSTAMP, + ZPL_DACL_ACES, + ZPL_END +} zpl_attr_t; + +#define ZFS_OLD_ZNODE_PHYS_SIZE 0x108 +#define ZFS_SA_BASE_ATTR_SIZE (ZFS_OLD_ZNODE_PHYS_SIZE - \ + sizeof (zfs_acl_phys_t)) + +#define SA_MODE_OFFSET 0 +#define SA_SIZE_OFFSET 8 +#define SA_GEN_OFFSET 16 +#define SA_UID_OFFSET 24 +#define SA_GID_OFFSET 32 +#define SA_PARENT_OFFSET 40 + +extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1]; +extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1]; + +/* + * This is a deprecated data structure that only exists for + * dealing with file systems create prior to ZPL version 5. + */ +typedef struct znode_phys { + uint64_t zp_atime[2]; /* 0 - last file access time */ + uint64_t zp_mtime[2]; /* 16 - last file modification time */ + uint64_t zp_ctime[2]; /* 32 - last file change time */ + uint64_t zp_crtime[2]; /* 48 - creation time */ + uint64_t zp_gen; /* 64 - generation (txg of creation) */ + uint64_t zp_mode; /* 72 - file mode bits */ + uint64_t zp_size; /* 80 - size of file */ + uint64_t zp_parent; /* 88 - directory parent (`..') */ + uint64_t zp_links; /* 96 - number of links to file */ + uint64_t zp_xattr; /* 104 - DMU object for xattrs */ + uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ + uint64_t zp_flags; /* 120 - persistent flags */ + uint64_t zp_uid; /* 128 - file owner */ + uint64_t zp_gid; /* 136 - owning group */ + uint64_t zp_zap; /* 144 - extra attributes */ + uint64_t zp_pad[3]; /* 152 - future */ + zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */ + /* + * Data may pad out any remaining bytes in the znode buffer, eg: + * + * |<---------------------- dnode_phys (512) ------------------------>| + * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| + * |<---- znode (264) ---->|<---- data (56) ---->| + * + * At present, we use this space for the following: + * - symbolic links + * - 32-byte anti-virus scanstamp (regular files only) + */ +} znode_phys_t; + +#ifdef _KERNEL +int zfs_sa_readlink(struct znode *, uio_t *); +void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *); +void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *); +void zfs_sa_get_scanstamp(struct znode *, xvattr_t *); +void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *); +void zfs_sa_uprade_pre(struct sa_handle *, void *, dmu_tx_t *); +void zfs_sa_upgrade_post(struct sa_handle *, void *, dmu_tx_t *); +void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_SA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h new file mode 100644 index 000000000000..a8af7ec61ba9 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_STAT_H +#define _SYS_FS_ZFS_STAT_H + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/dmu.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * A limited number of zpl level stats are retrievable + * with an ioctl. zfs diff is the current consumer. + */ +typedef struct zfs_stat { + uint64_t zs_gen; + uint64_t zs_mode; + uint64_t zs_links; + uint64_t zs_ctime[2]; +} zfs_stat_t; + +extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_STAT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h new file mode 100644 index 000000000000..bbfb79ce24d3 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h @@ -0,0 +1,175 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. + * All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_VFSOPS_H +#define _SYS_FS_ZFS_VFSOPS_H + +#include <sys/list.h> +#include <sys/vfs.h> +#include <sys/zil.h> +#include <sys/sa.h> +#include <sys/rrwlock.h> +#include <sys/zfs_ioctl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zfsvfs zfsvfs_t; +struct znode; + +struct zfsvfs { + vfs_t *z_vfs; /* generic fs struct */ + zfsvfs_t *z_parent; /* parent fs */ + objset_t *z_os; /* objset reference */ + uint64_t z_root; /* id of root znode */ + struct vnode *z_rootvnode; /* root vnode */ + struct rmlock z_rootvnodelock;/* protection for root vnode */ + uint64_t z_unlinkedobj; /* id of unlinked zapobj */ + uint64_t z_max_blksz; /* maximum block size for files */ + uint64_t z_fuid_obj; /* fuid table object number */ + uint64_t z_fuid_size; /* fuid table size */ + avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ + avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ + krwlock_t z_fuid_lock; /* fuid lock */ + boolean_t z_fuid_loaded; /* fuid tables are loaded */ + boolean_t z_fuid_dirty; /* need to sync fuid table ? */ + struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ + zilog_t *z_log; /* intent log pointer */ + uint_t z_acl_mode; /* acl chmod/mode behavior */ + uint_t z_acl_inherit; /* acl inheritance behavior */ + zfs_case_t z_case; /* case-sense */ + boolean_t z_utf8; /* utf8-only */ + int z_norm; /* normalization flags */ + boolean_t z_atime; /* enable atimes mount option */ + boolean_t z_unmounted; /* unmounted */ + rrmlock_t z_teardown_lock; + krwlock_t z_teardown_inactive_lock; + list_t z_all_znodes; /* all vnodes in the fs */ + kmutex_t z_znodes_lock; /* lock for z_all_znodes */ + struct zfsctl_root *z_ctldir; /* .zfs directory pointer */ + boolean_t z_show_ctldir; /* expose .zfs in the root dir */ + boolean_t z_issnap; /* true if this is a snapshot */ + boolean_t z_vscan; /* virus scan on/off */ + boolean_t z_use_fuids; /* version allows fuids */ + boolean_t z_replay; /* set during ZIL replay */ + boolean_t z_use_sa; /* version allow system attributes */ + boolean_t z_use_namecache;/* make use of FreeBSD name cache */ + uint64_t z_version; /* ZPL version */ + uint64_t z_shares_dir; /* hidden shares dir */ + kmutex_t z_lock; + uint64_t z_userquota_obj; + uint64_t z_groupquota_obj; + uint64_t z_replay_eof; /* New end of file - replay only */ + sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ +#define ZFS_OBJ_MTX_SZ 64 + kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ +#if defined(__FreeBSD__) + struct task z_unlinked_drain_task; +#endif +}; + +/* + * Normal filesystems (those not under .zfs/snapshot) have a total + * file ID size limited to 12 bytes (including the length field) due to + * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical + * reasons, this same limit is being imposed by the Solaris NFSv3 implementation + * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It + * is not possible to expand beyond 12 bytes without abandoning support + * of NFSv2. + * + * For normal filesystems, we partition up the available space as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * + * We reserve only 48 bits for the object number, as this is the limit + * currently defined and imposed by the DMU. + */ +typedef struct zfid_short { + uint16_t zf_len; + uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_short_t; + +/* + * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes + * (including the length field). This makes files under .zfs/snapshot + * accessible by NFSv3 and NFSv4, but not NFSv2. + * + * For files under .zfs/snapshot, we partition up the available space + * as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * 6 bytes objset id (48 bits) + * 4 bytes[**] currently just zero (32 bits) + * + * We reserve only 48 bits for the object number and objset id, as these are + * the limits currently defined and imposed by the DMU. + * + * [*] 20 bytes on FreeBSD to fit into the size of struct fid. + * [**] 2 bytes on FreeBSD for the above reason. + */ +typedef struct zfid_long { + zfid_short_t z_fid; + uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */ +} zfid_long_t; + +#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) +#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) + +extern uint_t zfs_fsyncer_key; +extern int zfs_super_owner; + +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valuep); +extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); +extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota); +extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *, + boolean_t isgroup); +extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, + uint64_t fuid); +extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); +extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp); +extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); +extern void zfsvfs_free(zfsvfs_t *zfsvfs); +extern int zfs_check_global_label(const char *dsname, const char *hexsl); + +#ifdef _KERNEL +extern void zfsvfs_update_fromname(const char *oldname, const char *newname); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h new file mode 100644 index 000000000000..d15f8945f12c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h @@ -0,0 +1,372 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_ZNODE_H +#define _SYS_FS_ZFS_ZNODE_H + +#ifdef _KERNEL +#include <sys/list.h> +#include <sys/dmu.h> +#include <sys/sa.h> +#include <sys/zfs_vfsops.h> +#include <sys/rrwlock.h> +#include <sys/zfs_sa.h> +#include <sys/zfs_stat.h> +#endif +#include <sys/zfs_acl.h> +#include <sys/zil.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Additional file level attributes, that are stored + * in the upper half of zp_flags + */ +#define ZFS_READONLY 0x0000000100000000 +#define ZFS_HIDDEN 0x0000000200000000 +#define ZFS_SYSTEM 0x0000000400000000 +#define ZFS_ARCHIVE 0x0000000800000000 +#define ZFS_IMMUTABLE 0x0000001000000000 +#define ZFS_NOUNLINK 0x0000002000000000 +#define ZFS_APPENDONLY 0x0000004000000000 +#define ZFS_NODUMP 0x0000008000000000 +#define ZFS_OPAQUE 0x0000010000000000 +#define ZFS_AV_QUARANTINED 0x0000020000000000 +#define ZFS_AV_MODIFIED 0x0000040000000000 +#define ZFS_REPARSE 0x0000080000000000 +#define ZFS_OFFLINE 0x0000100000000000 +#define ZFS_SPARSE 0x0000200000000000 + +#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \ +{ \ + if (value) \ + pflags |= attr; \ + else \ + pflags &= ~attr; \ + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \ + &pflags, sizeof (pflags), tx)); \ +} + +/* + * Define special zfs pflags + */ +#define ZFS_XATTR 0x1 /* is an extended attribute */ +#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ +#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ +#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ +#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */ +#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ +#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ +#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ +#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */ + +#define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME] +#define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME] +#define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME] +#define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME] +#define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN] +#define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES] +#define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR] +#define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK] +#define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV] +#define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP] +#define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID] +#define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID] +#define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT] +#define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS] +#define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE] +#define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT] +#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS] +#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE] +#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL] +#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD] + +/* + * Is ID ephemeral? + */ +#define IS_EPHEMERAL(x) (x > MAXUID) + +/* + * Should we use FUIDs? + */ +#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \ + spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) +#define USE_SA(version, os) (version >= ZPL_VERSION_SA && \ + spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA) + +#define MASTER_NODE_OBJ 1 + +/* + * Special attributes for master node. + * "userquota@" and "groupquota@" are also valid (from + * zfs_userquota_prop_prefixes[]). + */ +#define ZFS_FSID "FSID" +#define ZFS_UNLINKED_SET "DELETE_QUEUE" +#define ZFS_ROOT_OBJ "ROOT" +#define ZPL_VERSION_STR "VERSION" +#define ZFS_FUID_TABLES "FUID" +#define ZFS_SHARES_DIR "SHARES" +#define ZFS_SA_ATTRS "SA_ATTRS" + +/* + * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in + * the directory entries. + */ +#ifndef IFTODT +#define IFTODT(mode) (((mode) & S_IFMT) >> 12) +#endif + +/* + * The directory entry has the type (currently unused on Solaris) in the + * top 4 bits, and the object number in the low 48 bits. The "middle" + * 12 bits are unused. + */ +#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) +#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) + +/* + * Directory entry locks control access to directory entries. + * They are used to protect creates, deletes, and renames. + * Each directory znode has a mutex and a list of locked names. + */ +#ifdef _KERNEL +typedef struct zfs_dirlock { + char *dl_name; /* directory entry being locked */ + uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */ + uint8_t dl_namelock; /* 1 if z_name_lock is NOT held */ + uint16_t dl_namesize; /* set if dl_name was allocated */ + kcondvar_t dl_cv; /* wait for entry to be unlocked */ + struct znode *dl_dzp; /* directory znode */ + struct zfs_dirlock *dl_next; /* next in z_dirlocks list */ +} zfs_dirlock_t; + +typedef struct znode { + struct zfsvfs *z_zfsvfs; + vnode_t *z_vnode; + uint64_t z_id; /* object ID for this znode */ +#ifdef illumos + kmutex_t z_lock; /* znode modification lock */ + krwlock_t z_parent_lock; /* parent lock for directories */ + krwlock_t z_name_lock; /* "master" lock for dirent locks */ + zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ +#endif + kmutex_t z_range_lock; /* protects changes to z_range_avl */ + avl_tree_t z_range_avl; /* avl tree of file range locks */ + uint8_t z_unlinked; /* file has been unlinked */ + uint8_t z_atime_dirty; /* atime needs to be synced */ + uint8_t z_zn_prefetch; /* Prefetch znodes? */ + uint8_t z_moved; /* Has this znode been moved? */ + uint_t z_blksz; /* block size in bytes */ + uint_t z_seq; /* modification sequence number */ + uint64_t z_mapcnt; /* number of pages mapped to file */ + uint64_t z_dnodesize; /* dnode size */ + uint64_t z_gen; /* generation (cached) */ + uint64_t z_size; /* file size (cached) */ + uint64_t z_atime[2]; /* atime (cached) */ + uint64_t z_links; /* file links (cached) */ + uint64_t z_pflags; /* pflags (cached) */ + uint64_t z_uid; /* uid fuid (cached) */ + uint64_t z_gid; /* gid fuid (cached) */ + mode_t z_mode; /* mode (cached) */ + uint32_t z_sync_cnt; /* synchronous open count */ + kmutex_t z_acl_lock; /* acl data lock */ + zfs_acl_t *z_acl_cached; /* cached acl */ + list_node_t z_link_node; /* all znodes in fs link */ + sa_handle_t *z_sa_hdl; /* handle to sa data */ + boolean_t z_is_sa; /* are we native sa? */ +} znode_t; + +#define ZFS_LINK_MAX UINT64_MAX + +/* + * Range locking rules + * -------------------- + * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole + * file range needs to be locked as RL_WRITER. Only then can the pages be + * freed etc and zp_size reset. zp_size must be set within range lock. + * 2. For writes and punching holes (zfs_write & zfs_space) just the range + * being written or freed needs to be locked as RL_WRITER. + * Multiple writes at the end of the file must coordinate zp_size updates + * to ensure data isn't lost. A compare and swap loop is currently used + * to ensure the file size is at least the offset last written. + * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being + * read needs to be locked as RL_READER. A check against zp_size can then + * be made for reading beyond end of file. + */ + +/* + * Convert between znode pointers and vnode pointers + */ +#ifdef DEBUG +static __inline vnode_t * +ZTOV(znode_t *zp) +{ + vnode_t *vp = zp->z_vnode; + + ASSERT(vp != NULL && vp->v_data == zp); + return (vp); +} +static __inline znode_t * +VTOZ(vnode_t *vp) +{ + znode_t *zp = (znode_t *)vp->v_data; + + ASSERT(zp != NULL && zp->z_vnode == vp); + return (zp); +} +#else +#define ZTOV(ZP) ((ZP)->z_vnode) +#define VTOZ(VP) ((znode_t *)(VP)->v_data) +#endif + +/* Called on entry to each ZFS vnode and vfs operation */ +#define ZFS_ENTER(zfsvfs) \ + { \ + rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ + if ((zfsvfs)->z_unmounted) { \ + ZFS_EXIT(zfsvfs); \ + return (EIO); \ + } \ + } + +/* Must be called before exiting the vop */ +#define ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG) + +/* Verifies the znode is valid */ +#define ZFS_VERIFY_ZP(zp) \ + if ((zp)->z_sa_hdl == NULL) { \ + ZFS_EXIT((zp)->z_zfsvfs); \ + return (EIO); \ + } \ + +/* + * Macros for dealing with dmu_buf_hold + */ +#define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1)) +#define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \ + (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) +#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ + mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) +#define ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \ + mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) +#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ + mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) + +/* Encode ZFS stored time values from a struct timespec */ +#define ZFS_TIME_ENCODE(tp, stmp) \ +{ \ + (stmp)[0] = (uint64_t)(tp)->tv_sec; \ + (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ +} + +/* Decode ZFS stored time values to a struct timespec */ +#define ZFS_TIME_DECODE(tp, stmp) \ +{ \ + (tp)->tv_sec = (time_t)(stmp)[0]; \ + (tp)->tv_nsec = (long)(stmp)[1]; \ +} + +/* + * Timestamp defines + */ +#define ACCESSED (AT_ATIME) +#define STATE_CHANGED (AT_CTIME) +#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME) + +#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ + if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ + zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE); + +extern int zfs_init_fs(zfsvfs_t *, znode_t **); +extern void zfs_set_dataprop(objset_t *); +extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *, + dmu_tx_t *tx); +extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2], + uint64_t [2], boolean_t); +extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); +extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); +extern void zfs_znode_init(void); +extern void zfs_znode_fini(void); +extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); +extern int zfs_rezget(znode_t *); +extern void zfs_zinactive(znode_t *); +extern void zfs_znode_delete(znode_t *, dmu_tx_t *); +extern void zfs_znode_free(znode_t *); +extern void zfs_remove_op_tables(); +extern int zfs_create_op_tables(); +extern dev_t zfs_cmpldev(uint64_t); +extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); +extern int zfs_get_stats(objset_t *os, nvlist_t *nv); +extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os); +extern void zfs_znode_dmu_fini(znode_t *); + +extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *, + vattr_t *vap); +extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp, + vattr_t *vap); +extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, char *name, uint64_t foid); +#define ZFS_NO_OBJECT 0 /* no object id */ +extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name); +extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, char *link); +extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); +extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, offset_t off, ssize_t len, int ioflag); +extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t off, uint64_t len); +extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp); +#ifndef ZFS_NO_ACL +extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, + vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); +#endif +extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx); +extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); +extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx); + +extern zil_get_data_t zfs_get_data; +extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; +extern int zfsfstype; + +extern int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf); + +#endif /* _KERNEL */ + +extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_ZNODE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h new file mode 100644 index 000000000000..3f0b771df48f --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h @@ -0,0 +1,461 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#ifndef _SYS_ZIL_H +#define _SYS_ZIL_H + +#include <sys/types.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_pool; +struct dsl_dataset; +struct lwb; + +/* + * Intent log format: + * + * Each objset has its own intent log. The log header (zil_header_t) + * for objset N's intent log is kept in the Nth object of the SPA's + * intent_log objset. The log header points to a chain of log blocks, + * each of which contains log records (i.e., transactions) followed by + * a log block trailer (zil_trailer_t). The format of a log record + * depends on the record (or transaction) type, but all records begin + * with a common structure that defines the type, length, and txg. + */ + +/* + * Intent log header - this on disk structure holds fields to manage + * the log. All fields are 64 bit to easily handle cross architectures. + */ +typedef struct zil_header { + uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ + uint64_t zh_replay_seq; /* highest replayed sequence number */ + blkptr_t zh_log; /* log chain */ + uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */ + uint64_t zh_flags; /* header flags */ + uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */ + uint64_t zh_pad[3]; +} zil_header_t; + +/* + * zh_flags bit settings + */ +#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */ +#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */ + +/* + * Log block chaining. + * + * Log blocks are chained together. Originally they were chained at the + * end of the block. For performance reasons the chain was moved to the + * beginning of the block which allows writes for only the data being used. + * The older position is supported for backwards compatability. + * + * The zio_eck_t contains a zec_cksum which for the intent log is + * the sequence number of this log block. A seq of 0 is invalid. + * The zec_cksum is checked by the SPA against the sequence + * number passed in the blk_cksum field of the blkptr_t + */ +typedef struct zil_chain { + uint64_t zc_pad; + blkptr_t zc_next_blk; /* next block in chain */ + uint64_t zc_nused; /* bytes in log block used */ + zio_eck_t zc_eck; /* block trailer */ +} zil_chain_t; + +#define ZIL_MIN_BLKSZ 4096ULL + +/* + * ziltest is by and large an ugly hack, but very useful in + * checking replay without tedious work. + * When running ziltest we want to keep all itx's and so maintain + * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG + * We subtract TXG_CONCURRENT_STATES to allow for common code. + */ +#define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES) + +/* + * The words of a log block checksum. + */ +#define ZIL_ZC_GUID_0 0 +#define ZIL_ZC_GUID_1 1 +#define ZIL_ZC_OBJSET 2 +#define ZIL_ZC_SEQ 3 + +typedef enum zil_create { + Z_FILE, + Z_DIR, + Z_XATTRDIR, +} zil_create_t; + +/* + * size of xvattr log section. + * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps + * for create time and a single 64 bit integer for all of the attributes, + * and 4 64 bit integers (32 bytes) for the scanstamp. + * + */ + +#define ZIL_XVAT_SIZE(mapsize) \ + sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \ + (sizeof (uint64_t) * 7) + +/* + * Size of ACL in log. The ACE data is padded out to properly align + * on 8 byte boundary. + */ + +#define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t))) + +/* + * Intent log transaction types and record structures + */ +#define TX_COMMIT 0 /* Commit marker (no on-disk state) */ +#define TX_CREATE 1 /* Create file */ +#define TX_MKDIR 2 /* Make directory */ +#define TX_MKXATTR 3 /* Make XATTR directory */ +#define TX_SYMLINK 4 /* Create symbolic link to a file */ +#define TX_REMOVE 5 /* Remove file */ +#define TX_RMDIR 6 /* Remove directory */ +#define TX_LINK 7 /* Create hard link to a file */ +#define TX_RENAME 8 /* Rename a file */ +#define TX_WRITE 9 /* File write */ +#define TX_TRUNCATE 10 /* Truncate a file */ +#define TX_SETATTR 11 /* Set file attributes */ +#define TX_ACL_V0 12 /* Set old formatted ACL */ +#define TX_ACL 13 /* Set ACL */ +#define TX_CREATE_ACL 14 /* create with ACL */ +#define TX_CREATE_ATTR 15 /* create + attrs */ +#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ +#define TX_MKDIR_ACL 17 /* mkdir with ACL */ +#define TX_MKDIR_ATTR 18 /* mkdir with attr */ +#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ +#define TX_WRITE2 20 /* dmu_sync EALREADY write */ +#define TX_MAX_TYPE 21 /* Max transaction type */ + +/* + * The transactions for mkdir, symlink, remove, rmdir, link, and rename + * may have the following bit set, indicating the original request + * specified case-insensitive handling of names. + */ +#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ + +/* + * Transactions for write, truncate, setattr, acl_v0, and acl can be logged + * out of order. For convenience in the code, all such records must have + * lr_foid at the same offset. + */ +#define TX_OOO(txtype) \ + ((txtype) == TX_WRITE || \ + (txtype) == TX_TRUNCATE || \ + (txtype) == TX_SETATTR || \ + (txtype) == TX_ACL_V0 || \ + (txtype) == TX_ACL || \ + (txtype) == TX_WRITE2) + +/* + * The number of dnode slots consumed by the object is stored in the 8 + * unused upper bits of the object ID. We subtract 1 from the value + * stored on disk for compatibility with implementations that don't + * support large dnodes. The slot count for a single-slot dnode will + * contain 0 for those bits to preserve the log record format for + * "small" dnodes. + */ +#define LR_FOID_GET_SLOTS(oid) (BF64_GET((oid), 56, 8) + 1) +#define LR_FOID_SET_SLOTS(oid, x) BF64_SET((oid), 56, 8, (x) - 1) +#define LR_FOID_GET_OBJ(oid) BF64_GET((oid), 0, DN_MAX_OBJECT_SHIFT) +#define LR_FOID_SET_OBJ(oid, x) BF64_SET((oid), 0, DN_MAX_OBJECT_SHIFT, (x)) + +/* + * Format of log records. + * The fields are carefully defined to allow them to be aligned + * and sized the same on sparc & intel architectures. + * Each log record has a common structure at the beginning. + * + * The log record on disk (lrc_seq) holds the sequence number of all log + * records which is used to ensure we don't replay the same record. + */ +typedef struct { /* common log record header */ + uint64_t lrc_txtype; /* intent log transaction type */ + uint64_t lrc_reclen; /* transaction record length */ + uint64_t lrc_txg; /* dmu transaction group number */ + uint64_t lrc_seq; /* see comment above */ +} lr_t; + +/* + * Common start of all out-of-order record types (TX_OOO() above). + */ +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* object id */ +} lr_ooo_t; + +/* + * Handle option extended vattr attributes. + * + * Whenever new attributes are added the version number + * will need to be updated as will code in + * zfs_log.c and zfs_replay.c + */ +typedef struct { + uint32_t lr_attr_masksize; /* number of elements in array */ + uint32_t lr_attr_bitmap; /* First entry of array */ + /* remainder of array and any additional fields */ +} lr_attr_t; + +/* + * log record for creates without optional ACL. + * This log record does support optional xvattr_t attributes. + */ +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* object id of directory */ + uint64_t lr_foid; /* object id of created file object */ + uint64_t lr_mode; /* mode of object */ + uint64_t lr_uid; /* uid of object */ + uint64_t lr_gid; /* gid of object */ + uint64_t lr_gen; /* generation (txg of creation) */ + uint64_t lr_crtime[2]; /* creation time */ + uint64_t lr_rdev; /* rdev of object to create */ + /* name of object to create follows this */ + /* for symlinks, link content follows name */ + /* for creates with xvattr data, the name follows the xvattr info */ +} lr_create_t; + +/* + * FUID ACL record will be an array of ACEs from the original ACL. + * If this array includes ephemeral IDs, the record will also include + * an array of log-specific FUIDs to replace the ephemeral IDs. + * Only one copy of each unique domain will be present, so the log-specific + * FUIDs will use an index into a compressed domain table. On replay this + * information will be used to construct real FUIDs (and bypass idmap, + * since it may not be available). + */ + +/* + * Log record for creates with optional ACL + * This log record is also used for recording any FUID + * information needed for replaying the create. If the + * file doesn't have any actual ACEs then the lr_aclcnt + * would be zero. + * + * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's. + * If create is also setting xvattr's, then acl data follows xvattr. + * If ACE FUIDs are needed then they will follow the xvattr_t. Following + * the FUIDs will be the domain table information. The FUIDs for the owner + * and group will be in lr_create. Name follows ACL data. + */ +typedef struct { + lr_create_t lr_create; /* common create portion */ + uint64_t lr_aclcnt; /* number of ACEs in ACL */ + uint64_t lr_domcnt; /* number of unique domains */ + uint64_t lr_fuidcnt; /* number of real fuids */ + uint64_t lr_acl_bytes; /* number of bytes in ACL */ + uint64_t lr_acl_flags; /* ACL flags */ +} lr_acl_create_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* obj id of directory */ + /* name of object to remove follows this */ +} lr_remove_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* obj id of directory */ + uint64_t lr_link_obj; /* obj id of link */ + /* name of object to link follows this */ +} lr_link_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_sdoid; /* obj id of source directory */ + uint64_t lr_tdoid; /* obj id of target directory */ + /* 2 strings: names of source and destination follow this */ +} lr_rename_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to write */ + uint64_t lr_offset; /* offset to write to */ + uint64_t lr_length; /* user data length to write */ + uint64_t lr_blkoff; /* no longer used */ + blkptr_t lr_blkptr; /* spa block pointer for replay */ + /* write data will follow for small writes */ +} lr_write_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* object id of file to truncate */ + uint64_t lr_offset; /* offset to truncate from */ + uint64_t lr_length; /* length to truncate */ +} lr_truncate_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to change attributes */ + uint64_t lr_mask; /* mask of attributes to set */ + uint64_t lr_mode; /* mode to set */ + uint64_t lr_uid; /* uid to set */ + uint64_t lr_gid; /* gid to set */ + uint64_t lr_size; /* size to set */ + uint64_t lr_atime[2]; /* access time */ + uint64_t lr_mtime[2]; /* modification time */ + /* optional attribute lr_attr_t may be here */ +} lr_setattr_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* obj id of file */ + uint64_t lr_aclcnt; /* number of acl entries */ + /* lr_aclcnt number of ace_t entries follow this */ +} lr_acl_v0_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* obj id of file */ + uint64_t lr_aclcnt; /* number of ACEs in ACL */ + uint64_t lr_domcnt; /* number of unique domains */ + uint64_t lr_fuidcnt; /* number of real fuids */ + uint64_t lr_acl_bytes; /* number of bytes in ACL */ + uint64_t lr_acl_flags; /* ACL flags */ + /* lr_acl_bytes number of variable sized ace's follows */ +} lr_acl_t; + +/* + * ZIL structure definitions, interface function prototype and globals. + */ + +/* + * Writes are handled in three different ways: + * + * WR_INDIRECT: + * In this mode, if we need to commit the write later, then the block + * is immediately written into the file system (using dmu_sync), + * and a pointer to the block is put into the log record. + * When the txg commits the block is linked in. + * This saves additionally writing the data into the log record. + * There are a few requirements for this to occur: + * - write is greater than zfs/zvol_immediate_write_sz + * - not using slogs (as slogs are assumed to always be faster + * than writing into the main pool) + * - the write occupies only one block + * WR_COPIED: + * If we know we'll immediately be committing the + * transaction (FSYNC or FDSYNC), the we allocate a larger + * log record here for the data and copy the data in. + * WR_NEED_COPY: + * Otherwise we don't allocate a buffer, and *if* we need to + * flush the write later then a buffer is allocated and + * we retrieve the data using the dmu. + */ +typedef enum { + WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ + /* and put blkptr in log, rather than actual data) */ + WR_COPIED, /* immediate - data is copied into lr_write_t */ + WR_NEED_COPY, /* immediate - data needs to be copied if pushed */ + WR_NUM_STATES /* number of states */ +} itx_wr_state_t; + +typedef struct itx { + list_node_t itx_node; /* linkage on zl_itx_list */ + void *itx_private; /* type-specific opaque data */ + itx_wr_state_t itx_wr_state; /* write state */ + uint8_t itx_sync; /* synchronous transaction */ + uint64_t itx_oid; /* object id */ + lr_t itx_lr; /* common part of log record */ + /* followed by type-specific part of lr_xx_t and its immediate data */ +} itx_t; + +typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, + uint64_t txg); +typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, + uint64_t txg); +typedef int zil_replay_func_t(void *arg1, void *arg2, boolean_t byteswap); +typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, + struct lwb *lwb, zio_t *zio); + +extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, + zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); + +extern void zil_init(void); +extern void zil_fini(void); + +extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys); +extern void zil_free(zilog_t *zilog); + +extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); +extern void zil_close(zilog_t *zilog); + +extern void zil_replay(objset_t *os, void *arg, + zil_replay_func_t *replay_func[TX_MAX_TYPE]); +extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx); +extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); +extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx); +extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); + +extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); +extern void zil_itx_destroy(itx_t *itx); +extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); + +extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid); +extern void zil_commit(zilog_t *zilog, uint64_t oid); +extern void zil_commit_impl(zilog_t *zilog, uint64_t oid); + +extern int zil_reset(const char *osname, void *txarg); +extern int zil_claim(struct dsl_pool *dp, + struct dsl_dataset *ds, void *txarg); +extern int zil_check_log_chain(struct dsl_pool *dp, + struct dsl_dataset *ds, void *tx); +extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); +extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); + +extern int zil_suspend(const char *osname, void **cookiep); +extern void zil_resume(void *cookie); + +extern void zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp); +extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg); +extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); + +extern void zil_set_sync(zilog_t *zilog, uint64_t syncval); + +extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); + +extern int zil_replay_disable; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h new file mode 100644 index 000000000000..1825ced56663 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h @@ -0,0 +1,242 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#ifndef _SYS_ZIL_IMPL_H +#define _SYS_ZIL_IMPL_H + +#include <sys/zil.h> +#include <sys/dmu_objset.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Possbile states for a given lwb structure. + * + * An lwb will start out in the "closed" state, and then transition to + * the "opened" state via a call to zil_lwb_write_open(). When + * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock" + * must be held. + * + * After the lwb is "opened", it can transition into the "issued" state + * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must + * be held when making this transition. + * + * After the lwb's zio completes, and the vdev's are flushed, the lwb + * will transition into the "done" state via zil_lwb_write_done(). When + * transitioning from "issued" to "done", the zilog's "zl_lock" must be + * held, *not* the "zl_issuer_lock". + * + * The zilog's "zl_issuer_lock" can become heavily contended in certain + * workloads, so we specifically avoid acquiring that lock when + * transitioning an lwb from "issued" to "done". This allows us to avoid + * having to acquire the "zl_issuer_lock" for each lwb ZIO completion, + * which would have added more lock contention on an already heavily + * contended lock. + * + * Additionally, correctness when reading an lwb's state is often + * acheived by exploiting the fact that these state transitions occur in + * this specific order; i.e. "closed" to "opened" to "issued" to "done". + * + * Thus, if an lwb is in the "closed" or "opened" state, holding the + * "zl_issuer_lock" will prevent a concurrent thread from transitioning + * that lwb to the "issued" state. Likewise, if an lwb is already in the + * "issued" state, holding the "zl_lock" will prevent a concurrent + * thread from transitioning that lwb to the "done" state. + */ +typedef enum { + LWB_STATE_CLOSED, + LWB_STATE_OPENED, + LWB_STATE_ISSUED, + LWB_STATE_DONE, + LWB_NUM_STATES +} lwb_state_t; + +/* + * Log write block (lwb) + * + * Prior to an lwb being issued to disk via zil_lwb_write_issue(), it + * will be protected by the zilog's "zl_issuer_lock". Basically, prior + * to it being issued, it will only be accessed by the thread that's + * holding the "zl_issuer_lock". After the lwb is issued, the zilog's + * "zl_lock" is used to protect the lwb against concurrent access. + */ +typedef struct lwb { + zilog_t *lwb_zilog; /* back pointer to log struct */ + blkptr_t lwb_blk; /* on disk address of this log blk */ + boolean_t lwb_slog; /* lwb_blk is on SLOG device */ + int lwb_nused; /* # used bytes in buffer */ + int lwb_sz; /* size of block and buffer */ + lwb_state_t lwb_state; /* the state of this lwb */ + char *lwb_buf; /* log write buffer */ + zio_t *lwb_write_zio; /* zio for the lwb buffer */ + zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ + dmu_tx_t *lwb_tx; /* tx for log block allocation */ + uint64_t lwb_max_txg; /* highest txg in this lwb */ + list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ + list_t lwb_waiters; /* list of zil_commit_waiter's */ + avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */ + kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */ + hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ +} lwb_t; + +/* + * ZIL commit waiter. + * + * This structure is allocated each time zil_commit() is called, and is + * used by zil_commit() to communicate with other parts of the ZIL, such + * that zil_commit() can know when it safe for it return. For more + * details, see the comment above zil_commit(). + * + * The "zcw_lock" field is used to protect the commit waiter against + * concurrent access. This lock is often acquired while already holding + * the zilog's "zl_issuer_lock" or "zl_lock"; see the functions + * zil_process_commit_list() and zil_lwb_flush_vdevs_done() as examples + * of this. Thus, one must be careful not to acquire the + * "zl_issuer_lock" or "zl_lock" when already holding the "zcw_lock"; + * e.g. see the zil_commit_waiter_timeout() function. + */ +typedef struct zil_commit_waiter { + kcondvar_t zcw_cv; /* signalled when "done" */ + kmutex_t zcw_lock; /* protects fields of this struct */ + list_node_t zcw_node; /* linkage in lwb_t:lwb_waiter list */ + lwb_t *zcw_lwb; /* back pointer to lwb when linked */ + boolean_t zcw_done; /* B_TRUE when "done", else B_FALSE */ + int zcw_zio_error; /* contains the zio io_error value */ +} zil_commit_waiter_t; + +/* + * Intent log transaction lists + */ +typedef struct itxs { + list_t i_sync_list; /* list of synchronous itxs */ + avl_tree_t i_async_tree; /* tree of foids for async itxs */ +} itxs_t; + +typedef struct itxg { + kmutex_t itxg_lock; /* lock for this structure */ + uint64_t itxg_txg; /* txg for this chain */ + itxs_t *itxg_itxs; /* sync and async itxs */ +} itxg_t; + +/* for async nodes we build up an AVL tree of lists of async itxs per file */ +typedef struct itx_async_node { + uint64_t ia_foid; /* file object id */ + list_t ia_list; /* list of async itxs for this foid */ + avl_node_t ia_node; /* AVL tree linkage */ +} itx_async_node_t; + +/* + * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs + * we've touched so we know which ones need a write cache flush at the end. + */ +typedef struct zil_vdev_node { + uint64_t zv_vdev; /* vdev to be flushed */ + avl_node_t zv_node; /* AVL tree linkage */ +} zil_vdev_node_t; + +#define ZIL_PREV_BLKS 16 + +/* + * Stable storage intent log management structure. One per dataset. + */ +struct zilog { + kmutex_t zl_lock; /* protects most zilog_t fields */ + struct dsl_pool *zl_dmu_pool; /* DSL pool */ + spa_t *zl_spa; /* handle for read/write log */ + const zil_header_t *zl_header; /* log header buffer */ + objset_t *zl_os; /* object set we're logging */ + zil_get_data_t *zl_get_data; /* callback to get object content */ + lwb_t *zl_last_lwb_opened; /* most recent lwb opened */ + hrtime_t zl_last_lwb_latency; /* zio latency of last lwb done */ + uint64_t zl_lr_seq; /* on-disk log record sequence number */ + uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */ + uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ + uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */ + uint64_t zl_replaying_seq; /* current replay seq number */ + uint32_t zl_suspend; /* log suspend count */ + kcondvar_t zl_cv_suspend; /* log suspend completion */ + uint8_t zl_suspending; /* log is currently suspending */ + uint8_t zl_keep_first; /* keep first log block in destroy */ + uint8_t zl_replay; /* replaying records while set */ + uint8_t zl_stop_sync; /* for debugging */ + kmutex_t zl_issuer_lock; /* single writer, per ZIL, at a time */ + uint8_t zl_logbias; /* latency or throughput */ + uint8_t zl_sync; /* synchronous or asynchronous */ + int zl_parse_error; /* last zil_parse() error */ + uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */ + uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */ + uint64_t zl_parse_blk_count; /* number of blocks parsed */ + uint64_t zl_parse_lr_count; /* number of log records parsed */ + itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */ + list_t zl_itx_commit_list; /* itx list to be committed */ + uint64_t zl_cur_used; /* current commit log size used */ + list_t zl_lwb_list; /* in-flight log write list */ + avl_tree_t zl_bp_tree; /* track bps during log parse */ + clock_t zl_replay_time; /* lbolt of when replay started */ + uint64_t zl_replay_blks; /* number of log blocks replayed */ + zil_header_t zl_old_header; /* debugging aid */ + uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ + uint_t zl_prev_rotor; /* rotor for zl_prev[] */ + txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ + uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */ +}; + +typedef struct zil_bp_node { + dva_t zn_dva; + avl_node_t zn_node; +} zil_bp_node_t; + +/* + * Maximum amount of write data that can be put into single log block. + */ +#define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \ + sizeof (lr_write_t)) +#define ZIL_MAX_COPIED_DATA \ + ((SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t)) + +/* + * Maximum amount of log space we agree to waste to reduce number of + * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). + */ +#define ZIL_MAX_WASTE_SPACE (ZIL_MAX_LOG_DATA / 8) + +/* + * Maximum amount of write data for WR_COPIED. Fall back to WR_NEED_COPY + * as more space efficient if we can't fit at least two log records into + * maximum sized log block. + */ +#define ZIL_MAX_COPIED_DATA ((SPA_OLD_MAXBLOCKSIZE - \ + sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t)) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIL_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h new file mode 100644 index 000000000000..80a24b436a01 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -0,0 +1,665 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright 2016 Toomas Soome <tsoome@me.com> + */ + +#ifndef _ZIO_H +#define _ZIO_H + +#include <sys/zio_priority.h> +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/kstat.h> +#include <sys/fs/zfs.h> +#include <sys/zio_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Embedded checksum + */ +#define ZEC_MAGIC 0x210da7ab10c7a11ULL + +typedef struct zio_eck { + uint64_t zec_magic; /* for validation, endianness */ + zio_cksum_t zec_cksum; /* 256-bit checksum */ +} zio_eck_t; + +/* + * Gang block headers are self-checksumming and contain an array + * of block pointers. + */ +#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE +#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_eck_t)) / sizeof (blkptr_t)) +#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_eck_t) - \ + (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ + sizeof (uint64_t)) + +typedef struct zio_gbh { + blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; + uint64_t zg_filler[SPA_GBH_FILLER]; + zio_eck_t zg_tail; +} zio_gbh_phys_t; + +enum zio_checksum { + ZIO_CHECKSUM_INHERIT = 0, + ZIO_CHECKSUM_ON, + ZIO_CHECKSUM_OFF, + ZIO_CHECKSUM_LABEL, + ZIO_CHECKSUM_GANG_HEADER, + ZIO_CHECKSUM_ZILOG, + ZIO_CHECKSUM_FLETCHER_2, + ZIO_CHECKSUM_FLETCHER_4, + ZIO_CHECKSUM_SHA256, + ZIO_CHECKSUM_ZILOG2, + ZIO_CHECKSUM_NOPARITY, + ZIO_CHECKSUM_SHA512, + ZIO_CHECKSUM_SKEIN, +#ifdef illumos + ZIO_CHECKSUM_EDONR, +#endif + ZIO_CHECKSUM_FUNCTIONS +}; + +/* + * The number of "legacy" compression functions which can be set on individual + * objects. + */ +#define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2 + +#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 +#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON + +#define ZIO_CHECKSUM_MASK 0xffULL +#define ZIO_CHECKSUM_VERIFY (1 << 8) + +#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 +#define ZIO_DEDUPDITTO_MIN 100 + +/* + * The number of "legacy" compression functions which can be set on individual + * objects. + */ +#define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4 + +/* + * The meaning of "compress = on" selected by the compression features enabled + * on a given pool. + */ +#define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB +#define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4 + +#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF + +#define BOOTFS_COMPRESS_VALID(compress) \ + ((compress) == ZIO_COMPRESS_LZJB || \ + (compress) == ZIO_COMPRESS_LZ4 || \ + (compress) == ZIO_COMPRESS_ON || \ + (compress) == ZIO_COMPRESS_OFF) + +#define ZIO_FAILURE_MODE_WAIT 0 +#define ZIO_FAILURE_MODE_CONTINUE 1 +#define ZIO_FAILURE_MODE_PANIC 2 + +enum zio_flag { + /* + * Flags inherited by gang, ddt, and vdev children, + * and that must be equal for two zios to aggregate + */ + ZIO_FLAG_DONT_AGGREGATE = 1 << 0, + ZIO_FLAG_IO_REPAIR = 1 << 1, + ZIO_FLAG_SELF_HEAL = 1 << 2, + ZIO_FLAG_RESILVER = 1 << 3, + ZIO_FLAG_SCRUB = 1 << 4, + ZIO_FLAG_SCAN_THREAD = 1 << 5, + ZIO_FLAG_PHYSICAL = 1 << 6, + +#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) + + /* + * Flags inherited by ddt, gang, and vdev children. + */ + ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */ + ZIO_FLAG_SPECULATIVE = 1 << 8, + ZIO_FLAG_CONFIG_WRITER = 1 << 9, + ZIO_FLAG_DONT_RETRY = 1 << 10, + ZIO_FLAG_DONT_CACHE = 1 << 11, + ZIO_FLAG_NODATA = 1 << 12, + ZIO_FLAG_INDUCE_DAMAGE = 1 << 13, + ZIO_FLAG_IO_ALLOCATING = 1 << 14, + +#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) +#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) + + /* + * Flags inherited by vdev children. + */ + ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */ + ZIO_FLAG_PROBE = 1 << 16, + ZIO_FLAG_TRYHARD = 1 << 17, + ZIO_FLAG_OPTIONAL = 1 << 18, + +#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) + + /* + * Flags not inherited by any children. + */ + ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */ + ZIO_FLAG_DONT_PROPAGATE = 1 << 20, + ZIO_FLAG_IO_BYPASS = 1 << 21, + ZIO_FLAG_IO_REWRITE = 1 << 22, + ZIO_FLAG_RAW = 1 << 23, + ZIO_FLAG_GANG_CHILD = 1 << 24, + ZIO_FLAG_DDT_CHILD = 1 << 25, + ZIO_FLAG_GODFATHER = 1 << 26, + ZIO_FLAG_NOPWRITE = 1 << 27, + ZIO_FLAG_REEXECUTED = 1 << 28, + ZIO_FLAG_DELEGATED = 1 << 29, +}; + +#define ZIO_FLAG_MUSTSUCCEED 0 + +#define ZIO_DDT_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ + ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) + +#define ZIO_GANG_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ + ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) + +#define ZIO_VDEV_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL) + +#define ZIO_CHILD_BIT(x) (1 << (x)) +#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x))) + +enum zio_child { + ZIO_CHILD_VDEV = 0, + ZIO_CHILD_GANG, + ZIO_CHILD_DDT, + ZIO_CHILD_LOGICAL, + ZIO_CHILD_TYPES +}; + +#define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV) +#define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG) +#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT) +#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL) +#define ZIO_CHILD_ALL_BITS \ + (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \ + ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT) + +enum zio_wait_type { + ZIO_WAIT_READY = 0, + ZIO_WAIT_DONE, + ZIO_WAIT_TYPES +}; + +/* + * We'll take the number 122 and 123 to indicate checksum errors and + * fragmentation. Those doesn't collide with any errno values as they + * are greater than ELAST. + */ +#define ECKSUM 122 +#define EFRAGS 123 + +typedef void zio_done_func_t(zio_t *zio); + +extern boolean_t zio_dva_throttle_enabled; +extern const char *zio_type_name[ZIO_TYPES]; + +/* + * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely + * identifies any block in the pool. By convention, the meta-objset (MOS) + * is objset 0, and the meta-dnode is object 0. This covers all blocks + * except root blocks and ZIL blocks, which are defined as follows: + * + * Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>. + * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>. + * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>. + * dnode visit bookmarks are <objset, object id of dnode, -3, 0>. + * + * Note: this structure is called a bookmark because its original purpose + * was to remember where to resume a pool-wide traverse. + * + * Note: this structure is passed between userland and the kernel, and is + * stored on disk (by virtue of being incorporated into other on-disk + * structures, e.g. dsl_scan_phys_t). + */ +typedef struct zbookmark_phys { + uint64_t zb_objset; + uint64_t zb_object; + int64_t zb_level; + uint64_t zb_blkid; +} zbookmark_phys_t; + +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +#define ZB_DESTROYED_OBJSET (-1ULL) + +#define ZB_ROOT_OBJECT (0ULL) +#define ZB_ROOT_LEVEL (-1LL) +#define ZB_ROOT_BLKID (0ULL) + +#define ZB_ZIL_OBJECT (0ULL) +#define ZB_ZIL_LEVEL (-2LL) + +#define ZB_DNODE_LEVEL (-3LL) +#define ZB_DNODE_BLKID (0ULL) + +#define ZB_IS_ZERO(zb) \ + ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ + (zb)->zb_level == 0 && (zb)->zb_blkid == 0) +#define ZB_IS_ROOT(zb) \ + ((zb)->zb_object == ZB_ROOT_OBJECT && \ + (zb)->zb_level == ZB_ROOT_LEVEL && \ + (zb)->zb_blkid == ZB_ROOT_BLKID) + +typedef struct zio_prop { + enum zio_checksum zp_checksum; + enum zio_compress zp_compress; + dmu_object_type_t zp_type; + uint8_t zp_level; + uint8_t zp_copies; + boolean_t zp_dedup; + boolean_t zp_dedup_verify; + boolean_t zp_nopwrite; +} zio_prop_t; + +typedef struct zio_cksum_report zio_cksum_report_t; + +typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, + const void *good_data); +typedef void zio_cksum_free_f(void *cbdata, size_t size); + +struct zio_bad_cksum; /* defined in zio_checksum.h */ +struct dnode_phys; +struct abd; + +struct zio_cksum_report { + struct zio_cksum_report *zcr_next; + nvlist_t *zcr_ereport; + nvlist_t *zcr_detector; + void *zcr_cbdata; + size_t zcr_cbinfo; /* passed to zcr_free() */ + uint64_t zcr_align; + uint64_t zcr_length; + zio_cksum_finish_f *zcr_finish; + zio_cksum_free_f *zcr_free; + + /* internal use only */ + struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ +}; + +typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr, + void *arg); + +zio_vsd_cksum_report_f zio_vsd_default_cksum_report; + +typedef struct zio_vsd_ops { + zio_done_func_t *vsd_free; + zio_vsd_cksum_report_f *vsd_cksum_report; +} zio_vsd_ops_t; + +typedef struct zio_gang_node { + zio_gbh_phys_t *gn_gbh; + struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; +} zio_gang_node_t; + +typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, + zio_gang_node_t *gn, struct abd *data, uint64_t offset); + +typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size); + +typedef struct zio_transform { + struct abd *zt_orig_abd; + uint64_t zt_orig_size; + uint64_t zt_bufsize; + zio_transform_func_t *zt_transform; + struct zio_transform *zt_next; +} zio_transform_t; + +typedef zio_t *zio_pipe_stage_t(zio_t *zio); + +/* + * The io_reexecute flags are distinct from io_flags because the child must + * be able to propagate them to the parent. The normal io_flags are local + * to the zio, not protected by any lock, and not modifiable by children; + * the reexecute flags are protected by io_lock, modifiable by children, + * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. + */ +#define ZIO_REEXECUTE_NOW 0x01 +#define ZIO_REEXECUTE_SUSPEND 0x02 + +typedef struct zio_alloc_list { + list_t zal_list; + uint64_t zal_size; +} zio_alloc_list_t; + +typedef struct zio_link { + zio_t *zl_parent; + zio_t *zl_child; + list_node_t zl_parent_node; + list_node_t zl_child_node; +} zio_link_t; + +/* + * Used for TRIM kstat. + */ +typedef struct zio_trim_stats { + /* + * Number of bytes successfully TRIMmed. + */ + kstat_named_t bytes; + + /* + * Number of successful TRIM requests. + */ + kstat_named_t success; + + /* + * Number of TRIM requests that failed because TRIM is not + * supported. + */ + kstat_named_t unsupported; + + /* + * Number of TRIM requests that failed for other reasons. + */ + kstat_named_t failed; +} zio_trim_stats_t; + +extern zio_trim_stats_t zio_trim_stats; + +#define ZIO_TRIM_STAT_INCR(stat, val) \ + atomic_add_64(&zio_trim_stats.stat.value.ui64, (val)); +#define ZIO_TRIM_STAT_BUMP(stat) \ + ZIO_TRIM_STAT_INCR(stat, 1); + +struct zio { + /* Core information about this I/O */ + zbookmark_phys_t io_bookmark; + zio_prop_t io_prop; + zio_type_t io_type; + enum zio_child io_child_type; + int io_cmd; + zio_priority_t io_priority; + uint8_t io_reexecute; + uint8_t io_state[ZIO_WAIT_TYPES]; + uint64_t io_txg; + spa_t *io_spa; + blkptr_t *io_bp; + blkptr_t *io_bp_override; + blkptr_t io_bp_copy; + list_t io_parent_list; + list_t io_child_list; + zio_t *io_logical; + zio_transform_t *io_transform_stack; + + /* Callback info */ + zio_done_func_t *io_ready; + zio_done_func_t *io_children_ready; + zio_done_func_t *io_physdone; + zio_done_func_t *io_done; + void *io_private; + int64_t io_prev_space_delta; /* DMU private */ + blkptr_t io_bp_orig; + + /* Data represented by this I/O */ + struct abd *io_abd; + struct abd *io_orig_abd; + uint64_t io_size; + uint64_t io_orig_size; + /* io_lsize != io_orig_size iff this is a raw write */ + uint64_t io_lsize; + + /* Stuff for the vdev stack */ + vdev_t *io_vd; + void *io_vsd; + const zio_vsd_ops_t *io_vsd_ops; + + uint64_t io_offset; + hrtime_t io_timestamp; + hrtime_t io_queued_timestamp; + hrtime_t io_target_timestamp; + avl_node_t io_queue_node; + avl_node_t io_offset_node; + avl_node_t io_alloc_node; + zio_alloc_list_t io_alloc_list; + +#ifdef __FreeBSD__ + struct bio *io_bio; +#endif + + /* Internal pipeline state */ + enum zio_flag io_flags; + enum zio_stage io_stage; + enum zio_stage io_pipeline; + enum zio_flag io_orig_flags; + enum zio_stage io_orig_stage; + enum zio_stage io_orig_pipeline; + enum zio_stage io_pipeline_trace; + int io_error; + int io_child_error[ZIO_CHILD_TYPES]; + uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; + uint64_t io_child_count; + uint64_t io_phys_children; + uint64_t io_parent_count; + uint64_t *io_stall; + zio_t *io_gang_leader; + zio_gang_node_t *io_gang_tree; + void *io_executor; + void *io_waiter; + kmutex_t io_lock; + kcondvar_t io_cv; + int io_allocator; + + /* FMA state */ + zio_cksum_report_t *io_cksum_report; + uint64_t io_ena; + + /* Taskq dispatching state */ + taskq_ent_t io_tqent; + + avl_node_t io_trim_node; + list_node_t io_trim_link; +}; + +extern int zio_bookmark_compare(const void *, const void *); + +extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, + zio_done_func_t *done, void *priv, enum zio_flag flags); + +extern zio_t *zio_root(spa_t *spa, + zio_done_func_t *done, void *priv, enum zio_flag flags); + +extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv, + zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); + +extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, + zio_done_func_t *ready, zio_done_func_t *children_ready, + zio_done_func_t *physdone, zio_done_func_t *done, + void *priv, zio_priority_t priority, enum zio_flag flags, + const zbookmark_phys_t *zb); + +extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + struct abd *data, uint64_t size, zio_done_func_t *done, void *priv, + zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); + +extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, + boolean_t nopwrite); + +extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); + +extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, + const blkptr_t *bp, + zio_done_func_t *done, void *priv, enum zio_flag flags); + +extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, + uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, + zio_priority_t priority, enum zio_flag flags); + +extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, + uint64_t size, struct abd *data, int checksum, + zio_done_func_t *done, void *priv, zio_priority_t priority, + enum zio_flag flags, boolean_t labels); + +extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, + uint64_t size, struct abd *data, int checksum, + zio_done_func_t *done, void *priv, zio_priority_t priority, + enum zio_flag flags, boolean_t labels); + +extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, + const blkptr_t *bp, uint64_t size, enum zio_flag flags); + +extern int zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, + blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t *slog); +extern void zio_flush(zio_t *zio, vdev_t *vd); +extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, + uint64_t size); +extern void zio_shrink(zio_t *zio, uint64_t size); + +extern int zio_wait(zio_t *zio); +extern void zio_nowait(zio_t *zio); +extern void zio_execute(zio_t *zio); +extern void zio_interrupt(zio_t *zio); +extern void zio_delay_init(zio_t *zio); +extern void zio_delay_interrupt(zio_t *zio); + +extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); +extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); +extern zio_t *zio_unique_parent(zio_t *cio); +extern void zio_add_child(zio_t *pio, zio_t *cio); + +extern void *zio_buf_alloc(size_t size); +extern void zio_buf_free(void *buf, size_t size); +extern void *zio_data_buf_alloc(size_t size); +extern void zio_data_buf_free(void *buf, size_t size); + +extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size, + uint64_t bufsize, zio_transform_func_t *transform); +extern void zio_pop_transforms(zio_t *zio); + +extern void zio_resubmit_stage_async(void *); + +extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, + uint64_t offset, struct abd *data, uint64_t size, int type, + zio_priority_t priority, enum zio_flag flags, + zio_done_func_t *done, void *priv); + +extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, + struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority, + enum zio_flag flags, zio_done_func_t *done, void *priv); + +extern void zio_vdev_io_bypass(zio_t *zio); +extern void zio_vdev_io_reissue(zio_t *zio); +extern void zio_vdev_io_redone(zio_t *zio); + +extern void zio_change_priority(zio_t *pio, zio_priority_t priority); + +extern void zio_checksum_verified(zio_t *zio); +extern int zio_worst_error(int e1, int e2); + +extern enum zio_checksum zio_checksum_select(enum zio_checksum child, + enum zio_checksum parent); +extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, + enum zio_checksum child, enum zio_checksum parent); +extern enum zio_compress zio_compress_select(spa_t *spa, + enum zio_compress child, enum zio_compress parent); + +extern void zio_suspend(spa_t *spa, zio_t *zio); +extern int zio_resume(spa_t *spa); +extern void zio_resume_wait(spa_t *spa); + +/* + * Initial setup and teardown. + */ +extern void zio_init(void); +extern void zio_fini(void); + +/* + * Fault injection + */ +struct zinject_record; +extern uint32_t zio_injection_enabled; +extern int zio_inject_fault(char *name, int flags, int *id, + struct zinject_record *record); +extern int zio_inject_list_next(int *id, char *name, size_t buflen, + struct zinject_record *record); +extern int zio_clear_fault(int id); +extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); +extern int zio_handle_fault_injection(zio_t *zio, int error); +extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); +extern int zio_handle_label_injection(zio_t *zio, int error); +extern void zio_handle_ignored_writes(zio_t *zio); +extern hrtime_t zio_handle_io_delay(zio_t *zio); + +/* + * Checksum ereport functions + */ +extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, + uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info); +extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, + const void *good_data, const void *bad_data, boolean_t drop_if_identical); + +extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report); +extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); + +/* If we have the good data in hand, this function can be used */ +extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t offset, uint64_t length, + const void *good_data, const void *bad_data, struct zio_bad_cksum *info); + +/* Called from spa_sync(), but primarily an injection handler */ +extern void spa_handle_ignored_writes(spa_t *spa); + +/* zbookmark_phys functions */ +boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); +int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, + uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h new file mode 100644 index 000000000000..782df534c9a0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h @@ -0,0 +1,119 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + * Copyright Saso Kiselkov 2013, All rights reserved. + */ + +#ifndef _SYS_ZIO_CHECKSUM_H +#define _SYS_ZIO_CHECKSUM_H + +#include <sys/zio.h> +#include <zfeature_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct abd; + +/* + * Signature for checksum functions. + */ +typedef void zio_checksum_t(struct abd *, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp); +typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); +typedef void zio_checksum_tmpl_free_t(void *ctx_template); + +typedef enum zio_checksum_flags { + /* Strong enough for metadata? */ + ZCHECKSUM_FLAG_METADATA = (1 << 1), + /* ZIO embedded checksum */ + ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), + /* Strong enough for dedup (without verification)? */ + ZCHECKSUM_FLAG_DEDUP = (1 << 3), + /* Uses salt value */ + ZCHECKSUM_FLAG_SALTED = (1 << 4), + /* Strong enough for nopwrite? */ + ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) +} zio_checksum_flags_t; + +/* + * Information about each checksum function. + */ +typedef struct zio_checksum_info { + /* checksum function for each byteorder */ + zio_checksum_t *ci_func[2]; + zio_checksum_tmpl_init_t *ci_tmpl_init; + zio_checksum_tmpl_free_t *ci_tmpl_free; + zio_checksum_flags_t ci_flags; + char *ci_name; /* descriptive name */ +} zio_checksum_info_t; + +typedef struct zio_bad_cksum { + zio_cksum_t zbc_expected; + zio_cksum_t zbc_actual; + const char *zbc_checksum_name; + uint8_t zbc_byteswapped; + uint8_t zbc_injected; + uint8_t zbc_has_cksum; /* expected/actual valid */ +} zio_bad_cksum_t; + +extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; + +/* + * Checksum routines. + */ +extern zio_checksum_t abd_checksum_SHA256; +extern zio_checksum_t abd_checksum_SHA512_native; +extern zio_checksum_t abd_checksum_SHA512_byteswap; + +/* Skein */ +extern zio_checksum_t abd_checksum_skein_native; +extern zio_checksum_t abd_checksum_skein_byteswap; +extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init; +extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free; + +#ifdef illumos +/* Edon-R */ +extern zio_checksum_t abd_checksum_edonr_native; +extern zio_checksum_t abd_checksum_edonr_byteswap; +extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init; +extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free; +#endif + +extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); +extern void zio_checksum_compute(zio_t *, enum zio_checksum, + struct abd *, uint64_t); +extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, + struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *); +extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); +extern enum zio_checksum spa_dedup_checksum(spa_t *spa); +extern void zio_checksum_templates_free(spa_t *spa); +extern spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIO_CHECKSUM_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h new file mode 100644 index 000000000000..aab0282c45be --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h @@ -0,0 +1,128 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2015, 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZIO_COMPRESS_H +#define _SYS_ZIO_COMPRESS_H + +#include <sys/abd.h> + +#ifdef __cplusplus +extern "C" { +#endif + +enum zio_compress { + ZIO_COMPRESS_INHERIT = 0, + ZIO_COMPRESS_ON, + ZIO_COMPRESS_OFF, + ZIO_COMPRESS_LZJB, + ZIO_COMPRESS_EMPTY, + ZIO_COMPRESS_GZIP_1, + ZIO_COMPRESS_GZIP_2, + ZIO_COMPRESS_GZIP_3, + ZIO_COMPRESS_GZIP_4, + ZIO_COMPRESS_GZIP_5, + ZIO_COMPRESS_GZIP_6, + ZIO_COMPRESS_GZIP_7, + ZIO_COMPRESS_GZIP_8, + ZIO_COMPRESS_GZIP_9, + ZIO_COMPRESS_ZLE, + ZIO_COMPRESS_LZ4, + ZIO_COMPRESS_FUNCTIONS +}; + +/* Common signature for all zio compress functions. */ +typedef size_t zio_compress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); +/* Common signature for all zio decompress functions. */ +typedef int zio_decompress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); +/* + * Common signature for all zio decompress functions using an ABD as input. + * This is helpful if you have both compressed ARC and scatter ABDs enabled, + * but is not a requirement for all compression algorithms. + */ +typedef int zio_decompress_abd_func_t(abd_t *src, void *dst, + size_t s_len, size_t d_len, int); + +/* + * Information about each compression function. + */ +typedef struct zio_compress_info { + char *ci_name; + int ci_level; + zio_compress_func_t *ci_compress; + zio_decompress_func_t *ci_decompress; +} zio_compress_info_t; + +extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; + +/* + * Compression routines. + */ +extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern void lz4_init(void); +extern void lz4_fini(void); +extern size_t lz4_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); + +/* + * Compress and decompress data if necessary. + */ +extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len); +extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len, size_t d_len); +extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, + size_t s_len, size_t d_len); + +/* + * Module lifetime management. + */ +extern void zio_compress_init(void); +extern void zio_compress_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIO_COMPRESS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h new file mode 100644 index 000000000000..96b3b0135813 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h @@ -0,0 +1,256 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + */ + +#ifndef _ZIO_IMPL_H +#define _ZIO_IMPL_H + +#include <sys/zfs_context.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * XXX -- Describe ZFS I/O pipeline here. Fill in as needed. + * + * The ZFS I/O pipeline is comprised of various stages which are defined + * in the zio_stage enum below. The individual stages are used to construct + * these basic I/O operations: Read, Write, Free, Claim, and Ioctl. + * + * I/O operations: (XXX - provide detail for each of the operations) + * + * Read: + * Write: + * Free: + * Claim: + * Ioctl: + * + * Although the most common pipeline are used by the basic I/O operations + * above, there are some helper pipelines (one could consider them + * sub-pipelines) which are used internally by the ZIO module and are + * explained below: + * + * Interlock Pipeline: + * The interlock pipeline is the most basic pipeline and is used by all + * of the I/O operations. The interlock pipeline does not perform any I/O + * and is used to coordinate the dependencies between I/Os that are being + * issued (i.e. the parent/child relationship). + * + * Vdev child Pipeline: + * The vdev child pipeline is responsible for performing the physical I/O. + * It is in this pipeline where the I/O are queued and possibly cached. + * + * In addition to performing I/O, the pipeline is also responsible for + * data transformations. The transformations performed are based on the + * specific properties that user may have selected and modify the + * behavior of the pipeline. Examples of supported transformations are + * compression, dedup, and nop writes. Transformations will either modify + * the data or the pipeline. This list below further describes each of + * the supported transformations: + * + * Compression: + * ZFS supports three different flavors of compression -- gzip, lzjb, and + * zle. Compression occurs as part of the write pipeline and is performed + * in the ZIO_STAGE_WRITE_BP_INIT stage. + * + * Dedup: + * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and + * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing + * read pipeline if the dedup bit is set on the block pointer. + * Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage + * and added to a write pipeline if a user has enabled dedup on that + * particular dataset. + * + * NOP Write: + * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage + * and is added to an existing write pipeline if a crypographically + * secure checksum (i.e. SHA256) is enabled and compression is turned on. + * The NOP write stage will compare the checksums of the current data + * on-disk (level-0 blocks only) and the data that is currently being written. + * If the checksum values are identical then the pipeline is converted to + * an interlock pipeline skipping block allocation and bypassing the + * physical I/O. The nop write feature can handle writes in either + * syncing or open context (i.e. zil writes) and as a result is mutually + * exclusive with dedup. + */ + +/* + * zio pipeline stage definitions + */ +enum zio_stage { + ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */ + + ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */ + ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */ + ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */ + ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */ + ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */ + + ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */ + + ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */ + + ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */ + ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */ + ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */ + + ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */ + + ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */ + ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */ + ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */ + + ZIO_STAGE_READY = 1 << 18, /* RWFCI */ + + ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RWF-I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RWF-I */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RWF-I */ + + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */ + + ZIO_STAGE_DONE = 1 << 23 /* RWFCI */ +}; + +#define ZIO_INTERLOCK_STAGES \ + (ZIO_STAGE_READY | \ + ZIO_STAGE_DONE) + +#define ZIO_INTERLOCK_PIPELINE \ + ZIO_INTERLOCK_STAGES + +#define ZIO_VDEV_IO_STAGES \ + (ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_DONE | \ + ZIO_STAGE_VDEV_IO_ASSESS) + +#define ZIO_VDEV_CHILD_PIPELINE \ + (ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_DONE) + +#define ZIO_READ_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_CHECKSUM_VERIFY) + +#define ZIO_READ_PHYS_PIPELINE \ + ZIO_READ_COMMON_STAGES + +#define ZIO_READ_PIPELINE \ + (ZIO_READ_COMMON_STAGES | \ + ZIO_STAGE_READ_BP_INIT) + +#define ZIO_DDT_CHILD_READ_PIPELINE \ + ZIO_READ_COMMON_STAGES + +#define ZIO_DDT_READ_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_READ_BP_INIT | \ + ZIO_STAGE_DDT_READ_START | \ + ZIO_STAGE_DDT_READ_DONE) + +#define ZIO_WRITE_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_CHECKSUM_GENERATE) + +#define ZIO_WRITE_PHYS_PIPELINE \ + ZIO_WRITE_COMMON_STAGES + +#define ZIO_REWRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + ZIO_STAGE_WRITE_COMPRESS | \ + ZIO_STAGE_WRITE_BP_INIT) + +#define ZIO_WRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_WRITE_COMPRESS | \ + ZIO_STAGE_DVA_THROTTLE | \ + ZIO_STAGE_DVA_ALLOCATE) + +#define ZIO_DDT_CHILD_WRITE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_DVA_THROTTLE | \ + ZIO_STAGE_DVA_ALLOCATE) + +#define ZIO_DDT_WRITE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_WRITE_COMPRESS | \ + ZIO_STAGE_CHECKSUM_GENERATE | \ + ZIO_STAGE_DDT_WRITE) + +#define ZIO_GANG_STAGES \ + (ZIO_STAGE_GANG_ASSEMBLE | \ + ZIO_STAGE_GANG_ISSUE) + +#define ZIO_FREE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_DVA_FREE) + +#define ZIO_FREE_PHYS_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES) + +#define ZIO_DDT_FREE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_DDT_FREE) + +#define ZIO_CLAIM_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_DVA_CLAIM) + +#define ZIO_IOCTL_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_ASSESS) + +#define ZIO_BLOCKING_STAGES \ + (ZIO_STAGE_DVA_ALLOCATE | \ + ZIO_STAGE_DVA_CLAIM | \ + ZIO_STAGE_VDEV_IO_START) + +extern void zio_inject_init(void); +extern void zio_inject_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h new file mode 100644 index 000000000000..ebe05a09dc4e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h @@ -0,0 +1,43 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + */ +#ifndef _ZIO_PRIORITY_H +#define _ZIO_PRIORITY_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum zio_priority { + ZIO_PRIORITY_SYNC_READ, + ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ + ZIO_PRIORITY_ASYNC_READ, /* prefetch */ + ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ + ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ + ZIO_PRIORITY_TRIM, /* free requests used for TRIM */ + ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ + ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ + ZIO_PRIORITY_NUM_QUEUEABLE, + + ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ +} zio_priority_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_PRIORITY_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h new file mode 100644 index 000000000000..b6eba1a18ff4 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZRLOCK_H +#define _SYS_ZRLOCK_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zrlock { + kmutex_t zr_mtx; + volatile int32_t zr_refcount; + kcondvar_t zr_cv; + uint16_t zr_pad; +#ifdef ZFS_DEBUG + kthread_t *zr_owner; + const char *zr_caller; +#endif +} zrlock_t; + +extern void zrl_init(zrlock_t *); +extern void zrl_destroy(zrlock_t *); +#define zrl_add(_z) zrl_add_impl((_z), __func__) +extern void zrl_add_impl(zrlock_t *, const char *); +extern void zrl_remove(zrlock_t *); +extern int zrl_tryenter(zrlock_t *); +extern void zrl_exit(zrlock_t *); +extern int zrl_is_zero(zrlock_t *); +extern int zrl_is_locked(zrlock_t *); +#ifdef ZFS_DEBUG +extern kthread_t *zrl_owner(zrlock_t *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZRLOCK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h new file mode 100644 index 000000000000..ce6033ecb6b7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZTHR_H +#define _SYS_ZTHR_H + +typedef struct zthr zthr_t; +typedef int (zthr_func_t)(void *, zthr_t *); +typedef boolean_t (zthr_checkfunc_t)(void *, zthr_t *); + +struct zthr { + kthread_t *zthr_thread; + kmutex_t zthr_lock; + kcondvar_t zthr_cv; + boolean_t zthr_cancel; + hrtime_t zthr_wait_time; + + zthr_checkfunc_t *zthr_checkfunc; + zthr_func_t *zthr_func; + void *zthr_arg; + int zthr_rc; +}; + +extern zthr_t *zthr_create(zthr_checkfunc_t checkfunc, + zthr_func_t *func, void *arg); +extern zthr_t *zthr_create_timer(zthr_checkfunc_t *checkfunc, + zthr_func_t *func, void *arg, hrtime_t nano_wait); + +extern void zthr_exit(zthr_t *t, int rc); +extern void zthr_destroy(zthr_t *t); + +extern void zthr_wakeup(zthr_t *t); +extern int zthr_cancel(zthr_t *t); +extern void zthr_resume(zthr_t *t); + +extern boolean_t zthr_iscancelled(zthr_t *t); +extern boolean_t zthr_isrunning(zthr_t *t); + +#endif /* _SYS_ZTHR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h new file mode 100644 index 000000000000..7556eda0fbe9 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h @@ -0,0 +1,85 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_ZVOL_H +#define _SYS_ZVOL_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZVOL_OBJ 1ULL +#define ZVOL_ZAP_OBJ 2ULL + +#ifdef _KERNEL +extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); +extern int zvol_check_volblocksize(uint64_t volblocksize); +extern int zvol_get_stats(objset_t *os, nvlist_t *nv); +extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); +extern int zvol_create_minor(const char *); +extern int zvol_remove_minor(const char *); +extern void zvol_remove_minors(const char *); +extern int zvol_set_volsize(const char *, uint64_t); + +#ifdef illumos +extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); +extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks); +extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr); +extern int zvol_strategy(buf_t *bp); +extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr); +extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr); +extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr); +extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr); +#endif /* illumos */ +extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, + int *rvalp); +extern int zvol_busy(void); +extern void zvol_init(void); +extern void zvol_fini(void); + +#ifdef illumos +extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize, + uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl, + void **rl_hdl, void **bonus_hdl); +extern uint64_t zvol_get_volume_size(void *minor_hdl); +extern int zvol_get_volume_wce(void *minor_hdl); +extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, + ssize_t resid, boolean_t sync); +#endif /* illumos */ + +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +extern int zvol_create_minors(const char *name); +extern void zvol_rename_minors(const char *oldname, const char *newname); +#endif + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZVOL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c new file mode 100644 index 000000000000..847fe19fb0d1 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c @@ -0,0 +1,633 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>. + * All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/trim_map.h> +#include <sys/time.h> + +/* + * Calculate the zio end, upgrading based on ashift which would be + * done by zio_vdev_io_start. + * + * This makes free range consolidation much more effective + * than it would otherwise be as well as ensuring that entire + * blocks are invalidated by writes. + */ +#define TRIM_ZIO_END(vd, offset, size) (offset + \ + P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift)) + +/* Maximal segment size for ATA TRIM. */ +#define TRIM_MAP_SIZE_FACTOR (512 << 16) + +#define TRIM_MAP_SEGS(size) (1 + (size) / TRIM_MAP_SIZE_FACTOR) + +#define TRIM_MAP_ADD(tm, ts) do { \ + list_insert_tail(&(tm)->tm_head, (ts)); \ + (tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ +} while (0) + +#define TRIM_MAP_REM(tm, ts) do { \ + list_remove(&(tm)->tm_head, (ts)); \ + (tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ +} while (0) + +typedef struct trim_map { + list_t tm_head; /* List of segments sorted by txg. */ + avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ + avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ + avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ + list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ + kmutex_t tm_lock; + uint64_t tm_pending; /* Count of pending TRIMs. */ +} trim_map_t; + +typedef struct trim_seg { + avl_node_t ts_node; /* AVL node. */ + list_node_t ts_next; /* List element. */ + uint64_t ts_start; /* Starting offset of this segment. */ + uint64_t ts_end; /* Ending offset (non-inclusive). */ + uint64_t ts_txg; /* Segment creation txg. */ + hrtime_t ts_time; /* Segment creation time. */ +} trim_seg_t; + +extern boolean_t zfs_trim_enabled; + +static u_int trim_txg_delay = 32; /* Keep deleted data up to 32 TXG */ +static u_int trim_timeout = 30; /* Keep deleted data up to 30s */ +static u_int trim_max_interval = 1; /* 1s delays between TRIMs */ +static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */ + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM"); + +SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay, + 0, "Delay TRIMs by up to this many TXGs"); +SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0, + "Delay TRIMs by up to this many seconds"); +SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN, + &trim_max_interval, 0, + "Maximum interval between TRIM queue processing (seconds)"); + +SYSCTL_DECL(_vfs_zfs_vdev); +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN, + &trim_vdev_max_pending, 0, + "Maximum pending TRIM segments for a vdev"); + +static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd); + +static int +trim_map_seg_compare(const void *x1, const void *x2) +{ + const trim_seg_t *s1 = x1; + const trim_seg_t *s2 = x2; + + if (s1->ts_start < s2->ts_start) { + if (s1->ts_end > s2->ts_start) + return (0); + return (-1); + } + if (s1->ts_start > s2->ts_start) { + if (s1->ts_start < s2->ts_end) + return (0); + return (1); + } + return (0); +} + +static int +trim_map_zio_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_offset < z2->io_offset) { + if (z1->io_offset + z1->io_size > z2->io_offset) + return (0); + return (-1); + } + if (z1->io_offset > z2->io_offset) { + if (z1->io_offset < z2->io_offset + z2->io_size) + return (0); + return (1); + } + return (0); +} + +void +trim_map_create(vdev_t *vd) +{ + trim_map_t *tm; + + ASSERT(zfs_trim_enabled && !vd->vdev_notrim && + vd->vdev_ops->vdev_op_leaf); + + tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); + mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&tm->tm_head, sizeof (trim_seg_t), + offsetof(trim_seg_t, ts_next)); + list_create(&tm->tm_pending_writes, sizeof (zio_t), + offsetof(zio_t, io_trim_link)); + avl_create(&tm->tm_queued_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, + sizeof (zio_t), offsetof(zio_t, io_trim_node)); + vd->vdev_trimmap = tm; +} + +void +trim_map_destroy(vdev_t *vd) +{ + trim_map_t *tm; + trim_seg_t *ts; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (!zfs_trim_enabled) + return; + + tm = vd->vdev_trimmap; + if (tm == NULL) + return; + + /* + * We may have been called before trim_map_vdev_commit_done() + * had a chance to run, so do it now to prune the remaining + * inflight frees. + */ + trim_map_vdev_commit_done(vd->vdev_spa, vd); + + mutex_enter(&tm->tm_lock); + while ((ts = list_head(&tm->tm_head)) != NULL) { + avl_remove(&tm->tm_queued_frees, ts); + TRIM_MAP_REM(tm, ts); + kmem_free(ts, sizeof (*ts)); + } + mutex_exit(&tm->tm_lock); + + avl_destroy(&tm->tm_queued_frees); + avl_destroy(&tm->tm_inflight_frees); + avl_destroy(&tm->tm_inflight_writes); + list_destroy(&tm->tm_pending_writes); + list_destroy(&tm->tm_head); + mutex_destroy(&tm->tm_lock); + kmem_free(tm, sizeof (*tm)); + vd->vdev_trimmap = NULL; +} + +static void +trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + avl_index_t where; + trim_seg_t tsearch, *ts_before, *ts_after, *ts; + boolean_t merge_before, merge_after; + hrtime_t time; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + VERIFY(start < end); + + time = gethrtime(); + tsearch.ts_start = start; + tsearch.ts_end = end; + + ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); + if (ts != NULL) { + if (start < ts->ts_start) + trim_map_segment_add(tm, start, ts->ts_start, txg); + if (end > ts->ts_end) + trim_map_segment_add(tm, ts->ts_end, end, txg); + return; + } + + ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); + ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); + + merge_before = (ts_before != NULL && ts_before->ts_end == start); + merge_after = (ts_after != NULL && ts_after->ts_start == end); + + if (merge_before && merge_after) { + avl_remove(&tm->tm_queued_frees, ts_before); + TRIM_MAP_REM(tm, ts_before); + TRIM_MAP_REM(tm, ts_after); + ts_after->ts_start = ts_before->ts_start; + ts_after->ts_txg = txg; + ts_after->ts_time = time; + TRIM_MAP_ADD(tm, ts_after); + kmem_free(ts_before, sizeof (*ts_before)); + } else if (merge_before) { + TRIM_MAP_REM(tm, ts_before); + ts_before->ts_end = end; + ts_before->ts_txg = txg; + ts_before->ts_time = time; + TRIM_MAP_ADD(tm, ts_before); + } else if (merge_after) { + TRIM_MAP_REM(tm, ts_after); + ts_after->ts_start = start; + ts_after->ts_txg = txg; + ts_after->ts_time = time; + TRIM_MAP_ADD(tm, ts_after); + } else { + ts = kmem_alloc(sizeof (*ts), KM_SLEEP); + ts->ts_start = start; + ts->ts_end = end; + ts->ts_txg = txg; + ts->ts_time = time; + avl_insert(&tm->tm_queued_frees, ts, where); + TRIM_MAP_ADD(tm, ts); + } +} + +static void +trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, + uint64_t end) +{ + trim_seg_t *nts; + boolean_t left_over, right_over; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + left_over = (ts->ts_start < start); + right_over = (ts->ts_end > end); + + TRIM_MAP_REM(tm, ts); + if (left_over && right_over) { + nts = kmem_alloc(sizeof (*nts), KM_SLEEP); + nts->ts_start = end; + nts->ts_end = ts->ts_end; + nts->ts_txg = ts->ts_txg; + nts->ts_time = ts->ts_time; + ts->ts_end = start; + avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); + TRIM_MAP_ADD(tm, ts); + TRIM_MAP_ADD(tm, nts); + } else if (left_over) { + ts->ts_end = start; + TRIM_MAP_ADD(tm, ts); + } else if (right_over) { + ts->ts_start = end; + TRIM_MAP_ADD(tm, ts); + } else { + avl_remove(&tm->tm_queued_frees, ts); + kmem_free(ts, sizeof (*ts)); + } +} + +static void +trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + zio_t zsearch, *zs; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + zsearch.io_offset = start; + zsearch.io_size = end - start; + + zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL); + if (zs == NULL) { + trim_map_segment_add(tm, start, end, txg); + return; + } + if (start < zs->io_offset) + trim_map_free_locked(tm, start, zs->io_offset, txg); + if (zs->io_offset + zs->io_size < end) + trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); +} + +void +trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) +{ + trim_map_t *tm = vd->vdev_trimmap; + + if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg); + mutex_exit(&tm->tm_lock); +} + +boolean_t +trim_map_write_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t tsearch, *ts; + boolean_t left_over, right_over; + uint64_t start, end; + + if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) + return (B_TRUE); + + start = zio->io_offset; + end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size); + tsearch.ts_start = start; + tsearch.ts_end = end; + + mutex_enter(&tm->tm_lock); + + /* + * Checking for colliding in-flight frees. + */ + ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); + if (ts != NULL) { + list_insert_tail(&tm->tm_pending_writes, zio); + mutex_exit(&tm->tm_lock); + return (B_FALSE); + } + + /* + * Loop until all overlapping segments are removed. + */ + while ((ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL)) != NULL) { + trim_map_segment_remove(tm, ts, start, end); + } + + avl_add(&tm->tm_inflight_writes, zio); + + mutex_exit(&tm->tm_lock); + + return (B_TRUE); +} + +void +trim_map_write_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + + /* + * Don't check for vdev_notrim, since the write could have + * started before vdev_notrim was set. + */ + if (!zfs_trim_enabled || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + /* + * Don't fail if the write isn't in the tree, since the write + * could have started after vdev_notrim was set. + */ + if (zio->io_trim_node.avl_child[0] || + zio->io_trim_node.avl_child[1] || + AVL_XPARENT(&zio->io_trim_node) || + tm->tm_inflight_writes.avl_root == &zio->io_trim_node) + avl_remove(&tm->tm_inflight_writes, zio); + mutex_exit(&tm->tm_lock); +} + +/* + * Return the oldest segment (the one with the lowest txg / time) or NULL if: + * 1. The list is empty + * 2. The first element's txg is greater than txgsafe + * 3. The first element's txg is not greater than the txg argument and the + * the first element's time is not greater than time argument + */ +static trim_seg_t * +trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time, + boolean_t force) +{ + trim_seg_t *ts; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + VERIFY(txgsafe >= txg); + + ts = list_head(&tm->tm_head); + if (ts != NULL && ts->ts_txg <= txgsafe && + (ts->ts_txg <= txg || ts->ts_time <= time || force)) + return (ts); + return (NULL); +} + +static void +trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + uint64_t size, offset, txgtarget, txgsafe; + int64_t hard, soft; + hrtime_t timelimit; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC; + if (vd->vdev_isl2cache) { + txgsafe = UINT64_MAX; + txgtarget = UINT64_MAX; + } else { + txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa)); + if (txgsafe > trim_txg_delay) + txgtarget = txgsafe - trim_txg_delay; + else + txgtarget = 0; + } + + mutex_enter(&tm->tm_lock); + hard = 0; + if (tm->tm_pending > trim_vdev_max_pending) + hard = (tm->tm_pending - trim_vdev_max_pending) / 4; + soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64); + /* Loop until we have sent all outstanding free's */ + while (soft > 0 && + (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0)) + != NULL) { + TRIM_MAP_REM(tm, ts); + avl_remove(&tm->tm_queued_frees, ts); + avl_add(&tm->tm_inflight_frees, ts); + size = ts->ts_end - ts->ts_start; + offset = ts->ts_start; + /* + * We drop the lock while we call zio_nowait as the IO + * scheduler can result in a different IO being run e.g. + * a write which would result in a recursive lock. + */ + mutex_exit(&tm->tm_lock); + + zio_nowait(zio_trim(zio, spa, vd, offset, size)); + + soft -= TRIM_MAP_SEGS(size); + hard -= TRIM_MAP_SEGS(size); + mutex_enter(&tm->tm_lock); + } + mutex_exit(&tm->tm_lock); +} + +static void +trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + list_t pending_writes; + zio_t *zio; + uint64_t start, size; + void *cookie; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + if (!avl_is_empty(&tm->tm_inflight_frees)) { + cookie = NULL; + while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, + &cookie)) != NULL) { + kmem_free(ts, sizeof (*ts)); + } + } + list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, + io_trim_link)); + list_move_tail(&pending_writes, &tm->tm_pending_writes); + mutex_exit(&tm->tm_lock); + + while ((zio = list_remove_head(&pending_writes)) != NULL) { + zio_vdev_io_reissue(zio); + zio_execute(zio); + } + list_destroy(&pending_writes); +} + +static void +trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit(spa, zio, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit(spa, zio, vd->vdev_child[c]); + } +} + +static void +trim_map_commit_done(spa_t *spa, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit_done(spa, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit_done(spa, vd->vdev_child[c]); + } +} + +static void +trim_thread(void *arg) +{ + spa_t *spa = arg; + zio_t *zio; + +#ifdef _KERNEL + (void) snprintf(curthread->td_name, sizeof(curthread->td_name), + "trim %s", spa_name(spa)); +#endif + + for (;;) { + mutex_enter(&spa->spa_trim_lock); + if (spa->spa_trim_thread == NULL) { + spa->spa_trim_thread = curthread; + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); + thread_exit(); + } + + (void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock, + hz * trim_max_interval); + mutex_exit(&spa->spa_trim_lock); + + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + trim_map_commit(spa, zio, spa->spa_root_vdev); + (void) zio_wait(zio); + trim_map_commit_done(spa, spa->spa_root_vdev); + spa_config_exit(spa, SCL_STATE, FTAG); + } +} + +void +trim_thread_create(spa_t *spa) +{ + + if (!zfs_trim_enabled) + return; + + mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); + mutex_enter(&spa->spa_trim_lock); + spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, + TS_RUN, minclsyspri); + mutex_exit(&spa->spa_trim_lock); +} + +void +trim_thread_destroy(spa_t *spa) +{ + + if (!zfs_trim_enabled) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + /* Setting spa_trim_thread to NULL tells the thread to stop. */ + spa->spa_trim_thread = NULL; + cv_signal(&spa->spa_trim_cv); + /* The thread will set it back to != NULL on exit. */ + while (spa->spa_trim_thread == NULL) + cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); + spa->spa_trim_thread = NULL; + mutex_exit(&spa->spa_trim_lock); + + cv_destroy(&spa->spa_trim_cv); + mutex_destroy(&spa->spa_trim_lock); +} + +void +trim_thread_wakeup(spa_t *spa) +{ + + if (!zfs_trim_enabled) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c new file mode 100644 index 000000000000..62d215aa4626 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c @@ -0,0 +1,932 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/txg_impl.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_scan.h> +#include <sys/zil.h> +#include <sys/callb.h> + +/* + * ZFS Transaction Groups + * ---------------------- + * + * ZFS transaction groups are, as the name implies, groups of transactions + * that act on persistent state. ZFS asserts consistency at the granularity of + * these transaction groups. Each successive transaction group (txg) is + * assigned a 64-bit consecutive identifier. There are three active + * transaction group states: open, quiescing, or syncing. At any given time, + * there may be an active txg associated with each state; each active txg may + * either be processing, or blocked waiting to enter the next state. There may + * be up to three active txgs, and there is always a txg in the open state + * (though it may be blocked waiting to enter the quiescing state). In broad + * strokes, transactions -- operations that change in-memory structures -- are + * accepted into the txg in the open state, and are completed while the txg is + * in the open or quiescing states. The accumulated changes are written to + * disk in the syncing state. + * + * Open + * + * When a new txg becomes active, it first enters the open state. New + * transactions -- updates to in-memory structures -- are assigned to the + * currently open txg. There is always a txg in the open state so that ZFS can + * accept new changes (though the txg may refuse new changes if it has hit + * some limit). ZFS advances the open txg to the next state for a variety of + * reasons such as it hitting a time or size threshold, or the execution of an + * administrative action that must be completed in the syncing state. + * + * Quiescing + * + * After a txg exits the open state, it enters the quiescing state. The + * quiescing state is intended to provide a buffer between accepting new + * transactions in the open state and writing them out to stable storage in + * the syncing state. While quiescing, transactions can continue their + * operation without delaying either of the other states. Typically, a txg is + * in the quiescing state very briefly since the operations are bounded by + * software latencies rather than, say, slower I/O latencies. After all + * transactions complete, the txg is ready to enter the next state. + * + * Syncing + * + * In the syncing state, the in-memory state built up during the open and (to + * a lesser degree) the quiescing states is written to stable storage. The + * process of writing out modified data can, in turn modify more data. For + * example when we write new blocks, we need to allocate space for them; those + * allocations modify metadata (space maps)... which themselves must be + * written to stable storage. During the sync state, ZFS iterates, writing out + * data until it converges and all in-memory changes have been written out. + * The first such pass is the largest as it encompasses all the modified user + * data (as opposed to filesystem metadata). Subsequent passes typically have + * far less data to write as they consist exclusively of filesystem metadata. + * + * To ensure convergence, after a certain number of passes ZFS begins + * overwriting locations on stable storage that had been allocated earlier in + * the syncing state (and subsequently freed). ZFS usually allocates new + * blocks to optimize for large, continuous, writes. For the syncing state to + * converge however it must complete a pass where no new blocks are allocated + * since each allocation requires a modification of persistent metadata. + * Further, to hasten convergence, after a prescribed number of passes, ZFS + * also defers frees, and stops compressing. + * + * In addition to writing out user data, we must also execute synctasks during + * the syncing context. A synctask is the mechanism by which some + * administrative activities work such as creating and destroying snapshots or + * datasets. Note that when a synctask is initiated it enters the open txg, + * and ZFS then pushes that txg as quickly as possible to completion of the + * syncing state in order to reduce the latency of the administrative + * activity. To complete the syncing state, ZFS writes out a new uberblock, + * the root of the tree of blocks that comprise all state stored on the ZFS + * pool. Finally, if there is a quiesced txg waiting, we signal that it can + * now transition to the syncing state. + */ + +static void txg_sync_thread(void *arg); +static void txg_quiesce_thread(void *arg); + +int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG"); +SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RWTUN, &zfs_txg_timeout, 0, + "Maximum seconds worth of delta per txg"); + +/* + * Prepare the txg subsystem. + */ +void +txg_init(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + int c; + bzero(tx, sizeof (tx_state_t)); + + tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); + + for (c = 0; c < max_ncpus; c++) { + int i; + + mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT, + NULL); + for (i = 0; i < TXG_SIZE; i++) { + cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, + NULL); + list_create(&tx->tx_cpu[c].tc_callbacks[i], + sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); + } + } + + mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); + + cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); + + tx->tx_open_txg = txg; +} + +/* + * Close down the txg subsystem. + */ +void +txg_fini(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + int c; + + ASSERT0(tx->tx_threads); + + mutex_destroy(&tx->tx_sync_lock); + + cv_destroy(&tx->tx_sync_more_cv); + cv_destroy(&tx->tx_sync_done_cv); + cv_destroy(&tx->tx_quiesce_more_cv); + cv_destroy(&tx->tx_quiesce_done_cv); + cv_destroy(&tx->tx_exit_cv); + + for (c = 0; c < max_ncpus; c++) { + int i; + + mutex_destroy(&tx->tx_cpu[c].tc_open_lock); + mutex_destroy(&tx->tx_cpu[c].tc_lock); + for (i = 0; i < TXG_SIZE; i++) { + cv_destroy(&tx->tx_cpu[c].tc_cv[i]); + list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); + } + } + + if (tx->tx_commit_cb_taskq != NULL) + taskq_destroy(tx->tx_commit_cb_taskq); + + kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); + + bzero(tx, sizeof (tx_state_t)); +} + +/* + * Start syncing transaction groups. + */ +void +txg_sync_start(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + mutex_enter(&tx->tx_sync_lock); + + dprintf("pool %p\n", dp); + + ASSERT0(tx->tx_threads); + + tx->tx_threads = 2; + + tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, + dp, 0, &p0, TS_RUN, minclsyspri); + + /* + * The sync thread can need a larger-than-default stack size on + * 32-bit x86. This is due in part to nested pools and + * scrub_visitbp() recursion. + */ + tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, + dp, 0, &p0, TS_RUN, minclsyspri); + + mutex_exit(&tx->tx_sync_lock); +} + +static void +txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) +{ + CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); + mutex_enter(&tx->tx_sync_lock); +} + +static void +txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) +{ + ASSERT(*tpp != NULL); + *tpp = NULL; + tx->tx_threads--; + cv_broadcast(&tx->tx_exit_cv); + CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ + thread_exit(); +} + +static void +txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) +{ + CALLB_CPR_SAFE_BEGIN(cpr); + + if (time) + (void) cv_timedwait(cv, &tx->tx_sync_lock, time); + else + cv_wait(cv, &tx->tx_sync_lock); + + CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); +} + +/* + * Stop syncing transaction groups. + */ +void +txg_sync_stop(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + dprintf("pool %p\n", dp); + /* + * Finish off any work in progress. + */ + ASSERT3U(tx->tx_threads, ==, 2); + + /* + * We need to ensure that we've vacated the deferred space_maps. + */ + txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); + + /* + * Wake all sync threads and wait for them to die. + */ + mutex_enter(&tx->tx_sync_lock); + + ASSERT3U(tx->tx_threads, ==, 2); + + tx->tx_exiting = 1; + + cv_broadcast(&tx->tx_quiesce_more_cv); + cv_broadcast(&tx->tx_quiesce_done_cv); + cv_broadcast(&tx->tx_sync_more_cv); + + while (tx->tx_threads != 0) + cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); + + tx->tx_exiting = 0; + + mutex_exit(&tx->tx_sync_lock); +} + +uint64_t +txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) +{ + tx_state_t *tx = &dp->dp_tx; + tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; + uint64_t txg; + + mutex_enter(&tc->tc_open_lock); + txg = tx->tx_open_txg; + + mutex_enter(&tc->tc_lock); + tc->tc_count[txg & TXG_MASK]++; + mutex_exit(&tc->tc_lock); + + th->th_cpu = tc; + th->th_txg = txg; + + return (txg); +} + +void +txg_rele_to_quiesce(txg_handle_t *th) +{ + tx_cpu_t *tc = th->th_cpu; + + ASSERT(!MUTEX_HELD(&tc->tc_lock)); + mutex_exit(&tc->tc_open_lock); +} + +void +txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) +{ + tx_cpu_t *tc = th->th_cpu; + int g = th->th_txg & TXG_MASK; + + mutex_enter(&tc->tc_lock); + list_move_tail(&tc->tc_callbacks[g], tx_callbacks); + mutex_exit(&tc->tc_lock); +} + +void +txg_rele_to_sync(txg_handle_t *th) +{ + tx_cpu_t *tc = th->th_cpu; + int g = th->th_txg & TXG_MASK; + + mutex_enter(&tc->tc_lock); + ASSERT(tc->tc_count[g] != 0); + if (--tc->tc_count[g] == 0) + cv_broadcast(&tc->tc_cv[g]); + mutex_exit(&tc->tc_lock); + + th->th_cpu = NULL; /* defensive */ +} + +/* + * Blocks until all transactions in the group are committed. + * + * On return, the transaction group has reached a stable state in which it can + * then be passed off to the syncing context. + */ +static __noinline void +txg_quiesce(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + int g = txg & TXG_MASK; + int c; + + /* + * Grab all tc_open_locks so nobody else can get into this txg. + */ + for (c = 0; c < max_ncpus; c++) + mutex_enter(&tx->tx_cpu[c].tc_open_lock); + + ASSERT(txg == tx->tx_open_txg); + tx->tx_open_txg++; + tx->tx_open_time = gethrtime(); + + DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg); + DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg); + + /* + * Now that we've incremented tx_open_txg, we can let threads + * enter the next transaction group. + */ + for (c = 0; c < max_ncpus; c++) + mutex_exit(&tx->tx_cpu[c].tc_open_lock); + + /* + * Quiesce the transaction group by waiting for everyone to txg_exit(). + */ + for (c = 0; c < max_ncpus; c++) { + tx_cpu_t *tc = &tx->tx_cpu[c]; + mutex_enter(&tc->tc_lock); + while (tc->tc_count[g] != 0) + cv_wait(&tc->tc_cv[g], &tc->tc_lock); + mutex_exit(&tc->tc_lock); + } +} + +static void +txg_do_callbacks(void *arg) +{ + list_t *cb_list = arg; + + dmu_tx_do_callbacks(cb_list, 0); + + list_destroy(cb_list); + + kmem_free(cb_list, sizeof (list_t)); +} + +/* + * Dispatch the commit callbacks registered on this txg to worker threads. + * + * If no callbacks are registered for a given TXG, nothing happens. + * This function creates a taskq for the associated pool, if needed. + */ +static void +txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) +{ + int c; + tx_state_t *tx = &dp->dp_tx; + list_t *cb_list; + + for (c = 0; c < max_ncpus; c++) { + tx_cpu_t *tc = &tx->tx_cpu[c]; + /* + * No need to lock tx_cpu_t at this point, since this can + * only be called once a txg has been synced. + */ + + int g = txg & TXG_MASK; + + if (list_is_empty(&tc->tc_callbacks[g])) + continue; + + if (tx->tx_commit_cb_taskq == NULL) { + /* + * Commit callback taskq hasn't been created yet. + */ + tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", + max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2, + TASKQ_PREPOPULATE); + } + + cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(cb_list, sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); + + list_move_tail(cb_list, &tc->tc_callbacks[g]); + + (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) + txg_do_callbacks, cb_list, TQ_SLEEP); + } +} + +static boolean_t +txg_is_syncing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_syncing_txg != 0); +} + +static boolean_t +txg_is_quiescing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiescing_txg != 0); +} + +static boolean_t +txg_has_quiesced_to_sync(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiesced_txg != 0); +} + +static void +txg_sync_thread(void *arg) +{ + dsl_pool_t *dp = arg; + spa_t *spa = dp->dp_spa; + tx_state_t *tx = &dp->dp_tx; + callb_cpr_t cpr; + uint64_t start, delta; + + txg_thread_enter(tx, &cpr); + + start = delta = 0; + for (;;) { + uint64_t timeout = zfs_txg_timeout * hz; + uint64_t timer; + uint64_t txg; + + /* + * We sync when we're scanning, there's someone waiting + * on us, or the quiesce thread has handed off a txg to + * us, or we have reached our timeout. + */ + timer = (delta >= timeout ? 0 : timeout - delta); + while (!dsl_scan_active(dp->dp_scan) && + !tx->tx_exiting && timer > 0 && + tx->tx_synced_txg >= tx->tx_sync_txg_waiting && + !txg_has_quiesced_to_sync(dp) && + dp->dp_dirty_total < zfs_dirty_data_sync) { + dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", + tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); + txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); + delta = ddi_get_lbolt() - start; + timer = (delta > timeout ? 0 : timeout - delta); + } + + /* + * Wait until the quiesce thread hands off a txg to us, + * prompting it to do so if necessary. + */ + while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { + if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) + tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; + cv_broadcast(&tx->tx_quiesce_more_cv); + txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); + } + + if (tx->tx_exiting) + txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); + + /* + * Consume the quiesced txg which has been handed off to + * us. This may cause the quiescing thread to now be + * able to quiesce another txg, so we must signal it. + */ + ASSERT(tx->tx_quiesced_txg != 0); + txg = tx->tx_quiesced_txg; + tx->tx_quiesced_txg = 0; + tx->tx_syncing_txg = txg; + DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); + cv_broadcast(&tx->tx_quiesce_more_cv); + + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + mutex_exit(&tx->tx_sync_lock); + + start = ddi_get_lbolt(); + spa_sync(spa, txg); + delta = ddi_get_lbolt() - start; + + mutex_enter(&tx->tx_sync_lock); + tx->tx_synced_txg = txg; + tx->tx_syncing_txg = 0; + DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); + cv_broadcast(&tx->tx_sync_done_cv); + + /* + * Dispatch commit callbacks to worker threads. + */ + txg_dispatch_callbacks(dp, txg); + } +} + +static void +txg_quiesce_thread(void *arg) +{ + dsl_pool_t *dp = arg; + tx_state_t *tx = &dp->dp_tx; + callb_cpr_t cpr; + + txg_thread_enter(tx, &cpr); + + for (;;) { + uint64_t txg; + + /* + * We quiesce when there's someone waiting on us. + * However, we can only have one txg in "quiescing" or + * "quiesced, waiting to sync" state. So we wait until + * the "quiesced, waiting to sync" txg has been consumed + * by the sync thread. + */ + while (!tx->tx_exiting && + (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || + txg_has_quiesced_to_sync(dp))) + txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); + + if (tx->tx_exiting) + txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); + + txg = tx->tx_open_txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, + tx->tx_sync_txg_waiting); + tx->tx_quiescing_txg = txg; + + mutex_exit(&tx->tx_sync_lock); + txg_quiesce(dp, txg); + mutex_enter(&tx->tx_sync_lock); + + /* + * Hand this txg off to the sync thread. + */ + dprintf("quiesce done, handing off txg %llu\n", txg); + tx->tx_quiescing_txg = 0; + tx->tx_quiesced_txg = txg; + DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); + cv_broadcast(&tx->tx_sync_more_cv); + cv_broadcast(&tx->tx_quiesce_done_cv); + } +} + +/* + * Delay this thread by delay nanoseconds if we are still in the open + * transaction group and there is already a waiting txg quiesing or quiesced. + * Abort the delay if this txg stalls or enters the quiesing state. + */ +void +txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) +{ + tx_state_t *tx = &dp->dp_tx; + hrtime_t start = gethrtime(); + + /* don't delay if this txg could transition to quiescing immediately */ + if (tx->tx_open_txg > txg || + tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) + return; + + mutex_enter(&tx->tx_sync_lock); + if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { + mutex_exit(&tx->tx_sync_lock); + return; + } + + while (gethrtime() - start < delay && + tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) { + (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv, + &tx->tx_sync_lock, delay, resolution, 0); + } + + mutex_exit(&tx->tx_sync_lock); +} + +void +txg_wait_synced(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + + ASSERT(!dsl_pool_config_held(dp)); + + mutex_enter(&tx->tx_sync_lock); + ASSERT3U(tx->tx_threads, ==, 2); + if (txg == 0) + txg = tx->tx_open_txg + TXG_DEFER_SIZE; + if (tx->tx_sync_txg_waiting < txg) + tx->tx_sync_txg_waiting = txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + while (tx->tx_synced_txg < txg) { + dprintf("broadcasting sync more " + "tx_synced=%llu waiting=%llu dp=%p\n", + tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); + cv_broadcast(&tx->tx_sync_more_cv); + cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); + } + mutex_exit(&tx->tx_sync_lock); +} + +void +txg_wait_open(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + + ASSERT(!dsl_pool_config_held(dp)); + + mutex_enter(&tx->tx_sync_lock); + ASSERT3U(tx->tx_threads, ==, 2); + if (txg == 0) + txg = tx->tx_open_txg + 1; + if (tx->tx_quiesce_txg_waiting < txg) + tx->tx_quiesce_txg_waiting = txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + while (tx->tx_open_txg < txg) { + cv_broadcast(&tx->tx_quiesce_more_cv); + cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); + } + mutex_exit(&tx->tx_sync_lock); +} + +/* + * If there isn't a txg syncing or in the pipeline, push another txg through + * the pipeline by queiscing the open txg. + */ +void +txg_kick(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + ASSERT(!dsl_pool_config_held(dp)); + + mutex_enter(&tx->tx_sync_lock); + if (!txg_is_syncing(dp) && + !txg_is_quiescing(dp) && + tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && + tx->tx_sync_txg_waiting <= tx->tx_synced_txg && + tx->tx_quiesced_txg <= tx->tx_synced_txg) { + tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; + cv_broadcast(&tx->tx_quiesce_more_cv); + } + mutex_exit(&tx->tx_sync_lock); +} + +boolean_t +txg_stalled(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); +} + +boolean_t +txg_sync_waiting(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || + tx->tx_quiesced_txg != 0); +} + +/* + * Verify that this txg is active (open, quiescing, syncing). Non-active + * txg's should not be manipulated. + */ +void +txg_verify(spa_t *spa, uint64_t txg) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + if (txg <= TXG_INITIAL || txg == ZILTEST_TXG) + return; + ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); + ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg); + ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES); +} + +/* + * Per-txg object lists. + */ +void +txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset) +{ + int t; + + mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); + + tl->tl_offset = offset; + tl->tl_spa = spa; + + for (t = 0; t < TXG_SIZE; t++) + tl->tl_head[t] = NULL; +} + +void +txg_list_destroy(txg_list_t *tl) +{ + int t; + + for (t = 0; t < TXG_SIZE; t++) + ASSERT(txg_list_empty(tl, t)); + + mutex_destroy(&tl->tl_lock); +} + +boolean_t +txg_list_empty(txg_list_t *tl, uint64_t txg) +{ + txg_verify(tl->tl_spa, txg); + return (tl->tl_head[txg & TXG_MASK] == NULL); +} + +/* + * Returns true if all txg lists are empty. + * + * Warning: this is inherently racy (an item could be added immediately + * after this function returns). We don't bother with the lock because + * it wouldn't change the semantics. + */ +boolean_t +txg_all_lists_empty(txg_list_t *tl) +{ + for (int i = 0; i < TXG_SIZE; i++) { + if (!txg_list_empty(tl, i)) { + return (B_FALSE); + } + } + return (B_TRUE); +} + +/* + * Add an entry to the list (unless it's already on the list). + * Returns B_TRUE if it was actually added. + */ +boolean_t +txg_list_add(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + boolean_t add; + + txg_verify(tl->tl_spa, txg); + mutex_enter(&tl->tl_lock); + add = (tn->tn_member[t] == 0); + if (add) { + tn->tn_member[t] = 1; + tn->tn_next[t] = tl->tl_head[t]; + tl->tl_head[t] = tn; + } + mutex_exit(&tl->tl_lock); + + return (add); +} + +/* + * Add an entry to the end of the list, unless it's already on the list. + * (walks list to find end) + * Returns B_TRUE if it was actually added. + */ +boolean_t +txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + boolean_t add; + + txg_verify(tl->tl_spa, txg); + mutex_enter(&tl->tl_lock); + add = (tn->tn_member[t] == 0); + if (add) { + txg_node_t **tp; + + for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) + continue; + + tn->tn_member[t] = 1; + tn->tn_next[t] = NULL; + *tp = tn; + } + mutex_exit(&tl->tl_lock); + + return (add); +} + +/* + * Remove the head of the list and return it. + */ +void * +txg_list_remove(txg_list_t *tl, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn; + void *p = NULL; + + txg_verify(tl->tl_spa, txg); + mutex_enter(&tl->tl_lock); + if ((tn = tl->tl_head[t]) != NULL) { + ASSERT(tn->tn_member[t]); + ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]); + p = (char *)tn - tl->tl_offset; + tl->tl_head[t] = tn->tn_next[t]; + tn->tn_next[t] = NULL; + tn->tn_member[t] = 0; + } + mutex_exit(&tl->tl_lock); + + return (p); +} + +/* + * Remove a specific item from the list and return it. + */ +void * +txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn, **tp; + + txg_verify(tl->tl_spa, txg); + mutex_enter(&tl->tl_lock); + + for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { + if ((char *)tn - tl->tl_offset == p) { + *tp = tn->tn_next[t]; + tn->tn_next[t] = NULL; + tn->tn_member[t] = 0; + mutex_exit(&tl->tl_lock); + return (p); + } + } + + mutex_exit(&tl->tl_lock); + + return (NULL); +} + +boolean_t +txg_list_member(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + + txg_verify(tl->tl_spa, txg); + return (tn->tn_member[t] != 0); +} + +/* + * Walk a txg list -- only safe if you know it's not changing. + */ +void * +txg_list_head(txg_list_t *tl, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = tl->tl_head[t]; + + txg_verify(tl->tl_spa, txg); + return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); +} + +void * +txg_list_next(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + + txg_verify(tl->tl_spa, txg); + tn = tn->tn_next[t]; + + return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c new file mode 100644 index 000000000000..8b198469e1c2 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/uberblock_impl.h> +#include <sys/vdev_impl.h> + +int +uberblock_verify(uberblock_t *ub) +{ + if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) + byteswap_uint64_array(ub, sizeof (uberblock_t)); + + if (ub->ub_magic != UBERBLOCK_MAGIC) + return (SET_ERROR(EINVAL)); + + return (0); +} + +/* + * Update the uberblock and return TRUE if anything changed in this + * transaction group. + */ +boolean_t +uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg) +{ + ASSERT(ub->ub_txg < txg); + + /* + * We explicitly do not set ub_version here, so that older versions + * continue to be written with the previous uberblock version. + */ + ub->ub_magic = UBERBLOCK_MAGIC; + ub->ub_txg = txg; + ub->ub_guid_sum = rvd->vdev_guid_sum; + ub->ub_timestamp = gethrestime_sec(); + ub->ub_software_version = SPA_VERSION; + ub->ub_checkpoint_txg = 0; + + return (ub->ub_rootbp.blk_birth == txg); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c new file mode 100644 index 000000000000..d33f451938b8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c @@ -0,0 +1,112 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/unique.h> + +static avl_tree_t unique_avl; +static kmutex_t unique_mtx; + +typedef struct unique { + avl_node_t un_link; + uint64_t un_value; +} unique_t; + +#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1) + +static int +unique_compare(const void *a, const void *b) +{ + const unique_t *una = (const unique_t *)a; + const unique_t *unb = (const unique_t *)b; + + return (AVL_CMP(una->un_value, unb->un_value)); +} + +void +unique_init(void) +{ + avl_create(&unique_avl, unique_compare, + sizeof (unique_t), offsetof(unique_t, un_link)); + mutex_init(&unique_mtx, NULL, MUTEX_DEFAULT, NULL); +} + +void +unique_fini(void) +{ + avl_destroy(&unique_avl); + mutex_destroy(&unique_mtx); +} + +uint64_t +unique_create(void) +{ + uint64_t value = unique_insert(0); + unique_remove(value); + return (value); +} + +uint64_t +unique_insert(uint64_t value) +{ + avl_index_t idx; + unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP); + + un->un_value = value; + + mutex_enter(&unique_mtx); + while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK || + avl_find(&unique_avl, un, &idx)) { + mutex_exit(&unique_mtx); + (void) random_get_pseudo_bytes((void*)&un->un_value, + sizeof (un->un_value)); + un->un_value &= UNIQUE_MASK; + mutex_enter(&unique_mtx); + } + + avl_insert(&unique_avl, un, idx); + mutex_exit(&unique_mtx); + + return (un->un_value); +} + +void +unique_remove(uint64_t value) +{ + unique_t un_tofind; + unique_t *un; + + un_tofind.un_value = value; + mutex_enter(&unique_mtx); + un = avl_find(&unique_avl, &un_tofind, NULL); + if (un != NULL) { + avl_remove(&unique_avl, un); + kmem_free(un, sizeof (unique_t)); + } + mutex_exit(&unique_mtx); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c new file mode 100644 index 000000000000..2f210cebaedc --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -0,0 +1,4378 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Toomas Soome <tsoome@me.com> + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/bpobj.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dir.h> +#include <sys/vdev_impl.h> +#include <sys/uberblock_impl.h> +#include <sys/metaslab.h> +#include <sys/metaslab_impl.h> +#include <sys/space_map.h> +#include <sys/space_reftree.h> +#include <sys/zio.h> +#include <sys/zap.h> +#include <sys/fs/zfs.h> +#include <sys/arc.h> +#include <sys/zil.h> +#include <sys/dsl_scan.h> +#include <sys/abd.h> +#include <sys/trim_map.h> +#include <sys/vdev_initialize.h> + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); + +/* + * Virtual device management. + */ + +/* + * The limit for ZFS to automatically increase a top-level vdev's ashift + * from logical ashift to physical ashift. + * + * Example: one or more 512B emulation child vdevs + * child->vdev_ashift = 9 (512 bytes) + * child->vdev_physical_ashift = 12 (4096 bytes) + * zfs_max_auto_ashift = 11 (2048 bytes) + * zfs_min_auto_ashift = 9 (512 bytes) + * + * On pool creation or the addition of a new top-level vdev, ZFS will + * increase the ashift of the top-level vdev to 2048 as limited by + * zfs_max_auto_ashift. + * + * Example: one or more 512B emulation child vdevs + * child->vdev_ashift = 9 (512 bytes) + * child->vdev_physical_ashift = 12 (4096 bytes) + * zfs_max_auto_ashift = 13 (8192 bytes) + * zfs_min_auto_ashift = 9 (512 bytes) + * + * On pool creation or the addition of a new top-level vdev, ZFS will + * increase the ashift of the top-level vdev to 4096 to match the + * max vdev_physical_ashift. + * + * Example: one or more 512B emulation child vdevs + * child->vdev_ashift = 9 (512 bytes) + * child->vdev_physical_ashift = 9 (512 bytes) + * zfs_max_auto_ashift = 13 (8192 bytes) + * zfs_min_auto_ashift = 12 (4096 bytes) + * + * On pool creation or the addition of a new top-level vdev, ZFS will + * increase the ashift of the top-level vdev to 4096 to match the + * zfs_min_auto_ashift. + */ +static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; +static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; + +static int +sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_max_auto_ashift; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift) + return (EINVAL); + + zfs_max_auto_ashift = val; + + return (0); +} +SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), + sysctl_vfs_zfs_max_auto_ashift, "QU", + "Max ashift used when optimising for logical -> physical sectors size on " + "new top-level vdevs."); + +static int +sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_min_auto_ashift; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift) + return (EINVAL); + + zfs_min_auto_ashift = val; + + return (0); +} +SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), + sysctl_vfs_zfs_min_auto_ashift, "QU", + "Min ashift used when creating new top-level vdevs."); + +static vdev_ops_t *vdev_ops_table[] = { + &vdev_root_ops, + &vdev_raidz_ops, + &vdev_mirror_ops, + &vdev_replacing_ops, + &vdev_spare_ops, +#ifdef _KERNEL + &vdev_geom_ops, +#else + &vdev_disk_ops, +#endif + &vdev_file_ops, + &vdev_missing_ops, + &vdev_hole_ops, + &vdev_indirect_ops, + NULL +}; + + +/* target number of metaslabs per top-level vdev */ +int vdev_max_ms_count = 200; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count, CTLFLAG_RWTUN, + &vdev_max_ms_count, 0, + "Target number of metaslabs per top-level vdev"); + +/* minimum number of metaslabs per top-level vdev */ +int vdev_min_ms_count = 16; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN, + &vdev_min_ms_count, 0, + "Minimum number of metaslabs per top-level vdev"); + +/* practical upper limit of total metaslabs per top-level vdev */ +int vdev_ms_count_limit = 1ULL << 17; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN, + &vdev_ms_count_limit, 0, + "Maximum number of metaslabs per top-level vdev"); + +/* lower limit for metaslab size (512M) */ +int vdev_default_ms_shift = 29; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN, + &vdev_default_ms_shift, 0, + "Default shift between vdev size and number of metaslabs"); + +/* upper limit for metaslab size (256G) */ +int vdev_max_ms_shift = 38; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN, + &vdev_max_ms_shift, 0, + "Maximum shift between vdev size and number of metaslabs"); + +boolean_t vdev_validate_skip = B_FALSE; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, validate_skip, CTLFLAG_RWTUN, + &vdev_validate_skip, 0, + "Bypass vdev validation"); + +/* + * Since the DTL space map of a vdev is not expected to have a lot of + * entries, we default its block size to 4K. + */ +int vdev_dtl_sm_blksz = (1 << 12); +SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, + &vdev_dtl_sm_blksz, 0, + "Block size for DTL space map. Power of 2 and greater than 4096."); + +/* + * vdev-wide space maps that have lots of entries written to them at + * the end of each transaction can benefit from a higher I/O bandwidth + * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. + */ +int vdev_standard_sm_blksz = (1 << 17); +SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, + &vdev_standard_sm_blksz, 0, + "Block size for standard space map. Power of 2 and greater than 4096."); + +/*PRINTFLIKE2*/ +void +vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) +{ + va_list adx; + char buf[256]; + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + if (vd->vdev_path != NULL) { + zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, + vd->vdev_path, buf); + } else { + zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", + vd->vdev_ops->vdev_op_type, + (u_longlong_t)vd->vdev_id, + (u_longlong_t)vd->vdev_guid, buf); + } +} + +void +vdev_dbgmsg_print_tree(vdev_t *vd, int indent) +{ + char state[20]; + + if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { + zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id, + vd->vdev_ops->vdev_op_type); + return; + } + + switch (vd->vdev_state) { + case VDEV_STATE_UNKNOWN: + (void) snprintf(state, sizeof (state), "unknown"); + break; + case VDEV_STATE_CLOSED: + (void) snprintf(state, sizeof (state), "closed"); + break; + case VDEV_STATE_OFFLINE: + (void) snprintf(state, sizeof (state), "offline"); + break; + case VDEV_STATE_REMOVED: + (void) snprintf(state, sizeof (state), "removed"); + break; + case VDEV_STATE_CANT_OPEN: + (void) snprintf(state, sizeof (state), "can't open"); + break; + case VDEV_STATE_FAULTED: + (void) snprintf(state, sizeof (state), "faulted"); + break; + case VDEV_STATE_DEGRADED: + (void) snprintf(state, sizeof (state), "degraded"); + break; + case VDEV_STATE_HEALTHY: + (void) snprintf(state, sizeof (state), "healthy"); + break; + default: + (void) snprintf(state, sizeof (state), "<state %u>", + (uint_t)vd->vdev_state); + } + + zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, + "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, + vd->vdev_islog ? " (log)" : "", + (u_longlong_t)vd->vdev_guid, + vd->vdev_path ? vd->vdev_path : "N/A", state); + + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); +} + +/* + * Given a vdev type, return the appropriate ops vector. + */ +static vdev_ops_t * +vdev_getops(const char *type) +{ + vdev_ops_t *ops, **opspp; + + for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) + if (strcmp(ops->vdev_op_type, type) == 0) + break; + + return (ops); +} + +/* ARGSUSED */ +void +vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) +{ + res->rs_start = in->rs_start; + res->rs_end = in->rs_end; +} + +/* + * Default asize function: return the MAX of psize with the asize of + * all children. This is what's used by anything other than RAID-Z. + */ +uint64_t +vdev_default_asize(vdev_t *vd, uint64_t psize) +{ + uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); + uint64_t csize; + + for (int c = 0; c < vd->vdev_children; c++) { + csize = vdev_psize_to_asize(vd->vdev_child[c], psize); + asize = MAX(asize, csize); + } + + return (asize); +} + +/* + * Get the minimum allocatable size. We define the allocatable size as + * the vdev's asize rounded to the nearest metaslab. This allows us to + * replace or attach devices which don't have the same physical size but + * can still satisfy the same number of allocations. + */ +uint64_t +vdev_get_min_asize(vdev_t *vd) +{ + vdev_t *pvd = vd->vdev_parent; + + /* + * If our parent is NULL (inactive spare or cache) or is the root, + * just return our own asize. + */ + if (pvd == NULL) + return (vd->vdev_asize); + + /* + * The top-level vdev just returns the allocatable size rounded + * to the nearest metaslab. + */ + if (vd == vd->vdev_top) + return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); + + /* + * The allocatable space for a raidz vdev is N * sizeof(smallest child), + * so each child must provide at least 1/Nth of its asize. + */ + if (pvd->vdev_ops == &vdev_raidz_ops) + return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / + pvd->vdev_children); + + return (pvd->vdev_min_asize); +} + +void +vdev_set_min_asize(vdev_t *vd) +{ + vd->vdev_min_asize = vdev_get_min_asize(vd); + + for (int c = 0; c < vd->vdev_children; c++) + vdev_set_min_asize(vd->vdev_child[c]); +} + +vdev_t * +vdev_lookup_top(spa_t *spa, uint64_t vdev) +{ + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + + if (vdev < rvd->vdev_children) { + ASSERT(rvd->vdev_child[vdev] != NULL); + return (rvd->vdev_child[vdev]); + } + + return (NULL); +} + +vdev_t * +vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) +{ + vdev_t *mvd; + + if (vd->vdev_guid == guid) + return (vd); + + for (int c = 0; c < vd->vdev_children; c++) + if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != + NULL) + return (mvd); + + return (NULL); +} + +static int +vdev_count_leaves_impl(vdev_t *vd) +{ + int n = 0; + + if (vd->vdev_ops->vdev_op_leaf) + return (1); + + for (int c = 0; c < vd->vdev_children; c++) + n += vdev_count_leaves_impl(vd->vdev_child[c]); + + return (n); +} + +int +vdev_count_leaves(spa_t *spa) +{ + return (vdev_count_leaves_impl(spa->spa_root_vdev)); +} + +void +vdev_add_child(vdev_t *pvd, vdev_t *cvd) +{ + size_t oldsize, newsize; + uint64_t id = cvd->vdev_id; + vdev_t **newchild; + spa_t *spa = cvd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + ASSERT(cvd->vdev_parent == NULL); + + cvd->vdev_parent = pvd; + + if (pvd == NULL) + return; + + ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); + + oldsize = pvd->vdev_children * sizeof (vdev_t *); + pvd->vdev_children = MAX(pvd->vdev_children, id + 1); + newsize = pvd->vdev_children * sizeof (vdev_t *); + + newchild = kmem_zalloc(newsize, KM_SLEEP); + if (pvd->vdev_child != NULL) { + bcopy(pvd->vdev_child, newchild, oldsize); + kmem_free(pvd->vdev_child, oldsize); + } + + pvd->vdev_child = newchild; + pvd->vdev_child[id] = cvd; + + cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); + ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); + + /* + * Walk up all ancestors to update guid sum. + */ + for (; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += cvd->vdev_guid_sum; +} + +void +vdev_remove_child(vdev_t *pvd, vdev_t *cvd) +{ + int c; + uint_t id = cvd->vdev_id; + + ASSERT(cvd->vdev_parent == pvd); + + if (pvd == NULL) + return; + + ASSERT(id < pvd->vdev_children); + ASSERT(pvd->vdev_child[id] == cvd); + + pvd->vdev_child[id] = NULL; + cvd->vdev_parent = NULL; + + for (c = 0; c < pvd->vdev_children; c++) + if (pvd->vdev_child[c]) + break; + + if (c == pvd->vdev_children) { + kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); + pvd->vdev_child = NULL; + pvd->vdev_children = 0; + } + + /* + * Walk up all ancestors to update guid sum. + */ + for (; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum -= cvd->vdev_guid_sum; +} + +/* + * Remove any holes in the child array. + */ +void +vdev_compact_children(vdev_t *pvd) +{ + vdev_t **newchild, *cvd; + int oldc = pvd->vdev_children; + int newc; + + ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if (oldc == 0) + return; + + for (int c = newc = 0; c < oldc; c++) + if (pvd->vdev_child[c]) + newc++; + + if (newc > 0) { + newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); + + for (int c = newc = 0; c < oldc; c++) { + if ((cvd = pvd->vdev_child[c]) != NULL) { + newchild[newc] = cvd; + cvd->vdev_id = newc++; + } + } + } else { + newchild = NULL; + } + + kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); + pvd->vdev_child = newchild; + pvd->vdev_children = newc; +} + +/* + * Allocate and minimally initialize a vdev_t. + */ +vdev_t * +vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) +{ + vdev_t *vd; + vdev_indirect_config_t *vic; + + vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); + vic = &vd->vdev_indirect_config; + + if (spa->spa_root_vdev == NULL) { + ASSERT(ops == &vdev_root_ops); + spa->spa_root_vdev = vd; + spa->spa_load_guid = spa_generate_guid(NULL); + } + + if (guid == 0 && ops != &vdev_hole_ops) { + if (spa->spa_root_vdev == vd) { + /* + * The root vdev's guid will also be the pool guid, + * which must be unique among all pools. + */ + guid = spa_generate_guid(NULL); + } else { + /* + * Any other vdev's guid must be unique within the pool. + */ + guid = spa_generate_guid(spa); + } + ASSERT(!spa_guid_exists(spa_guid(spa), guid)); + } + + vd->vdev_spa = spa; + vd->vdev_id = id; + vd->vdev_guid = guid; + vd->vdev_guid_sum = guid; + vd->vdev_ops = ops; + vd->vdev_state = VDEV_STATE_CLOSED; + vd->vdev_ishole = (ops == &vdev_hole_ops); + vic->vic_prev_indirect_vdev = UINT64_MAX; + + rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); + vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); + + mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); + + for (int t = 0; t < DTL_TYPES; t++) { + vd->vdev_dtl[t] = range_tree_create(NULL, NULL); + } + txg_list_create(&vd->vdev_ms_list, spa, + offsetof(struct metaslab, ms_txg_node)); + txg_list_create(&vd->vdev_dtl_list, spa, + offsetof(struct vdev, vdev_dtl_node)); + vd->vdev_stat.vs_timestamp = gethrtime(); + vdev_queue_init(vd); + vdev_cache_init(vd); + + return (vd); +} + +/* + * Allocate a new vdev. The 'alloctype' is used to control whether we are + * creating a new vdev or loading an existing one - the behavior is slightly + * different for each case. + */ +int +vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, + int alloctype) +{ + vdev_ops_t *ops; + char *type; + uint64_t guid = 0, islog, nparity; + vdev_t *vd; + vdev_indirect_config_t *vic; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (SET_ERROR(EINVAL)); + + if ((ops = vdev_getops(type)) == NULL) + return (SET_ERROR(EINVAL)); + + /* + * If this is a load, get the vdev guid from the nvlist. + * Otherwise, vdev_alloc_common() will generate one for us. + */ + if (alloctype == VDEV_ALLOC_LOAD) { + uint64_t label_id; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || + label_id != id) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (SET_ERROR(EINVAL)); + } else if (alloctype == VDEV_ALLOC_SPARE) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (SET_ERROR(EINVAL)); + } else if (alloctype == VDEV_ALLOC_L2CACHE) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (SET_ERROR(EINVAL)); + } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (SET_ERROR(EINVAL)); + } + + /* + * The first allocated vdev must be of type 'root'. + */ + if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) + return (SET_ERROR(EINVAL)); + + /* + * Determine whether we're a log vdev. + */ + islog = 0; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); + if (islog && spa_version(spa) < SPA_VERSION_SLOGS) + return (SET_ERROR(ENOTSUP)); + + if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) + return (SET_ERROR(ENOTSUP)); + + /* + * Set the nparity property for RAID-Z vdevs. + */ + nparity = -1ULL; + if (ops == &vdev_raidz_ops) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, + &nparity) == 0) { + if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) + return (SET_ERROR(EINVAL)); + /* + * Previous versions could only support 1 or 2 parity + * device. + */ + if (nparity > 1 && + spa_version(spa) < SPA_VERSION_RAIDZ2) + return (SET_ERROR(ENOTSUP)); + if (nparity > 2 && + spa_version(spa) < SPA_VERSION_RAIDZ3) + return (SET_ERROR(ENOTSUP)); + } else { + /* + * We require the parity to be specified for SPAs that + * support multiple parity levels. + */ + if (spa_version(spa) >= SPA_VERSION_RAIDZ2) + return (SET_ERROR(EINVAL)); + /* + * Otherwise, we default to 1 parity device for RAID-Z. + */ + nparity = 1; + } + } else { + nparity = 0; + } + ASSERT(nparity != -1ULL); + + vd = vdev_alloc_common(spa, id, guid, ops); + vic = &vd->vdev_indirect_config; + + vd->vdev_islog = islog; + vd->vdev_nparity = nparity; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) + vd->vdev_path = spa_strdup(vd->vdev_path); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) + vd->vdev_devid = spa_strdup(vd->vdev_devid); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, + &vd->vdev_physpath) == 0) + vd->vdev_physpath = spa_strdup(vd->vdev_physpath); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) + vd->vdev_fru = spa_strdup(vd->vdev_fru); + + /* + * Set the whole_disk property. If it's not specified, leave the value + * as -1. + */ + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &vd->vdev_wholedisk) != 0) + vd->vdev_wholedisk = -1ULL; + + ASSERT0(vic->vic_mapping_object); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, + &vic->vic_mapping_object); + ASSERT0(vic->vic_births_object); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, + &vic->vic_births_object); + ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, + &vic->vic_prev_indirect_vdev); + + /* + * Look for the 'not present' flag. This will only be set if the device + * was not present at the time of import. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + &vd->vdev_not_present); + + /* + * Get the alignment requirement. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); + + /* + * Retrieve the vdev creation time. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, + &vd->vdev_crtxg); + + /* + * If we're a top-level vdev, try to load the allocation parameters. + */ + if (parent && !parent->vdev_parent && + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, + &vd->vdev_ms_array); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, + &vd->vdev_ms_shift); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, + &vd->vdev_asize); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, + &vd->vdev_removing); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, + &vd->vdev_top_zap); + } else { + ASSERT0(vd->vdev_top_zap); + } + + if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { + ASSERT(alloctype == VDEV_ALLOC_LOAD || + alloctype == VDEV_ALLOC_ADD || + alloctype == VDEV_ALLOC_SPLIT || + alloctype == VDEV_ALLOC_ROOTPOOL); + vd->vdev_mg = metaslab_group_create(islog ? + spa_log_class(spa) : spa_normal_class(spa), vd, + spa->spa_alloc_count); + } + + if (vd->vdev_ops->vdev_op_leaf && + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { + (void) nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); + } else { + ASSERT0(vd->vdev_leaf_zap); + } + + /* + * If we're a leaf vdev, try to load the DTL object and other state. + */ + + if (vd->vdev_ops->vdev_op_leaf && + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || + alloctype == VDEV_ALLOC_ROOTPOOL)) { + if (alloctype == VDEV_ALLOC_LOAD) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, + &vd->vdev_dtl_object); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, + &vd->vdev_unspare); + } + + if (alloctype == VDEV_ALLOC_ROOTPOOL) { + uint64_t spare = 0; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, + &spare) == 0 && spare) + spa_spare_add(vd); + } + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, + &vd->vdev_offline); + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, + &vd->vdev_resilver_txg); + + /* + * When importing a pool, we want to ignore the persistent fault + * state, as the diagnosis made on another system may not be + * valid in the current context. Local vdevs will + * remain in the faulted state. + */ + if (spa_load_state(spa) == SPA_LOAD_OPEN) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, + &vd->vdev_faulted); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, + &vd->vdev_degraded); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, + &vd->vdev_removed); + + if (vd->vdev_faulted || vd->vdev_degraded) { + char *aux; + + vd->vdev_label_aux = + VDEV_AUX_ERR_EXCEEDED; + if (nvlist_lookup_string(nv, + ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && + strcmp(aux, "external") == 0) + vd->vdev_label_aux = VDEV_AUX_EXTERNAL; + } + } + } + + /* + * Add ourselves to the parent's list of children. + */ + vdev_add_child(parent, vd); + + *vdp = vd; + + return (0); +} + +void +vdev_free(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + + /* + * Scan queues are normally destroyed at the end of a scan. If the + * queue exists here, that implies the vdev is being removed while + * the scan is still running. + */ + if (vd->vdev_scan_io_queue != NULL) { + mutex_enter(&vd->vdev_scan_io_queue_lock); + dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); + vd->vdev_scan_io_queue = NULL; + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + /* + * vdev_free() implies closing the vdev first. This is simpler than + * trying to ensure complicated semantics for all callers. + */ + vdev_close(vd); + + ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); + ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); + + /* + * Free all children. + */ + for (int c = 0; c < vd->vdev_children; c++) + vdev_free(vd->vdev_child[c]); + + ASSERT(vd->vdev_child == NULL); + ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + ASSERT(vd->vdev_initialize_thread == NULL); + + /* + * Discard allocation state. + */ + if (vd->vdev_mg != NULL) { + vdev_metaslab_fini(vd); + metaslab_group_destroy(vd->vdev_mg); + } + + ASSERT0(vd->vdev_stat.vs_space); + ASSERT0(vd->vdev_stat.vs_dspace); + ASSERT0(vd->vdev_stat.vs_alloc); + + /* + * Remove this vdev from its parent's child list. + */ + vdev_remove_child(vd->vdev_parent, vd); + + ASSERT(vd->vdev_parent == NULL); + + /* + * Clean up vdev structure. + */ + vdev_queue_fini(vd); + vdev_cache_fini(vd); + + if (vd->vdev_path) + spa_strfree(vd->vdev_path); + if (vd->vdev_devid) + spa_strfree(vd->vdev_devid); + if (vd->vdev_physpath) + spa_strfree(vd->vdev_physpath); + if (vd->vdev_fru) + spa_strfree(vd->vdev_fru); + + if (vd->vdev_isspare) + spa_spare_remove(vd); + if (vd->vdev_isl2cache) + spa_l2cache_remove(vd); + + txg_list_destroy(&vd->vdev_ms_list); + txg_list_destroy(&vd->vdev_dtl_list); + + mutex_enter(&vd->vdev_dtl_lock); + space_map_close(vd->vdev_dtl_sm); + for (int t = 0; t < DTL_TYPES; t++) { + range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); + range_tree_destroy(vd->vdev_dtl[t]); + } + mutex_exit(&vd->vdev_dtl_lock); + + EQUIV(vd->vdev_indirect_births != NULL, + vd->vdev_indirect_mapping != NULL); + if (vd->vdev_indirect_births != NULL) { + vdev_indirect_mapping_close(vd->vdev_indirect_mapping); + vdev_indirect_births_close(vd->vdev_indirect_births); + } + + if (vd->vdev_obsolete_sm != NULL) { + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + space_map_close(vd->vdev_obsolete_sm); + vd->vdev_obsolete_sm = NULL; + } + range_tree_destroy(vd->vdev_obsolete_segments); + rw_destroy(&vd->vdev_indirect_rwlock); + mutex_destroy(&vd->vdev_obsolete_lock); + + mutex_destroy(&vd->vdev_queue_lock); + mutex_destroy(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_stat_lock); + mutex_destroy(&vd->vdev_probe_lock); + mutex_destroy(&vd->vdev_scan_io_queue_lock); + mutex_destroy(&vd->vdev_initialize_lock); + mutex_destroy(&vd->vdev_initialize_io_lock); + cv_destroy(&vd->vdev_initialize_io_cv); + cv_destroy(&vd->vdev_initialize_cv); + + if (vd == spa->spa_root_vdev) + spa->spa_root_vdev = NULL; + + kmem_free(vd, sizeof (vdev_t)); +} + +/* + * Transfer top-level vdev state from svd to tvd. + */ +static void +vdev_top_transfer(vdev_t *svd, vdev_t *tvd) +{ + spa_t *spa = svd->vdev_spa; + metaslab_t *msp; + vdev_t *vd; + int t; + + ASSERT(tvd == tvd->vdev_top); + + tvd->vdev_ms_array = svd->vdev_ms_array; + tvd->vdev_ms_shift = svd->vdev_ms_shift; + tvd->vdev_ms_count = svd->vdev_ms_count; + tvd->vdev_top_zap = svd->vdev_top_zap; + + svd->vdev_ms_array = 0; + svd->vdev_ms_shift = 0; + svd->vdev_ms_count = 0; + svd->vdev_top_zap = 0; + + if (tvd->vdev_mg) + ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); + tvd->vdev_mg = svd->vdev_mg; + tvd->vdev_ms = svd->vdev_ms; + + svd->vdev_mg = NULL; + svd->vdev_ms = NULL; + + if (tvd->vdev_mg != NULL) + tvd->vdev_mg->mg_vd = tvd; + + tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; + svd->vdev_checkpoint_sm = NULL; + + tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; + tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; + tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; + + svd->vdev_stat.vs_alloc = 0; + svd->vdev_stat.vs_space = 0; + svd->vdev_stat.vs_dspace = 0; + + /* + * State which may be set on a top-level vdev that's in the + * process of being removed. + */ + ASSERT0(tvd->vdev_indirect_config.vic_births_object); + ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); + ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); + ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); + ASSERT3P(tvd->vdev_indirect_births, ==, NULL); + ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); + ASSERT0(tvd->vdev_removing); + tvd->vdev_removing = svd->vdev_removing; + tvd->vdev_indirect_config = svd->vdev_indirect_config; + tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; + tvd->vdev_indirect_births = svd->vdev_indirect_births; + range_tree_swap(&svd->vdev_obsolete_segments, + &tvd->vdev_obsolete_segments); + tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; + svd->vdev_indirect_config.vic_mapping_object = 0; + svd->vdev_indirect_config.vic_births_object = 0; + svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; + svd->vdev_indirect_mapping = NULL; + svd->vdev_indirect_births = NULL; + svd->vdev_obsolete_sm = NULL; + svd->vdev_removing = 0; + + for (t = 0; t < TXG_SIZE; t++) { + while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) + (void) txg_list_add(&tvd->vdev_ms_list, msp, t); + while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) + (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); + if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) + (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); + } + + if (list_link_active(&svd->vdev_config_dirty_node)) { + vdev_config_clean(svd); + vdev_config_dirty(tvd); + } + + if (list_link_active(&svd->vdev_state_dirty_node)) { + vdev_state_clean(svd); + vdev_state_dirty(tvd); + } + + tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; + svd->vdev_deflate_ratio = 0; + + tvd->vdev_islog = svd->vdev_islog; + svd->vdev_islog = 0; + + dsl_scan_io_queue_vdev_xfer(svd, tvd); +} + +static void +vdev_top_update(vdev_t *tvd, vdev_t *vd) +{ + if (vd == NULL) + return; + + vd->vdev_top = tvd; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_top_update(tvd, vd->vdev_child[c]); +} + +/* + * Add a mirror/replacing vdev above an existing vdev. + */ +vdev_t * +vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) +{ + spa_t *spa = cvd->vdev_spa; + vdev_t *pvd = cvd->vdev_parent; + vdev_t *mvd; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); + + mvd->vdev_asize = cvd->vdev_asize; + mvd->vdev_min_asize = cvd->vdev_min_asize; + mvd->vdev_max_asize = cvd->vdev_max_asize; + mvd->vdev_psize = cvd->vdev_psize; + mvd->vdev_ashift = cvd->vdev_ashift; + mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; + mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; + mvd->vdev_state = cvd->vdev_state; + mvd->vdev_crtxg = cvd->vdev_crtxg; + + vdev_remove_child(pvd, cvd); + vdev_add_child(pvd, mvd); + cvd->vdev_id = mvd->vdev_children; + vdev_add_child(mvd, cvd); + vdev_top_update(cvd->vdev_top, cvd->vdev_top); + + if (mvd == mvd->vdev_top) + vdev_top_transfer(cvd, mvd); + + return (mvd); +} + +/* + * Remove a 1-way mirror/replacing vdev from the tree. + */ +void +vdev_remove_parent(vdev_t *cvd) +{ + vdev_t *mvd = cvd->vdev_parent; + vdev_t *pvd = mvd->vdev_parent; + + ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + ASSERT(mvd->vdev_children == 1); + ASSERT(mvd->vdev_ops == &vdev_mirror_ops || + mvd->vdev_ops == &vdev_replacing_ops || + mvd->vdev_ops == &vdev_spare_ops); + cvd->vdev_ashift = mvd->vdev_ashift; + cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; + cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; + + vdev_remove_child(mvd, cvd); + vdev_remove_child(pvd, mvd); + + /* + * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. + * Otherwise, we could have detached an offline device, and when we + * go to import the pool we'll think we have two top-level vdevs, + * instead of a different version of the same top-level vdev. + */ + if (mvd->vdev_top == mvd) { + uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; + cvd->vdev_orig_guid = cvd->vdev_guid; + cvd->vdev_guid += guid_delta; + cvd->vdev_guid_sum += guid_delta; + } + cvd->vdev_id = mvd->vdev_id; + vdev_add_child(pvd, cvd); + vdev_top_update(cvd->vdev_top, cvd->vdev_top); + + if (cvd == cvd->vdev_top) + vdev_top_transfer(mvd, cvd); + + ASSERT(mvd->vdev_children == 0); + vdev_free(mvd); +} + +int +vdev_metaslab_init(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + uint64_t m; + uint64_t oldc = vd->vdev_ms_count; + uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; + metaslab_t **mspp; + int error; + + ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + /* + * This vdev is not being allocated from yet or is a hole. + */ + if (vd->vdev_ms_shift == 0) + return (0); + + ASSERT(!vd->vdev_ishole); + + ASSERT(oldc <= newc); + + mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); + + if (oldc != 0) { + bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); + kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); + } + + vd->vdev_ms = mspp; + vd->vdev_ms_count = newc; + for (m = oldc; m < newc; m++) { + uint64_t object = 0; + + /* + * vdev_ms_array may be 0 if we are creating the "fake" + * metaslabs for an indirect vdev for zdb's leak detection. + * See zdb_leak_init(). + */ + if (txg == 0 && vd->vdev_ms_array != 0) { + error = dmu_read(mos, vd->vdev_ms_array, + m * sizeof (uint64_t), sizeof (uint64_t), &object, + DMU_READ_PREFETCH); + if (error != 0) { + vdev_dbgmsg(vd, "unable to read the metaslab " + "array [error=%d]", error); + return (error); + } + } + + error = metaslab_init(vd->vdev_mg, m, object, txg, + &(vd->vdev_ms[m])); + if (error != 0) { + vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", + error); + return (error); + } + } + + if (txg == 0) + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); + + /* + * If the vdev is being removed we don't activate + * the metaslabs since we want to ensure that no new + * allocations are performed on this device. + */ + if (oldc == 0 && !vd->vdev_removing) + metaslab_group_activate(vd->vdev_mg); + + if (txg == 0) + spa_config_exit(spa, SCL_ALLOC, FTAG); + + return (0); +} + +void +vdev_metaslab_fini(vdev_t *vd) +{ + if (vd->vdev_checkpoint_sm != NULL) { + ASSERT(spa_feature_is_active(vd->vdev_spa, + SPA_FEATURE_POOL_CHECKPOINT)); + space_map_close(vd->vdev_checkpoint_sm); + /* + * Even though we close the space map, we need to set its + * pointer to NULL. The reason is that vdev_metaslab_fini() + * may be called multiple times for certain operations + * (i.e. when destroying a pool) so we need to ensure that + * this clause never executes twice. This logic is similar + * to the one used for the vdev_ms clause below. + */ + vd->vdev_checkpoint_sm = NULL; + } + + if (vd->vdev_ms != NULL) { + uint64_t count = vd->vdev_ms_count; + + metaslab_group_passivate(vd->vdev_mg); + for (uint64_t m = 0; m < count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp != NULL) + metaslab_fini(msp); + } + kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); + vd->vdev_ms = NULL; + + vd->vdev_ms_count = 0; + } + ASSERT0(vd->vdev_ms_count); +} + +typedef struct vdev_probe_stats { + boolean_t vps_readable; + boolean_t vps_writeable; + int vps_flags; +} vdev_probe_stats_t; + +static void +vdev_probe_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + vdev_t *vd = zio->io_vd; + vdev_probe_stats_t *vps = zio->io_private; + + ASSERT(vd->vdev_probe_zio != NULL); + + if (zio->io_type == ZIO_TYPE_READ) { + if (zio->io_error == 0) + vps->vps_readable = 1; + if (zio->io_error == 0 && spa_writeable(spa)) { + zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, + zio->io_offset, zio->io_size, zio->io_abd, + ZIO_CHECKSUM_OFF, vdev_probe_done, vps, + ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); + } else { + abd_free(zio->io_abd); + } + } else if (zio->io_type == ZIO_TYPE_WRITE) { + if (zio->io_error == 0) + vps->vps_writeable = 1; + abd_free(zio->io_abd); + } else if (zio->io_type == ZIO_TYPE_NULL) { + zio_t *pio; + + vd->vdev_cant_read |= !vps->vps_readable; + vd->vdev_cant_write |= !vps->vps_writeable; + + if (vdev_readable(vd) && + (vdev_writeable(vd) || !spa_writeable(spa))) { + zio->io_error = 0; + } else { + ASSERT(zio->io_error != 0); + vdev_dbgmsg(vd, "failed probe"); + zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, + spa, vd, NULL, 0, 0); + zio->io_error = SET_ERROR(ENXIO); + } + + mutex_enter(&vd->vdev_probe_lock); + ASSERT(vd->vdev_probe_zio == zio); + vd->vdev_probe_zio = NULL; + mutex_exit(&vd->vdev_probe_lock); + + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) + if (!vdev_accessible(vd, pio)) + pio->io_error = SET_ERROR(ENXIO); + + kmem_free(vps, sizeof (*vps)); + } +} + +/* + * Determine whether this device is accessible. + * + * Read and write to several known locations: the pad regions of each + * vdev label but the first, which we leave alone in case it contains + * a VTOC. + */ +zio_t * +vdev_probe(vdev_t *vd, zio_t *zio) +{ + spa_t *spa = vd->vdev_spa; + vdev_probe_stats_t *vps = NULL; + zio_t *pio; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + /* + * Don't probe the probe. + */ + if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) + return (NULL); + + /* + * To prevent 'probe storms' when a device fails, we create + * just one probe i/o at a time. All zios that want to probe + * this vdev will become parents of the probe io. + */ + mutex_enter(&vd->vdev_probe_lock); + + if ((pio = vd->vdev_probe_zio) == NULL) { + vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); + + vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | + ZIO_FLAG_TRYHARD; + + if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { + /* + * vdev_cant_read and vdev_cant_write can only + * transition from TRUE to FALSE when we have the + * SCL_ZIO lock as writer; otherwise they can only + * transition from FALSE to TRUE. This ensures that + * any zio looking at these values can assume that + * failures persist for the life of the I/O. That's + * important because when a device has intermittent + * connectivity problems, we want to ensure that + * they're ascribed to the device (ENXIO) and not + * the zio (EIO). + * + * Since we hold SCL_ZIO as writer here, clear both + * values so the probe can reevaluate from first + * principles. + */ + vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + } + + vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, + vdev_probe_done, vps, + vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); + + /* + * We can't change the vdev state in this context, so we + * kick off an async task to do it on our behalf. + */ + if (zio != NULL) { + vd->vdev_probe_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_PROBE); + } + } + + if (zio != NULL) + zio_add_child(zio, pio); + + mutex_exit(&vd->vdev_probe_lock); + + if (vps == NULL) { + ASSERT(zio != NULL); + return (NULL); + } + + for (int l = 1; l < VDEV_LABELS; l++) { + zio_nowait(zio_read_phys(pio, vd, + vdev_label_offset(vd->vdev_psize, l, + offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, + abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), + ZIO_CHECKSUM_OFF, vdev_probe_done, vps, + ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); + } + + if (zio == NULL) + return (pio); + + zio_nowait(pio); + return (NULL); +} + +static void +vdev_open_child(void *arg) +{ + vdev_t *vd = arg; + + vd->vdev_open_thread = curthread; + vd->vdev_open_error = vdev_open(vd); + vd->vdev_open_thread = NULL; +} + +boolean_t +vdev_uses_zvols(vdev_t *vd) +{ + if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, + strlen(ZVOL_DIR)) == 0) + return (B_TRUE); + for (int c = 0; c < vd->vdev_children; c++) + if (vdev_uses_zvols(vd->vdev_child[c])) + return (B_TRUE); + return (B_FALSE); +} + +void +vdev_open_children(vdev_t *vd) +{ + taskq_t *tq; + int children = vd->vdev_children; + + vd->vdev_nonrot = B_TRUE; + + /* + * in order to handle pools on top of zvols, do the opens + * in a single thread so that the same thread holds the + * spa_namespace_lock + */ + if (B_TRUE || vdev_uses_zvols(vd)) { + for (int c = 0; c < children; c++) { + vd->vdev_child[c]->vdev_open_error = + vdev_open(vd->vdev_child[c]); + vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; + } + return; + } + tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + + for (int c = 0; c < children; c++) + VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], + TQ_SLEEP) != 0); + + taskq_destroy(tq); + + for (int c = 0; c < children; c++) + vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; +} + +/* + * Compute the raidz-deflation ratio. Note, we hard-code + * in 128k (1 << 17) because it is the "typical" blocksize. + * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, + * otherwise it would inconsistently account for existing bp's. + */ +static void +vdev_set_deflate_ratio(vdev_t *vd) +{ + if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { + vd->vdev_deflate_ratio = (1 << 17) / + (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); + } +} + +/* + * Prepare a virtual device for access. + */ +int +vdev_open(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + int error; + uint64_t osize = 0; + uint64_t max_osize = 0; + uint64_t asize, max_asize, psize; + uint64_t logical_ashift = 0; + uint64_t physical_ashift = 0; + + ASSERT(vd->vdev_open_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || + vd->vdev_state == VDEV_STATE_CANT_OPEN || + vd->vdev_state == VDEV_STATE_OFFLINE); + + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + vd->vdev_notrim = B_FALSE; + vd->vdev_min_asize = vdev_get_min_asize(vd); + + /* + * If this vdev is not removed, check its fault status. If it's + * faulted, bail out of the open. + */ + if (!vd->vdev_removed && vd->vdev_faulted) { + ASSERT(vd->vdev_children == 0); + ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || + vd->vdev_label_aux == VDEV_AUX_EXTERNAL); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + vd->vdev_label_aux); + return (SET_ERROR(ENXIO)); + } else if (vd->vdev_offline) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); + return (SET_ERROR(ENXIO)); + } + + error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, + &logical_ashift, &physical_ashift); + + /* + * Reset the vdev_reopening flag so that we actually close + * the vdev on error. + */ + vd->vdev_reopening = B_FALSE; + if (zio_injection_enabled && error == 0) + error = zio_handle_device_injection(vd, NULL, ENXIO); + + if (error) { + if (vd->vdev_removed && + vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) + vd->vdev_removed = B_FALSE; + + if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, + vd->vdev_stat.vs_aux); + } else { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + vd->vdev_stat.vs_aux); + } + return (error); + } + + vd->vdev_removed = B_FALSE; + + /* + * Recheck the faulted flag now that we have confirmed that + * the vdev is accessible. If we're faulted, bail. + */ + if (vd->vdev_faulted) { + ASSERT(vd->vdev_children == 0); + ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || + vd->vdev_label_aux == VDEV_AUX_EXTERNAL); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + vd->vdev_label_aux); + return (SET_ERROR(ENXIO)); + } + + if (vd->vdev_degraded) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } else { + vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); + } + + /* + * For hole or missing vdevs we just return success. + */ + if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) + return (0); + + if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) + trim_map_create(vd); + + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, + VDEV_AUX_NONE); + break; + } + } + + osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); + max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); + + if (vd->vdev_children == 0) { + if (osize < SPA_MINDEVSIZE) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_TOO_SMALL); + return (SET_ERROR(EOVERFLOW)); + } + psize = osize; + asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); + max_asize = max_osize - (VDEV_LABEL_START_SIZE + + VDEV_LABEL_END_SIZE); + } else { + if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - + (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_TOO_SMALL); + return (SET_ERROR(EOVERFLOW)); + } + psize = 0; + asize = osize; + max_asize = max_osize; + } + + vd->vdev_psize = psize; + + /* + * Make sure the allocatable size hasn't shrunk too much. + */ + if (asize < vd->vdev_min_asize) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (SET_ERROR(EINVAL)); + } + + vd->vdev_physical_ashift = + MAX(physical_ashift, vd->vdev_physical_ashift); + vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); + vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); + + if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_ASHIFT_TOO_BIG); + return (EINVAL); + } + + if (vd->vdev_asize == 0) { + /* + * This is the first-ever open, so use the computed values. + * For testing purposes, a higher ashift can be requested. + */ + vd->vdev_asize = asize; + vd->vdev_max_asize = max_asize; + } else { + /* + * Make sure the alignment requirement hasn't increased. + */ + if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && + vd->vdev_ops->vdev_op_leaf) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (EINVAL); + } + vd->vdev_max_asize = max_asize; + } + + /* + * If all children are healthy we update asize if either: + * The asize has increased, due to a device expansion caused by dynamic + * LUN growth or vdev replacement, and automatic expansion is enabled; + * making the additional space available. + * + * The asize has decreased, due to a device shrink usually caused by a + * vdev replace with a smaller device. This ensures that calculations + * based of max_asize and asize e.g. esize are always valid. It's safe + * to do this as we've already validated that asize is greater than + * vdev_min_asize. + */ + if (vd->vdev_state == VDEV_STATE_HEALTHY && + ((asize > vd->vdev_asize && + (vd->vdev_expanding || spa->spa_autoexpand)) || + (asize < vd->vdev_asize))) + vd->vdev_asize = asize; + + vdev_set_min_asize(vd); + + /* + * Ensure we can issue some IO before declaring the + * vdev open for business. + */ + if (vd->vdev_ops->vdev_op_leaf && + (error = zio_wait(vdev_probe(vd, NULL))) != 0) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); + return (error); + } + + /* + * Track the min and max ashift values for normal data devices. + */ + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + !vd->vdev_islog && vd->vdev_aux == NULL) { + if (vd->vdev_ashift > spa->spa_max_ashift) + spa->spa_max_ashift = vd->vdev_ashift; + if (vd->vdev_ashift < spa->spa_min_ashift) + spa->spa_min_ashift = vd->vdev_ashift; + } + + /* + * If a leaf vdev has a DTL, and seems healthy, then kick off a + * resilver. But don't do this if we are doing a reopen for a scrub, + * since this would just restart the scrub we are already doing. + */ + if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && + vdev_resilver_needed(vd, NULL, NULL)) + spa_async_request(spa, SPA_ASYNC_RESILVER); + + return (0); +} + +/* + * Called once the vdevs are all opened, this routine validates the label + * contents. This needs to be done before vdev_load() so that we don't + * inadvertently do repair I/Os to the wrong device. + * + * This function will only return failure if one of the vdevs indicates that it + * has since been destroyed or exported. This is only possible if + * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state + * will be updated but the function will return 0. + */ +int +vdev_validate(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + nvlist_t *label; + uint64_t guid = 0, aux_guid = 0, top_guid; + uint64_t state; + nvlist_t *nvl; + uint64_t txg; + + if (vdev_validate_skip) + return (0); + + for (uint64_t c = 0; c < vd->vdev_children; c++) + if (vdev_validate(vd->vdev_child[c]) != 0) + return (SET_ERROR(EBADF)); + + /* + * If the device has already failed, or was marked offline, don't do + * any further validation. Otherwise, label I/O will fail and we will + * overwrite the previous state. + */ + if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) + return (0); + + /* + * If we are performing an extreme rewind, we allow for a label that + * was modified at a point after the current txg. + * If config lock is not held do not check for the txg. spa_sync could + * be updating the vdev's label before updating spa_last_synced_txg. + */ + if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || + spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) + txg = UINT64_MAX; + else + txg = spa_last_synced_txg(spa); + + if ((label = vdev_label_read_config(vd, txg)) == NULL) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + vdev_dbgmsg(vd, "vdev_validate: failed reading config for " + "txg %llu", (u_longlong_t)txg); + return (0); + } + + /* + * Determine if this vdev has been split off into another + * pool. If so, then refuse to open it. + */ + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, + &aux_guid) == 0 && aux_guid == spa_guid(spa)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_SPLIT_POOL); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); + return (0); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", + ZPOOL_CONFIG_POOL_GUID); + return (0); + } + + /* + * If config is not trusted then ignore the spa guid check. This is + * necessary because if the machine crashed during a re-guid the new + * guid might have been written to all of the vdev labels, but not the + * cached config. The check will be performed again once we have the + * trusted config from the MOS. + */ + if (spa->spa_trust_config && guid != spa_guid(spa)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " + "match config (%llu != %llu)", (u_longlong_t)guid, + (u_longlong_t)spa_guid(spa)); + return (0); + } + + if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) + != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, + &aux_guid) != 0) + aux_guid = 0; + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", + ZPOOL_CONFIG_GUID); + return (0); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) + != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", + ZPOOL_CONFIG_TOP_GUID); + return (0); + } + + /* + * If this vdev just became a top-level vdev because its sibling was + * detached, it will have adopted the parent's vdev guid -- but the + * label may or may not be on disk yet. Fortunately, either version + * of the label will have the same top guid, so if we're a top-level + * vdev, we can safely compare to that instead. + * However, if the config comes from a cachefile that failed to update + * after the detach, a top-level vdev will appear as a non top-level + * vdev in the config. Also relax the constraints if we perform an + * extreme rewind. + * + * If we split this vdev off instead, then we also check the + * original pool's guid. We don't want to consider the vdev + * corrupt if it is partway through a split operation. + */ + if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { + boolean_t mismatch = B_FALSE; + if (spa->spa_trust_config && !spa->spa_extreme_rewind) { + if (vd != vd->vdev_top || vd->vdev_guid != top_guid) + mismatch = B_TRUE; + } else { + if (vd->vdev_guid != top_guid && + vd->vdev_top->vdev_guid != guid) + mismatch = B_TRUE; + } + + if (mismatch) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: config guid " + "doesn't match label guid"); + vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", + (u_longlong_t)vd->vdev_guid, + (u_longlong_t)vd->vdev_top->vdev_guid); + vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " + "aux_guid %llu", (u_longlong_t)guid, + (u_longlong_t)top_guid, (u_longlong_t)aux_guid); + return (0); + } + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, + &state) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", + ZPOOL_CONFIG_POOL_STATE); + return (0); + } + + nvlist_free(label); + + /* + * If this is a verbatim import, no need to check the + * state of the pool. + */ + if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && + spa_load_state(spa) == SPA_LOAD_OPEN && + state != POOL_STATE_ACTIVE) { + vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " + "for spa %s", (u_longlong_t)state, spa->spa_name); + return (SET_ERROR(EBADF)); + } + + /* + * If we were able to open and validate a vdev that was + * previously marked permanently unavailable, clear that state + * now. + */ + if (vd->vdev_not_present) + vd->vdev_not_present = 0; + + return (0); +} + +static void +vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) +{ + if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { + if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { + zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " + "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, + dvd->vdev_path, svd->vdev_path); + spa_strfree(dvd->vdev_path); + dvd->vdev_path = spa_strdup(svd->vdev_path); + } + } else if (svd->vdev_path != NULL) { + dvd->vdev_path = spa_strdup(svd->vdev_path); + zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", + (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); + } +} + +/* + * Recursively copy vdev paths from one vdev to another. Source and destination + * vdev trees must have same geometry otherwise return error. Intended to copy + * paths from userland config into MOS config. + */ +int +vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) +{ + if ((svd->vdev_ops == &vdev_missing_ops) || + (svd->vdev_ishole && dvd->vdev_ishole) || + (dvd->vdev_ops == &vdev_indirect_ops)) + return (0); + + if (svd->vdev_ops != dvd->vdev_ops) { + vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", + svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); + return (SET_ERROR(EINVAL)); + } + + if (svd->vdev_guid != dvd->vdev_guid) { + vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " + "%llu)", (u_longlong_t)svd->vdev_guid, + (u_longlong_t)dvd->vdev_guid); + return (SET_ERROR(EINVAL)); + } + + if (svd->vdev_children != dvd->vdev_children) { + vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " + "%llu != %llu", (u_longlong_t)svd->vdev_children, + (u_longlong_t)dvd->vdev_children); + return (SET_ERROR(EINVAL)); + } + + for (uint64_t i = 0; i < svd->vdev_children; i++) { + int error = vdev_copy_path_strict(svd->vdev_child[i], + dvd->vdev_child[i]); + if (error != 0) + return (error); + } + + if (svd->vdev_ops->vdev_op_leaf) + vdev_copy_path_impl(svd, dvd); + + return (0); +} + +static void +vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) +{ + ASSERT(stvd->vdev_top == stvd); + ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); + + for (uint64_t i = 0; i < dvd->vdev_children; i++) { + vdev_copy_path_search(stvd, dvd->vdev_child[i]); + } + + if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) + return; + + /* + * The idea here is that while a vdev can shift positions within + * a top vdev (when replacing, attaching mirror, etc.) it cannot + * step outside of it. + */ + vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); + + if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) + return; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + vdev_copy_path_impl(vd, dvd); +} + +/* + * Recursively copy vdev paths from one root vdev to another. Source and + * destination vdev trees may differ in geometry. For each destination leaf + * vdev, search a vdev with the same guid and top vdev id in the source. + * Intended to copy paths from userland config into MOS config. + */ +void +vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) +{ + uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); + ASSERT(srvd->vdev_ops == &vdev_root_ops); + ASSERT(drvd->vdev_ops == &vdev_root_ops); + + for (uint64_t i = 0; i < children; i++) { + vdev_copy_path_search(srvd->vdev_child[i], + drvd->vdev_child[i]); + } +} + +/* + * Close a virtual device. + */ +void +vdev_close(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *pvd = vd->vdev_parent; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + /* + * If our parent is reopening, then we are as well, unless we are + * going offline. + */ + if (pvd != NULL && pvd->vdev_reopening) + vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); + + vd->vdev_ops->vdev_op_close(vd); + + vdev_cache_purge(vd); + + if (vd->vdev_ops->vdev_op_leaf) + trim_map_destroy(vd); + + /* + * We record the previous state before we close it, so that if we are + * doing a reopen(), we don't generate FMA ereports if we notice that + * it's still faulted. + */ + vd->vdev_prevstate = vd->vdev_state; + + if (vd->vdev_offline) + vd->vdev_state = VDEV_STATE_OFFLINE; + else + vd->vdev_state = VDEV_STATE_CLOSED; + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; +} + +void +vdev_hold(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_is_root(spa)); + if (spa->spa_state == POOL_STATE_UNINITIALIZED) + return; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_hold(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_hold(vd); +} + +void +vdev_rele(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_is_root(spa)); + for (int c = 0; c < vd->vdev_children; c++) + vdev_rele(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_rele(vd); +} + +/* + * Reopen all interior vdevs and any unopened leaves. We don't actually + * reopen leaf vdevs which had previously been opened as they might deadlock + * on the spa_config_lock. Instead we only obtain the leaf's physical size. + * If the leaf has never been opened then open it, as usual. + */ +void +vdev_reopen(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + /* set the reopening flag unless we're taking the vdev offline */ + vd->vdev_reopening = !vd->vdev_offline; + vdev_close(vd); + (void) vdev_open(vd); + + /* + * Call vdev_validate() here to make sure we have the same device. + * Otherwise, a device with an invalid label could be successfully + * opened in response to vdev_reopen(). + */ + if (vd->vdev_aux) { + (void) vdev_validate_aux(vd); + if (vdev_readable(vd) && vdev_writeable(vd) && + vd->vdev_aux == &spa->spa_l2cache && + !l2arc_vdev_present(vd)) + l2arc_add_vdev(spa, vd); + } else { + (void) vdev_validate(vd); + } + + /* + * Reassess parent vdev's health. + */ + vdev_propagate_state(vd); +} + +int +vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) +{ + int error; + + /* + * Normally, partial opens (e.g. of a mirror) are allowed. + * For a create, however, we want to fail the request if + * there are any components we can't open. + */ + error = vdev_open(vd); + + if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { + vdev_close(vd); + return (error ? error : ENXIO); + } + + /* + * Recursively load DTLs and initialize all labels. + */ + if ((error = vdev_dtl_load(vd)) != 0 || + (error = vdev_label_init(vd, txg, isreplacing ? + VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { + vdev_close(vd); + return (error); + } + + return (0); +} + +void +vdev_metaslab_set_size(vdev_t *vd) +{ + uint64_t asize = vd->vdev_asize; + uint64_t ms_count = asize >> vdev_default_ms_shift; + uint64_t ms_shift; + + /* + * There are two dimensions to the metaslab sizing calculation: + * the size of the metaslab and the count of metaslabs per vdev. + * In general, we aim for vdev_max_ms_count (200) metaslabs. The + * range of the dimensions are as follows: + * + * 2^29 <= ms_size <= 2^38 + * 16 <= ms_count <= 131,072 + * + * On the lower end of vdev sizes, we aim for metaslabs sizes of + * at least 512MB (2^29) to minimize fragmentation effects when + * testing with smaller devices. However, the count constraint + * of at least 16 metaslabs will override this minimum size goal. + * + * On the upper end of vdev sizes, we aim for a maximum metaslab + * size of 256GB. However, we will cap the total count to 2^17 + * metaslabs to keep our memory footprint in check. + * + * The net effect of applying above constrains is summarized below. + * + * vdev size metaslab count + * -------------|----------------- + * < 8GB ~16 + * 8GB - 100GB one per 512MB + * 100GB - 50TB ~200 + * 50TB - 32PB one per 256GB + * > 32PB ~131,072 + * ------------------------------- + */ + + if (ms_count < vdev_min_ms_count) + ms_shift = highbit64(asize / vdev_min_ms_count); + else if (ms_count > vdev_max_ms_count) + ms_shift = highbit64(asize / vdev_max_ms_count); + else + ms_shift = vdev_default_ms_shift; + + if (ms_shift < SPA_MAXBLOCKSHIFT) { + ms_shift = SPA_MAXBLOCKSHIFT; + } else if (ms_shift > vdev_max_ms_shift) { + ms_shift = vdev_max_ms_shift; + /* cap the total count to constrain memory footprint */ + if ((asize >> ms_shift) > vdev_ms_count_limit) + ms_shift = highbit64(asize / vdev_ms_count_limit); + } + + vd->vdev_ms_shift = ms_shift; + ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); +} + +/* + * Maximize performance by inflating the configured ashift for top level + * vdevs to be as close to the physical ashift as possible while maintaining + * administrator defined limits and ensuring it doesn't go below the + * logical ashift. + */ +void +vdev_ashift_optimize(vdev_t *vd) +{ + if (vd == vd->vdev_top) { + if (vd->vdev_ashift < vd->vdev_physical_ashift) { + vd->vdev_ashift = MIN( + MAX(zfs_max_auto_ashift, vd->vdev_ashift), + MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); + } else { + /* + * Unusual case where logical ashift > physical ashift + * so we can't cap the calculated ashift based on max + * ashift as that would cause failures. + * We still check if we need to increase it to match + * the min ashift. + */ + vd->vdev_ashift = MAX(zfs_min_auto_ashift, + vd->vdev_ashift); + } + } +} + +void +vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) +{ + ASSERT(vd == vd->vdev_top); + /* indirect vdevs don't have metaslabs or dtls */ + ASSERT(vdev_is_concrete(vd) || flags == 0); + ASSERT(ISP2(flags)); + ASSERT(spa_writeable(vd->vdev_spa)); + + if (flags & VDD_METASLAB) + (void) txg_list_add(&vd->vdev_ms_list, arg, txg); + + if (flags & VDD_DTL) + (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); + + (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); +} + +void +vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_dirty_leaves(vd->vdev_child[c], flags, txg); + + if (vd->vdev_ops->vdev_op_leaf) + vdev_dirty(vd->vdev_top, flags, vd, txg); +} + +/* + * DTLs. + * + * A vdev's DTL (dirty time log) is the set of transaction groups for which + * the vdev has less than perfect replication. There are four kinds of DTL: + * + * DTL_MISSING: txgs for which the vdev has no valid copies of the data + * + * DTL_PARTIAL: txgs for which data is available, but not fully replicated + * + * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon + * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of + * txgs that was scrubbed. + * + * DTL_OUTAGE: txgs which cannot currently be read, whether due to + * persistent errors or just some device being offline. + * Unlike the other three, the DTL_OUTAGE map is not generally + * maintained; it's only computed when needed, typically to + * determine whether a device can be detached. + * + * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device + * either has the data or it doesn't. + * + * For interior vdevs such as mirror and RAID-Z the picture is more complex. + * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because + * if any child is less than fully replicated, then so is its parent. + * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, + * comprising only those txgs which appear in 'maxfaults' or more children; + * those are the txgs we don't have enough replication to read. For example, + * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); + * thus, its DTL_MISSING consists of the set of txgs that appear in more than + * two child DTL_MISSING maps. + * + * It should be clear from the above that to compute the DTLs and outage maps + * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. + * Therefore, that is all we keep on disk. When loading the pool, or after + * a configuration change, we generate all other DTLs from first principles. + */ +void +vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) +{ + range_tree_t *rt = vd->vdev_dtl[t]; + + ASSERT(t < DTL_TYPES); + ASSERT(vd != vd->vdev_spa->spa_root_vdev); + ASSERT(spa_writeable(vd->vdev_spa)); + + mutex_enter(&vd->vdev_dtl_lock); + if (!range_tree_contains(rt, txg, size)) + range_tree_add(rt, txg, size); + mutex_exit(&vd->vdev_dtl_lock); +} + +boolean_t +vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) +{ + range_tree_t *rt = vd->vdev_dtl[t]; + boolean_t dirty = B_FALSE; + + ASSERT(t < DTL_TYPES); + ASSERT(vd != vd->vdev_spa->spa_root_vdev); + + /* + * While we are loading the pool, the DTLs have not been loaded yet. + * Ignore the DTLs and try all devices. This avoids a recursive + * mutex enter on the vdev_dtl_lock, and also makes us try hard + * when loading the pool (relying on the checksum to ensure that + * we get the right data -- note that we while loading, we are + * only reading the MOS, which is always checksummed). + */ + if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + + mutex_enter(&vd->vdev_dtl_lock); + if (!range_tree_is_empty(rt)) + dirty = range_tree_contains(rt, txg, size); + mutex_exit(&vd->vdev_dtl_lock); + + return (dirty); +} + +boolean_t +vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) +{ + range_tree_t *rt = vd->vdev_dtl[t]; + boolean_t empty; + + mutex_enter(&vd->vdev_dtl_lock); + empty = range_tree_is_empty(rt); + mutex_exit(&vd->vdev_dtl_lock); + + return (empty); +} + +/* + * Returns B_TRUE if vdev determines offset needs to be resilvered. + */ +boolean_t +vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + ASSERT(vd != vd->vdev_spa->spa_root_vdev); + + if (vd->vdev_ops->vdev_op_need_resilver == NULL || + vd->vdev_ops->vdev_op_leaf) + return (B_TRUE); + + return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); +} + +/* + * Returns the lowest txg in the DTL range. + */ +static uint64_t +vdev_dtl_min(vdev_t *vd) +{ + range_seg_t *rs; + + ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); + ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); + ASSERT0(vd->vdev_children); + + rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); + return (rs->rs_start - 1); +} + +/* + * Returns the highest txg in the DTL. + */ +static uint64_t +vdev_dtl_max(vdev_t *vd) +{ + range_seg_t *rs; + + ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); + ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); + ASSERT0(vd->vdev_children); + + rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); + return (rs->rs_end); +} + +/* + * Determine if a resilvering vdev should remove any DTL entries from + * its range. If the vdev was resilvering for the entire duration of the + * scan then it should excise that range from its DTLs. Otherwise, this + * vdev is considered partially resilvered and should leave its DTL + * entries intact. The comment in vdev_dtl_reassess() describes how we + * excise the DTLs. + */ +static boolean_t +vdev_dtl_should_excise(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + + ASSERT0(scn->scn_phys.scn_errors); + ASSERT0(vd->vdev_children); + + if (vd->vdev_state < VDEV_STATE_DEGRADED) + return (B_FALSE); + + if (vd->vdev_resilver_txg == 0 || + range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) + return (B_TRUE); + + /* + * When a resilver is initiated the scan will assign the scn_max_txg + * value to the highest txg value that exists in all DTLs. If this + * device's max DTL is not part of this scan (i.e. it is not in + * the range (scn_min_txg, scn_max_txg] then it is not eligible + * for excision. + */ + if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { + ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); + ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); + ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Reassess DTLs after a config change or scrub completion. + */ +void +vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) +{ + spa_t *spa = vd->vdev_spa; + avl_tree_t reftree; + int minref; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + + for (int c = 0; c < vd->vdev_children; c++) + vdev_dtl_reassess(vd->vdev_child[c], txg, + scrub_txg, scrub_done); + + if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + + mutex_enter(&vd->vdev_dtl_lock); + + /* + * If we've completed a scan cleanly then determine + * if this vdev should remove any DTLs. We only want to + * excise regions on vdevs that were available during + * the entire duration of this scan. + */ + if (scrub_txg != 0 && + (spa->spa_scrub_started || + (scn != NULL && scn->scn_phys.scn_errors == 0)) && + vdev_dtl_should_excise(vd)) { + /* + * We completed a scrub up to scrub_txg. If we + * did it without rebooting, then the scrub dtl + * will be valid, so excise the old region and + * fold in the scrub dtl. Otherwise, leave the + * dtl as-is if there was an error. + * + * There's little trick here: to excise the beginning + * of the DTL_MISSING map, we put it into a reference + * tree and then add a segment with refcnt -1 that + * covers the range [0, scrub_txg). This means + * that each txg in that range has refcnt -1 or 0. + * We then add DTL_SCRUB with a refcnt of 2, so that + * entries in the range [0, scrub_txg) will have a + * positive refcnt -- either 1 or 2. We then convert + * the reference tree into the new DTL_MISSING map. + */ + space_reftree_create(&reftree); + space_reftree_add_map(&reftree, + vd->vdev_dtl[DTL_MISSING], 1); + space_reftree_add_seg(&reftree, 0, scrub_txg, -1); + space_reftree_add_map(&reftree, + vd->vdev_dtl[DTL_SCRUB], 2); + space_reftree_generate_map(&reftree, + vd->vdev_dtl[DTL_MISSING], 1); + space_reftree_destroy(&reftree); + } + range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); + range_tree_walk(vd->vdev_dtl[DTL_MISSING], + range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); + if (scrub_done) + range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); + range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); + if (!vdev_readable(vd)) + range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); + else + range_tree_walk(vd->vdev_dtl[DTL_MISSING], + range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); + + /* + * If the vdev was resilvering and no longer has any + * DTLs then reset its resilvering flag and dirty + * the top level so that we persist the change. + */ + if (vd->vdev_resilver_txg != 0 && + range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && + range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { + vd->vdev_resilver_txg = 0; + vdev_config_dirty(vd->vdev_top); + } + + mutex_exit(&vd->vdev_dtl_lock); + + if (txg != 0) + vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); + return; + } + + mutex_enter(&vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + /* account for child's outage in parent's missing map */ + int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; + if (t == DTL_SCRUB) + continue; /* leaf vdevs only */ + if (t == DTL_PARTIAL) + minref = 1; /* i.e. non-zero */ + else if (vd->vdev_nparity != 0) + minref = vd->vdev_nparity + 1; /* RAID-Z */ + else + minref = vd->vdev_children; /* any kind of mirror */ + space_reftree_create(&reftree); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + mutex_enter(&cvd->vdev_dtl_lock); + space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); + mutex_exit(&cvd->vdev_dtl_lock); + } + space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); + space_reftree_destroy(&reftree); + } + mutex_exit(&vd->vdev_dtl_lock); +} + +int +vdev_dtl_load(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + int error = 0; + + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { + ASSERT(vdev_is_concrete(vd)); + + error = space_map_open(&vd->vdev_dtl_sm, mos, + vd->vdev_dtl_object, 0, -1ULL, 0); + if (error) + return (error); + ASSERT(vd->vdev_dtl_sm != NULL); + + mutex_enter(&vd->vdev_dtl_lock); + + /* + * Now that we've opened the space_map we need to update + * the in-core DTL. + */ + space_map_update(vd->vdev_dtl_sm); + + error = space_map_load(vd->vdev_dtl_sm, + vd->vdev_dtl[DTL_MISSING], SM_ALLOC); + mutex_exit(&vd->vdev_dtl_lock); + + return (error); + } + + for (int c = 0; c < vd->vdev_children; c++) { + error = vdev_dtl_load(vd->vdev_child[c]); + if (error != 0) + break; + } + + return (error); +} + +void +vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + + VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); + VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, + zapobj, tx)); +} + +uint64_t +vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, + DMU_OT_NONE, 0, tx); + + ASSERT(zap != 0); + VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, + zap, tx)); + + return (zap); +} + +void +vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) +{ + if (vd->vdev_ops != &vdev_hole_ops && + vd->vdev_ops != &vdev_missing_ops && + vd->vdev_ops != &vdev_root_ops && + !vd->vdev_top->vdev_removing) { + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { + vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); + } + if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { + vd->vdev_top_zap = vdev_create_link_zap(vd, tx); + } + } + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_construct_zaps(vd->vdev_child[i], tx); + } +} + +void +vdev_dtl_sync(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; + objset_t *mos = spa->spa_meta_objset; + range_tree_t *rtsync; + dmu_tx_t *tx; + uint64_t object = space_map_object(vd->vdev_dtl_sm); + + ASSERT(vdev_is_concrete(vd)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + if (vd->vdev_detached || vd->vdev_top->vdev_removing) { + mutex_enter(&vd->vdev_dtl_lock); + space_map_free(vd->vdev_dtl_sm, tx); + space_map_close(vd->vdev_dtl_sm); + vd->vdev_dtl_sm = NULL; + mutex_exit(&vd->vdev_dtl_lock); + + /* + * We only destroy the leaf ZAP for detached leaves or for + * removed log devices. Removed data devices handle leaf ZAP + * cleanup later, once cancellation is no longer possible. + */ + if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || + vd->vdev_top->vdev_islog)) { + vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); + vd->vdev_leaf_zap = 0; + } + + dmu_tx_commit(tx); + return; + } + + if (vd->vdev_dtl_sm == NULL) { + uint64_t new_object; + + new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx); + VERIFY3U(new_object, !=, 0); + + VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, + 0, -1ULL, 0)); + ASSERT(vd->vdev_dtl_sm != NULL); + } + + rtsync = range_tree_create(NULL, NULL); + + mutex_enter(&vd->vdev_dtl_lock); + range_tree_walk(rt, range_tree_add, rtsync); + mutex_exit(&vd->vdev_dtl_lock); + + space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx); + space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); + range_tree_vacate(rtsync, NULL, NULL); + + range_tree_destroy(rtsync); + + /* + * If the object for the space map has changed then dirty + * the top level so that we update the config. + */ + if (object != space_map_object(vd->vdev_dtl_sm)) { + vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " + "new object %llu", (u_longlong_t)txg, spa_name(spa), + (u_longlong_t)object, + (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); + vdev_config_dirty(vd->vdev_top); + } + + dmu_tx_commit(tx); + + mutex_enter(&vd->vdev_dtl_lock); + space_map_update(vd->vdev_dtl_sm); + mutex_exit(&vd->vdev_dtl_lock); +} + +/* + * Determine whether the specified vdev can be offlined/detached/removed + * without losing data. + */ +boolean_t +vdev_dtl_required(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *tvd = vd->vdev_top; + uint8_t cant_read = vd->vdev_cant_read; + boolean_t required; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + if (vd == spa->spa_root_vdev || vd == tvd) + return (B_TRUE); + + /* + * Temporarily mark the device as unreadable, and then determine + * whether this results in any DTL outages in the top-level vdev. + * If not, we can safely offline/detach/remove the device. + */ + vd->vdev_cant_read = B_TRUE; + vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + required = !vdev_dtl_empty(tvd, DTL_OUTAGE); + vd->vdev_cant_read = cant_read; + vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + + if (!required && zio_injection_enabled) + required = !!zio_handle_device_injection(vd, NULL, ECHILD); + + return (required); +} + +/* + * Determine if resilver is needed, and if so the txg range. + */ +boolean_t +vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) +{ + boolean_t needed = B_FALSE; + uint64_t thismin = UINT64_MAX; + uint64_t thismax = 0; + + if (vd->vdev_children == 0) { + mutex_enter(&vd->vdev_dtl_lock); + if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && + vdev_writeable(vd)) { + + thismin = vdev_dtl_min(vd); + thismax = vdev_dtl_max(vd); + needed = B_TRUE; + } + mutex_exit(&vd->vdev_dtl_lock); + } else { + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + uint64_t cmin, cmax; + + if (vdev_resilver_needed(cvd, &cmin, &cmax)) { + thismin = MIN(thismin, cmin); + thismax = MAX(thismax, cmax); + needed = B_TRUE; + } + } + } + + if (needed && minp) { + *minp = thismin; + *maxp = thismax; + } + return (needed); +} + +/* + * Gets the checkpoint space map object from the vdev's ZAP. + * Returns the spacemap object, or 0 if it wasn't in the ZAP + * or the ZAP doesn't exist yet. + */ +int +vdev_checkpoint_sm_object(vdev_t *vd) +{ + ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + if (vd->vdev_top_zap == 0) { + return (0); + } + + uint64_t sm_obj = 0; + int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj); + + ASSERT(err == 0 || err == ENOENT); + + return (sm_obj); +} + +int +vdev_load(vdev_t *vd) +{ + int error = 0; + /* + * Recursively load all children. + */ + for (int c = 0; c < vd->vdev_children; c++) { + error = vdev_load(vd->vdev_child[c]); + if (error != 0) { + return (error); + } + } + + vdev_set_deflate_ratio(vd); + + /* + * If this is a top-level vdev, initialize its metaslabs. + */ + if (vd == vd->vdev_top && vdev_is_concrete(vd)) { + if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " + "asize=%llu", (u_longlong_t)vd->vdev_ashift, + (u_longlong_t)vd->vdev_asize); + return (SET_ERROR(ENXIO)); + } else if ((error = vdev_metaslab_init(vd, 0)) != 0) { + vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " + "[error=%d]", error); + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (error); + } + + uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd); + if (checkpoint_sm_obj != 0) { + objset_t *mos = spa_meta_objset(vd->vdev_spa); + ASSERT(vd->vdev_asize != 0); + ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); + + if ((error = space_map_open(&vd->vdev_checkpoint_sm, + mos, checkpoint_sm_obj, 0, vd->vdev_asize, + vd->vdev_ashift))) { + vdev_dbgmsg(vd, "vdev_load: space_map_open " + "failed for checkpoint spacemap (obj %llu) " + "[error=%d]", + (u_longlong_t)checkpoint_sm_obj, error); + return (error); + } + ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); + space_map_update(vd->vdev_checkpoint_sm); + + /* + * Since the checkpoint_sm contains free entries + * exclusively we can use sm_alloc to indicate the + * culmulative checkpointed space that has been freed. + */ + vd->vdev_stat.vs_checkpoint_space = + -vd->vdev_checkpoint_sm->sm_alloc; + vd->vdev_spa->spa_checkpoint_info.sci_dspace += + vd->vdev_stat.vs_checkpoint_space; + } + } + + /* + * If this is a leaf vdev, load its DTL. + */ + if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " + "[error=%d]", error); + return (error); + } + + uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); + if (obsolete_sm_object != 0) { + objset_t *mos = vd->vdev_spa->spa_meta_objset; + ASSERT(vd->vdev_asize != 0); + ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); + + if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, + obsolete_sm_object, 0, vd->vdev_asize, 0))) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " + "obsolete spacemap (obj %llu) [error=%d]", + (u_longlong_t)obsolete_sm_object, error); + return (error); + } + space_map_update(vd->vdev_obsolete_sm); + } + + return (0); +} + +/* + * The special vdev case is used for hot spares and l2cache devices. Its + * sole purpose it to set the vdev state for the associated vdev. To do this, + * we make sure that we can open the underlying device, then try to read the + * label, and make sure that the label is sane and that it hasn't been + * repurposed to another pool. + */ +int +vdev_validate_aux(vdev_t *vd) +{ + nvlist_t *label; + uint64_t guid, version; + uint64_t state; + + if (!vdev_readable(vd)) + return (0); + + if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (-1); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || + !SPA_VERSION_IS_SUPPORTED(version) || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || + guid != vd->vdev_guid || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + return (-1); + } + + /* + * We don't actually check the pool state here. If it's in fact in + * use by another pool, we update this fact on the fly when requested. + */ + nvlist_free(label); + return (0); +} + +/* + * Free the objects used to store this vdev's spacemaps, and the array + * that points to them. + */ +void +vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) +{ + if (vd->vdev_ms_array == 0) + return; + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; + size_t array_bytes = array_count * sizeof (uint64_t); + uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); + VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, + array_bytes, smobj_array, 0)); + + for (uint64_t i = 0; i < array_count; i++) { + uint64_t smobj = smobj_array[i]; + if (smobj == 0) + continue; + + space_map_free_obj(mos, smobj, tx); + } + + kmem_free(smobj_array, array_bytes); + VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); + vd->vdev_ms_array = 0; +} + +static void +vdev_remove_empty(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + dmu_tx_t *tx; + + ASSERT(vd == vd->vdev_top); + ASSERT3U(txg, ==, spa_syncing_txg(spa)); + + if (vd->vdev_ms != NULL) { + metaslab_group_t *mg = vd->vdev_mg; + + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp == NULL || msp->ms_sm == NULL) + continue; + + mutex_enter(&msp->ms_lock); + /* + * If the metaslab was not loaded when the vdev + * was removed then the histogram accounting may + * not be accurate. Update the histogram information + * here so that we ensure that the metaslab group + * and metaslab class are up-to-date. + */ + metaslab_group_histogram_remove(mg, msp); + + VERIFY0(space_map_allocated(msp->ms_sm)); + space_map_close(msp->ms_sm); + msp->ms_sm = NULL; + mutex_exit(&msp->ms_lock); + } + + if (vd->vdev_checkpoint_sm != NULL) { + ASSERT(spa_has_checkpoint(spa)); + space_map_close(vd->vdev_checkpoint_sm); + vd->vdev_checkpoint_sm = NULL; + } + + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + ASSERT0(mg->mg_histogram[i]); + } + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + vdev_destroy_spacemaps(vd, tx); + + if (vd->vdev_islog && vd->vdev_top_zap != 0) { + vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); + vd->vdev_top_zap = 0; + } + dmu_tx_commit(tx); +} + +void +vdev_sync_done(vdev_t *vd, uint64_t txg) +{ + metaslab_t *msp; + boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); + + ASSERT(vdev_is_concrete(vd)); + + while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) + != NULL) + metaslab_sync_done(msp, txg); + + if (reassess) + metaslab_sync_reassess(vd->vdev_mg); +} + +void +vdev_sync(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *lvd; + metaslab_t *msp; + dmu_tx_t *tx; + + if (range_tree_space(vd->vdev_obsolete_segments) > 0) { + dmu_tx_t *tx; + + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + vdev_indirect_sync_obsolete(vd, tx); + dmu_tx_commit(tx); + + /* + * If the vdev is indirect, it can't have dirty + * metaslabs or DTLs. + */ + if (vd->vdev_ops == &vdev_indirect_ops) { + ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); + ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); + return; + } + } + + ASSERT(vdev_is_concrete(vd)); + + if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && + !vd->vdev_removing) { + ASSERT(vd == vd->vdev_top); + ASSERT0(vd->vdev_indirect_config.vic_mapping_object); + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); + ASSERT(vd->vdev_ms_array != 0); + vdev_config_dirty(vd); + dmu_tx_commit(tx); + } + + while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { + metaslab_sync(msp, txg); + (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); + } + + while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) + vdev_dtl_sync(lvd, txg); + + /* + * Remove the metadata associated with this vdev once it's empty. + * Note that this is typically used for log/cache device removal; + * we don't empty toplevel vdevs when removing them. But if + * a toplevel happens to be emptied, this is not harmful. + */ + if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) { + vdev_remove_empty(vd, txg); + } + + (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); +} + +uint64_t +vdev_psize_to_asize(vdev_t *vd, uint64_t psize) +{ + return (vd->vdev_ops->vdev_op_asize(vd, psize)); +} + +/* + * Mark the given vdev faulted. A faulted vdev behaves as if the device could + * not be opened, and no I/O is attempted. + */ +int +vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) +{ + vdev_t *vd, *tvd; + + spa_vdev_state_enter(spa, SCL_NONE); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + + tvd = vd->vdev_top; + + /* + * We don't directly use the aux state here, but if we do a + * vdev_reopen(), we need this value to be present to remember why we + * were faulted. + */ + vd->vdev_label_aux = aux; + + /* + * Faulted state takes precedence over degraded. + */ + vd->vdev_delayed_close = B_FALSE; + vd->vdev_faulted = 1ULL; + vd->vdev_degraded = 0ULL; + vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); + + /* + * If this device has the only valid copy of the data, then + * back off and simply mark the vdev as degraded instead. + */ + if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { + vd->vdev_degraded = 1ULL; + vd->vdev_faulted = 0ULL; + + /* + * If we reopen the device and it's not dead, only then do we + * mark it degraded. + */ + vdev_reopen(tvd); + + if (vdev_readable(vd)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); + } + + return (spa_vdev_state_exit(spa, vd, 0)); +} + +/* + * Mark the given vdev degraded. A degraded vdev is purely an indication to the + * user that something is wrong. The vdev continues to operate as normal as far + * as I/O is concerned. + */ +int +vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) +{ + vdev_t *vd; + + spa_vdev_state_enter(spa, SCL_NONE); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + + /* + * If the vdev is already faulted, then don't do anything. + */ + if (vd->vdev_faulted || vd->vdev_degraded) + return (spa_vdev_state_exit(spa, NULL, 0)); + + vd->vdev_degraded = 1ULL; + if (!vdev_is_dead(vd)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + aux); + + return (spa_vdev_state_exit(spa, vd, 0)); +} + +/* + * Online the given vdev. + * + * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached + * spare device should be detached when the device finishes resilvering. + * Second, the online should be treated like a 'test' online case, so no FMA + * events are generated if the device fails to open. + */ +int +vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) +{ + vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; + boolean_t wasoffline; + vdev_state_t oldstate; + + spa_vdev_state_enter(spa, SCL_NONE); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + + wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); + oldstate = vd->vdev_state; + + tvd = vd->vdev_top; + vd->vdev_offline = B_FALSE; + vd->vdev_tmpoffline = B_FALSE; + vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); + vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); + + /* XXX - L2ARC 1.0 does not support expansion */ + if (!vd->vdev_aux) { + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); + } + + vdev_reopen(tvd); + vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; + + if (!vd->vdev_aux) { + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + pvd->vdev_expanding = B_FALSE; + } + + if (newstate) + *newstate = vd->vdev_state; + if ((flags & ZFS_ONLINE_UNSPARE) && + !vdev_is_dead(vd) && vd->vdev_parent && + vd->vdev_parent->vdev_ops == &vdev_spare_ops && + vd->vdev_parent->vdev_child[0] == vd) + vd->vdev_unspare = B_TRUE; + + if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { + + /* XXX - L2ARC 1.0 does not support expansion */ + if (vd->vdev_aux) + return (spa_vdev_state_exit(spa, vd, ENOTSUP)); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } + + /* Restart initializing if necessary */ + mutex_enter(&vd->vdev_initialize_lock); + if (vdev_writeable(vd) && + vd->vdev_initialize_thread == NULL && + vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { + (void) vdev_initialize(vd); + } + mutex_exit(&vd->vdev_initialize_lock); + + if (wasoffline || + (oldstate < VDEV_STATE_DEGRADED && + vd->vdev_state >= VDEV_STATE_DEGRADED)) + spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); + + return (spa_vdev_state_exit(spa, vd, 0)); +} + +static int +vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) +{ + vdev_t *vd, *tvd; + int error = 0; + uint64_t generation; + metaslab_group_t *mg; + +top: + spa_vdev_state_enter(spa, SCL_ALLOC); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + + tvd = vd->vdev_top; + mg = tvd->vdev_mg; + generation = spa->spa_config_generation + 1; + + /* + * If the device isn't already offline, try to offline it. + */ + if (!vd->vdev_offline) { + /* + * If this device has the only valid copy of some data, + * don't allow it to be offlined. Log devices are always + * expendable. + */ + if (!tvd->vdev_islog && vd->vdev_aux == NULL && + vdev_dtl_required(vd)) + return (spa_vdev_state_exit(spa, NULL, EBUSY)); + + /* + * If the top-level is a slog and it has had allocations + * then proceed. We check that the vdev's metaslab group + * is not NULL since it's possible that we may have just + * added this vdev but not yet initialized its metaslabs. + */ + if (tvd->vdev_islog && mg != NULL) { + /* + * Prevent any future allocations. + */ + metaslab_group_passivate(mg); + (void) spa_vdev_state_exit(spa, vd, 0); + + error = spa_reset_logs(spa); + + /* + * If the log device was successfully reset but has + * checkpointed data, do not offline it. + */ + if (error == 0 && + tvd->vdev_checkpoint_sm != NULL) { + ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc, + !=, 0); + error = ZFS_ERR_CHECKPOINT_EXISTS; + } + + spa_vdev_state_enter(spa, SCL_ALLOC); + + /* + * Check to see if the config has changed. + */ + if (error || generation != spa->spa_config_generation) { + metaslab_group_activate(mg); + if (error) + return (spa_vdev_state_exit(spa, + vd, error)); + (void) spa_vdev_state_exit(spa, vd, 0); + goto top; + } + ASSERT0(tvd->vdev_stat.vs_alloc); + } + + /* + * Offline this device and reopen its top-level vdev. + * If the top-level vdev is a log device then just offline + * it. Otherwise, if this action results in the top-level + * vdev becoming unusable, undo it and fail the request. + */ + vd->vdev_offline = B_TRUE; + vdev_reopen(tvd); + + if (!tvd->vdev_islog && vd->vdev_aux == NULL && + vdev_is_dead(tvd)) { + vd->vdev_offline = B_FALSE; + vdev_reopen(tvd); + return (spa_vdev_state_exit(spa, NULL, EBUSY)); + } + + /* + * Add the device back into the metaslab rotor so that + * once we online the device it's open for business. + */ + if (tvd->vdev_islog && mg != NULL) + metaslab_group_activate(mg); + } + + vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); + + return (spa_vdev_state_exit(spa, vd, 0)); +} + +int +vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) +{ + int error; + + mutex_enter(&spa->spa_vdev_top_lock); + error = vdev_offline_locked(spa, guid, flags); + mutex_exit(&spa->spa_vdev_top_lock); + + return (error); +} + +/* + * Clear the error counts associated with this vdev. Unlike vdev_online() and + * vdev_offline(), we assume the spa config is locked. We also clear all + * children. If 'vd' is NULL, then the user wants to clear all vdevs. + */ +void +vdev_clear(spa_t *spa, vdev_t *vd) +{ + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + if (vd == NULL) + vd = rvd; + + vd->vdev_stat.vs_read_errors = 0; + vd->vdev_stat.vs_write_errors = 0; + vd->vdev_stat.vs_checksum_errors = 0; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_clear(spa, vd->vdev_child[c]); + + if (vd == rvd) { + for (int c = 0; c < spa->spa_l2cache.sav_count; c++) + vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); + + for (int c = 0; c < spa->spa_spares.sav_count; c++) + vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); + } + + /* + * It makes no sense to "clear" an indirect vdev. + */ + if (!vdev_is_concrete(vd)) + return; + + /* + * If we're in the FAULTED state or have experienced failed I/O, then + * clear the persistent state and attempt to reopen the device. We + * also mark the vdev config dirty, so that the new faulted state is + * written out to disk. + */ + if (vd->vdev_faulted || vd->vdev_degraded || + !vdev_readable(vd) || !vdev_writeable(vd)) { + + /* + * When reopening in reponse to a clear event, it may be due to + * a fmadm repair request. In this case, if the device is + * still broken, we want to still post the ereport again. + */ + vd->vdev_forcefault = B_TRUE; + + vd->vdev_faulted = vd->vdev_degraded = 0ULL; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + + vdev_reopen(vd == rvd ? rvd : vd->vdev_top); + + vd->vdev_forcefault = B_FALSE; + + if (vd != rvd && vdev_writeable(vd->vdev_top)) + vdev_state_dirty(vd->vdev_top); + + if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) + spa_async_request(spa, SPA_ASYNC_RESILVER); + + spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); + } + + /* + * When clearing a FMA-diagnosed fault, we always want to + * unspare the device, as we assume that the original spare was + * done in response to the FMA fault. + */ + if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && + vd->vdev_parent->vdev_ops == &vdev_spare_ops && + vd->vdev_parent->vdev_child[0] == vd) + vd->vdev_unspare = B_TRUE; +} + +boolean_t +vdev_is_dead(vdev_t *vd) +{ + /* + * Holes and missing devices are always considered "dead". + * This simplifies the code since we don't have to check for + * these types of devices in the various code paths. + * Instead we rely on the fact that we skip over dead devices + * before issuing I/O to them. + */ + return (vd->vdev_state < VDEV_STATE_DEGRADED || + vd->vdev_ops == &vdev_hole_ops || + vd->vdev_ops == &vdev_missing_ops); +} + +boolean_t +vdev_readable(vdev_t *vd) +{ + return (!vdev_is_dead(vd) && !vd->vdev_cant_read); +} + +boolean_t +vdev_writeable(vdev_t *vd) +{ + return (!vdev_is_dead(vd) && !vd->vdev_cant_write && + vdev_is_concrete(vd)); +} + +boolean_t +vdev_allocatable(vdev_t *vd) +{ + uint64_t state = vd->vdev_state; + + /* + * We currently allow allocations from vdevs which may be in the + * process of reopening (i.e. VDEV_STATE_CLOSED). If the device + * fails to reopen then we'll catch it later when we're holding + * the proper locks. Note that we have to get the vdev state + * in a local variable because although it changes atomically, + * we're asking two separate questions about it. + */ + return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && + !vd->vdev_cant_write && vdev_is_concrete(vd) && + vd->vdev_mg->mg_initialized); +} + +boolean_t +vdev_accessible(vdev_t *vd, zio_t *zio) +{ + ASSERT(zio->io_vd == vd); + + if (vdev_is_dead(vd) || vd->vdev_remove_wanted) + return (B_FALSE); + + if (zio->io_type == ZIO_TYPE_READ) + return (!vd->vdev_cant_read); + + if (zio->io_type == ZIO_TYPE_WRITE) + return (!vd->vdev_cant_write); + + return (B_TRUE); +} + +boolean_t +vdev_is_spacemap_addressable(vdev_t *vd) +{ + /* + * Assuming 47 bits of the space map entry dedicated for the entry's + * offset (see description in space_map.h), we calculate the maximum + * address that can be described by a space map entry for the given + * device. + */ + uint64_t shift = vd->vdev_ashift + 47; + + if (shift >= 63) /* detect potential overflow */ + return (B_TRUE); + + return (vd->vdev_asize < (1ULL << shift)); +} + +/* + * Get statistics for the given vdev. + */ +void +vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *tvd = vd->vdev_top; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + + mutex_enter(&vd->vdev_stat_lock); + bcopy(&vd->vdev_stat, vs, sizeof (*vs)); + vs->vs_timestamp = gethrtime() - vs->vs_timestamp; + vs->vs_state = vd->vdev_state; + vs->vs_rsize = vdev_get_min_asize(vd); + if (vd->vdev_ops->vdev_op_leaf) { + vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + /* + * Report intializing progress. Since we don't have the + * initializing locks held, this is only an estimate (although a + * fairly accurate one). + */ + vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; + vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; + vs->vs_initialize_state = vd->vdev_initialize_state; + vs->vs_initialize_action_time = vd->vdev_initialize_action_time; + } + /* + * Report expandable space on top-level, non-auxillary devices only. + * The expandable space is reported in terms of metaslab sized units + * since that determines how much space the pool can expand. + */ + if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { + vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize - + spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift); + } + vs->vs_configured_ashift = vd->vdev_top != NULL + ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; + vs->vs_logical_ashift = vd->vdev_logical_ashift; + vs->vs_physical_ashift = vd->vdev_physical_ashift; + if (vd->vdev_aux == NULL && vd == vd->vdev_top && + vdev_is_concrete(vd)) { + vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; + } + + /* + * If we're getting stats on the root vdev, aggregate the I/O counts + * over all top-level vdevs (i.e. the direct children of the root). + */ + if (vd == rvd) { + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *cvd = rvd->vdev_child[c]; + vdev_stat_t *cvs = &cvd->vdev_stat; + + for (int t = 0; t < ZIO_TYPES; t++) { + vs->vs_ops[t] += cvs->vs_ops[t]; + vs->vs_bytes[t] += cvs->vs_bytes[t]; + } + cvs->vs_scan_removing = cvd->vdev_removing; + } + } + mutex_exit(&vd->vdev_stat_lock); +} + +void +vdev_clear_stats(vdev_t *vd) +{ + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_space = 0; + vd->vdev_stat.vs_dspace = 0; + vd->vdev_stat.vs_alloc = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +void +vdev_scan_stat_init(vdev_t *vd) +{ + vdev_stat_t *vs = &vd->vdev_stat; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_scan_stat_init(vd->vdev_child[c]); + + mutex_enter(&vd->vdev_stat_lock); + vs->vs_scan_processed = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +void +vdev_stat_update(zio_t *zio, uint64_t psize) +{ + spa_t *spa = zio->io_spa; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; + vdev_t *pvd; + uint64_t txg = zio->io_txg; + vdev_stat_t *vs = &vd->vdev_stat; + zio_type_t type = zio->io_type; + int flags = zio->io_flags; + + /* + * If this i/o is a gang leader, it didn't do any actual work. + */ + if (zio->io_gang_tree) + return; + + if (zio->io_error == 0) { + /* + * If this is a root i/o, don't count it -- we've already + * counted the top-level vdevs, and vdev_get_stats() will + * aggregate them when asked. This reduces contention on + * the root vdev_stat_lock and implicitly handles blocks + * that compress away to holes, for which there is no i/o. + * (Holes never create vdev children, so all the counters + * remain zero, which is what we want.) + * + * Note: this only applies to successful i/o (io_error == 0) + * because unlike i/o counts, errors are not additive. + * When reading a ditto block, for example, failure of + * one top-level vdev does not imply a root-level error. + */ + if (vd == rvd) + return; + + ASSERT(vd == zio->io_vd); + + if (flags & ZIO_FLAG_IO_BYPASS) + return; + + mutex_enter(&vd->vdev_stat_lock); + + if (flags & ZIO_FLAG_IO_REPAIR) { + if (flags & ZIO_FLAG_SCAN_THREAD) { + dsl_scan_phys_t *scn_phys = + &spa->spa_dsl_pool->dp_scan->scn_phys; + uint64_t *processed = &scn_phys->scn_processed; + + /* XXX cleanup? */ + if (vd->vdev_ops->vdev_op_leaf) + atomic_add_64(processed, psize); + vs->vs_scan_processed += psize; + } + + if (flags & ZIO_FLAG_SELF_HEAL) + vs->vs_self_healed += psize; + } + + vs->vs_ops[type]++; + vs->vs_bytes[type] += psize; + + mutex_exit(&vd->vdev_stat_lock); + return; + } + + if (flags & ZIO_FLAG_SPECULATIVE) + return; + + /* + * If this is an I/O error that is going to be retried, then ignore the + * error. Otherwise, the user may interpret B_FAILFAST I/O errors as + * hard errors, when in reality they can happen for any number of + * innocuous reasons (bus resets, MPxIO link failure, etc). + */ + if (zio->io_error == EIO && + !(zio->io_flags & ZIO_FLAG_IO_RETRY)) + return; + + /* + * Intent logs writes won't propagate their error to the root + * I/O so don't mark these types of failures as pool-level + * errors. + */ + if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) + return; + + mutex_enter(&vd->vdev_stat_lock); + if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { + if (zio->io_error == ECKSUM) + vs->vs_checksum_errors++; + else + vs->vs_read_errors++; + } + if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) + vs->vs_write_errors++; + mutex_exit(&vd->vdev_stat_lock); + + if (spa->spa_load_state == SPA_LOAD_NONE && + type == ZIO_TYPE_WRITE && txg != 0 && + (!(flags & ZIO_FLAG_IO_REPAIR) || + (flags & ZIO_FLAG_SCAN_THREAD) || + spa->spa_claiming)) { + /* + * This is either a normal write (not a repair), or it's + * a repair induced by the scrub thread, or it's a repair + * made by zil_claim() during spa_load() in the first txg. + * In the normal case, we commit the DTL change in the same + * txg as the block was born. In the scrub-induced repair + * case, we know that scrubs run in first-pass syncing context, + * so we commit the DTL change in spa_syncing_txg(spa). + * In the zil_claim() case, we commit in spa_first_txg(spa). + * + * We currently do not make DTL entries for failed spontaneous + * self-healing writes triggered by normal (non-scrubbing) + * reads, because we have no transactional context in which to + * do so -- and it's not clear that it'd be desirable anyway. + */ + if (vd->vdev_ops->vdev_op_leaf) { + uint64_t commit_txg = txg; + if (flags & ZIO_FLAG_SCAN_THREAD) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + ASSERT(spa_sync_pass(spa) == 1); + vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); + commit_txg = spa_syncing_txg(spa); + } else if (spa->spa_claiming) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + commit_txg = spa_first_txg(spa); + } + ASSERT(commit_txg >= spa_syncing_txg(spa)); + if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) + return; + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); + vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); + } + if (vd != rvd) + vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); + } +} + +/* + * Update the in-core space usage stats for this vdev, its metaslab class, + * and the root vdev. + */ +void +vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, + int64_t space_delta) +{ + int64_t dspace_delta = space_delta; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + metaslab_group_t *mg = vd->vdev_mg; + metaslab_class_t *mc = mg ? mg->mg_class : NULL; + + ASSERT(vd == vd->vdev_top); + + /* + * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion + * factor. We must calculate this here and not at the root vdev + * because the root vdev's psize-to-asize is simply the max of its + * childrens', thus not accurate enough for us. + */ + ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); + ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); + dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * + vd->vdev_deflate_ratio; + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_alloc += alloc_delta; + vd->vdev_stat.vs_space += space_delta; + vd->vdev_stat.vs_dspace += dspace_delta; + mutex_exit(&vd->vdev_stat_lock); + + if (mc == spa_normal_class(spa)) { + mutex_enter(&rvd->vdev_stat_lock); + rvd->vdev_stat.vs_alloc += alloc_delta; + rvd->vdev_stat.vs_space += space_delta; + rvd->vdev_stat.vs_dspace += dspace_delta; + mutex_exit(&rvd->vdev_stat_lock); + } + + if (mc != NULL) { + ASSERT(rvd == vd->vdev_parent); + ASSERT(vd->vdev_ms_count != 0); + + metaslab_class_space_update(mc, + alloc_delta, defer_delta, space_delta, dspace_delta); + } +} + +/* + * Mark a top-level vdev's config as dirty, placing it on the dirty list + * so that it will be written out next time the vdev configuration is synced. + * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. + */ +void +vdev_config_dirty(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + int c; + + ASSERT(spa_writeable(spa)); + + /* + * If this is an aux vdev (as with l2cache and spare devices), then we + * update the vdev config manually and set the sync flag. + */ + if (vd->vdev_aux != NULL) { + spa_aux_vdev_t *sav = vd->vdev_aux; + nvlist_t **aux; + uint_t naux; + + for (c = 0; c < sav->sav_count; c++) { + if (sav->sav_vdevs[c] == vd) + break; + } + + if (c == sav->sav_count) { + /* + * We're being removed. There's nothing more to do. + */ + ASSERT(sav->sav_sync == B_TRUE); + return; + } + + sav->sav_sync = B_TRUE; + + if (nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); + } + + ASSERT(c < naux); + + /* + * Setting the nvlist in the middle if the array is a little + * sketchy, but it will work. + */ + nvlist_free(aux[c]); + aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); + + return; + } + + /* + * The dirty list is protected by the SCL_CONFIG lock. The caller + * must either hold SCL_CONFIG as writer, or must be the sync thread + * (which holds SCL_CONFIG as reader). There's only one sync thread, + * so this is sufficient to ensure mutual exclusion. + */ + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_CONFIG, RW_READER))); + + if (vd == rvd) { + for (c = 0; c < rvd->vdev_children; c++) + vdev_config_dirty(rvd->vdev_child[c]); + } else { + ASSERT(vd == vd->vdev_top); + + if (!list_link_active(&vd->vdev_config_dirty_node) && + vdev_is_concrete(vd)) { + list_insert_head(&spa->spa_config_dirty_list, vd); + } + } +} + +void +vdev_config_clean(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_CONFIG, RW_READER))); + + ASSERT(list_link_active(&vd->vdev_config_dirty_node)); + list_remove(&spa->spa_config_dirty_list, vd); +} + +/* + * Mark a top-level vdev's state as dirty, so that the next pass of + * spa_sync() can convert this into vdev_config_dirty(). We distinguish + * the state changes from larger config changes because they require + * much less locking, and are often needed for administrative actions. + */ +void +vdev_state_dirty(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_writeable(spa)); + ASSERT(vd == vd->vdev_top); + + /* + * The state list is protected by the SCL_STATE lock. The caller + * must either hold SCL_STATE as writer, or must be the sync thread + * (which holds SCL_STATE as reader). There's only one sync thread, + * so this is sufficient to ensure mutual exclusion. + */ + ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_STATE, RW_READER))); + + if (!list_link_active(&vd->vdev_state_dirty_node) && + vdev_is_concrete(vd)) + list_insert_head(&spa->spa_state_dirty_list, vd); +} + +void +vdev_state_clean(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_STATE, RW_READER))); + + ASSERT(list_link_active(&vd->vdev_state_dirty_node)); + list_remove(&spa->spa_state_dirty_list, vd); +} + +/* + * Propagate vdev state up from children to parent. + */ +void +vdev_propagate_state(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + int degraded = 0, faulted = 0; + int corrupted = 0; + vdev_t *child; + + if (vd->vdev_children > 0) { + for (int c = 0; c < vd->vdev_children; c++) { + child = vd->vdev_child[c]; + + /* + * Don't factor holes or indirect vdevs into the + * decision. + */ + if (!vdev_is_concrete(child)) + continue; + + if (!vdev_readable(child) || + (!vdev_writeable(child) && spa_writeable(spa))) { + /* + * Root special: if there is a top-level log + * device, treat the root vdev as if it were + * degraded. + */ + if (child->vdev_islog && vd == rvd) + degraded++; + else + faulted++; + } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { + degraded++; + } + + if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) + corrupted++; + } + + vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + + /* + * Root special: if there is a top-level vdev that cannot be + * opened due to corrupted metadata, then propagate the root + * vdev's aux state as 'corrupt' rather than 'insufficient + * replicas'. + */ + if (corrupted && vd == rvd && + rvd->vdev_state == VDEV_STATE_CANT_OPEN) + vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + } + + if (vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); +} + +/* + * Set a vdev's state. If this is during an open, we don't update the parent + * state, because we're in the process of opening children depth-first. + * Otherwise, we propagate the change to the parent. + * + * If this routine places a device in a faulted state, an appropriate ereport is + * generated. + */ +void +vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) +{ + uint64_t save_state; + spa_t *spa = vd->vdev_spa; + + if (state == vd->vdev_state) { + vd->vdev_stat.vs_aux = aux; + return; + } + + save_state = vd->vdev_state; + + vd->vdev_state = state; + vd->vdev_stat.vs_aux = aux; + + /* + * If we are setting the vdev state to anything but an open state, then + * always close the underlying device unless the device has requested + * a delayed close (i.e. we're about to remove or fault the device). + * Otherwise, we keep accessible but invalid devices open forever. + * We don't call vdev_close() itself, because that implies some extra + * checks (offline, etc) that we don't want here. This is limited to + * leaf devices, because otherwise closing the device will affect other + * children. + */ + if (!vd->vdev_delayed_close && vdev_is_dead(vd) && + vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_close(vd); + + if (vd->vdev_removed && + state == VDEV_STATE_CANT_OPEN && + (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { + /* + * If the previous state is set to VDEV_STATE_REMOVED, then this + * device was previously marked removed and someone attempted to + * reopen it. If this failed due to a nonexistent device, then + * keep the device in the REMOVED state. We also let this be if + * it is one of our special test online cases, which is only + * attempting to online the device and shouldn't generate an FMA + * fault. + */ + vd->vdev_state = VDEV_STATE_REMOVED; + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + } else if (state == VDEV_STATE_REMOVED) { + vd->vdev_removed = B_TRUE; + } else if (state == VDEV_STATE_CANT_OPEN) { + /* + * If we fail to open a vdev during an import or recovery, we + * mark it as "not available", which signifies that it was + * never there to begin with. Failure to open such a device + * is not considered an error. + */ + if ((spa_load_state(spa) == SPA_LOAD_IMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) && + vd->vdev_ops->vdev_op_leaf) + vd->vdev_not_present = 1; + + /* + * Post the appropriate ereport. If the 'prevstate' field is + * set to something other than VDEV_STATE_UNKNOWN, it indicates + * that this is part of a vdev_reopen(). In this case, we don't + * want to post the ereport if the device was already in the + * CANT_OPEN state beforehand. + * + * If the 'checkremove' flag is set, then this is an attempt to + * online the device in response to an insertion event. If we + * hit this case, then we have detected an insertion event for a + * faulted or offline device that wasn't in the removed state. + * In this scenario, we don't post an ereport because we are + * about to replace the device, or attempt an online with + * vdev_forcefault, which will generate the fault for us. + */ + if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && + !vd->vdev_not_present && !vd->vdev_checkremove && + vd != spa->spa_root_vdev) { + const char *class; + + switch (aux) { + case VDEV_AUX_OPEN_FAILED: + class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; + break; + case VDEV_AUX_CORRUPT_DATA: + class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; + break; + case VDEV_AUX_NO_REPLICAS: + class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; + break; + case VDEV_AUX_BAD_GUID_SUM: + class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; + break; + case VDEV_AUX_TOO_SMALL: + class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; + break; + case VDEV_AUX_BAD_LABEL: + class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; + break; + default: + class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; + } + + zfs_ereport_post(class, spa, vd, NULL, save_state, 0); + } + + /* Erase any notion of persistent removed state */ + vd->vdev_removed = B_FALSE; + } else { + vd->vdev_removed = B_FALSE; + } + + /* + * Notify the fmd of the state change. Be verbose and post + * notifications even for stuff that's not important; the fmd agent can + * sort it out. Don't emit state change events for non-leaf vdevs since + * they can't change state on their own. The FMD can check their state + * if it wants to when it sees that a leaf vdev had a state change. + */ + if (vd->vdev_ops->vdev_op_leaf) + zfs_post_state_change(spa, vd); + + if (!isopen && vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); +} + +boolean_t +vdev_children_are_offline(vdev_t *vd) +{ + ASSERT(!vd->vdev_ops->vdev_op_leaf); + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * Check the vdev configuration to ensure that it's capable of supporting + * a root pool. We do not support partial configuration. + * In addition, only a single top-level vdev is allowed. + * + * FreeBSD does not have above limitations. + */ +boolean_t +vdev_is_bootable(vdev_t *vd) +{ +#ifdef illumos + if (!vd->vdev_ops->vdev_op_leaf) { + char *vdev_type = vd->vdev_ops->vdev_op_type; + + if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && + vd->vdev_children > 1) { + return (B_FALSE); + } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || + strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { + return (B_FALSE); + } + } + + for (int c = 0; c < vd->vdev_children; c++) { + if (!vdev_is_bootable(vd->vdev_child[c])) + return (B_FALSE); + } +#endif /* illumos */ + return (B_TRUE); +} + +boolean_t +vdev_is_concrete(vdev_t *vd) +{ + vdev_ops_t *ops = vd->vdev_ops; + if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || + ops == &vdev_missing_ops || ops == &vdev_root_ops) { + return (B_FALSE); + } else { + return (B_TRUE); + } +} + +/* + * Determine if a log device has valid content. If the vdev was + * removed or faulted in the MOS config then we know that + * the content on the log device has already been written to the pool. + */ +boolean_t +vdev_log_state_valid(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && + !vd->vdev_removed) + return (B_TRUE); + + for (int c = 0; c < vd->vdev_children; c++) + if (vdev_log_state_valid(vd->vdev_child[c])) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * Expand a vdev if possible. + */ +void +vdev_expand(vdev_t *vd, uint64_t txg) +{ + ASSERT(vd->vdev_top == vd); + ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + ASSERT(vdev_is_concrete(vd)); + + vdev_set_deflate_ratio(vd); + + if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { + VERIFY(vdev_metaslab_init(vd, txg) == 0); + vdev_config_dirty(vd); + } +} + +/* + * Split a vdev. + */ +void +vdev_split(vdev_t *vd) +{ + vdev_t *cvd, *pvd = vd->vdev_parent; + + vdev_remove_child(pvd, vd); + vdev_compact_children(pvd); + + cvd = pvd->vdev_child[0]; + if (pvd->vdev_children == 1) { + vdev_remove_parent(cvd); + cvd->vdev_splitting = B_TRUE; + } + vdev_propagate_state(cvd); +} + +void +vdev_deadman(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + vdev_deadman(cvd); + } + + if (vd->vdev_ops->vdev_op_leaf) { + vdev_queue_t *vq = &vd->vdev_queue; + + mutex_enter(&vq->vq_lock); + if (avl_numnodes(&vq->vq_active_tree) > 0) { + spa_t *spa = vd->vdev_spa; + zio_t *fio; + uint64_t delta; + + /* + * Look at the head of all the pending queues, + * if any I/O has been outstanding for longer than + * the spa_deadman_synctime we panic the system. + */ + fio = avl_first(&vq->vq_active_tree); + delta = gethrtime() - fio->io_timestamp; + if (delta > spa_deadman_synctime(spa)) { + vdev_dbgmsg(vd, "SLOW IO: zio timestamp " + "%lluns, delta %lluns, last io %lluns", + fio->io_timestamp, (u_longlong_t)delta, + vq->vq_io_complete_ts); + fm_panic("I/O to pool '%s' appears to be " + "hung on vdev guid %llu at '%s'.", + spa_name(spa), + (long long unsigned int) vd->vdev_guid, + vd->vdev_path); + } + } + mutex_exit(&vq->vq_lock); + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c new file mode 100644 index 000000000000..ef2574b915f5 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c @@ -0,0 +1,438 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/kstat.h> +#include <sys/abd.h> + +/* + * Virtual device read-ahead caching. + * + * This file implements a simple LRU read-ahead cache. When the DMU reads + * a given block, it will often want other, nearby blocks soon thereafter. + * We take advantage of this by reading a larger disk region and caching + * the result. In the best case, this can turn 128 back-to-back 512-byte + * reads into a single 64k read followed by 127 cache hits; this reduces + * latency dramatically. In the worst case, it can turn an isolated 512-byte + * read into a 64k read, which doesn't affect latency all that much but is + * terribly wasteful of bandwidth. A more intelligent version of the cache + * could keep track of access patterns and not do read-ahead unless it sees + * at least two temporally close I/Os to the same region. Currently, only + * metadata I/O is inflated. A futher enhancement could take advantage of + * more semantic information about the I/O. And it could use something + * faster than an AVL tree; that was chosen solely for convenience. + * + * There are five cache operations: allocate, fill, read, write, evict. + * + * (1) Allocate. This reserves a cache entry for the specified region. + * We separate the allocate and fill operations so that multiple threads + * don't generate I/O for the same cache miss. + * + * (2) Fill. When the I/O for a cache miss completes, the fill routine + * places the data in the previously allocated cache entry. + * + * (3) Read. Read data from the cache. + * + * (4) Write. Update cache contents after write completion. + * + * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry + * if the total cache size exceeds zfs_vdev_cache_size. + */ + +/* + * These tunables are for performance analysis. + */ +/* + * All i/os smaller than zfs_vdev_cache_max will be turned into + * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software + * track buffer). At most zfs_vdev_cache_size bytes will be kept in each + * vdev's vdev_cache. + * + * TODO: Note that with the current ZFS code, it turns out that the + * vdev cache is not helpful, and in some cases actually harmful. It + * is better if we disable this. Once some time has passed, we should + * actually remove this to simplify the code. For now we just disable + * it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11 + * has made these same changes. + */ +int zfs_vdev_cache_max = 1<<14; /* 16KB */ +int zfs_vdev_cache_size = 0; +int zfs_vdev_cache_bshift = 16; + +#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ + +SYSCTL_DECL(_vfs_zfs_vdev); +SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); +SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN, + &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size"); +SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN, + &zfs_vdev_cache_size, 0, "Size of VDEV cache"); +SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, bshift, CTLFLAG_RDTUN, + &zfs_vdev_cache_bshift, 0, "Turn too small requests into 1 << this value"); + +kstat_t *vdc_ksp = NULL; + +typedef struct vdc_stats { + kstat_named_t vdc_stat_delegations; + kstat_named_t vdc_stat_hits; + kstat_named_t vdc_stat_misses; +} vdc_stats_t; + +static vdc_stats_t vdc_stats = { + { "delegations", KSTAT_DATA_UINT64 }, + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 } +}; + +#define VDCSTAT_BUMP(stat) atomic_inc_64(&vdc_stats.stat.value.ui64); + +static int +vdev_cache_offset_compare(const void *a1, const void *a2) +{ + const vdev_cache_entry_t *ve1 = a1; + const vdev_cache_entry_t *ve2 = a2; + + if (ve1->ve_offset < ve2->ve_offset) + return (-1); + if (ve1->ve_offset > ve2->ve_offset) + return (1); + return (0); +} + +static int +vdev_cache_lastused_compare(const void *a1, const void *a2) +{ + const vdev_cache_entry_t *ve1 = a1; + const vdev_cache_entry_t *ve2 = a2; + + if (ve1->ve_lastused < ve2->ve_lastused) + return (-1); + if (ve1->ve_lastused > ve2->ve_lastused) + return (1); + + /* + * Among equally old entries, sort by offset to ensure uniqueness. + */ + return (vdev_cache_offset_compare(a1, a2)); +} + +/* + * Evict the specified entry from the cache. + */ +static void +vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) +{ + ASSERT(MUTEX_HELD(&vc->vc_lock)); + ASSERT3P(ve->ve_fill_io, ==, NULL); + ASSERT3P(ve->ve_abd, !=, NULL); + + avl_remove(&vc->vc_lastused_tree, ve); + avl_remove(&vc->vc_offset_tree, ve); + abd_free(ve->ve_abd); + kmem_free(ve, sizeof (vdev_cache_entry_t)); +} + +/* + * Allocate an entry in the cache. At the point we don't have the data, + * we're just creating a placeholder so that multiple threads don't all + * go off and read the same blocks. + */ +static vdev_cache_entry_t * +vdev_cache_allocate(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + uint64_t offset = P2ALIGN(zio->io_offset, VCBS); + vdev_cache_entry_t *ve; + + ASSERT(MUTEX_HELD(&vc->vc_lock)); + + if (zfs_vdev_cache_size == 0) + return (NULL); + + /* + * If adding a new entry would exceed the cache size, + * evict the oldest entry (LRU). + */ + if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > + zfs_vdev_cache_size) { + ve = avl_first(&vc->vc_lastused_tree); + if (ve->ve_fill_io != NULL) + return (NULL); + ASSERT3U(ve->ve_hits, !=, 0); + vdev_cache_evict(vc, ve); + } + + ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); + ve->ve_offset = offset; + ve->ve_lastused = ddi_get_lbolt(); + ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); + + avl_add(&vc->vc_offset_tree, ve); + avl_add(&vc->vc_lastused_tree, ve); + + return (ve); +} + +static void +vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) +{ + uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); + + ASSERT(MUTEX_HELD(&vc->vc_lock)); + ASSERT3P(ve->ve_fill_io, ==, NULL); + + if (ve->ve_lastused != ddi_get_lbolt()) { + avl_remove(&vc->vc_lastused_tree, ve); + ve->ve_lastused = ddi_get_lbolt(); + avl_add(&vc->vc_lastused_tree, ve); + } + + ve->ve_hits++; + abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); +} + +/* + * Fill a previously allocated cache entry with data. + */ +static void +vdev_cache_fill(zio_t *fio) +{ + vdev_t *vd = fio->io_vd; + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve = fio->io_private; + zio_t *pio; + + ASSERT3U(fio->io_size, ==, VCBS); + + /* + * Add data to the cache. + */ + mutex_enter(&vc->vc_lock); + + ASSERT3P(ve->ve_fill_io, ==, fio); + ASSERT3U(ve->ve_offset, ==, fio->io_offset); + ASSERT3P(ve->ve_abd, ==, fio->io_abd); + + ve->ve_fill_io = NULL; + + /* + * Even if this cache line was invalidated by a missed write update, + * any reads that were queued up before the missed update are still + * valid, so we can satisfy them from this line before we evict it. + */ + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(fio, &zl)) != NULL) + vdev_cache_hit(vc, ve, pio); + + if (fio->io_error || ve->ve_missed_update) + vdev_cache_evict(vc, ve); + + mutex_exit(&vc->vc_lock); +} + +/* + * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss. + */ +boolean_t +vdev_cache_read(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + vdev_cache_entry_t *ve, ve_search; + uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); + uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); + zio_t *fio; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + if (zio->io_flags & ZIO_FLAG_DONT_CACHE) + return (B_FALSE); + + if (zio->io_size > zfs_vdev_cache_max) + return (B_FALSE); + + /* + * If the I/O straddles two or more cache blocks, don't cache it. + */ + if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) + return (B_FALSE); + + ASSERT3U(cache_phase + zio->io_size, <=, VCBS); + + mutex_enter(&vc->vc_lock); + + ve_search.ve_offset = cache_offset; + ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); + + if (ve != NULL) { + if (ve->ve_missed_update) { + mutex_exit(&vc->vc_lock); + return (B_FALSE); + } + + if ((fio = ve->ve_fill_io) != NULL) { + zio_vdev_io_bypass(zio); + zio_add_child(zio, fio); + mutex_exit(&vc->vc_lock); + VDCSTAT_BUMP(vdc_stat_delegations); + return (B_TRUE); + } + + vdev_cache_hit(vc, ve, zio); + zio_vdev_io_bypass(zio); + + mutex_exit(&vc->vc_lock); + VDCSTAT_BUMP(vdc_stat_hits); + return (B_TRUE); + } + + ve = vdev_cache_allocate(zio); + + if (ve == NULL) { + mutex_exit(&vc->vc_lock); + return (B_FALSE); + } + + fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, + ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, + ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); + + ve->ve_fill_io = fio; + zio_vdev_io_bypass(zio); + zio_add_child(zio, fio); + + mutex_exit(&vc->vc_lock); + zio_nowait(fio); + VDCSTAT_BUMP(vdc_stat_misses); + + return (B_TRUE); +} + +/* + * Update cache contents upon write completion. + */ +void +vdev_cache_write(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + vdev_cache_entry_t *ve, ve_search; + uint64_t io_start = zio->io_offset; + uint64_t io_end = io_start + zio->io_size; + uint64_t min_offset = P2ALIGN(io_start, VCBS); + uint64_t max_offset = P2ROUNDUP(io_end, VCBS); + avl_index_t where; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + + mutex_enter(&vc->vc_lock); + + ve_search.ve_offset = min_offset; + ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); + + if (ve == NULL) + ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); + + while (ve != NULL && ve->ve_offset < max_offset) { + uint64_t start = MAX(ve->ve_offset, io_start); + uint64_t end = MIN(ve->ve_offset + VCBS, io_end); + + if (ve->ve_fill_io != NULL) { + ve->ve_missed_update = 1; + } else { + abd_copy_off(ve->ve_abd, zio->io_abd, + start - ve->ve_offset, start - io_start, + end - start); + } + ve = AVL_NEXT(&vc->vc_offset_tree, ve); + } + mutex_exit(&vc->vc_lock); +} + +void +vdev_cache_purge(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve; + + mutex_enter(&vc->vc_lock); + while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) + vdev_cache_evict(vc, ve); + mutex_exit(&vc->vc_lock); +} + +void +vdev_cache_init(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + + mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, + sizeof (vdev_cache_entry_t), + offsetof(struct vdev_cache_entry, ve_offset_node)); + + avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, + sizeof (vdev_cache_entry_t), + offsetof(struct vdev_cache_entry, ve_lastused_node)); +} + +void +vdev_cache_fini(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + + vdev_cache_purge(vd); + + avl_destroy(&vc->vc_offset_tree); + avl_destroy(&vc->vc_lastused_tree); + + mutex_destroy(&vc->vc_lock); +} + +void +vdev_cache_stat_init(void) +{ + vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", + KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (vdc_ksp != NULL) { + vdc_ksp->ks_data = &vdc_stats; + kstat_install(vdc_ksp); + } +} + +void +vdev_cache_stat_fini(void) +{ + if (vdc_ksp != NULL) { + kstat_delete(vdc_ksp); + vdc_ksp = NULL; + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c new file mode 100644 index 000000000000..be24cde54b9b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c @@ -0,0 +1,926 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 Joyent, Inc. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/refcount.h> +#include <sys/vdev_disk.h> +#include <sys/vdev_impl.h> +#include <sys/abd.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/sunldi.h> +#include <sys/efi_partition.h> +#include <sys/fm/fs/zfs.h> + +/* + * Virtual device vector for disks. + */ + +extern ldi_ident_t zfs_li; + +static void vdev_disk_close(vdev_t *); + +typedef struct vdev_disk_ldi_cb { + list_node_t lcb_next; + ldi_callback_id_t lcb_id; +} vdev_disk_ldi_cb_t; + +static void +vdev_disk_alloc(vdev_t *vd) +{ + vdev_disk_t *dvd; + + dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); + /* + * Create the LDI event callback list. + */ + list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t), + offsetof(vdev_disk_ldi_cb_t, lcb_next)); +} + +static void +vdev_disk_free(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_ldi_cb_t *lcb; + + if (dvd == NULL) + return; + + /* + * We have already closed the LDI handle. Clean up the LDI event + * callbacks and free vd->vdev_tsd. + */ + while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) { + list_remove(&dvd->vd_ldi_cbs, lcb); + (void) ldi_ev_remove_callbacks(lcb->lcb_id); + kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t)); + } + list_destroy(&dvd->vd_ldi_cbs); + kmem_free(dvd, sizeof (vdev_disk_t)); + vd->vdev_tsd = NULL; +} + +/* ARGSUSED */ +static int +vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg, + void *ev_data) +{ + vdev_t *vd = (vdev_t *)arg; + vdev_disk_t *dvd = vd->vdev_tsd; + + /* + * Ignore events other than offline. + */ + if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) + return (LDI_EV_SUCCESS); + + /* + * All LDI handles must be closed for the state change to succeed, so + * call on vdev_disk_close() to do this. + * + * We inform vdev_disk_close that it is being called from offline + * notify context so it will defer cleanup of LDI event callbacks and + * freeing of vd->vdev_tsd to the offline finalize or a reopen. + */ + dvd->vd_ldi_offline = B_TRUE; + vdev_disk_close(vd); + + /* + * Now that the device is closed, request that the spa_async_thread + * mark the device as REMOVED and notify FMA of the removal. + */ + zfs_post_remove(vd->vdev_spa, vd); + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); + + return (LDI_EV_SUCCESS); +} + +/* ARGSUSED */ +static void +vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, + int ldi_result, void *arg, void *ev_data) +{ + vdev_t *vd = (vdev_t *)arg; + + /* + * Ignore events other than offline. + */ + if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) + return; + + /* + * We have already closed the LDI handle in notify. + * Clean up the LDI event callbacks and free vd->vdev_tsd. + */ + vdev_disk_free(vd); + + /* + * Request that the vdev be reopened if the offline state change was + * unsuccessful. + */ + if (ldi_result != LDI_EV_SUCCESS) { + vd->vdev_probe_wanted = B_TRUE; + spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE); + } +} + +static ldi_ev_callback_t vdev_disk_off_callb = { + .cb_vers = LDI_EV_CB_VERS, + .cb_notify = vdev_disk_off_notify, + .cb_finalize = vdev_disk_off_finalize +}; + +/* ARGSUSED */ +static void +vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, + int ldi_result, void *arg, void *ev_data) +{ + vdev_t *vd = (vdev_t *)arg; + + /* + * Ignore events other than degrade. + */ + if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0) + return; + + /* + * Degrade events always succeed. Mark the vdev as degraded. + * This status is purely informative for the user. + */ + (void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0); +} + +static ldi_ev_callback_t vdev_disk_dgrd_callb = { + .cb_vers = LDI_EV_CB_VERS, + .cb_notify = NULL, + .cb_finalize = vdev_disk_dgrd_finalize +}; + +static void +vdev_disk_hold(vdev_t *vd) +{ + ddi_devid_t devid; + char *minor; + + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') + return; + + /* + * Only prefetch path and devid info if the device has + * never been opened. + */ + if (vd->vdev_tsd != NULL) + return; + + if (vd->vdev_wholedisk == -1ULL) { + size_t len = strlen(vd->vdev_path) + 3; + char *buf = kmem_alloc(len, KM_SLEEP); + + (void) snprintf(buf, len, "%ss0", vd->vdev_path); + + (void) ldi_vp_from_name(buf, &vd->vdev_name_vp); + kmem_free(buf, len); + } + + if (vd->vdev_name_vp == NULL) + (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp); + + if (vd->vdev_devid != NULL && + ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) { + (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp); + ddi_devid_str_free(minor); + ddi_devid_free(devid); + } +} + +static void +vdev_disk_rele(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + if (vd->vdev_name_vp) { + VN_RELE_ASYNC(vd->vdev_name_vp, + dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); + vd->vdev_name_vp = NULL; + } + if (vd->vdev_devid_vp) { + VN_RELE_ASYNC(vd->vdev_devid_vp, + dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); + vd->vdev_devid_vp = NULL; + } +} + +/* + * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when + * even a fallback to DKIOCGMEDIAINFO fails. + */ +#ifdef DEBUG +#define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__) +#else +#define VDEV_DEBUG(...) /* Nothing... */ +#endif + +static int +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ + spa_t *spa = vd->vdev_spa; + vdev_disk_t *dvd = vd->vdev_tsd; + ldi_ev_cookie_t ecookie; + vdev_disk_ldi_cb_t *lcb; + union { + struct dk_minfo_ext ude; + struct dk_minfo ud; + } dks; + struct dk_minfo_ext *dkmext = &dks.ude; + struct dk_minfo *dkm = &dks.ud; + int error; + dev_t dev; + int otyp; + boolean_t validate_devid = B_FALSE; + ddi_devid_t devid; + uint64_t capacity = 0, blksz = 0, pbsize; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (dvd != NULL) { + if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) { + /* + * If we are opening a device in its offline notify + * context, the LDI handle was just closed. Clean + * up the LDI event callbacks and free vd->vdev_tsd. + */ + vdev_disk_free(vd); + } else { + ASSERT(vd->vdev_reopening); + goto skip_open; + } + } + + /* + * Create vd->vdev_tsd. + */ + vdev_disk_alloc(vd); + dvd = vd->vdev_tsd; + + /* + * When opening a disk device, we want to preserve the user's original + * intent. We always want to open the device by the path the user gave + * us, even if it is one of multiple paths to the save device. But we + * also want to be able to survive disks being removed/recabled. + * Therefore the sequence of opening devices is: + * + * 1. Try opening the device by path. For legacy pools without the + * 'whole_disk' property, attempt to fix the path by appending 's0'. + * + * 2. If the devid of the device matches the stored value, return + * success. + * + * 3. Otherwise, the device may have moved. Try opening the device + * by the devid instead. + */ + if (vd->vdev_devid != NULL) { + if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, + &dvd->vd_minor) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + vdev_dbgmsg(vd, "vdev_disk_open: invalid " + "vdev_devid '%s'", vd->vdev_devid); + return (SET_ERROR(EINVAL)); + } + } + + error = EINVAL; /* presume failure */ + + if (vd->vdev_path != NULL) { + + if (vd->vdev_wholedisk == -1ULL) { + size_t len = strlen(vd->vdev_path) + 3; + char *buf = kmem_alloc(len, KM_SLEEP); + + (void) snprintf(buf, len, "%ss0", vd->vdev_path); + + error = ldi_open_by_name(buf, spa_mode(spa), kcred, + &dvd->vd_lh, zfs_li); + if (error == 0) { + spa_strfree(vd->vdev_path); + vd->vdev_path = buf; + vd->vdev_wholedisk = 1ULL; + } else { + kmem_free(buf, len); + } + } + + /* + * If we have not yet opened the device, try to open it by the + * specified path. + */ + if (error != 0) { + error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), + kcred, &dvd->vd_lh, zfs_li); + } + + /* + * Compare the devid to the stored value. + */ + if (error == 0 && vd->vdev_devid != NULL && + ldi_get_devid(dvd->vd_lh, &devid) == 0) { + if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { + error = SET_ERROR(EINVAL); + (void) ldi_close(dvd->vd_lh, spa_mode(spa), + kcred); + dvd->vd_lh = NULL; + } + ddi_devid_free(devid); + } + + /* + * If we succeeded in opening the device, but 'vdev_wholedisk' + * is not yet set, then this must be a slice. + */ + if (error == 0 && vd->vdev_wholedisk == -1ULL) + vd->vdev_wholedisk = 0; + } + + /* + * If we were unable to open by path, or the devid check fails, open by + * devid instead. + */ + if (error != 0 && vd->vdev_devid != NULL) { + error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, + spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); + } + + /* + * If all else fails, then try opening by physical path (if available) + * or the logical path (if we failed due to the devid check). While not + * as reliable as the devid, this will give us something, and the higher + * level vdev validation will prevent us from opening the wrong device. + */ + if (error) { + if (vd->vdev_devid != NULL) + validate_devid = B_TRUE; + + if (vd->vdev_physpath != NULL && + (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) + error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), + kcred, &dvd->vd_lh, zfs_li); + + /* + * Note that we don't support the legacy auto-wholedisk support + * as above. This hasn't been used in a very long time and we + * don't need to propagate its oddities to this edge condition. + */ + if (error && vd->vdev_path != NULL) + error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), + kcred, &dvd->vd_lh, zfs_li); + } + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]", + error); + return (error); + } + + /* + * Now that the device has been successfully opened, update the devid + * if necessary. + */ + if (validate_devid && spa_writeable(spa) && + ldi_get_devid(dvd->vd_lh, &devid) == 0) { + if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { + char *vd_devid; + + vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor); + vdev_dbgmsg(vd, "vdev_disk_open: update devid from " + "'%s' to '%s'", vd->vdev_devid, vd_devid); + spa_strfree(vd->vdev_devid); + vd->vdev_devid = spa_strdup(vd_devid); + ddi_devid_str_free(vd_devid); + } + ddi_devid_free(devid); + } + + /* + * Once a device is opened, verify that the physical device path (if + * available) is up to date. + */ + if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && + ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { + char *physpath, *minorname; + + physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); + minorname = NULL; + if (ddi_dev_pathname(dev, otyp, physpath) == 0 && + ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && + (vd->vdev_physpath == NULL || + strcmp(vd->vdev_physpath, physpath) != 0)) { + if (vd->vdev_physpath) + spa_strfree(vd->vdev_physpath); + (void) strlcat(physpath, ":", MAXPATHLEN); + (void) strlcat(physpath, minorname, MAXPATHLEN); + vd->vdev_physpath = spa_strdup(physpath); + } + if (minorname) + kmem_free(minorname, strlen(minorname) + 1); + kmem_free(physpath, MAXPATHLEN); + } + + /* + * Register callbacks for the LDI offline event. + */ + if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == + LDI_EV_SUCCESS) { + lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); + list_insert_tail(&dvd->vd_ldi_cbs, lcb); + (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, + &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); + } + + /* + * Register callbacks for the LDI degrade event. + */ + if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) == + LDI_EV_SUCCESS) { + lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); + list_insert_tail(&dvd->vd_ldi_cbs, lcb); + (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, + &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id); + } +skip_open: + /* + * Determine the actual size of the device. + */ + if (ldi_get_size(dvd->vd_lh, psize) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vdev_dbgmsg(vd, "vdev_disk_open: failed to get size"); + return (SET_ERROR(EINVAL)); + } + + *max_psize = *psize; + + /* + * Determine the device's minimum transfer size. + * If the ioctl isn't supported, assume DEV_BSIZE. + */ + if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, + (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { + capacity = dkmext->dki_capacity - 1; + blksz = dkmext->dki_lbsize; + pbsize = dkmext->dki_pbsize; + } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, + (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { + VDEV_DEBUG( + "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", + vd->vdev_path); + capacity = dkm->dki_capacity - 1; + blksz = dkm->dki_lbsize; + pbsize = blksz; + } else { + VDEV_DEBUG("vdev_disk_open(\"%s\"): " + "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", + vd->vdev_path, error); + pbsize = DEV_BSIZE; + } + + *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; + + if (vd->vdev_wholedisk == 1) { + int wce = 1; + + if (error == 0) { + /* + * If we have the capability to expand, we'd have + * found out via success from DKIOCGMEDIAINFO{,EXT}. + * Adjust max_psize upward accordingly since we know + * we own the whole disk now. + */ + *max_psize = capacity * blksz; + } + + /* + * Since we own the whole disk, try to enable disk write + * caching. We ignore errors because it's OK if we can't do it. + */ + (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, + FKIOCTL, kcred, NULL); + } + + /* + * Clear the nowritecache bit, so that on a vdev_reopen() we will + * try again. + */ + vd->vdev_nowritecache = B_FALSE; + + return (0); +} + +static void +vdev_disk_close(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + if (vd->vdev_reopening || dvd == NULL) + return; + + if (dvd->vd_minor != NULL) { + ddi_devid_str_free(dvd->vd_minor); + dvd->vd_minor = NULL; + } + + if (dvd->vd_devid != NULL) { + ddi_devid_free(dvd->vd_devid); + dvd->vd_devid = NULL; + } + + if (dvd->vd_lh != NULL) { + (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); + dvd->vd_lh = NULL; + } + + vd->vdev_delayed_close = B_FALSE; + /* + * If we closed the LDI handle due to an offline notify from LDI, + * don't free vd->vdev_tsd or unregister the callbacks here; + * the offline finalize callback or a reopen will take care of it. + */ + if (dvd->vd_ldi_offline) + return; + + vdev_disk_free(vd); +} + +int +vdev_disk_physio(vdev_t *vd, caddr_t data, + size_t size, uint64_t offset, int flags, boolean_t isdump) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) + return (EIO); + + ASSERT(vd->vdev_ops == &vdev_disk_ops); + + /* + * If in the context of an active crash dump, use the ldi_dump(9F) + * call instead of ldi_strategy(9F) as usual. + */ + if (isdump) { + ASSERT3P(dvd, !=, NULL); + return (ldi_dump(dvd->vd_lh, data, lbtodb(offset), + lbtodb(size))); + } + + return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); +} + +int +vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, + size_t size, uint64_t offset, int flags) +{ + buf_t *bp; + int error = 0; + + if (vd_lh == NULL) + return (SET_ERROR(EINVAL)); + + ASSERT(flags & B_READ || flags & B_WRITE); + + bp = getrbuf(KM_SLEEP); + bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; + bp->b_bcount = size; + bp->b_un.b_addr = (void *)data; + bp->b_lblkno = lbtodb(offset); + bp->b_bufsize = size; + + error = ldi_strategy(vd_lh, bp); + ASSERT(error == 0); + if ((error = biowait(bp)) == 0 && bp->b_resid != 0) + error = SET_ERROR(EIO); + freerbuf(bp); + + return (error); +} + +static void +vdev_disk_io_intr(buf_t *bp) +{ + vdev_buf_t *vb = (vdev_buf_t *)bp; + zio_t *zio = vb->vb_io; + + /* + * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. + * Rather than teach the rest of the stack about other error + * possibilities (EFAULT, etc), we normalize the error value here. + */ + zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0); + + if (zio->io_error == 0 && bp->b_resid != 0) + zio->io_error = SET_ERROR(EIO); + + if (zio->io_type == ZIO_TYPE_READ) { + abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size); + } else { + abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size); + } + + kmem_free(vb, sizeof (vdev_buf_t)); + + zio_delay_interrupt(zio); +} + +static void +vdev_disk_ioctl_free(zio_t *zio) +{ + kmem_free(zio->io_vsd, sizeof (struct dk_callback)); +} + +static const zio_vsd_ops_t vdev_disk_vsd_ops = { + vdev_disk_ioctl_free, + zio_vsd_default_cksum_report +}; + +static void +vdev_disk_ioctl_done(void *zio_arg, int error) +{ + zio_t *zio = zio_arg; + + zio->io_error = error; + + zio_interrupt(zio); +} + +static void +vdev_disk_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_disk_t *dvd = vd->vdev_tsd; + vdev_buf_t *vb; + struct dk_callback *dkc; + buf_t *bp; + int error; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + if (zio->io_type == ZIO_TYPE_IOCTL) { + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + + case DKIOCFLUSHWRITECACHE: + + if (zfs_nocacheflush) + break; + + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + + zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); + zio->io_vsd_ops = &vdev_disk_vsd_ops; + + dkc->dkc_callback = vdev_disk_ioctl_done; + dkc->dkc_flag = FLUSH_VOLATILE; + dkc->dkc_cookie = zio; + + error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, + (uintptr_t)dkc, FKIOCTL, kcred, NULL); + + if (error == 0) { + /* + * The ioctl will be done asychronously, + * and will call vdev_disk_ioctl_done() + * upon completion. + */ + return; + } + + zio->io_error = error; + + break; + + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + + zio_execute(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); + + vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); + + vb->vb_io = zio; + bp = &vb->vb_buf; + + bioinit(bp); + bp->b_flags = B_BUSY | B_NOCACHE | + (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); + if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) + bp->b_flags |= B_FAILFAST; + bp->b_bcount = zio->io_size; + + if (zio->io_type == ZIO_TYPE_READ) { + bp->b_un.b_addr = + abd_borrow_buf(zio->io_abd, zio->io_size); + } else { + bp->b_un.b_addr = + abd_borrow_buf_copy(zio->io_abd, zio->io_size); + } + + bp->b_lblkno = lbtodb(zio->io_offset); + bp->b_bufsize = zio->io_size; + bp->b_iodone = (int (*)())vdev_disk_io_intr; + + /* ldi_strategy() will return non-zero only on programming errors */ + VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); +} + +static void +vdev_disk_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + /* + * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if + * the device has been removed. If this is the case, then we trigger an + * asynchronous removal of the device. Otherwise, probe the device and + * make sure it's still accessible. + */ + if (zio->io_error == EIO && !vd->vdev_remove_wanted) { + vdev_disk_t *dvd = vd->vdev_tsd; + int state = DKIO_NONE; + + if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, + FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { + /* + * We post the resource as soon as possible, instead of + * when the async removal actually happens, because the + * DE is using this information to discard previous I/O + * errors. + */ + zfs_post_remove(zio->io_spa, vd); + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } else if (!vd->vdev_delayed_close) { + vd->vdev_delayed_close = B_TRUE; + } + } +} + +vdev_ops_t vdev_disk_ops = { + vdev_disk_open, + vdev_disk_close, + vdev_default_asize, + vdev_disk_io_start, + vdev_disk_io_done, + NULL, + NULL, + vdev_disk_hold, + vdev_disk_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +/* + * Given the root disk device devid or pathname, read the label from + * the device, and construct a configuration nvlist. + */ +int +vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) +{ + ldi_handle_t vd_lh; + vdev_label_t *label; + uint64_t s, size; + int l; + ddi_devid_t tmpdevid; + int error = -1; + char *minor_name; + + /* + * Read the device label and build the nvlist. + */ + if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, + &minor_name) == 0) { + error = ldi_open_by_devid(tmpdevid, minor_name, + FREAD, kcred, &vd_lh, zfs_li); + ddi_devid_free(tmpdevid); + ddi_devid_str_free(minor_name); + } + + if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, + zfs_li))) + return (error); + + if (ldi_get_size(vd_lh, &s)) { + (void) ldi_close(vd_lh, FREAD, kcred); + return (SET_ERROR(EIO)); + } + + size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); + label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); + + *config = NULL; + for (l = 0; l < VDEV_LABELS; l++) { + uint64_t offset, state, txg = 0; + + /* read vdev label */ + offset = vdev_label_offset(size, l, 0); + if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, + VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) + continue; + + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state >= POOL_STATE_DESTROYED) { + nvlist_free(*config); + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0) { + nvlist_free(*config); + *config = NULL; + continue; + } + + break; + } + + kmem_free(label, sizeof (vdev_label_t)); + (void) ldi_close(vd_lh, FREAD, kcred); + if (*config == NULL) + error = SET_ERROR(EIDRM); + + return (error); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c new file mode 100644 index 000000000000..6cc5343b5e69 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c @@ -0,0 +1,303 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_file.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> +#include <sys/fm/fs/zfs.h> +#include <sys/abd.h> + +/* + * Virtual device vector for files. + */ + +static taskq_t *vdev_file_taskq; + +void +vdev_file_init(void) +{ + vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16), + minclsyspri, max_ncpus, INT_MAX, 0); +} + +void +vdev_file_fini(void) +{ + taskq_destroy(vdev_file_taskq); +} + +static void +vdev_file_hold(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static void +vdev_file_rele(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_file_t *vf; + vnode_t *vp; + vattr_t vattr; + int error; + + /* Rotational optimizations only make sense on block devices */ + vd->vdev_nonrot = B_TRUE; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + vf = vd->vdev_tsd; + vp = vf->vf_vnode; + goto skip_open; + } + + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); + + /* + * We always open the files from the root of the global zone, even if + * we're in a local zone. If the user has gotten to this point, the + * administrator has already decided that the pool should be available + * to local zone users, so the underlying devices should be as well. + */ + ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); + error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, + spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; + return (error); + } + + vf->vf_vnode = vp; + +#ifdef _KERNEL + /* + * Make sure it's a regular file. + */ + if (vp->v_type != VREG) { +#ifdef __FreeBSD__ + (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); +#endif + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; +#ifdef __FreeBSD__ + kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +#endif + return (SET_ERROR(ENODEV)); + } +#endif /* _KERNEL */ + +skip_open: + /* + * Determine the physical size of the file. + */ + vattr.va_mask = AT_SIZE; + vn_lock(vp, LK_SHARED | LK_RETRY); + error = VOP_GETATTR(vp, &vattr, kcred); + VOP_UNLOCK(vp, 0); + if (error) { + (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; + return (error); + } + + vd->vdev_notrim = B_TRUE; + + *max_psize = *psize = vattr.va_size; + *logical_ashift = SPA_MINBLOCKSHIFT; + *physical_ashift = SPA_MINBLOCKSHIFT; + + return (0); +} + +static void +vdev_file_close(vdev_t *vd) +{ + vdev_file_t *vf = vd->vdev_tsd; + + if (vd->vdev_reopening || vf == NULL) + return; + + if (vf->vf_vnode != NULL) { + (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, + kcred, NULL); + } + + vd->vdev_delayed_close = B_FALSE; + kmem_free(vf, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +} + +/* + * Implements the interrupt side for file vdev types. This routine will be + * called when the I/O completes allowing us to transfer the I/O to the + * interrupt taskqs. For consistency, the code structure mimics disk vdev + * types. + */ +static void +vdev_file_io_intr(zio_t *zio) +{ + zio_delay_interrupt(zio); +} + +static void +vdev_file_io_strategy(void *arg) +{ + zio_t *zio = arg; + vdev_t *vd = zio->io_vd; + vdev_file_t *vf; + vnode_t *vp; + void *addr; + ssize_t resid; + + vf = vd->vdev_tsd; + vp = vf->vf_vnode; + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + if (zio->io_type == ZIO_TYPE_READ) { + addr = abd_borrow_buf(zio->io_abd, zio->io_size); + } else { + addr = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + } + + zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? + UIO_READ : UIO_WRITE, vp, addr, zio->io_size, + zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + if (zio->io_type == ZIO_TYPE_READ) { + abd_return_buf_copy(zio->io_abd, addr, zio->io_size); + } else { + abd_return_buf(zio->io_abd, addr, zio->io_size); + } + + if (resid != 0 && zio->io_error == 0) + zio->io_error = ENOSPC; + + vdev_file_io_intr(zio); +} + +static void +vdev_file_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, + kcred, NULL); + break; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + + zio_execute(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); + + VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, + TQ_SLEEP), !=, 0); +} + +/* ARGSUSED */ +static void +vdev_file_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_file_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_FILE, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +/* + * From userland we access disks just like files. + */ +#ifndef _KERNEL + +vdev_ops_t vdev_disk_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c new file mode 100644 index 000000000000..0631007b4a4e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -0,0 +1,1189 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org> + */ + +#include <sys/zfs_context.h> +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/bio.h> +#include <sys/disk.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <geom/geom.h> +#include <geom/geom_int.h> + +/* + * Virtual device vector for GEOM. + */ + +static g_attrchanged_t vdev_geom_attrchanged; +struct g_class zfs_vdev_class = { + .name = "ZFS::VDEV", + .version = G_VERSION, + .attrchanged = vdev_geom_attrchanged, +}; + +struct consumer_vdev_elem { + SLIST_ENTRY(consumer_vdev_elem) elems; + vdev_t *vd; +}; + +SLIST_HEAD(consumer_priv_t, consumer_vdev_elem); +_Static_assert(sizeof(((struct g_consumer*)NULL)->private) + == sizeof(struct consumer_priv_t*), + "consumer_priv_t* can't be stored in g_consumer.private"); + +DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); + +SYSCTL_DECL(_vfs_zfs_vdev); +/* Don't send BIO_FLUSH. */ +static int vdev_geom_bio_flush_disable; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, + &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); +/* Don't send BIO_DELETE. */ +static int vdev_geom_bio_delete_disable; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, + &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); + +/* Declare local functions */ +static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); + +/* + * Thread local storage used to indicate when a thread is probing geoms + * for their guids. If NULL, this thread is not tasting geoms. If non NULL, + * it is looking for a replacement for the vdev_t* that is its value. + */ +uint_t zfs_geom_probe_vdev_key; + +static void +vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp) +{ + int error; + uint16_t rate; + + error = g_getattr("GEOM::rotation_rate", cp, &rate); + if (error == 0 && rate == 1) + vd->vdev_nonrot = B_TRUE; + else + vd->vdev_nonrot = B_FALSE; +} + +static void +vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp, + boolean_t do_null_update) +{ + boolean_t needs_update = B_FALSE; + char *physpath; + int error, physpath_len; + + physpath_len = MAXPATHLEN; + physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); + error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); + if (error == 0) { + char *old_physpath; + + /* g_topology lock ensures that vdev has not been closed */ + g_topology_assert(); + old_physpath = vd->vdev_physpath; + vd->vdev_physpath = spa_strdup(physpath); + + if (old_physpath != NULL) { + needs_update = (strcmp(old_physpath, + vd->vdev_physpath) != 0); + spa_strfree(old_physpath); + } else + needs_update = do_null_update; + } + g_free(physpath); + + /* + * If the physical path changed, update the config. + * Only request an update for previously unset physpaths if + * requested by the caller. + */ + if (needs_update) + spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); + +} + +static void +vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) +{ + char *old_physpath; + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + int error; + + priv = (struct consumer_priv_t*)&cp->private; + if (SLIST_EMPTY(priv)) + return; + + SLIST_FOREACH(elem, priv, elems) { + vdev_t *vd = elem->vd; + if (strcmp(attr, "GEOM::rotation_rate") == 0) { + vdev_geom_set_rotation_rate(vd, cp); + return; + } + if (strcmp(attr, "GEOM::physpath") == 0) { + vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE); + return; + } + } +} + +static void +vdev_geom_resize(struct g_consumer *cp) +{ + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + spa_t *spa; + vdev_t *vd; + + priv = (struct consumer_priv_t *)&cp->private; + if (SLIST_EMPTY(priv)) + return; + + SLIST_FOREACH(elem, priv, elems) { + vd = elem->vd; + if (vd->vdev_state != VDEV_STATE_HEALTHY) + continue; + spa = vd->vdev_spa; + if (!spa->spa_autoexpand) + continue; + vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL); + } +} + +static void +vdev_geom_orphan(struct g_consumer *cp) +{ + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + + g_topology_assert(); + + priv = (struct consumer_priv_t*)&cp->private; + if (SLIST_EMPTY(priv)) + /* Vdev close in progress. Ignore the event. */ + return; + + /* + * Orphan callbacks occur from the GEOM event thread. + * Concurrent with this call, new I/O requests may be + * working their way through GEOM about to find out + * (only once executed by the g_down thread) that we've + * been orphaned from our disk provider. These I/Os + * must be retired before we can detach our consumer. + * This is most easily achieved by acquiring the + * SPA ZIO configuration lock as a writer, but doing + * so with the GEOM topology lock held would cause + * a lock order reversal. Instead, rely on the SPA's + * async removal support to invoke a close on this + * vdev once it is safe to do so. + */ + SLIST_FOREACH(elem, priv, elems) { + vdev_t *vd = elem->vd; + + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); + } +} + +static struct g_consumer * +vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error; + + g_topology_assert(); + + ZFS_LOG(1, "Attaching to %s.", pp->name); + + if (sanity) { + if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { + ZFS_LOG(1, "Failing attach of %s. " + "Incompatible sectorsize %d\n", + pp->name, pp->sectorsize); + return (NULL); + } else if (pp->mediasize < SPA_MINDEVSIZE) { + ZFS_LOG(1, "Failing attach of %s. " + "Incompatible mediasize %ju\n", + pp->name, pp->mediasize); + return (NULL); + } + } + + /* Do we have geom already? No? Create one. */ + LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + if (strcmp(gp->name, "zfs::vdev") != 0) + continue; + break; + } + if (gp == NULL) { + gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); + gp->orphan = vdev_geom_orphan; + gp->attrchanged = vdev_geom_attrchanged; + gp->resize = vdev_geom_resize; + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, + __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, + __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); + } else { + /* Check if we are already connected to this provider. */ + LIST_FOREACH(cp, &gp->consumer, consumer) { + if (cp->provider == pp) { + ZFS_LOG(1, "Found consumer for %s.", pp->name); + break; + } + } + if (cp == NULL) { + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", + __func__, __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", + __func__, __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + ZFS_LOG(1, "Created consumer for %s.", pp->name); + } else { + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", + __func__, __LINE__, error); + return (NULL); + } + ZFS_LOG(1, "Used existing consumer for %s.", pp->name); + } + } + + if (vd != NULL) + vd->vdev_tsd = cp; + + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; + return (cp); +} + +static void +vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) +{ + struct g_geom *gp; + + g_topology_assert(); + + ZFS_LOG(1, "Detaching from %s.", + cp->provider && cp->provider->name ? cp->provider->name : "NULL"); + + gp = cp->geom; + if (open_for_read) + g_access(cp, -1, 0, -1); + /* Destroy consumer on last close. */ + if (cp->acr == 0 && cp->ace == 0) { + if (cp->acw > 0) + g_access(cp, 0, -cp->acw, 0); + if (cp->provider != NULL) { + ZFS_LOG(1, "Destroying consumer for %s.", + cp->provider->name ? cp->provider->name : "NULL"); + g_detach(cp); + } + g_destroy_consumer(cp); + } + /* Destroy geom if there are no consumers left. */ + if (LIST_EMPTY(&gp->consumer)) { + ZFS_LOG(1, "Destroyed geom %s.", gp->name); + g_wither_geom(gp, ENXIO); + } +} + +static void +vdev_geom_close_locked(vdev_t *vd) +{ + struct g_consumer *cp; + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem, *elem_temp; + + g_topology_assert(); + + cp = vd->vdev_tsd; + vd->vdev_delayed_close = B_FALSE; + if (cp == NULL) + return; + + ZFS_LOG(1, "Closing access to %s.", cp->provider->name); + KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__)); + priv = (struct consumer_priv_t*)&cp->private; + vd->vdev_tsd = NULL; + SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) { + if (elem->vd == vd) { + SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems); + g_free(elem); + } + } + + vdev_geom_detach(cp, B_TRUE); +} + +/* + * Issue one or more bios to the vdev in parallel + * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO + * operation is described by parallel entries from each array. There may be + * more bios actually issued than entries in the array + */ +static void +vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, + off_t *sizes, int *errors, int ncmds) +{ + struct bio **bios; + u_char *p; + off_t off, maxio, s, end; + int i, n_bios, j; + size_t bios_size; + + maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); + n_bios = 0; + + /* How many bios are required for all commands ? */ + for (i = 0; i < ncmds; i++) + n_bios += (sizes[i] + maxio - 1) / maxio; + + /* Allocate memory for the bios */ + bios_size = n_bios * sizeof(struct bio*); + bios = kmem_zalloc(bios_size, KM_SLEEP); + + /* Prepare and issue all of the bios */ + for (i = j = 0; i < ncmds; i++) { + off = offsets[i]; + p = datas[i]; + s = sizes[i]; + end = off + s; + ASSERT((off % cp->provider->sectorsize) == 0); + ASSERT((s % cp->provider->sectorsize) == 0); + + for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { + bios[j] = g_alloc_bio(); + bios[j]->bio_cmd = cmds[i]; + bios[j]->bio_done = NULL; + bios[j]->bio_offset = off; + bios[j]->bio_length = MIN(s, maxio); + bios[j]->bio_data = p; + g_io_request(bios[j], cp); + } + } + ASSERT(j == n_bios); + + /* Wait for all of the bios to complete, and clean them up */ + for (i = j = 0; i < ncmds; i++) { + off = offsets[i]; + s = sizes[i]; + end = off + s; + + for (; off < end; off += maxio, s -= maxio, j++) { + errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i]; + g_destroy_bio(bios[j]); + } + } + kmem_free(bios, bios_size); +} + +/* + * Read the vdev config from a device. Return the number of valid labels that + * were found. The vdev config will be returned in config if and only if at + * least one valid label was found. + */ +static int +vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) +{ + struct g_provider *pp; + nvlist_t *config; + vdev_phys_t *vdev_lists[VDEV_LABELS]; + char *buf; + size_t buflen; + uint64_t psize, state, txg; + off_t offsets[VDEV_LABELS]; + off_t size; + off_t sizes[VDEV_LABELS]; + int cmds[VDEV_LABELS]; + int errors[VDEV_LABELS]; + int l, nlabels; + + g_topology_assert_not(); + + pp = cp->provider; + ZFS_LOG(1, "Reading config from %s...", pp->name); + + psize = pp->mediasize; + psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t)); + + size = sizeof(*vdev_lists[0]) + pp->sectorsize - + ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1; + + buflen = sizeof(vdev_lists[0]->vp_nvlist); + + /* Create all of the IO requests */ + for (l = 0; l < VDEV_LABELS; l++) { + cmds[l] = BIO_READ; + vdev_lists[l] = kmem_alloc(size, KM_SLEEP); + offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; + sizes[l] = size; + errors[l] = 0; + ASSERT(offsets[l] % pp->sectorsize == 0); + } + + /* Issue the IO requests */ + vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, + VDEV_LABELS); + + /* Parse the labels */ + config = *configp = NULL; + nlabels = 0; + for (l = 0; l < VDEV_LABELS; l++) { + if (errors[l] != 0) + continue; + + buf = vdev_lists[l]->vp_nvlist; + + if (nvlist_unpack(buf, buflen, &config, 0) != 0) + continue; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + nvlist_free(config); + continue; + } + + if (state != POOL_STATE_SPARE && + state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + nvlist_free(config); + continue; + } + + if (*configp != NULL) + nvlist_free(*configp); + *configp = config; + + nlabels++; + } + + /* Free the label storage */ + for (l = 0; l < VDEV_LABELS; l++) + kmem_free(vdev_lists[l], size); + + return (nlabels); +} + +static void +resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) +{ + nvlist_t **new_configs; + uint64_t i; + + if (id < *count) + return; + new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *), + KM_SLEEP); + for (i = 0; i < *count; i++) + new_configs[i] = (*configs)[i]; + if (*configs != NULL) + kmem_free(*configs, *count * sizeof(void *)); + *configs = new_configs; + *count = id + 1; +} + +static void +process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, + const char *name, uint64_t* known_pool_guid) +{ + nvlist_t *vdev_tree; + uint64_t pool_guid; + uint64_t vdev_guid, known_guid; + uint64_t id, txg, known_txg; + char *pname; + int i; + + if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || + strcmp(pname, name) != 0) + goto ignore; + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + goto ignore; + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) + goto ignore; + + if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) + goto ignore; + + if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) + goto ignore; + + VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + + if (*known_pool_guid != 0) { + if (pool_guid != *known_pool_guid) + goto ignore; + } else + *known_pool_guid = pool_guid; + + resize_configs(configs, count, id); + + if ((*configs)[id] != NULL) { + VERIFY(nvlist_lookup_uint64((*configs)[id], + ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0); + if (txg <= known_txg) + goto ignore; + nvlist_free((*configs)[id]); + } + + (*configs)[id] = cfg; + return; + +ignore: + nvlist_free(cfg); +} + +int +vdev_geom_read_pool_label(const char *name, + nvlist_t ***configs, uint64_t *count) +{ + struct g_class *mp; + struct g_geom *gp; + struct g_provider *pp; + struct g_consumer *zcp; + nvlist_t *vdev_cfg; + uint64_t pool_guid; + int error, nlabels; + + DROP_GIANT(); + g_topology_lock(); + + *configs = NULL; + *count = 0; + pool_guid = 0; + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + if (pp->flags & G_PF_WITHER) + continue; + zcp = vdev_geom_attach(pp, NULL, B_TRUE); + if (zcp == NULL) + continue; + g_topology_unlock(); + nlabels = vdev_geom_read_config(zcp, &vdev_cfg); + g_topology_lock(); + vdev_geom_detach(zcp, B_TRUE); + if (nlabels == 0) + continue; + ZFS_LOG(1, "successfully read vdev config"); + + process_vdev_config(configs, count, + vdev_cfg, name, &pool_guid); + } + } + } + g_topology_unlock(); + PICKUP_GIANT(); + + return (*count > 0 ? 0 : ENOENT); +} + +enum match { + NO_MATCH = 0, /* No matching labels found */ + TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid*/ + ZERO_MATCH = 1, /* Should never be returned */ + ONE_MATCH = 2, /* 1 label matching the vdev_guid */ + TWO_MATCH = 3, /* 2 label matching the vdev_guid */ + THREE_MATCH = 4, /* 3 label matching the vdev_guid */ + FULL_MATCH = 5 /* all labels match the vdev_guid */ +}; + +static enum match +vdev_attach_ok(vdev_t *vd, struct g_provider *pp) +{ + nvlist_t *config; + uint64_t pool_guid, top_guid, vdev_guid; + struct g_consumer *cp; + int nlabels; + + cp = vdev_geom_attach(pp, NULL, B_TRUE); + if (cp == NULL) { + ZFS_LOG(1, "Unable to attach tasting instance to %s.", + pp->name); + return (NO_MATCH); + } + g_topology_unlock(); + nlabels = vdev_geom_read_config(cp, &config); + g_topology_lock(); + vdev_geom_detach(cp, B_TRUE); + if (nlabels == 0) { + ZFS_LOG(1, "Unable to read config from %s.", pp->name); + return (NO_MATCH); + } + + pool_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); + top_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); + vdev_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + nvlist_free(config); + + /* + * Check that the label's pool guid matches the desired guid. + * Inactive spares and L2ARCs do not have any pool guid in the label. + */ + if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { + ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", + pp->name, + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); + return (NO_MATCH); + } + + /* + * Check that the label's vdev guid matches the desired guid. + * The second condition handles possible race on vdev detach, when + * remaining vdev receives GUID of destroyed top level mirror vdev. + */ + if (vdev_guid == vd->vdev_guid) { + ZFS_LOG(1, "guids match for provider %s.", pp->name); + return (ZERO_MATCH + nlabels); + } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { + ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); + return (TOPGUID_MATCH); + } + ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", + pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); + return (NO_MATCH); +} + +static struct g_consumer * +vdev_geom_attach_by_guids(vdev_t *vd) +{ + struct g_class *mp; + struct g_geom *gp; + struct g_provider *pp, *best_pp; + struct g_consumer *cp; + const char *vdpath; + enum match match, best_match; + + g_topology_assert(); + + vdpath = vd->vdev_path + sizeof("/dev/") - 1; + cp = NULL; + best_pp = NULL; + best_match = NO_MATCH; + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + match = vdev_attach_ok(vd, pp); + if (match > best_match) { + best_match = match; + best_pp = pp; + } else if (match == best_match) { + if (strcmp(pp->name, vdpath) == 0) { + best_pp = pp; + } + } + if (match == FULL_MATCH) + goto out; + } + } + } + +out: + if (best_pp) { + cp = vdev_geom_attach(best_pp, vd, B_TRUE); + if (cp == NULL) { + printf("ZFS WARNING: Unable to attach to %s.\n", + best_pp->name); + } + } + return (cp); +} + +static struct g_consumer * +vdev_geom_open_by_guids(vdev_t *vd) +{ + struct g_consumer *cp; + char *buf; + size_t len; + + g_topology_assert(); + + ZFS_LOG(1, "Searching by guids [%ju:%ju].", + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); + cp = vdev_geom_attach_by_guids(vd); + if (cp != NULL) { + len = strlen(cp->provider->name) + strlen("/dev/") + 1; + buf = kmem_alloc(len, KM_SLEEP); + + snprintf(buf, len, "/dev/%s", cp->provider->name); + spa_strfree(vd->vdev_path); + vd->vdev_path = buf; + + ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid, cp->provider->name); + } else { + ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid); + } + + return (cp); +} + +static struct g_consumer * +vdev_geom_open_by_path(vdev_t *vd, int check_guid) +{ + struct g_provider *pp; + struct g_consumer *cp; + + g_topology_assert(); + + cp = NULL; + pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1); + if (pp != NULL) { + ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); + if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) + cp = vdev_geom_attach(pp, vd, B_FALSE); + } + + return (cp); +} + +static int +vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + struct g_provider *pp; + struct g_consumer *cp; + size_t bufsize; + int error; + + /* Set the TLS to indicate downstack that we should not access zvols*/ + VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0); + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if ((cp = vd->vdev_tsd) != NULL) { + ASSERT(vd->vdev_reopening); + goto skip_open; + } + + DROP_GIANT(); + g_topology_lock(); + error = 0; + + if (vd->vdev_spa->spa_splitting_newspa || + (vd->vdev_prevstate == VDEV_STATE_UNKNOWN && + vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || + vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) { + /* + * We are dealing with a vdev that hasn't been previously + * opened (since boot), and we are not loading an + * existing pool configuration. This looks like a + * vdev add operation to a new or existing pool. + * Assume the user knows what he/she is doing and find + * GEOM provider by its name, ignoring GUID mismatches. + * + * XXPOLICY: It would be safer to only allow a device + * that is unlabeled or labeled but missing + * GUID information to be opened in this fashion, + * unless we are doing a split, in which case we + * should allow any guid. + */ + cp = vdev_geom_open_by_path(vd, 0); + } else { + /* + * Try using the recorded path for this device, but only + * accept it if its label data contains the expected GUIDs. + */ + cp = vdev_geom_open_by_path(vd, 1); + if (cp == NULL) { + /* + * The device at vd->vdev_path doesn't have the + * expected GUIDs. The disks might have merely + * moved around so try all other GEOM providers + * to find one with the right GUIDs. + */ + cp = vdev_geom_open_by_guids(vd); + } + } + + /* Clear the TLS now that tasting is done */ + VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0); + + if (cp == NULL) { + ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path); + error = ENOENT; + } else { + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + int spamode; + + priv = (struct consumer_priv_t*)&cp->private; + if (cp->private == NULL) + SLIST_INIT(priv); + elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO); + elem->vd = vd; + SLIST_INSERT_HEAD(priv, elem, elems); + + spamode = spa_mode(vd->vdev_spa); + if (cp->provider->sectorsize > VDEV_PAD_SIZE || + !ISP2(cp->provider->sectorsize)) { + ZFS_LOG(1, "Provider %s has unsupported sectorsize.", + cp->provider->name); + + vdev_geom_close_locked(vd); + error = EINVAL; + cp = NULL; + } else if (cp->acw == 0 && (spamode & FWRITE) != 0) { + int i; + + for (i = 0; i < 5; i++) { + error = g_access(cp, 0, 1, 0); + if (error == 0) + break; + g_topology_unlock(); + tsleep(vd, 0, "vdev", hz / 2); + g_topology_lock(); + } + if (error != 0) { + printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n", + cp->provider->name, error); + vdev_geom_close_locked(vd); + cp = NULL; + } + } + } + + /* Fetch initial physical path information for this device. */ + if (cp != NULL) { + vdev_geom_attrchanged(cp, "GEOM::physpath"); + + /* Set other GEOM characteristics */ + vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE); + vdev_geom_set_rotation_rate(vd, cp); + } + + g_topology_unlock(); + PICKUP_GIANT(); + if (cp == NULL) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]", + error); + return (error); + } +skip_open: + pp = cp->provider; + + /* + * Determine the actual size of the device. + */ + *max_psize = *psize = pp->mediasize; + + /* + * Determine the device's minimum transfer size and preferred + * transfer size. + */ + *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; + *physical_ashift = 0; + if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) && + pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0) + *physical_ashift = highbit(pp->stripesize) - 1; + + /* + * Clear the nowritecache settings, so that on a vdev_reopen() + * we will try again. + */ + vd->vdev_nowritecache = B_FALSE; + + return (0); +} + +static void +vdev_geom_close(vdev_t *vd) +{ + struct g_consumer *cp; + + cp = vd->vdev_tsd; + + DROP_GIANT(); + g_topology_lock(); + + if (!vd->vdev_reopening || + (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 || + (cp->provider != NULL && cp->provider->error != 0)))) + vdev_geom_close_locked(vd); + + g_topology_unlock(); + PICKUP_GIANT(); +} + +static void +vdev_geom_io_intr(struct bio *bp) +{ + vdev_t *vd; + zio_t *zio; + + zio = bp->bio_caller1; + vd = zio->io_vd; + zio->io_error = bp->bio_error; + if (zio->io_error == 0 && bp->bio_resid != 0) + zio->io_error = SET_ERROR(EIO); + + switch(zio->io_error) { + case ENOTSUP: + /* + * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know + * that future attempts will never succeed. In this case + * we set a persistent flag so that we don't bother with + * requests in the future. + */ + switch(bp->bio_cmd) { + case BIO_FLUSH: + vd->vdev_nowritecache = B_TRUE; + break; + case BIO_DELETE: + vd->vdev_notrim = B_TRUE; + break; + } + break; + case ENXIO: + if (!vd->vdev_remove_wanted) { + /* + * If provider's error is set we assume it is being + * removed. + */ + if (bp->bio_to->error != 0) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, + SPA_ASYNC_REMOVE); + } else if (!vd->vdev_delayed_close) { + vd->vdev_delayed_close = B_TRUE; + } + } + break; + } + + /* + * We have to split bio freeing into two parts, because the ABD code + * cannot be called in this context and vdev_op_io_done is not called + * for ZIO_TYPE_IOCTL zio-s. + */ + if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { + g_destroy_bio(bp); + zio->io_bio = NULL; + } + zio_delay_interrupt(zio); +} + +static void +vdev_geom_io_start(zio_t *zio) +{ + vdev_t *vd; + struct g_consumer *cp; + struct bio *bp; + int error; + + vd = zio->io_vd; + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } else { + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + if (zfs_nocacheflush || vdev_geom_bio_flush_disable) + break; + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + goto sendreq; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + } + + zio_execute(zio); + return; + case ZIO_TYPE_FREE: + if (vd->vdev_notrim) { + zio->io_error = SET_ERROR(ENOTSUP); + } else if (!vdev_geom_bio_delete_disable) { + goto sendreq; + } + zio_execute(zio); + return; + } +sendreq: + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_FREE || + zio->io_type == ZIO_TYPE_IOCTL); + + cp = vd->vdev_tsd; + if (cp == NULL) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + bp = g_alloc_bio(); + bp->bio_caller1 = zio; + switch (zio->io_type) { + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + zio->io_target_timestamp = zio_handle_io_delay(zio); + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + if (zio->io_type == ZIO_TYPE_READ) { + bp->bio_cmd = BIO_READ; + bp->bio_data = + abd_borrow_buf(zio->io_abd, zio->io_size); + } else { + bp->bio_cmd = BIO_WRITE; + bp->bio_data = + abd_borrow_buf_copy(zio->io_abd, zio->io_size); + } + break; + case ZIO_TYPE_FREE: + bp->bio_cmd = BIO_DELETE; + bp->bio_data = NULL; + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + break; + case ZIO_TYPE_IOCTL: + bp->bio_cmd = BIO_FLUSH; + bp->bio_data = NULL; + bp->bio_offset = cp->provider->mediasize; + bp->bio_length = 0; + break; + } + bp->bio_done = vdev_geom_io_intr; + zio->io_bio = bp; + + g_io_request(bp, cp); +} + +static void +vdev_geom_io_done(zio_t *zio) +{ + struct bio *bp = zio->io_bio; + + if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { + ASSERT(bp == NULL); + return; + } + + if (bp == NULL) { + ASSERT3S(zio->io_error, ==, ENXIO); + return; + } + + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size); + else + abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size); + + g_destroy_bio(bp); + zio->io_bio = NULL; +} + +static void +vdev_geom_hold(vdev_t *vd) +{ +} + +static void +vdev_geom_rele(vdev_t *vd) +{ +} + +vdev_ops_t vdev_geom_ops = { + vdev_geom_open, + vdev_geom_close, + vdev_default_asize, + vdev_geom_io_start, + vdev_geom_io_done, + NULL, + NULL, + vdev_geom_hold, + vdev_geom_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c new file mode 100644 index 000000000000..5fb44936b127 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c @@ -0,0 +1,1649 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014, 2017 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/metaslab.h> +#include <sys/refcount.h> +#include <sys/dmu.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_synctask.h> +#include <sys/zap.h> +#include <sys/abd.h> +#include <sys/zthr.h> + +/* + * An indirect vdev corresponds to a vdev that has been removed. Since + * we cannot rewrite block pointers of snapshots, etc., we keep a + * mapping from old location on the removed device to the new location + * on another device in the pool and use this mapping whenever we need + * to access the DVA. Unfortunately, this mapping did not respect + * logical block boundaries when it was first created, and so a DVA on + * this indirect vdev may be "split" into multiple sections that each + * map to a different location. As a consequence, not all DVAs can be + * translated to an equivalent new DVA. Instead we must provide a + * "vdev_remap" operation that executes a callback on each contiguous + * segment of the new location. This function is used in multiple ways: + * + * - i/os to this vdev use the callback to determine where the + * data is now located, and issue child i/os for each segment's new + * location. + * + * - frees and claims to this vdev use the callback to free or claim + * each mapped segment. (Note that we don't actually need to claim + * log blocks on indirect vdevs, because we don't allocate to + * removing vdevs. However, zdb uses zio_claim() for its leak + * detection.) + */ + +/* + * "Big theory statement" for how we mark blocks obsolete. + * + * When a block on an indirect vdev is freed or remapped, a section of + * that vdev's mapping may no longer be referenced (aka "obsolete"). We + * keep track of how much of each mapping entry is obsolete. When + * an entry becomes completely obsolete, we can remove it, thus reducing + * the memory used by the mapping. The complete picture of obsolescence + * is given by the following data structures, described below: + * - the entry-specific obsolete count + * - the vdev-specific obsolete spacemap + * - the pool-specific obsolete bpobj + * + * == On disk data structures used == + * + * We track the obsolete space for the pool using several objects. Each + * of these objects is created on demand and freed when no longer + * needed, and is assumed to be empty if it does not exist. + * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. + * + * - Each vic_mapping_object (associated with an indirect vdev) can + * have a vimp_counts_object. This is an array of uint32_t's + * with the same number of entries as the vic_mapping_object. When + * the mapping is condensed, entries from the vic_obsolete_sm_object + * (see below) are folded into the counts. Therefore, each + * obsolete_counts entry tells us the number of bytes in the + * corresponding mapping entry that were not referenced when the + * mapping was last condensed. + * + * - Each indirect or removing vdev can have a vic_obsolete_sm_object. + * This is a space map containing an alloc entry for every DVA that + * has been obsoleted since the last time this indirect vdev was + * condensed. We use this object in order to improve performance + * when marking a DVA as obsolete. Instead of modifying an arbitrary + * offset of the vimp_counts_object, we only need to append an entry + * to the end of this object. When a DVA becomes obsolete, it is + * added to the obsolete space map. This happens when the DVA is + * freed, remapped and not referenced by a snapshot, or the last + * snapshot referencing it is destroyed. + * + * - Each dataset can have a ds_remap_deadlist object. This is a + * deadlist object containing all blocks that were remapped in this + * dataset but referenced in a previous snapshot. Blocks can *only* + * appear on this list if they were remapped (dsl_dataset_block_remapped); + * blocks that were killed in a head dataset are put on the normal + * ds_deadlist and marked obsolete when they are freed. + * + * - The pool can have a dp_obsolete_bpobj. This is a list of blocks + * in the pool that need to be marked obsolete. When a snapshot is + * destroyed, we move some of the ds_remap_deadlist to the obsolete + * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then + * asynchronously process the obsolete bpobj, moving its entries to + * the specific vdevs' obsolete space maps. + * + * == Summary of how we mark blocks as obsolete == + * + * - When freeing a block: if any DVA is on an indirect vdev, append to + * vic_obsolete_sm_object. + * - When remapping a block, add dva to ds_remap_deadlist (if prev snap + * references; otherwise append to vic_obsolete_sm_object). + * - When freeing a snapshot: move parts of ds_remap_deadlist to + * dp_obsolete_bpobj (same algorithm as ds_deadlist). + * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to + * individual vdev's vic_obsolete_sm_object. + */ + +/* + * "Big theory statement" for how we condense indirect vdevs. + * + * Condensing an indirect vdev's mapping is the process of determining + * the precise counts of obsolete space for each mapping entry (by + * integrating the obsolete spacemap into the obsolete counts) and + * writing out a new mapping that contains only referenced entries. + * + * We condense a vdev when we expect the mapping to shrink (see + * vdev_indirect_should_condense()), but only perform one condense at a + * time to limit the memory usage. In addition, we use a separate + * open-context thread (spa_condense_indirect_thread) to incrementally + * create the new mapping object in a way that minimizes the impact on + * the rest of the system. + * + * == Generating a new mapping == + * + * To generate a new mapping, we follow these steps: + * + * 1. Save the old obsolete space map and create a new mapping object + * (see spa_condense_indirect_start_sync()). This initializes the + * spa_condensing_indirect_phys with the "previous obsolete space map", + * which is now read only. Newly obsolete DVAs will be added to a + * new (initially empty) obsolete space map, and will not be + * considered as part of this condense operation. + * + * 2. Construct in memory the precise counts of obsolete space for each + * mapping entry, by incorporating the obsolete space map into the + * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) + * + * 3. Iterate through each mapping entry, writing to the new mapping any + * entries that are not completely obsolete (i.e. which don't have + * obsolete count == mapping length). (See + * spa_condense_indirect_generate_new_mapping().) + * + * 4. Destroy the old mapping object and switch over to the new one + * (spa_condense_indirect_complete_sync). + * + * == Restarting from failure == + * + * To restart the condense when we import/open the pool, we must start + * at the 2nd step above: reconstruct the precise counts in memory, + * based on the space map + counts. Then in the 3rd step, we start + * iterating where we left off: at vimp_max_offset of the new mapping + * object. + */ + +boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE; + +/* + * Condense if at least this percent of the bytes in the mapping is + * obsolete. With the default of 25%, the amount of space mapped + * will be reduced to 1% of its original size after at most 16 + * condenses. Higher values will condense less often (causing less + * i/o); lower values will reduce the mapping size more quickly. + */ +int zfs_indirect_condense_obsolete_pct = 25; + +/* + * Condense if the obsolete space map takes up more than this amount of + * space on disk (logically). This limits the amount of disk space + * consumed by the obsolete space map; the default of 1GB is small enough + * that we typically don't mind "wasting" it. + */ +uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; + +/* + * Don't bother condensing if the mapping uses less than this amount of + * memory. The default of 128KB is considered a "trivial" amount of + * memory and not worth reducing. + */ +uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; + +/* + * This is used by the test suite so that it can ensure that certain + * actions happen while in the middle of a condense (which might otherwise + * complete too quickly). If used to reduce the performance impact of + * condensing in production, a maximum value of 1 should be sufficient. + */ +int zfs_condense_indirect_commit_entry_delay_ticks = 0; + +/* + * If a split block contains more than this many segments, consider it too + * computationally expensive to check all (2^num_segments) possible + * combinations. Instead, try at most 2^_segments_max randomly-selected + * combinations. + * + * This is reasonable if only a few segment copies are damaged and the + * majority of segment copies are good. This allows all the segment copies to + * participate fairly in the reconstruction and prevents the repeated use of + * one bad copy. + */ +int zfs_reconstruct_indirect_segments_max = 10; + +/* + * The indirect_child_t represents the vdev that we will read from, when we + * need to read all copies of the data (e.g. for scrub or reconstruction). + * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), + * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, + * ic_vdev is a child of the mirror. + */ +typedef struct indirect_child { + abd_t *ic_data; + vdev_t *ic_vdev; +} indirect_child_t; + +/* + * The indirect_split_t represents one mapped segment of an i/o to the + * indirect vdev. For non-split (contiguously-mapped) blocks, there will be + * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. + * For split blocks, there will be several of these. + */ +typedef struct indirect_split { + list_node_t is_node; /* link on iv_splits */ + + /* + * is_split_offset is the offset into the i/o. + * This is the sum of the previous splits' is_size's. + */ + uint64_t is_split_offset; + + vdev_t *is_vdev; /* top-level vdev */ + uint64_t is_target_offset; /* offset on is_vdev */ + uint64_t is_size; + int is_children; /* number of entries in is_child[] */ + + /* + * is_good_child is the child that we are currently using to + * attempt reconstruction. + */ + int is_good_child; + + indirect_child_t is_child[1]; /* variable-length */ +} indirect_split_t; + +/* + * The indirect_vsd_t is associated with each i/o to the indirect vdev. + * It is the "Vdev-Specific Data" in the zio_t's io_vsd. + */ +typedef struct indirect_vsd { + boolean_t iv_split_block; + boolean_t iv_reconstruct; + + list_t iv_splits; /* list of indirect_split_t's */ +} indirect_vsd_t; + +static void +vdev_indirect_map_free(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + indirect_split_t *is; + while ((is = list_head(&iv->iv_splits)) != NULL) { + for (int c = 0; c < is->is_children; c++) { + indirect_child_t *ic = &is->is_child[c]; + if (ic->ic_data != NULL) + abd_free(ic->ic_data); + } + list_remove(&iv->iv_splits, is); + kmem_free(is, + offsetof(indirect_split_t, is_child[is->is_children])); + } + kmem_free(iv, sizeof (*iv)); +} + +static const zio_vsd_ops_t vdev_indirect_vsd_ops = { + vdev_indirect_map_free, + zio_vsd_default_cksum_report +}; +/* + * Mark the given offset and size as being obsolete. + */ +void +vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); + ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); + ASSERT(size > 0); + VERIFY(vdev_indirect_mapping_entry_for_offset( + vd->vdev_indirect_mapping, offset) != NULL); + + if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + mutex_enter(&vd->vdev_obsolete_lock); + range_tree_add(vd->vdev_obsolete_segments, offset, size); + mutex_exit(&vd->vdev_obsolete_lock); + vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); + } +} + +/* + * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This + * wrapper is provided because the DMU does not know about vdev_t's and + * cannot directly call vdev_indirect_mark_obsolete. + */ +void +spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, + uint64_t size, dmu_tx_t *tx) +{ + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + ASSERT(dmu_tx_is_syncing(tx)); + + /* The DMU can only remap indirect vdevs. */ + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + vdev_indirect_mark_obsolete(vd, offset, size); +} + +static spa_condensing_indirect_t * +spa_condensing_indirect_create(spa_t *spa) +{ + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); + objset_t *mos = spa->spa_meta_objset; + + for (int i = 0; i < TXG_SIZE; i++) { + list_create(&sci->sci_new_mapping_entries[i], + sizeof (vdev_indirect_mapping_entry_t), + offsetof(vdev_indirect_mapping_entry_t, vime_node)); + } + + sci->sci_new_mapping = + vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); + + return (sci); +} + +static void +spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) +{ + for (int i = 0; i < TXG_SIZE; i++) + list_destroy(&sci->sci_new_mapping_entries[i]); + + if (sci->sci_new_mapping != NULL) + vdev_indirect_mapping_close(sci->sci_new_mapping); + + kmem_free(sci, sizeof (*sci)); +} + +boolean_t +vdev_indirect_should_condense(vdev_t *vd) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + spa_t *spa = vd->vdev_spa; + + ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); + + if (!zfs_condense_indirect_vdevs_enable) + return (B_FALSE); + + /* + * We can only condense one indirect vdev at a time. + */ + if (spa->spa_condensing_indirect != NULL) + return (B_FALSE); + + if (spa_shutting_down(spa)) + return (B_FALSE); + + /* + * The mapping object size must not change while we are + * condensing, so we can only condense indirect vdevs + * (not vdevs that are still in the middle of being removed). + */ + if (vd->vdev_ops != &vdev_indirect_ops) + return (B_FALSE); + + /* + * If nothing new has been marked obsolete, there is no + * point in condensing. + */ + if (vd->vdev_obsolete_sm == NULL) { + ASSERT0(vdev_obsolete_sm_object(vd)); + return (B_FALSE); + } + + ASSERT(vd->vdev_obsolete_sm != NULL); + + ASSERT3U(vdev_obsolete_sm_object(vd), ==, + space_map_object(vd->vdev_obsolete_sm)); + + uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); + uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); + uint64_t mapping_size = vdev_indirect_mapping_size(vim); + uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); + + ASSERT3U(bytes_obsolete, <=, bytes_mapped); + + /* + * If a high percentage of the bytes that are mapped have become + * obsolete, condense (unless the mapping is already small enough). + * This has a good chance of reducing the amount of memory used + * by the mapping. + */ + if (bytes_obsolete * 100 / bytes_mapped >= + zfs_indirect_condense_obsolete_pct && + mapping_size > zfs_condense_min_mapping_bytes) { + zfs_dbgmsg("should condense vdev %llu because obsolete " + "spacemap covers %d%% of %lluMB mapping", + (u_longlong_t)vd->vdev_id, + (int)(bytes_obsolete * 100 / bytes_mapped), + (u_longlong_t)bytes_mapped / 1024 / 1024); + return (B_TRUE); + } + + /* + * If the obsolete space map takes up too much space on disk, + * condense in order to free up this disk space. + */ + if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { + zfs_dbgmsg("should condense vdev %llu because obsolete sm " + "length %lluMB >= max size %lluMB", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)obsolete_sm_size / 1024 / 1024, + (u_longlong_t)zfs_condense_max_obsolete_bytes / + 1024 / 1024); + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * This sync task completes (finishes) a condense, deleting the old + * mapping and replacing it with the new one. + */ +static void +spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_condensing_indirect_t *sci = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + objset_t *mos = spa->spa_meta_objset; + vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; + uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); + uint64_t new_count = + vdev_indirect_mapping_num_entries(sci->sci_new_mapping); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + ASSERT3P(sci, ==, spa->spa_condensing_indirect); + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); + } + ASSERT(vic->vic_mapping_object != 0); + ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); + ASSERT(scip->scip_next_mapping_object != 0); + ASSERT(scip->scip_prev_obsolete_sm_object != 0); + + /* + * Reset vdev_indirect_mapping to refer to the new object. + */ + rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); + vdev_indirect_mapping_close(vd->vdev_indirect_mapping); + vd->vdev_indirect_mapping = sci->sci_new_mapping; + rw_exit(&vd->vdev_indirect_rwlock); + + sci->sci_new_mapping = NULL; + vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); + vic->vic_mapping_object = scip->scip_next_mapping_object; + scip->scip_next_mapping_object = 0; + + space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + scip->scip_prev_obsolete_sm_object = 0; + + scip->scip_vdev = 0; + + VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CONDENSING_INDIRECT, tx)); + spa_condensing_indirect_destroy(spa->spa_condensing_indirect); + spa->spa_condensing_indirect = NULL; + + zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " + "new mapping object %llu has %llu entries " + "(was %llu entries)", + vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, + new_count, old_count); + + vdev_config_dirty(spa->spa_root_vdev); +} + +/* + * This sync task appends entries to the new mapping object. + */ +static void +spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) +{ + spa_condensing_indirect_t *sci = arg; + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3P(sci, ==, spa->spa_condensing_indirect); + + vdev_indirect_mapping_add_entries(sci->sci_new_mapping, + &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); + ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); +} + +/* + * Open-context function to add one entry to the new mapping. The new + * entry will be remembered and written from syncing context. + */ +static void +spa_condense_indirect_commit_entry(spa_t *spa, + vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) +{ + spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; + + ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + /* + * If we are the first entry committed this txg, kick off the sync + * task to write to the MOS on our behalf. + */ + if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { + dsl_sync_task_nowait(dmu_tx_pool(tx), + spa_condense_indirect_commit_sync, sci, + 0, ZFS_SPACE_CHECK_NONE, tx); + } + + vdev_indirect_mapping_entry_t *vime = + kmem_alloc(sizeof (*vime), KM_SLEEP); + vime->vime_mapping = *vimep; + vime->vime_obsolete_count = count; + list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); + + dmu_tx_commit(tx); +} + +static void +spa_condense_indirect_generate_new_mapping(vdev_t *vd, + uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) +{ + spa_t *spa = vd->vdev_spa; + uint64_t mapi = start_index; + vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; + uint64_t old_num_entries = + vdev_indirect_mapping_num_entries(old_mapping); + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); + + zfs_dbgmsg("starting condense of vdev %llu from index %llu", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)mapi); + + while (mapi < old_num_entries) { + + if (zthr_iscancelled(zthr)) { + zfs_dbgmsg("pausing condense of vdev %llu " + "at index %llu", (u_longlong_t)vd->vdev_id, + (u_longlong_t)mapi); + break; + } + + vdev_indirect_mapping_entry_phys_t *entry = + &old_mapping->vim_entries[mapi]; + uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); + ASSERT3U(obsolete_counts[mapi], <=, entry_size); + if (obsolete_counts[mapi] < entry_size) { + spa_condense_indirect_commit_entry(spa, entry, + obsolete_counts[mapi]); + + /* + * This delay may be requested for testing, debugging, + * or performance reasons. + */ + delay(zfs_condense_indirect_commit_entry_delay_ticks); + } + + mapi++; + } +} + +/* ARGSUSED */ +static boolean_t +spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + + return (spa->spa_condensing_indirect != NULL); +} + +/* ARGSUSED */ +static int +spa_condense_indirect_thread(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + vdev_t *vd; + + ASSERT3P(spa->spa_condensing_indirect, !=, NULL); + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); + ASSERT3P(vd, !=, NULL); + spa_config_exit(spa, SCL_VDEV, FTAG); + + spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + uint32_t *counts; + uint64_t start_index; + vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; + space_map_t *prev_obsolete_sm = NULL; + + ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); + ASSERT(scip->scip_next_mapping_object != 0); + ASSERT(scip->scip_prev_obsolete_sm_object != 0); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + for (int i = 0; i < TXG_SIZE; i++) { + /* + * The list must start out empty in order for the + * _commit_sync() sync task to be properly registered + * on the first call to _commit_entry(); so it's wise + * to double check and ensure we actually are starting + * with empty lists. + */ + ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); + } + + VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, + scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); + space_map_update(prev_obsolete_sm); + counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); + if (prev_obsolete_sm != NULL) { + vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, + counts, prev_obsolete_sm); + } + space_map_close(prev_obsolete_sm); + + /* + * Generate new mapping. Determine what index to continue from + * based on the max offset that we've already written in the + * new mapping. + */ + uint64_t max_offset = + vdev_indirect_mapping_max_offset(sci->sci_new_mapping); + if (max_offset == 0) { + /* We haven't written anything to the new mapping yet. */ + start_index = 0; + } else { + /* + * Pick up from where we left off. _entry_for_offset() + * returns a pointer into the vim_entries array. If + * max_offset is greater than any of the mappings + * contained in the table NULL will be returned and + * that indicates we've exhausted our iteration of the + * old_mapping. + */ + + vdev_indirect_mapping_entry_phys_t *entry = + vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, + max_offset); + + if (entry == NULL) { + /* + * We've already written the whole new mapping. + * This special value will cause us to skip the + * generate_new_mapping step and just do the sync + * task to complete the condense. + */ + start_index = UINT64_MAX; + } else { + start_index = entry - old_mapping->vim_entries; + ASSERT3U(start_index, <, + vdev_indirect_mapping_num_entries(old_mapping)); + } + } + + spa_condense_indirect_generate_new_mapping(vd, counts, + start_index, zthr); + + vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); + + /* + * If the zthr has received a cancellation signal while running + * in generate_new_mapping() or at any point after that, then bail + * early. We don't want to complete the condense if the spa is + * shutting down. + */ + if (zthr_iscancelled(zthr)) + return (0); + + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + spa_condense_indirect_complete_sync, sci, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); + + return (0); +} + +/* + * Sync task to begin the condensing process. + */ +void +spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + + ASSERT0(scip->scip_next_mapping_object); + ASSERT0(scip->scip_prev_obsolete_sm_object); + ASSERT0(scip->scip_vdev); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); + ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); + + uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); + ASSERT(obsolete_sm_obj != 0); + + scip->scip_vdev = vd->vdev_id; + scip->scip_next_mapping_object = + vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); + + scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; + + /* + * We don't need to allocate a new space map object, since + * vdev_indirect_sync_obsolete will allocate one when needed. + */ + space_map_close(vd->vdev_obsolete_sm); + vd->vdev_obsolete_sm = NULL; + VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); + + VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), + sizeof (*scip) / sizeof (uint64_t), scip, tx)); + + ASSERT3P(spa->spa_condensing_indirect, ==, NULL); + spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); + + zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " + "posm=%llu nm=%llu", + vd->vdev_id, dmu_tx_get_txg(tx), + (u_longlong_t)scip->scip_prev_obsolete_sm_object, + (u_longlong_t)scip->scip_next_mapping_object); + + zthr_wakeup(spa->spa_condense_zthr); +} + +/* + * Sync to the given vdev's obsolete space map any segments that are no longer + * referenced as of the given txg. + * + * If the obsolete space map doesn't exist yet, create and open it. + */ +void +vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + ASSERT3U(vic->vic_mapping_object, !=, 0); + ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); + ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); + + if (vdev_obsolete_sm_object(vd) == 0) { + uint64_t obsolete_sm_object = + space_map_alloc(spa->spa_meta_objset, + vdev_standard_sm_blksz, tx); + + ASSERT(vd->vdev_top_zap != 0); + VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, + sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); + ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0); + + spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + VERIFY0(space_map_open(&vd->vdev_obsolete_sm, + spa->spa_meta_objset, obsolete_sm_object, + 0, vd->vdev_asize, 0)); + space_map_update(vd->vdev_obsolete_sm); + } + + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT3U(vdev_obsolete_sm_object(vd), ==, + space_map_object(vd->vdev_obsolete_sm)); + + space_map_write(vd->vdev_obsolete_sm, + vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); + space_map_update(vd->vdev_obsolete_sm); + range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); +} + +int +spa_condense_init(spa_t *spa) +{ + int error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), + sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), + &spa->spa_condensing_indirect_phys); + if (error == 0) { + if (spa_writeable(spa)) { + spa->spa_condensing_indirect = + spa_condensing_indirect_create(spa); + } + return (0); + } else if (error == ENOENT) { + return (0); + } else { + return (error); + } +} + +void +spa_condense_fini(spa_t *spa) +{ + if (spa->spa_condensing_indirect != NULL) { + spa_condensing_indirect_destroy(spa->spa_condensing_indirect); + spa->spa_condensing_indirect = NULL; + } +} + +void +spa_start_indirect_condensing_thread(spa_t *spa) +{ + ASSERT3P(spa->spa_condense_zthr, ==, NULL); + spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check, + spa_condense_indirect_thread, spa); +} + +/* + * Gets the obsolete spacemap object from the vdev's ZAP. + * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't + * exist yet. + */ +int +vdev_obsolete_sm_object(vdev_t *vd) +{ + ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + if (vd->vdev_top_zap == 0) { + return (0); + } + + uint64_t sm_obj = 0; + int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj); + + ASSERT(err == 0 || err == ENOENT); + + return (sm_obj); +} + +boolean_t +vdev_obsolete_counts_are_precise(vdev_t *vd) +{ + ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + if (vd->vdev_top_zap == 0) { + return (B_FALSE); + } + + uint64_t val = 0; + int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); + + ASSERT(err == 0 || err == ENOENT); + + return (val != 0); +} + +/* ARGSUSED */ +static void +vdev_indirect_close(vdev_t *vd) +{ +} + +/* ARGSUSED */ +static int +vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + *psize = *max_psize = vd->vdev_asize + + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + *logical_ashift = vd->vdev_ashift; + *physical_ashift = vd->vdev_physical_ashift; + return (0); +} + +typedef struct remap_segment { + vdev_t *rs_vd; + uint64_t rs_offset; + uint64_t rs_asize; + uint64_t rs_split_offset; + list_node_t rs_node; +} remap_segment_t; + +remap_segment_t * +rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) +{ + remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); + rs->rs_vd = vd; + rs->rs_offset = offset; + rs->rs_asize = asize; + rs->rs_split_offset = split_offset; + return (rs); +} + +/* + * Given an indirect vdev and an extent on that vdev, it duplicates the + * physical entries of the indirect mapping that correspond to the extent + * to a new array and returns a pointer to it. In addition, copied_entries + * is populated with the number of mapping entries that were duplicated. + * + * Note that the function assumes that the caller holds vdev_indirect_rwlock. + * This ensures that the mapping won't change due to condensing as we + * copy over its contents. + * + * Finally, since we are doing an allocation, it is up to the caller to + * free the array allocated in this function. + */ +vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, + uint64_t asize, uint64_t *copied_entries) +{ + vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t entries = 0; + + ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); + + vdev_indirect_mapping_entry_phys_t *first_mapping = + vdev_indirect_mapping_entry_for_offset(vim, offset); + ASSERT3P(first_mapping, !=, NULL); + + vdev_indirect_mapping_entry_phys_t *m = first_mapping; + while (asize > 0) { + uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); + + ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); + ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); + + uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); + uint64_t inner_size = MIN(asize, size - inner_offset); + + offset += inner_size; + asize -= inner_size; + entries++; + m++; + } + + size_t copy_length = entries * sizeof (*first_mapping); + duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); + bcopy(first_mapping, duplicate_mappings, copy_length); + *copied_entries = entries; + + return (duplicate_mappings); +} + +/* + * Goes through the relevant indirect mappings until it hits a concrete vdev + * and issues the callback. On the way to the concrete vdev, if any other + * indirect vdevs are encountered, then the callback will also be called on + * each of those indirect vdevs. For example, if the segment is mapped to + * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is + * mapped to segment B on concrete vdev 2, then the callback will be called on + * both vdev 1 and vdev 2. + * + * While the callback passed to vdev_indirect_remap() is called on every vdev + * the function encounters, certain callbacks only care about concrete vdevs. + * These types of callbacks should return immediately and explicitly when they + * are called on an indirect vdev. + * + * Because there is a possibility that a DVA section in the indirect device + * has been split into multiple sections in our mapping, we keep track + * of the relevant contiguous segments of the new location (remap_segment_t) + * in a stack. This way we can call the callback for each of the new sections + * created by a single section of the indirect device. Note though, that in + * this scenario the callbacks in each split block won't occur in-order in + * terms of offset, so callers should not make any assumptions about that. + * + * For callbacks that don't handle split blocks and immediately return when + * they encounter them (as is the case for remap_blkptr_cb), the caller can + * assume that its callback will be applied from the first indirect vdev + * encountered to the last one and then the concrete vdev, in that order. + */ +static void +vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, + void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) +{ + list_t stack; + spa_t *spa = vd->vdev_spa; + + list_create(&stack, sizeof (remap_segment_t), + offsetof(remap_segment_t, rs_node)); + + for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); + rs != NULL; rs = list_remove_head(&stack)) { + vdev_t *v = rs->rs_vd; + uint64_t num_entries = 0; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + ASSERT(rs->rs_asize > 0); + + /* + * Note: As this function can be called from open context + * (e.g. zio_read()), we need the following rwlock to + * prevent the mapping from being changed by condensing. + * + * So we grab the lock and we make a copy of the entries + * that are relevant to the extent that we are working on. + * Once that is done, we drop the lock and iterate over + * our copy of the mapping. Once we are done with the with + * the remap segment and we free it, we also free our copy + * of the indirect mapping entries that are relevant to it. + * + * This way we don't need to wait until the function is + * finished with a segment, to condense it. In addition, we + * don't need a recursive rwlock for the case that a call to + * vdev_indirect_remap() needs to call itself (through the + * codepath of its callback) for the same vdev in the middle + * of its execution. + */ + rw_enter(&v->vdev_indirect_rwlock, RW_READER); + vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping; + ASSERT3P(vim, !=, NULL); + + vdev_indirect_mapping_entry_phys_t *mapping = + vdev_indirect_mapping_duplicate_adjacent_entries(v, + rs->rs_offset, rs->rs_asize, &num_entries); + ASSERT3P(mapping, !=, NULL); + ASSERT3U(num_entries, >, 0); + rw_exit(&v->vdev_indirect_rwlock); + + for (uint64_t i = 0; i < num_entries; i++) { + /* + * Note: the vdev_indirect_mapping can not change + * while we are running. It only changes while the + * removal is in progress, and then only from syncing + * context. While a removal is in progress, this + * function is only called for frees, which also only + * happen from syncing context. + */ + vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; + + ASSERT3P(m, !=, NULL); + ASSERT3U(rs->rs_asize, >, 0); + + uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); + uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); + uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); + + ASSERT3U(rs->rs_offset, >=, + DVA_MAPPING_GET_SRC_OFFSET(m)); + ASSERT3U(rs->rs_offset, <, + DVA_MAPPING_GET_SRC_OFFSET(m) + size); + ASSERT3U(dst_vdev, !=, v->vdev_id); + + uint64_t inner_offset = rs->rs_offset - + DVA_MAPPING_GET_SRC_OFFSET(m); + uint64_t inner_size = + MIN(rs->rs_asize, size - inner_offset); + + vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); + ASSERT3P(dst_v, !=, NULL); + + if (dst_v->vdev_ops == &vdev_indirect_ops) { + list_insert_head(&stack, + rs_alloc(dst_v, dst_offset + inner_offset, + inner_size, rs->rs_split_offset)); + + } + + if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && + IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { + /* + * Note: This clause exists only solely for + * testing purposes. We use it to ensure that + * split blocks work and that the callbacks + * using them yield the same result if issued + * in reverse order. + */ + uint64_t inner_half = inner_size / 2; + + func(rs->rs_split_offset + inner_half, dst_v, + dst_offset + inner_offset + inner_half, + inner_half, arg); + + func(rs->rs_split_offset, dst_v, + dst_offset + inner_offset, + inner_half, arg); + } else { + func(rs->rs_split_offset, dst_v, + dst_offset + inner_offset, + inner_size, arg); + } + + rs->rs_offset += inner_size; + rs->rs_asize -= inner_size; + rs->rs_split_offset += inner_size; + } + VERIFY0(rs->rs_asize); + + kmem_free(mapping, num_entries * sizeof (*mapping)); + kmem_free(rs, sizeof (remap_segment_t)); + } + list_destroy(&stack); +} + +static void +vdev_indirect_child_io_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + mutex_enter(&pio->io_lock); + pio->io_error = zio_worst_error(pio->io_error, zio->io_error); + mutex_exit(&pio->io_lock); + +#ifdef __FreeBSD__ + if (zio->io_abd != NULL) +#endif + abd_put(zio->io_abd); +} + +/* + * This is a callback for vdev_indirect_remap() which allocates an + * indirect_split_t for each split segment and adds it to iv_splits. + */ +static void +vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + zio_t *zio = arg; + indirect_vsd_t *iv = zio->io_vsd; + + ASSERT3P(vd, !=, NULL); + + if (vd->vdev_ops == &vdev_indirect_ops) + return; + + int n = 1; + if (vd->vdev_ops == &vdev_mirror_ops) + n = vd->vdev_children; + + indirect_split_t *is = + kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); + + is->is_children = n; + is->is_size = size; + is->is_split_offset = split_offset; + is->is_target_offset = offset; + is->is_vdev = vd; + + /* + * Note that we only consider multiple copies of the data for + * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even + * though they use the same ops as mirror, because there's only one + * "good" copy under the replacing/spare. + */ + if (vd->vdev_ops == &vdev_mirror_ops) { + for (int i = 0; i < n; i++) { + is->is_child[i].ic_vdev = vd->vdev_child[i]; + } + } else { + is->is_child[0].ic_vdev = vd; + } + + list_insert_tail(&iv->iv_splits, is); +} + +static void +vdev_indirect_read_split_done(zio_t *zio) +{ + indirect_child_t *ic = zio->io_private; + + if (zio->io_error != 0) { + /* + * Clear ic_data to indicate that we do not have data for this + * child. + */ + abd_free(ic->ic_data); + ic->ic_data = NULL; + } +} + +/* + * Issue reads for all copies (mirror children) of all splits. + */ +static void +vdev_indirect_read_all(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + for (int i = 0; i < is->is_children; i++) { + indirect_child_t *ic = &is->is_child[i]; + + if (!vdev_readable(ic->ic_vdev)) + continue; + + /* + * Note, we may read from a child whose DTL + * indicates that the data may not be present here. + * While this might result in a few i/os that will + * likely return incorrect data, it simplifies the + * code since we can treat scrub and resilver + * identically. (The incorrect data will be + * detected and ignored when we verify the + * checksum.) + */ + + ic->ic_data = abd_alloc_sametype(zio->io_abd, + is->is_size); + + zio_nowait(zio_vdev_child_io(zio, NULL, + ic->ic_vdev, is->is_target_offset, ic->ic_data, + is->is_size, zio->io_type, zio->io_priority, 0, + vdev_indirect_read_split_done, ic)); + } + } + iv->iv_reconstruct = B_TRUE; +} + +static void +vdev_indirect_io_start(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); + list_create(&iv->iv_splits, + sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); + + zio->io_vsd = iv; + zio->io_vsd_ops = &vdev_indirect_vsd_ops; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); +#ifdef __FreeBSD__ + if (zio->io_type == ZIO_TYPE_WRITE) { +#else + if (zio->io_type != ZIO_TYPE_READ) { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); +#endif + /* + * Note: this code can handle other kinds of writes, + * but we don't expect them. + */ + ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | + ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); + } + + vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, + vdev_indirect_gather_splits, zio); + + indirect_split_t *first = list_head(&iv->iv_splits); + if (first->is_size == zio->io_size) { + /* + * This is not a split block; we are pointing to the entire + * data, which will checksum the same as the original data. + * Pass the BP down so that the child i/o can verify the + * checksum, and try a different location if available + * (e.g. on a mirror). + * + * While this special case could be handled the same as the + * general (split block) case, doing it this way ensures + * that the vast majority of blocks on indirect vdevs + * (which are not split) are handled identically to blocks + * on non-indirect vdevs. This allows us to be less strict + * about performance in the general (but rare) case. + */ + ASSERT0(first->is_split_offset); + ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + first->is_vdev, first->is_target_offset, +#ifdef __FreeBSD__ + zio->io_abd == NULL ? NULL : +#endif + abd_get_offset(zio->io_abd, 0), + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_indirect_child_io_done, zio)); + } else { + iv->iv_split_block = B_TRUE; + if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { + /* + * Read all copies. Note that for simplicity, + * we don't bother consulting the DTL in the + * resilver case. + */ + vdev_indirect_read_all(zio); + } else { + /* + * Read one copy of each split segment, from the + * top-level vdev. Since we don't know the + * checksum of each split individually, the child + * zio can't ensure that we get the right data. + * E.g. if it's a mirror, it will just read from a + * random (healthy) leaf vdev. We have to verify + * the checksum in vdev_indirect_io_done(). + */ + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + zio_nowait(zio_vdev_child_io(zio, NULL, + is->is_vdev, is->is_target_offset, +#ifdef __FreeBSD__ + zio->io_abd == NULL ? NULL : +#endif + abd_get_offset(zio->io_abd, + is->is_split_offset), + is->is_size, zio->io_type, + zio->io_priority, 0, + vdev_indirect_child_io_done, zio)); + } + } + } + + zio_execute(zio); +} + +/* + * Report a checksum error for a child. + */ +static void +vdev_indirect_checksum_error(zio_t *zio, + indirect_split_t *is, indirect_child_t *ic) +{ + vdev_t *vd = ic->ic_vdev; + + if (zio->io_flags & ZIO_FLAG_SPECULATIVE) + return; + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + + zio_bad_cksum_t zbc = { 0 }; + void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size); + abd_t *good_abd = is->is_child[is->is_good_child].ic_data; + void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size); + zfs_ereport_post_checksum(zio->io_spa, vd, zio, + is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc); + abd_return_buf(ic->ic_data, bad_buf, is->is_size); + abd_return_buf(good_abd, good_buf, is->is_size); +} + +/* + * Issue repair i/os for any incorrect copies. We do this by comparing + * each split segment's correct data (is_good_child's ic_data) with each + * other copy of the data. If they differ, then we overwrite the bad data + * with the good copy. Note that we do this without regard for the DTL's, + * which simplifies this code and also issues the optimal number of writes + * (based on which copies actually read bad data, as opposed to which we + * think might be wrong). For the same reason, we always use + * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). + */ +static void +vdev_indirect_repair(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + enum zio_flag flags = ZIO_FLAG_IO_REPAIR; + + if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) + flags |= ZIO_FLAG_SELF_HEAL; + + if (!spa_writeable(zio->io_spa)) + return; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + indirect_child_t *good_child = &is->is_child[is->is_good_child]; + + for (int c = 0; c < is->is_children; c++) { + indirect_child_t *ic = &is->is_child[c]; + if (ic == good_child) + continue; + if (ic->ic_data == NULL) + continue; + if (abd_cmp(good_child->ic_data, ic->ic_data, + is->is_size) == 0) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, + ic->ic_vdev, is->is_target_offset, + good_child->ic_data, is->is_size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, + NULL, NULL)); + + vdev_indirect_checksum_error(zio, is, ic); + } + } +} + +/* + * Report checksum errors on all children that we read from. + */ +static void +vdev_indirect_all_checksum_errors(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + if (zio->io_flags & ZIO_FLAG_SPECULATIVE) + return; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + for (int c = 0; c < is->is_children; c++) { + indirect_child_t *ic = &is->is_child[c]; + + if (ic->ic_data == NULL) + continue; + + vdev_t *vd = ic->ic_vdev; + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + + zfs_ereport_post_checksum(zio->io_spa, vd, zio, + is->is_target_offset, is->is_size, + NULL, NULL, NULL); + } + } +} + +/* + * This function is called when we have read all copies of the data and need + * to try to find a combination of copies that gives us the right checksum. + * + * If we pointed to any mirror vdevs, this effectively does the job of the + * mirror. The mirror vdev code can't do its own job because we don't know + * the checksum of each split segment individually. We have to try every + * combination of copies of split segments, until we find one that checksums + * correctly. (Or until we have tried all combinations, or have tried + * 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we + * set io_error to ECKSUM to propagate the error up to the user.) + * + * For example, if we have 3 segments in the split, + * and each points to a 2-way mirror, we will have the following pieces of + * data: + * + * | mirror child + * split | [0] [1] + * ======|===================== + * A | data_A_0 data_A_1 + * B | data_B_0 data_B_1 + * C | data_C_0 data_C_1 + * + * We will try the following (mirror children)^(number of splits) (2^3=8) + * combinations, which is similar to bitwise-little-endian counting in + * binary. In general each "digit" corresponds to a split segment, and the + * base of each digit is is_children, which can be different for each + * digit. + * + * "low bit" "high bit" + * v v + * data_A_0 data_B_0 data_C_0 + * data_A_1 data_B_0 data_C_0 + * data_A_0 data_B_1 data_C_0 + * data_A_1 data_B_1 data_C_0 + * data_A_0 data_B_0 data_C_1 + * data_A_1 data_B_0 data_C_1 + * data_A_0 data_B_1 data_C_1 + * data_A_1 data_B_1 data_C_1 + * + * Note that the split segments may be on the same or different top-level + * vdevs. In either case, we try lots of combinations (see + * zfs_reconstruct_indirect_segments_max). This ensures that if a mirror has + * small silent errors on all of its children, we can still reconstruct the + * correct data, as long as those errors are at sufficiently-separated + * offsets (specifically, separated by the largest block size - default of + * 128KB, but up to 16MB). + */ +static void +vdev_indirect_reconstruct_io_done(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + uint64_t attempts = 0; + uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max; + int segments = 0; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) + segments++; + + for (;;) { + /* copy data from splits to main zio */ + int ret; + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + + /* + * If this child failed, its ic_data will be NULL. + * Skip this combination. + */ + if (is->is_child[is->is_good_child].ic_data == NULL) { + ret = EIO; + goto next; + } + + abd_copy_off(zio->io_abd, + is->is_child[is->is_good_child].ic_data, + is->is_split_offset, 0, is->is_size); + } + + /* See if this checksum matches. */ + zio_bad_cksum_t zbc; + ret = zio_checksum_error(zio, &zbc); + if (ret == 0) { + /* Found a matching checksum. Issue repair i/os. */ + vdev_indirect_repair(zio); + zio_checksum_verified(zio); + return; + } + + /* + * Checksum failed; try a different combination of split + * children. + */ + boolean_t more; +next: + more = B_FALSE; + if (segments <= zfs_reconstruct_indirect_segments_max) { + /* + * There are relatively few segments, so + * deterministically check all combinations. We do + * this by by adding one to the first split's + * good_child. If it overflows, then "carry over" to + * the next split (like counting in base is_children, + * but each digit can have a different base). + */ + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + is->is_good_child++; + if (is->is_good_child < is->is_children) { + more = B_TRUE; + break; + } + is->is_good_child = 0; + } + } else if (++attempts < attempts_max) { + /* + * There are too many combinations to try all of them + * in a reasonable amount of time, so try a fixed + * number of random combinations, after which we'll + * consider the block unrecoverable. + */ + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + is->is_good_child = + spa_get_random(is->is_children); + } + more = B_TRUE; + } + if (!more) { + /* All combinations failed. */ + zio->io_error = ret; + vdev_indirect_all_checksum_errors(zio); + zio_checksum_verified(zio); + return; + } + } +} + +static void +vdev_indirect_io_done(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + if (iv->iv_reconstruct) { + /* + * We have read all copies of the data (e.g. from mirrors), + * either because this was a scrub/resilver, or because the + * one-copy read didn't checksum correctly. + */ + vdev_indirect_reconstruct_io_done(zio); + return; + } + + if (!iv->iv_split_block) { + /* + * This was not a split block, so we passed the BP down, + * and the checksum was handled by the (one) child zio. + */ + return; + } + + zio_bad_cksum_t zbc; + int ret = zio_checksum_error(zio, &zbc); + if (ret == 0) { + zio_checksum_verified(zio); + return; + } + + /* + * The checksum didn't match. Read all copies of all splits, and + * then we will try to reconstruct. The next time + * vdev_indirect_io_done() is called, iv_reconstruct will be set. + */ + vdev_indirect_read_all(zio); + + zio_vdev_io_redone(zio); +} + +vdev_ops_t vdev_indirect_ops = { + vdev_indirect_open, + vdev_indirect_close, + vdev_default_asize, + vdev_indirect_io_start, + vdev_indirect_io_done, + NULL, + NULL, + NULL, + NULL, + vdev_indirect_remap, + NULL, + VDEV_TYPE_INDIRECT, /* name of this vdev type */ + B_FALSE /* leaf vdev */ +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c new file mode 100644 index 000000000000..fbecbe830929 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c @@ -0,0 +1,212 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#include <sys/dmu_tx.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/vdev_indirect_births.h> + +static boolean_t +vdev_indirect_births_verify(vdev_indirect_births_t *vib) +{ + ASSERT(vib != NULL); + + ASSERT(vib->vib_object != 0); + ASSERT(vib->vib_objset != NULL); + ASSERT(vib->vib_phys != NULL); + ASSERT(vib->vib_dbuf != NULL); + + EQUIV(vib->vib_phys->vib_count > 0, vib->vib_entries != NULL); + + return (B_TRUE); +} + +uint64_t +vdev_indirect_births_count(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + + return (vib->vib_phys->vib_count); +} + +uint64_t +vdev_indirect_births_object(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + + return (vib->vib_object); +} + +static uint64_t +vdev_indirect_births_size_impl(vdev_indirect_births_t *vib) +{ + return (vib->vib_phys->vib_count * sizeof (*vib->vib_entries)); +} + +void +vdev_indirect_births_close(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + + if (vib->vib_phys->vib_count > 0) { + uint64_t births_size = vdev_indirect_births_size_impl(vib); + + kmem_free(vib->vib_entries, births_size); + vib->vib_entries = NULL; + } + + dmu_buf_rele(vib->vib_dbuf, vib); + + vib->vib_objset = NULL; + vib->vib_object = 0; + vib->vib_dbuf = NULL; + vib->vib_phys = NULL; + + kmem_free(vib, sizeof (*vib)); +} + +uint64_t +vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + + return (dmu_object_alloc(os, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, sizeof (vdev_indirect_birth_phys_t), + tx)); +} + +vdev_indirect_births_t * +vdev_indirect_births_open(objset_t *os, uint64_t births_object) +{ + vdev_indirect_births_t *vib = kmem_zalloc(sizeof (*vib), KM_SLEEP); + + vib->vib_objset = os; + vib->vib_object = births_object; + + VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf)); + vib->vib_phys = vib->vib_dbuf->db_data; + + if (vib->vib_phys->vib_count > 0) { + uint64_t births_size = vdev_indirect_births_size_impl(vib); + vib->vib_entries = kmem_alloc(births_size, KM_SLEEP); + VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0, + births_size, vib->vib_entries, DMU_READ_PREFETCH)); + } + + ASSERT(vdev_indirect_births_verify(vib)); + + return (vib); +} + +void +vdev_indirect_births_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + VERIFY0(dmu_object_free(os, object, tx)); +} + +void +vdev_indirect_births_add_entry(vdev_indirect_births_t *vib, + uint64_t max_offset, uint64_t txg, dmu_tx_t *tx) +{ + vdev_indirect_birth_entry_phys_t vibe; + uint64_t old_size; + uint64_t new_size; + vdev_indirect_birth_entry_phys_t *new_entries; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); + ASSERT(vdev_indirect_births_verify(vib)); + + dmu_buf_will_dirty(vib->vib_dbuf, tx); + + vibe.vibe_offset = max_offset; + vibe.vibe_phys_birth_txg = txg; + + old_size = vdev_indirect_births_size_impl(vib); + dmu_write(vib->vib_objset, vib->vib_object, old_size, sizeof (vibe), + &vibe, tx); + vib->vib_phys->vib_count++; + new_size = vdev_indirect_births_size_impl(vib); + + new_entries = kmem_alloc(new_size, KM_SLEEP); + if (old_size > 0) { + bcopy(vib->vib_entries, new_entries, old_size); + kmem_free(vib->vib_entries, old_size); + } + new_entries[vib->vib_phys->vib_count - 1] = vibe; + vib->vib_entries = new_entries; +} + +uint64_t +vdev_indirect_births_last_entry_txg(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + ASSERT(vib->vib_phys->vib_count > 0); + + vdev_indirect_birth_entry_phys_t *last = + &vib->vib_entries[vib->vib_phys->vib_count - 1]; + return (last->vibe_phys_birth_txg); +} + +/* + * Return the txg in which the given range was copied (i.e. its physical + * birth txg). The specified offset+asize must be contiguously mapped + * (i.e. not a split block). + * + * The entries are sorted by increasing phys_birth, and also by increasing + * offset. We find the specified offset by binary search. Note that we + * can not use bsearch() because looking at each entry independently is + * insufficient to find the correct entry. Each entry implicitly relies + * on the previous entry: an entry indicates that the offsets from the + * end of the previous entry to the end of this entry were written in the + * specified txg. + */ +uint64_t +vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, uint64_t offset, + uint64_t asize) +{ + vdev_indirect_birth_entry_phys_t *base; + vdev_indirect_birth_entry_phys_t *last; + + ASSERT(vdev_indirect_births_verify(vib)); + ASSERT(vib->vib_phys->vib_count > 0); + + base = vib->vib_entries; + last = base + vib->vib_phys->vib_count - 1; + + ASSERT3U(offset, <, last->vibe_offset); + + while (last >= base) { + vdev_indirect_birth_entry_phys_t *p = + base + ((last - base) / 2); + if (offset >= p->vibe_offset) { + base = p + 1; + } else if (p == vib->vib_entries || + offset >= (p - 1)->vibe_offset) { + ASSERT3U(offset + asize, <=, p->vibe_offset); + return (p->vibe_phys_birth_txg); + } else { + last = p - 1; + } + } + ASSERT(!"offset not found"); + return (-1); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c new file mode 100644 index 000000000000..02999aae7274 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c @@ -0,0 +1,593 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015, 2017 by Delphix. All rights reserved. + */ + +#include <sys/dmu_tx.h> +#include <sys/dsl_pool.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/zfeature.h> +#include <sys/dmu_objset.h> + +static boolean_t +vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim) +{ + ASSERT(vim != NULL); + + ASSERT(vim->vim_object != 0); + ASSERT(vim->vim_objset != NULL); + ASSERT(vim->vim_phys != NULL); + ASSERT(vim->vim_dbuf != NULL); + + EQUIV(vim->vim_phys->vimp_num_entries > 0, + vim->vim_entries != NULL); + if (vim->vim_phys->vimp_num_entries > 0) { + vdev_indirect_mapping_entry_phys_t *last_entry = + &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1]; + uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry); + uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst); + + ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size); + } + if (vim->vim_havecounts) { + ASSERT(vim->vim_phys->vimp_counts_object != 0); + } + + return (B_TRUE); +} + +uint64_t +vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_phys->vimp_num_entries); +} + +uint64_t +vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_phys->vimp_max_offset); +} + +uint64_t +vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_object); +} + +uint64_t +vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_phys->vimp_bytes_mapped); +} + +/* + * The length (in bytes) of the mapping object array in memory and + * (logically) on disk. + * + * Note that unlike most of our accessor functions, + * we don't assert that the struct is consistent; therefore it can be + * called while there may be concurrent changes, if we don't care about + * the value being immediately stale (e.g. from spa_removal_get_stats()). + */ +uint64_t +vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim) +{ + return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries)); +} + +/* + * Compare an offset with an indirect mapping entry; there are three + * possible scenarios: + * + * 1. The offset is "less than" the mapping entry; meaning the + * offset is less than the source offset of the mapping entry. In + * this case, there is no overlap between the offset and the + * mapping entry and -1 will be returned. + * + * 2. The offset is "greater than" the mapping entry; meaning the + * offset is greater than the mapping entry's source offset plus + * the entry's size. In this case, there is no overlap between + * the offset and the mapping entry and 1 will be returned. + * + * NOTE: If the offset is actually equal to the entry's offset + * plus size, this is considered to be "greater" than the entry, + * and this case applies (i.e. 1 will be returned). Thus, the + * entry's "range" can be considered to be inclusive at its + * start, but exclusive at its end: e.g. [src, src + size). + * + * 3. The last case to consider is if the offset actually falls + * within the mapping entry's range. If this is the case, the + * offset is considered to be "equal to" the mapping entry and + * 0 will be returned. + * + * NOTE: If the offset is equal to the entry's source offset, + * this case applies and 0 will be returned. If the offset is + * equal to the entry's source plus its size, this case does + * *not* apply (see "NOTE" above for scenario 2), and 1 will be + * returned. + */ +static int +dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) +{ + const uint64_t *key = v_key; + const vdev_indirect_mapping_entry_phys_t *array_elem = + v_array_elem; + uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); + + if (*key < src_offset) { + return (-1); + } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { + return (0); + } else { + return (1); + } +} + +/* + * Returns the mapping entry for the given offset. + * + * It's possible that the given offset will not be in the mapping table + * (i.e. no mapping entries contain this offset), in which case, the + * return value value depends on the "next_if_missing" parameter. + * + * If the offset is not found in the table and "next_if_missing" is + * B_FALSE, then NULL will always be returned. The behavior is intended + * to allow consumers to get the entry corresponding to the offset + * parameter, iff the offset overlaps with an entry in the table. + * + * If the offset is not found in the table and "next_if_missing" is + * B_TRUE, then the entry nearest to the given offset will be returned, + * such that the entry's source offset is greater than the offset + * passed in (i.e. the "next" mapping entry in the table is returned, if + * the offset is missing from the table). If there are no entries whose + * source offset is greater than the passed in offset, NULL is returned. + */ +static vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim, + uint64_t offset, boolean_t next_if_missing) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + ASSERT(vim->vim_phys->vimp_num_entries > 0); + + vdev_indirect_mapping_entry_phys_t *entry = NULL; + + uint64_t last = vim->vim_phys->vimp_num_entries - 1; + uint64_t base = 0; + + /* + * We don't define these inside of the while loop because we use + * their value in the case that offset isn't in the mapping. + */ + uint64_t mid; + int result; + + while (last >= base) { + mid = base + ((last - base) >> 1); + + result = dva_mapping_overlap_compare(&offset, + &vim->vim_entries[mid]); + + if (result == 0) { + entry = &vim->vim_entries[mid]; + break; + } else if (result < 0) { + last = mid - 1; + } else { + base = mid + 1; + } + } + + if (entry == NULL && next_if_missing) { + ASSERT3U(base, ==, last + 1); + ASSERT(mid == base || mid == last); + ASSERT3S(result, !=, 0); + + /* + * The offset we're looking for isn't actually contained + * in the mapping table, thus we need to return the + * closest mapping entry that is greater than the + * offset. We reuse the result of the last comparison, + * comparing the mapping entry at index "mid" and the + * offset. The offset is guaranteed to lie between + * indices one less than "mid", and one greater than + * "mid"; we just need to determine if offset is greater + * than, or less than the mapping entry contained at + * index "mid". + */ + + uint64_t index; + if (result < 0) + index = mid; + else + index = mid + 1; + + ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries); + + if (index == vim->vim_phys->vimp_num_entries) { + /* + * If "index" is past the end of the entries + * array, then not only is the offset not in the + * mapping table, but it's actually greater than + * all entries in the table. In this case, we + * can't return a mapping entry greater than the + * offset (since none exist), so we return NULL. + */ + + ASSERT3S(dva_mapping_overlap_compare(&offset, + &vim->vim_entries[index - 1]), >, 0); + + return (NULL); + } else { + /* + * Just to be safe, we verify the offset falls + * in between the mapping entries at index and + * one less than index. Since we know the offset + * doesn't overlap an entry, and we're supposed + * to return the entry just greater than the + * offset, both of the following tests must be + * true. + */ + ASSERT3S(dva_mapping_overlap_compare(&offset, + &vim->vim_entries[index]), <, 0); + IMPLY(index >= 1, dva_mapping_overlap_compare(&offset, + &vim->vim_entries[index - 1]) > 0); + + return (&vim->vim_entries[index]); + } + } else { + return (entry); + } +} + +vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, + uint64_t offset) +{ + return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, + B_FALSE)); +} + +vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, + uint64_t offset) +{ + return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, + B_TRUE)); +} + + +void +vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + if (vim->vim_phys->vimp_num_entries > 0) { + uint64_t map_size = vdev_indirect_mapping_size(vim); + kmem_free(vim->vim_entries, map_size); + vim->vim_entries = NULL; + } + + dmu_buf_rele(vim->vim_dbuf, vim); + + vim->vim_objset = NULL; + vim->vim_object = 0; + vim->vim_dbuf = NULL; + vim->vim_phys = NULL; + + kmem_free(vim, sizeof (*vim)); +} + +uint64_t +vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx) +{ + uint64_t object; + ASSERT(dmu_tx_is_syncing(tx)); + uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0; + + if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + bonus_size = sizeof (vdev_indirect_mapping_phys_t); + } + + object = dmu_object_alloc(os, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, bonus_size, + tx); + + if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + dmu_buf_t *dbuf; + vdev_indirect_mapping_phys_t *vimp; + + VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + vimp = dbuf->db_data; + vimp->vimp_counts_object = dmu_object_alloc(os, + DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OT_NONE, 0, tx); + spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + dmu_buf_rele(dbuf, FTAG); + } + + return (object); +} + + +vdev_indirect_mapping_t * +vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object) +{ + vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP); + dmu_object_info_t doi; + VERIFY0(dmu_object_info(os, mapping_object, &doi)); + + vim->vim_objset = os; + vim->vim_object = mapping_object; + + VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim, + &vim->vim_dbuf)); + vim->vim_phys = vim->vim_dbuf->db_data; + + vim->vim_havecounts = + (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0); + + if (vim->vim_phys->vimp_num_entries > 0) { + uint64_t map_size = vdev_indirect_mapping_size(vim); + vim->vim_entries = kmem_alloc(map_size, KM_SLEEP); + VERIFY0(dmu_read(os, vim->vim_object, 0, map_size, + vim->vim_entries, DMU_READ_PREFETCH)); + } + + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim); +} + +void +vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object); + if (vim->vim_havecounts) { + VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object, + tx)); + spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + } + vdev_indirect_mapping_close(vim); + + VERIFY0(dmu_object_free(os, object, tx)); +} + +/* + * Append the list of vdev_indirect_mapping_entry_t's to the on-disk + * mapping object. Also remove the entries from the list and free them. + * This also implicitly extends the max_offset of the mapping (to the end + * of the last entry). + */ +void +vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, + list_t *list, dmu_tx_t *tx) +{ + vdev_indirect_mapping_entry_phys_t *mapbuf; + uint64_t old_size; + uint32_t *countbuf = NULL; + vdev_indirect_mapping_entry_phys_t *old_entries; + uint64_t old_count; + uint64_t entries_written = 0; + + ASSERT(vdev_indirect_mapping_verify(vim)); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); + ASSERT(!list_is_empty(list)); + + old_size = vdev_indirect_mapping_size(vim); + old_entries = vim->vim_entries; + old_count = vim->vim_phys->vimp_num_entries; + + dmu_buf_will_dirty(vim->vim_dbuf, tx); + + mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); + if (vim->vim_havecounts) { + countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); + ASSERT(spa_feature_is_active(vim->vim_objset->os_spa, + SPA_FEATURE_OBSOLETE_COUNTS)); + } + while (!list_is_empty(list)) { + uint64_t i; + /* + * Write entries from the list to the + * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE. + */ + for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) { + vdev_indirect_mapping_entry_t *entry = + list_remove_head(list); + if (entry == NULL) + break; + + uint64_t size = + DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst); + uint64_t src_offset = + DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping); + + /* + * We shouldn't be adding an entry which is fully + * obsolete. + */ + ASSERT3U(entry->vime_obsolete_count, <, size); + IMPLY(entry->vime_obsolete_count != 0, + vim->vim_havecounts); + + mapbuf[i] = entry->vime_mapping; + if (vim->vim_havecounts) + countbuf[i] = entry->vime_obsolete_count; + + vim->vim_phys->vimp_bytes_mapped += size; + ASSERT3U(src_offset, >=, + vim->vim_phys->vimp_max_offset); + vim->vim_phys->vimp_max_offset = src_offset + size; + + entries_written++; + + kmem_free(entry, sizeof (*entry)); + } + dmu_write(vim->vim_objset, vim->vim_object, + vim->vim_phys->vimp_num_entries * sizeof (*mapbuf), + i * sizeof (*mapbuf), + mapbuf, tx); + if (vim->vim_havecounts) { + dmu_write(vim->vim_objset, + vim->vim_phys->vimp_counts_object, + vim->vim_phys->vimp_num_entries * + sizeof (*countbuf), + i * sizeof (*countbuf), countbuf, tx); + } + vim->vim_phys->vimp_num_entries += i; + } + zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE); + if (vim->vim_havecounts) + zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE); + + /* + * Update the entry array to reflect the new entries. First, copy + * over any old entries then read back the new entries we just wrote. + */ + uint64_t new_size = vdev_indirect_mapping_size(vim); + ASSERT3U(new_size, >, old_size); + ASSERT3U(new_size - old_size, ==, + entries_written * sizeof (vdev_indirect_mapping_entry_phys_t)); + vim->vim_entries = kmem_alloc(new_size, KM_SLEEP); + if (old_size > 0) { + bcopy(old_entries, vim->vim_entries, old_size); + kmem_free(old_entries, old_size); + } + VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size, + new_size - old_size, &vim->vim_entries[old_count], + DMU_READ_PREFETCH)); + + zfs_dbgmsg("txg %llu: wrote %llu entries to " + "indirect mapping obj %llu; max offset=0x%llx", + (u_longlong_t)dmu_tx_get_txg(tx), + (u_longlong_t)entries_written, + (u_longlong_t)vim->vim_object, + (u_longlong_t)vim->vim_phys->vimp_max_offset); +} + +/* + * Increment the relevant counts for the specified offset and length. + * The counts array must be obtained from + * vdev_indirect_mapping_load_obsolete_counts(). + */ +void +vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim, + uint64_t offset, uint64_t length, uint32_t *counts) +{ + vdev_indirect_mapping_entry_phys_t *mapping; + uint64_t index; + + mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); + + ASSERT(length > 0); + ASSERT3P(mapping, !=, NULL); + + index = mapping - vim->vim_entries; + + while (length > 0) { + ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim)); + + uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); + uint64_t inner_offset = offset - + DVA_MAPPING_GET_SRC_OFFSET(mapping); + VERIFY3U(inner_offset, <, size); + uint64_t inner_size = MIN(length, size - inner_offset); + + VERIFY3U(counts[index] + inner_size, <=, size); + counts[index] += inner_size; + + offset += inner_size; + length -= inner_size; + mapping++; + index++; + } +} + +typedef struct load_obsolete_space_map_arg { + vdev_indirect_mapping_t *losma_vim; + uint32_t *losma_counts; +} load_obsolete_space_map_arg_t; + +static int +load_obsolete_sm_callback(space_map_entry_t *sme, void *arg) +{ + load_obsolete_space_map_arg_t *losma = arg; + ASSERT3S(sme->sme_type, ==, SM_ALLOC); + + vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim, + sme->sme_offset, sme->sme_run, losma->losma_counts); + + return (0); +} + +/* + * Modify the counts (increment them) based on the spacemap. + */ +void +vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim, + uint32_t *counts, space_map_t *obsolete_space_sm) +{ + load_obsolete_space_map_arg_t losma; + losma.losma_counts = counts; + losma.losma_vim = vim; + VERIFY0(space_map_iterate(obsolete_space_sm, + load_obsolete_sm_callback, &losma)); +} + +/* + * Read the obsolete counts from disk, returning them in an array. + */ +uint32_t * +vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + uint64_t counts_size = + vim->vim_phys->vimp_num_entries * sizeof (uint32_t); + uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP); + if (vim->vim_havecounts) { + VERIFY0(dmu_read(vim->vim_objset, + vim->vim_phys->vimp_counts_object, + 0, counts_size, + counts, DMU_READ_PREFETCH)); + } else { + bzero(counts, counts_size); + } + return (counts); +} + +extern void +vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim, + uint32_t *counts) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c new file mode 100644 index 000000000000..a2c39c2868e5 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c @@ -0,0 +1,792 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/txg.h> +#include <sys/vdev_impl.h> +#include <sys/refcount.h> +#include <sys/metaslab_impl.h> +#include <sys/dsl_synctask.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> + +/* + * Maximum number of metaslabs per group that can be initialized + * simultaneously. + */ +int max_initialize_ms = 3; + +/* + * Value that is written to disk during initialization. + */ +uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL; + +/* maximum number of I/Os outstanding per leaf vdev */ +int zfs_initialize_limit = 1; + +/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ +uint64_t zfs_initialize_chunk_size = 1024 * 1024; + +static boolean_t +vdev_initialize_should_stop(vdev_t *vd) +{ + return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || + vd->vdev_detached || vd->vdev_top->vdev_removing); +} + +static void +vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) +{ + /* + * We pass in the guid instead of the vdev_t since the vdev may + * have been freed prior to the sync task being processed. This + * happens when a vdev is detached as we call spa_config_vdev_exit(), + * stop the intializing thread, schedule the sync task, and free + * the vdev. Later when the scheduled sync task is invoked, it would + * find that the vdev has been freed. + */ + uint64_t guid = *(uint64_t *)arg; + uint64_t txg = dmu_tx_get_txg(tx); + kmem_free(arg, sizeof (uint64_t)); + + vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + return; + + uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; + vd->vdev_initialize_offset[txg & TXG_MASK] = 0; + + VERIFY(vd->vdev_leaf_zap != 0); + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + + if (last_offset > 0) { + vd->vdev_initialize_last_offset = last_offset; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, + sizeof (last_offset), 1, &last_offset, tx)); + } + if (vd->vdev_initialize_action_time > 0) { + uint64_t val = (uint64_t)vd->vdev_initialize_action_time; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), + 1, &val, tx)); + } + + uint64_t initialize_state = vd->vdev_initialize_state; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, + &initialize_state, tx)); +} + +static void +vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + spa_t *spa = vd->vdev_spa; + + if (new_state == vd->vdev_initialize_state) + return; + + /* + * Copy the vd's guid, this will be freed by the sync task. + */ + uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + *guid = vd->vdev_guid; + + /* + * If we're suspending, then preserving the original start time. + */ + if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { + vd->vdev_initialize_action_time = gethrestime_sec(); + } + vd->vdev_initialize_state = new_state; + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, + guid, 2, ZFS_SPACE_CHECK_RESERVED, tx); + + switch (new_state) { + case VDEV_INITIALIZE_ACTIVE: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s activated", vd->vdev_path); + break; + case VDEV_INITIALIZE_SUSPENDED: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s suspended", vd->vdev_path); + break; + case VDEV_INITIALIZE_CANCELED: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s canceled", vd->vdev_path); + break; + case VDEV_INITIALIZE_COMPLETE: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s complete", vd->vdev_path); + break; + default: + panic("invalid state %llu", (unsigned long long)new_state); + } + + dmu_tx_commit(tx); +} + +static void +vdev_initialize_cb(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + mutex_enter(&vd->vdev_initialize_io_lock); + if (zio->io_error == ENXIO && !vdev_writeable(vd)) { + /* + * The I/O failed because the vdev was unavailable; roll the + * last offset back. (This works because spa_sync waits on + * spa_txg_zio before it runs sync tasks.) + */ + uint64_t *off = + &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; + *off = MIN(*off, zio->io_offset); + } else { + /* + * Since initializing is best-effort, we ignore I/O errors and + * rely on vdev_probe to determine if the errors are more + * critical. + */ + if (zio->io_error != 0) + vd->vdev_stat.vs_initialize_errors++; + + vd->vdev_initialize_bytes_done += zio->io_orig_size; + } + ASSERT3U(vd->vdev_initialize_inflight, >, 0); + vd->vdev_initialize_inflight--; + cv_broadcast(&vd->vdev_initialize_io_cv); + mutex_exit(&vd->vdev_initialize_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} + +/* Takes care of physical writing and limiting # of concurrent ZIOs. */ +static int +vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) +{ + spa_t *spa = vd->vdev_spa; + + /* Limit inflight initializing I/Os */ + mutex_enter(&vd->vdev_initialize_io_lock); + while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { + cv_wait(&vd->vdev_initialize_io_cv, + &vd->vdev_initialize_io_lock); + } + vd->vdev_initialize_inflight++; + mutex_exit(&vd->vdev_initialize_io_lock); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); + mutex_enter(&vd->vdev_initialize_lock); + + if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { + uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + *guid = vd->vdev_guid; + + /* This is the first write of this txg. */ + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_initialize_zap_update_sync, guid, 2, + ZFS_SPACE_CHECK_RESERVED, tx); + } + + /* + * We know the vdev struct will still be around since all + * consumers of vdev_free must stop the initialization first. + */ + if (vdev_initialize_should_stop(vd)) { + mutex_enter(&vd->vdev_initialize_io_lock); + ASSERT3U(vd->vdev_initialize_inflight, >, 0); + vd->vdev_initialize_inflight--; + mutex_exit(&vd->vdev_initialize_io_lock); + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); + mutex_exit(&vd->vdev_initialize_lock); + dmu_tx_commit(tx); + return (SET_ERROR(EINTR)); + } + mutex_exit(&vd->vdev_initialize_lock); + + vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; + zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, + size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, + ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); + /* vdev_initialize_cb releases SCL_STATE_ALL */ + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Translate a logical range to the physical range for the specified vdev_t. + * This function is initially called with a leaf vdev and will walk each + * parent vdev until it reaches a top-level vdev. Once the top-level is + * reached the physical range is initialized and the recursive function + * begins to unwind. As it unwinds it calls the parent's vdev specific + * translation function to do the real conversion. + */ +void +vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) +{ + /* + * Walk up the vdev tree + */ + if (vd != vd->vdev_top) { + vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); + } else { + /* + * We've reached the top-level vdev, initialize the + * physical range to the logical range and start to + * unwind. + */ + physical_rs->rs_start = logical_rs->rs_start; + physical_rs->rs_end = logical_rs->rs_end; + return; + } + + vdev_t *pvd = vd->vdev_parent; + ASSERT3P(pvd, !=, NULL); + ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); + + /* + * As this recursive function unwinds, translate the logical + * range into its physical components by calling the + * vdev specific translate function. + */ + range_seg_t intermediate = { 0 }; + pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); + + physical_rs->rs_start = intermediate.rs_start; + physical_rs->rs_end = intermediate.rs_end; +} + +/* + * Callback to fill each ABD chunk with zfs_initialize_value. len must be + * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD + * allocation will guarantee these for us. + */ +/* ARGSUSED */ +static int +vdev_initialize_block_fill(void *buf, size_t len, void *unused) +{ + ASSERT0(len % sizeof (uint64_t)); + for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { + *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; + } + return (0); +} + +static abd_t * +vdev_initialize_block_alloc() +{ + /* Allocate ABD for filler data */ + abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); + + ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); + (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, + vdev_initialize_block_fill, NULL); + + return (data); +} + +static void +vdev_initialize_block_free(abd_t *data) +{ + abd_free(data); +} + +static int +vdev_initialize_ranges(vdev_t *vd, abd_t *data) +{ + avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; + + for (range_seg_t *rs = avl_first(rt); rs != NULL; + rs = AVL_NEXT(rt, rs)) { + uint64_t size = rs->rs_end - rs->rs_start; + + /* Split range into legally-sized physical chunks */ + uint64_t writes_required = + ((size - 1) / zfs_initialize_chunk_size) + 1; + + for (uint64_t w = 0; w < writes_required; w++) { + int error; + + error = vdev_initialize_write(vd, + VDEV_LABEL_START_SIZE + rs->rs_start + + (w * zfs_initialize_chunk_size), + MIN(size - (w * zfs_initialize_chunk_size), + zfs_initialize_chunk_size), data); + if (error != 0) + return (error); + } + } + return (0); +} + +static void +vdev_initialize_ms_load(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + metaslab_load_wait(msp); + if (!msp->ms_loaded) + VERIFY0(metaslab_load(msp)); +} + +static void +vdev_initialize_mg_wait(metaslab_group_t *mg) +{ + ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); + while (mg->mg_initialize_updating) { + cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); + } +} + +static void +vdev_initialize_mg_mark(metaslab_group_t *mg) +{ + ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); + ASSERT(mg->mg_initialize_updating); + + while (mg->mg_ms_initializing >= max_initialize_ms) { + cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); + } + mg->mg_ms_initializing++; + ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms); +} + +/* + * Mark the metaslab as being initialized to prevent any allocations + * on this metaslab. We must also track how many metaslabs are currently + * being initialized within a metaslab group and limit them to prevent + * allocation failures from occurring because all metaslabs are being + * initialized. + */ +static void +vdev_initialize_ms_mark(metaslab_t *msp) +{ + ASSERT(!MUTEX_HELD(&msp->ms_lock)); + metaslab_group_t *mg = msp->ms_group; + + mutex_enter(&mg->mg_ms_initialize_lock); + + /* + * To keep an accurate count of how many threads are initializing + * a specific metaslab group, we only allow one thread to mark + * the metaslab group at a time. This ensures that the value of + * ms_initializing will be accurate when we decide to mark a metaslab + * group as being initialized. To do this we force all other threads + * to wait till the metaslab's mg_initialize_updating flag is no + * longer set. + */ + vdev_initialize_mg_wait(mg); + mg->mg_initialize_updating = B_TRUE; + if (msp->ms_initializing == 0) { + vdev_initialize_mg_mark(mg); + } + mutex_enter(&msp->ms_lock); + msp->ms_initializing++; + mutex_exit(&msp->ms_lock); + + mg->mg_initialize_updating = B_FALSE; + cv_broadcast(&mg->mg_ms_initialize_cv); + mutex_exit(&mg->mg_ms_initialize_lock); +} + +static void +vdev_initialize_ms_unmark(metaslab_t *msp) +{ + ASSERT(!MUTEX_HELD(&msp->ms_lock)); + metaslab_group_t *mg = msp->ms_group; + mutex_enter(&mg->mg_ms_initialize_lock); + mutex_enter(&msp->ms_lock); + if (--msp->ms_initializing == 0) { + mg->mg_ms_initializing--; + cv_broadcast(&mg->mg_ms_initialize_cv); + } + mutex_exit(&msp->ms_lock); + mutex_exit(&mg->mg_ms_initialize_lock); +} + +static void +vdev_initialize_calculate_progress(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || + spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); + ASSERT(vd->vdev_leaf_zap != 0); + + vd->vdev_initialize_bytes_est = 0; + vd->vdev_initialize_bytes_done = 0; + + for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + mutex_enter(&msp->ms_lock); + + uint64_t ms_free = msp->ms_size - + space_map_allocated(msp->ms_sm); + + if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) + ms_free /= vd->vdev_top->vdev_children; + + /* + * Convert the metaslab range to a physical range + * on our vdev. We use this to determine if we are + * in the middle of this metaslab range. + */ + range_seg_t logical_rs, physical_rs; + logical_rs.rs_start = msp->ms_start; + logical_rs.rs_end = msp->ms_start + msp->ms_size; + vdev_xlate(vd, &logical_rs, &physical_rs); + + if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { + vd->vdev_initialize_bytes_est += ms_free; + mutex_exit(&msp->ms_lock); + continue; + } else if (vd->vdev_initialize_last_offset > + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += ms_free; + vd->vdev_initialize_bytes_est += ms_free; + mutex_exit(&msp->ms_lock); + continue; + } + + /* + * If we get here, we're in the middle of initializing this + * metaslab. Load it and walk the free tree for more accurate + * progress estimation. + */ + vdev_initialize_ms_load(msp); + + for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs; + rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { + logical_rs.rs_start = rs->rs_start; + logical_rs.rs_end = rs->rs_end; + vdev_xlate(vd, &logical_rs, &physical_rs); + + uint64_t size = physical_rs.rs_end - + physical_rs.rs_start; + vd->vdev_initialize_bytes_est += size; + if (vd->vdev_initialize_last_offset > + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += size; + } else if (vd->vdev_initialize_last_offset > + physical_rs.rs_start && + vd->vdev_initialize_last_offset < + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += + vd->vdev_initialize_last_offset - + physical_rs.rs_start; + } + } + mutex_exit(&msp->ms_lock); + } +} + +static void +vdev_initialize_load(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || + spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); + ASSERT(vd->vdev_leaf_zap != 0); + + if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || + vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { + int err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, + sizeof (vd->vdev_initialize_last_offset), 1, + &vd->vdev_initialize_last_offset); + ASSERT(err == 0 || err == ENOENT); + } + + vdev_initialize_calculate_progress(vd); +} + + +/* + * Convert the logical range into a physcial range and add it to our + * avl tree. + */ +void +vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) +{ + vdev_t *vd = arg; + range_seg_t logical_rs, physical_rs; + logical_rs.rs_start = start; + logical_rs.rs_end = start + size; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + vdev_xlate(vd, &logical_rs, &physical_rs); + + IMPLY(vd->vdev_top == vd, + logical_rs.rs_start == physical_rs.rs_start); + IMPLY(vd->vdev_top == vd, + logical_rs.rs_end == physical_rs.rs_end); + + /* Only add segments that we have not visited yet */ + if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) + return; + + /* Pick up where we left off mid-range. */ + if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { + zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " + "(%llu, %llu)", vd->vdev_path, + (u_longlong_t)physical_rs.rs_start, + (u_longlong_t)physical_rs.rs_end, + (u_longlong_t)vd->vdev_initialize_last_offset, + (u_longlong_t)physical_rs.rs_end); + ASSERT3U(physical_rs.rs_end, >, + vd->vdev_initialize_last_offset); + physical_rs.rs_start = vd->vdev_initialize_last_offset; + } + ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); + + /* + * With raidz, it's possible that the logical range does not live on + * this leaf vdev. We only add the physical range to this vdev's if it + * has a length greater than 0. + */ + if (physical_rs.rs_end > physical_rs.rs_start) { + range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + } else { + ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); + } +} + +static void +vdev_initialize_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + int error = 0; + uint64_t ms_count = 0; + + ASSERT(vdev_is_concrete(vd)); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + vd->vdev_initialize_last_offset = 0; + vdev_initialize_load(vd); + + abd_t *deadbeef = vdev_initialize_block_alloc(); + + vd->vdev_initialize_tree = range_tree_create(NULL, NULL); + + for (uint64_t i = 0; !vd->vdev_detached && + i < vd->vdev_top->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + + /* + * If we've expanded the top-level vdev or it's our + * first pass, calculate our progress. + */ + if (vd->vdev_top->vdev_ms_count != ms_count) { + vdev_initialize_calculate_progress(vd); + ms_count = vd->vdev_top->vdev_ms_count; + } + + vdev_initialize_ms_mark(msp); + mutex_enter(&msp->ms_lock); + vdev_initialize_ms_load(msp); + + range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, + vd); + mutex_exit(&msp->ms_lock); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + error = vdev_initialize_ranges(vd, deadbeef); + vdev_initialize_ms_unmark(msp); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); + if (error != 0) + break; + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_enter(&vd->vdev_initialize_io_lock); + while (vd->vdev_initialize_inflight > 0) { + cv_wait(&vd->vdev_initialize_io_cv, + &vd->vdev_initialize_io_lock); + } + mutex_exit(&vd->vdev_initialize_io_lock); + + range_tree_destroy(vd->vdev_initialize_tree); + vdev_initialize_block_free(deadbeef); + vd->vdev_initialize_tree = NULL; + + mutex_enter(&vd->vdev_initialize_lock); + if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { + vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); + } + ASSERT(vd->vdev_initialize_thread != NULL || + vd->vdev_initialize_inflight == 0); + + /* + * Drop the vdev_initialize_lock while we sync out the + * txg since it's possible that a device might be trying to + * come online and must check to see if it needs to restart an + * initialization. That thread will be holding the spa_config_lock + * which would prevent the txg_wait_synced from completing. + */ + mutex_exit(&vd->vdev_initialize_lock); + txg_wait_synced(spa_get_dsl(spa), 0); + mutex_enter(&vd->vdev_initialize_lock); + + vd->vdev_initialize_thread = NULL; + cv_broadcast(&vd->vdev_initialize_cv); + mutex_exit(&vd->vdev_initialize_lock); + thread_exit(); +} + +/* + * Initiates a device. Caller must hold vdev_initialize_lock. + * Device must be a leaf and not already be initializing. + */ +void +vdev_initialize(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_initialize_exit_wanted); + ASSERT(!vd->vdev_top->vdev_removing); + + vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); + vd->vdev_initialize_thread = thread_create(NULL, 0, + vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); +} + +/* + * Stop initializng a device, with the resultant initialing state being + * tgt_state. Blocks until the initializing thread has exited. + * Caller must hold vdev_initialize_lock and must not be writing to the spa + * config, as the initializing thread may try to enter the config as a reader + * before exiting. + */ +void +vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state) +{ + spa_t *spa = vd->vdev_spa; + ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER)); + + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + + /* + * Allow cancel requests to proceed even if the initialize thread + * has stopped. + */ + if (vd->vdev_initialize_thread == NULL && + tgt_state != VDEV_INITIALIZE_CANCELED) { + return; + } + + vdev_initialize_change_state(vd, tgt_state); + vd->vdev_initialize_exit_wanted = B_TRUE; + while (vd->vdev_initialize_thread != NULL) + cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); + + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + vd->vdev_initialize_exit_wanted = B_FALSE; +} + +static void +vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state) +{ + if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { + mutex_enter(&vd->vdev_initialize_lock); + vdev_initialize_stop(vd, tgt_state); + mutex_exit(&vd->vdev_initialize_lock); + return; + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state); + } +} + +/* + * Convenience function to stop initializing of a vdev tree and set all + * initialize thread pointers to NULL. + */ +void +vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) +{ + vdev_initialize_stop_all_impl(vd, tgt_state); + + if (vd->vdev_spa->spa_sync_on) { + /* Make sure that our state has been synced to disk */ + txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); + } +} + +void +vdev_initialize_restart(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + + if (vd->vdev_leaf_zap != 0) { + mutex_enter(&vd->vdev_initialize_lock); + uint64_t initialize_state = VDEV_INITIALIZE_NONE; + int err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, + sizeof (initialize_state), 1, &initialize_state); + ASSERT(err == 0 || err == ENOENT); + vd->vdev_initialize_state = initialize_state; + + uint64_t timestamp = 0; + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, + sizeof (timestamp), 1, ×tamp); + ASSERT(err == 0 || err == ENOENT); + vd->vdev_initialize_action_time = (time_t)timestamp; + + if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vd->vdev_offline) { + /* load progress for reporting, but don't resume */ + vdev_initialize_load(vd); + } else if (vd->vdev_initialize_state == + VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) { + vdev_initialize(vd); + } + + mutex_exit(&vd->vdev_initialize_lock); + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_initialize_restart(vd->vdev_child[i]); + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c new file mode 100644 index 000000000000..0e009124b2a1 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -0,0 +1,1503 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +/* + * Virtual Device Labels + * --------------------- + * + * The vdev label serves several distinct purposes: + * + * 1. Uniquely identify this device as part of a ZFS pool and confirm its + * identity within the pool. + * + * 2. Verify that all the devices given in a configuration are present + * within the pool. + * + * 3. Determine the uberblock for the pool. + * + * 4. In case of an import operation, determine the configuration of the + * toplevel vdev of which it is a part. + * + * 5. If an import operation cannot find all the devices in the pool, + * provide enough information to the administrator to determine which + * devices are missing. + * + * It is important to note that while the kernel is responsible for writing the + * label, it only consumes the information in the first three cases. The + * latter information is only consumed in userland when determining the + * configuration to import a pool. + * + * + * Label Organization + * ------------------ + * + * Before describing the contents of the label, it's important to understand how + * the labels are written and updated with respect to the uberblock. + * + * When the pool configuration is altered, either because it was newly created + * or a device was added, we want to update all the labels such that we can deal + * with fatal failure at any point. To this end, each disk has two labels which + * are updated before and after the uberblock is synced. Assuming we have + * labels and an uberblock with the following transaction groups: + * + * L1 UB L2 + * +------+ +------+ +------+ + * | | | | | | + * | t10 | | t10 | | t10 | + * | | | | | | + * +------+ +------+ +------+ + * + * In this stable state, the labels and the uberblock were all updated within + * the same transaction group (10). Each label is mirrored and checksummed, so + * that we can detect when we fail partway through writing the label. + * + * In order to identify which labels are valid, the labels are written in the + * following manner: + * + * 1. For each vdev, update 'L1' to the new label + * 2. Update the uberblock + * 3. For each vdev, update 'L2' to the new label + * + * Given arbitrary failure, we can determine the correct label to use based on + * the transaction group. If we fail after updating L1 but before updating the + * UB, we will notice that L1's transaction group is greater than the uberblock, + * so L2 must be valid. If we fail after writing the uberblock but before + * writing L2, we will notice that L2's transaction group is less than L1, and + * therefore L1 is valid. + * + * Another added complexity is that not every label is updated when the config + * is synced. If we add a single device, we do not want to have to re-write + * every label for every device in the pool. This means that both L1 and L2 may + * be older than the pool uberblock, because the necessary information is stored + * on another vdev. + * + * + * On-disk Format + * -------------- + * + * The vdev label consists of two distinct parts, and is wrapped within the + * vdev_label_t structure. The label includes 8k of padding to permit legacy + * VTOC disk labels, but is otherwise ignored. + * + * The first half of the label is a packed nvlist which contains pool wide + * properties, per-vdev properties, and configuration information. It is + * described in more detail below. + * + * The latter half of the label consists of a redundant array of uberblocks. + * These uberblocks are updated whenever a transaction group is committed, + * or when the configuration is updated. When a pool is loaded, we scan each + * vdev for the 'best' uberblock. + * + * + * Configuration Information + * ------------------------- + * + * The nvlist describing the pool and vdev contains the following elements: + * + * version ZFS on-disk version + * name Pool name + * state Pool state + * txg Transaction group in which this label was written + * pool_guid Unique identifier for this pool + * vdev_tree An nvlist describing vdev tree. + * features_for_read + * An nvlist of the features necessary for reading the MOS. + * + * Each leaf device label also contains the following: + * + * top_guid Unique ID for top-level vdev in which this is contained + * guid Unique ID for the leaf vdev + * + * The 'vs' configuration follows the format described in 'spa_config.c'. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/uberblock_impl.h> +#include <sys/metaslab.h> +#include <sys/metaslab_impl.h> +#include <sys/zio.h> +#include <sys/dsl_scan.h> +#include <sys/abd.h> +#include <sys/fs/zfs.h> +#include <sys/trim_map.h> + +static boolean_t vdev_trim_on_init = B_TRUE; +SYSCTL_DECL(_vfs_zfs_vdev); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RWTUN, + &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation"); + +/* + * Basic routines to read and write from a vdev label. + * Used throughout the rest of this file. + */ +uint64_t +vdev_label_offset(uint64_t psize, int l, uint64_t offset) +{ + ASSERT(offset < sizeof (vdev_label_t)); + ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0); + + return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? + 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); +} + +/* + * Returns back the vdev label associated with the passed in offset. + */ +int +vdev_label_number(uint64_t psize, uint64_t offset) +{ + int l; + + if (offset >= psize - VDEV_LABEL_END_SIZE) { + offset -= psize - VDEV_LABEL_END_SIZE; + offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t); + } + l = offset / sizeof (vdev_label_t); + return (l < VDEV_LABELS ? l : -1); +} + +static void +vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int flags) +{ + ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == + SCL_STATE_ALL); + ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); + + zio_nowait(zio_read_phys(zio, vd, + vdev_label_offset(vd->vdev_psize, l, offset), + size, buf, ZIO_CHECKSUM_LABEL, done, private, + ZIO_PRIORITY_SYNC_READ, flags, B_TRUE)); +} + +static void +vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int flags) +{ + ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || + (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) == + (SCL_CONFIG | SCL_STATE) && + dsl_pool_sync_context(spa_get_dsl(zio->io_spa)))); + ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); + + zio_nowait(zio_write_phys(zio, vd, + vdev_label_offset(vd->vdev_psize, l, offset), + size, buf, ZIO_CHECKSUM_LABEL, done, private, + ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); +} + +static void +root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) +{ + spa_t *spa = vd->vdev_spa; + + if (vd != spa->spa_root_vdev) + return; + + /* provide either current or previous scan information */ + pool_scan_stat_t ps; + if (spa_scan_get_stats(spa, &ps) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, + sizeof (pool_scan_stat_t) / sizeof (uint64_t)); + } + + pool_removal_stat_t prs; + if (spa_removal_get_stats(spa, &prs) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs, + sizeof (prs) / sizeof (uint64_t)); + } + + pool_checkpoint_stat_t pcs; + if (spa_checkpoint_get_stats(spa, &pcs) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, + sizeof (pcs) / sizeof (uint64_t)); + } +} + +/* + * Generate the nvlist representing this vdev's config. + */ +nvlist_t * +vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, + vdev_config_flag_t flags) +{ + nvlist_t *nv = NULL; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + nv = fnvlist_alloc(); + + fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type); + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid); + + if (vd->vdev_path != NULL) + fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path); + + if (vd->vdev_devid != NULL) + fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid); + + if (vd->vdev_physpath != NULL) + fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vd->vdev_physpath); + + if (vd->vdev_fru != NULL) + fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); + + if (vd->vdev_nparity != 0) { + ASSERT(strcmp(vd->vdev_ops->vdev_op_type, + VDEV_TYPE_RAIDZ) == 0); + + /* + * Make sure someone hasn't managed to sneak a fancy new vdev + * into a crufty old storage pool. + */ + ASSERT(vd->vdev_nparity == 1 || + (vd->vdev_nparity <= 2 && + spa_version(spa) >= SPA_VERSION_RAIDZ2) || + (vd->vdev_nparity <= 3 && + spa_version(spa) >= SPA_VERSION_RAIDZ3)); + + /* + * Note that we'll add the nparity tag even on storage pools + * that only support a single parity device -- older software + * will just ignore it. + */ + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); + } + + if (vd->vdev_wholedisk != -1ULL) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + vd->vdev_wholedisk); + + if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); + + if (vd->vdev_isspare) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && + vd == vd->vdev_top) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, + vd->vdev_ms_array); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, + vd->vdev_ms_shift); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, + vd->vdev_asize); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); + if (vd->vdev_removing) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, + vd->vdev_removing); + } + } + + if (vd->vdev_dtl_sm != NULL) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, + space_map_object(vd->vdev_dtl_sm)); + } + + if (vic->vic_mapping_object != 0) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, + vic->vic_mapping_object); + } + + if (vic->vic_births_object != 0) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, + vic->vic_births_object); + } + + if (vic->vic_prev_indirect_vdev != UINT64_MAX) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, + vic->vic_prev_indirect_vdev); + } + + if (vd->vdev_crtxg) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); + + if (flags & VDEV_CONFIG_MOS) { + if (vd->vdev_leaf_zap != 0) { + ASSERT(vd->vdev_ops->vdev_op_leaf); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, + vd->vdev_leaf_zap); + } + + if (vd->vdev_top_zap != 0) { + ASSERT(vd == vd->vdev_top); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, + vd->vdev_top_zap); + } + } + + if (getstats) { + vdev_stat_t vs; + + vdev_get_stats(vd, &vs); + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)); + + root_vdev_actions_getprogress(vd, nv); + + /* + * Note: this can be called from open context + * (spa_get_stats()), so we need the rwlock to prevent + * the mapping from being changed by condensing. + */ + rw_enter(&vd->vdev_indirect_rwlock, RW_READER); + if (vd->vdev_indirect_mapping != NULL) { + ASSERT(vd->vdev_indirect_births != NULL); + vdev_indirect_mapping_t *vim = + vd->vdev_indirect_mapping; + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, + vdev_indirect_mapping_size(vim)); + } + rw_exit(&vd->vdev_indirect_rwlock); + if (vd->vdev_mg != NULL && + vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) { + /* + * Compute approximately how much memory would be used + * for the indirect mapping if this device were to + * be removed. + * + * Note: If the frag metric is invalid, then not + * enough metaslabs have been converted to have + * histograms. + */ + uint64_t seg_count = 0; + uint64_t to_alloc = vd->vdev_stat.vs_alloc; + + /* + * There are the same number of allocated segments + * as free segments, so we will have at least one + * entry per free segment. However, small free + * segments (smaller than vdev_removal_max_span) + * will be combined with adjacent allocated segments + * as a single mapping. + */ + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + if (1ULL << (i + 1) < vdev_removal_max_span) { + to_alloc += + vd->vdev_mg->mg_histogram[i] << + i + 1; + } else { + seg_count += + vd->vdev_mg->mg_histogram[i]; + } + } + + /* + * The maximum length of a mapping is + * zfs_remove_max_segment, so we need at least one entry + * per zfs_remove_max_segment of allocated data. + */ + seg_count += to_alloc / zfs_remove_max_segment; + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, + seg_count * + sizeof (vdev_indirect_mapping_entry_phys_t)); + } + } + + if (!vd->vdev_ops->vdev_op_leaf) { + nvlist_t **child; + int c, idx; + + ASSERT(!vd->vdev_ishole); + + child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), + KM_SLEEP); + + for (c = 0, idx = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + /* + * If we're generating an nvlist of removing + * vdevs then skip over any device which is + * not being removed. + */ + if ((flags & VDEV_CONFIG_REMOVING) && + !cvd->vdev_removing) + continue; + + child[idx++] = vdev_config_generate(spa, cvd, + getstats, flags); + } + + if (idx) { + fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + child, idx); + } + + for (c = 0; c < idx; c++) + nvlist_free(child[c]); + + kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); + + } else { + const char *aux = NULL; + + if (vd->vdev_offline && !vd->vdev_tmpoffline) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE); + if (vd->vdev_resilver_txg != 0) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, + vd->vdev_resilver_txg); + if (vd->vdev_faulted) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); + if (vd->vdev_degraded) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE); + if (vd->vdev_removed) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE); + if (vd->vdev_unspare) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE); + if (vd->vdev_ishole) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE); + + switch (vd->vdev_stat.vs_aux) { + case VDEV_AUX_ERR_EXCEEDED: + aux = "err_exceeded"; + break; + + case VDEV_AUX_EXTERNAL: + aux = "external"; + break; + } + + if (aux != NULL) + fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux); + + if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, + vd->vdev_orig_guid); + } + } + + return (nv); +} + +/* + * Generate a view of the top-level vdevs. If we currently have holes + * in the namespace, then generate an array which contains a list of holey + * vdevs. Additionally, add the number of top-level children that currently + * exist. + */ +void +vdev_top_config_generate(spa_t *spa, nvlist_t *config) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t *array; + uint_t c, idx; + + array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); + + for (c = 0, idx = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + if (tvd->vdev_ishole) { + array[idx++] = c; + } + } + + if (idx) { + VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, + array, idx) == 0); + } + + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, + rvd->vdev_children) == 0); + + kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); +} + +/* + * Returns the configuration from the label of the given vdev. For vdevs + * which don't have a txg value stored on their label (i.e. spares/cache) + * or have not been completely initialized (txg = 0) just return + * the configuration from the first valid label we find. Otherwise, + * find the most up-to-date label that does not exceed the specified + * 'txg' value. + */ +nvlist_t * +vdev_label_read_config(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + nvlist_t *config = NULL; + vdev_phys_t *vp; + abd_t *vp_abd; + zio_t *zio; + uint64_t best_txg = 0; + uint64_t label_txg = 0; + int error = 0; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + if (!vdev_readable(vd)) + return (NULL); + + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + vp = abd_to_buf(vp_abd); + +retry: + for (int l = 0; l < VDEV_LABELS; l++) { + nvlist_t *label = NULL; + + zio = zio_root(spa, NULL, NULL, flags); + + vdev_label_read(zio, vd, l, vp_abd, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), NULL, NULL, flags); + + if (zio_wait(zio) == 0 && + nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), + &label, 0) == 0) { + /* + * Auxiliary vdevs won't have txg values in their + * labels and newly added vdevs may not have been + * completely initialized so just return the + * configuration from the first valid label we + * encounter. + */ + error = nvlist_lookup_uint64(label, + ZPOOL_CONFIG_POOL_TXG, &label_txg); + if ((error || label_txg == 0) && !config) { + config = label; + break; + } else if (label_txg <= txg && label_txg > best_txg) { + best_txg = label_txg; + nvlist_free(config); + config = fnvlist_dup(label); + } + } + + if (label != NULL) { + nvlist_free(label); + label = NULL; + } + } + + if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + /* + * We found a valid label but it didn't pass txg restrictions. + */ + if (config == NULL && label_txg != 0) { + vdev_dbgmsg(vd, "label discarded as txg is too large " + "(%llu > %llu)", (u_longlong_t)label_txg, + (u_longlong_t)txg); + } + + abd_free(vp_abd); + + return (config); +} + +/* + * Determine if a device is in use. The 'spare_guid' parameter will be filled + * in with the device guid if this spare is active elsewhere on the system. + */ +static boolean_t +vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, + uint64_t *spare_guid, uint64_t *l2cache_guid) +{ + spa_t *spa = vd->vdev_spa; + uint64_t state, pool_guid, device_guid, txg, spare_pool; + uint64_t vdtxg = 0; + nvlist_t *label; + + if (spare_guid) + *spare_guid = 0ULL; + if (l2cache_guid) + *l2cache_guid = 0ULL; + + /* + * Read the label, if any, and perform some basic sanity checks. + */ + if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) + return (B_FALSE); + + (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, + &vdtxg); + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, + &device_guid) != 0) { + nvlist_free(label); + return (B_FALSE); + } + + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0)) { + nvlist_free(label); + return (B_FALSE); + } + + nvlist_free(label); + + /* + * Check to see if this device indeed belongs to the pool it claims to + * be a part of. The only way this is allowed is if the device is a hot + * spare (which we check for later on). + */ + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + !spa_guid_exists(pool_guid, device_guid) && + !spa_spare_exists(device_guid, NULL, NULL) && + !spa_l2cache_exists(device_guid, NULL)) + return (B_FALSE); + + /* + * If the transaction group is zero, then this an initialized (but + * unused) label. This is only an error if the create transaction + * on-disk is the same as the one we're using now, in which case the + * user has attempted to add the same vdev multiple times in the same + * transaction. + */ + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + txg == 0 && vdtxg == crtxg) + return (B_TRUE); + + /* + * Check to see if this is a spare device. We do an explicit check for + * spa_has_spare() here because it may be on our pending list of spares + * to add. We also check if it is an l2cache device. + */ + if (spa_spare_exists(device_guid, &spare_pool, NULL) || + spa_has_spare(spa, device_guid)) { + if (spare_guid) + *spare_guid = device_guid; + + switch (reason) { + case VDEV_LABEL_CREATE: + case VDEV_LABEL_L2CACHE: + return (B_TRUE); + + case VDEV_LABEL_REPLACE: + return (!spa_has_spare(spa, device_guid) || + spare_pool != 0ULL); + + case VDEV_LABEL_SPARE: + return (spa_has_spare(spa, device_guid)); + } + } + + /* + * Check to see if this is an l2cache device. + */ + if (spa_l2cache_exists(device_guid, NULL)) + return (B_TRUE); + + /* + * We can't rely on a pool's state if it's been imported + * read-only. Instead we look to see if the pools is marked + * read-only in the namespace and set the state to active. + */ + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + (spa = spa_by_guid(pool_guid, device_guid)) != NULL && + spa_mode(spa) == FREAD) + state = POOL_STATE_ACTIVE; + + /* + * If the device is marked ACTIVE, then this device is in use by another + * pool on the system. + */ + return (state == POOL_STATE_ACTIVE); +} + +/* + * Initialize a vdev label. We check to make sure each leaf device is not in + * use, and writable. We put down an initial label which we will later + * overwrite with a complete label. Note that it's important to do this + * sequentially, not in parallel, so that we catch cases of multiple use of the + * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with + * itself. + */ +int +vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) +{ + spa_t *spa = vd->vdev_spa; + nvlist_t *label; + vdev_phys_t *vp; + abd_t *vp_abd; + abd_t *pad2; + uberblock_t *ub; + abd_t *ub_abd; + zio_t *zio; + char *buf; + size_t buflen; + int error; + uint64_t spare_guid, l2cache_guid; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + for (int c = 0; c < vd->vdev_children; c++) + if ((error = vdev_label_init(vd->vdev_child[c], + crtxg, reason)) != 0) + return (error); + + /* Track the creation time for this vdev */ + vd->vdev_crtxg = crtxg; + + if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa)) + return (0); + + /* + * Dead vdevs cannot be initialized. + */ + if (vdev_is_dead(vd)) + return (SET_ERROR(EIO)); + + /* + * Determine if the vdev is in use. + */ + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT && + vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) + return (SET_ERROR(EBUSY)); + + /* + * If this is a request to add or replace a spare or l2cache device + * that is in use elsewhere on the system, then we must update the + * guid (which was initialized to a random value) to reflect the + * actual GUID (which is shared between multiple pools). + */ + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE && + spare_guid != 0ULL) { + uint64_t guid_delta = spare_guid - vd->vdev_guid; + + vd->vdev_guid += guid_delta; + + for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += guid_delta; + + /* + * If this is a replacement, then we want to fallthrough to the + * rest of the code. If we're adding a spare, then it's already + * labeled appropriately and we can just return. + */ + if (reason == VDEV_LABEL_SPARE) + return (0); + ASSERT(reason == VDEV_LABEL_REPLACE || + reason == VDEV_LABEL_SPLIT); + } + + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && + l2cache_guid != 0ULL) { + uint64_t guid_delta = l2cache_guid - vd->vdev_guid; + + vd->vdev_guid += guid_delta; + + for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += guid_delta; + + /* + * If this is a replacement, then we want to fallthrough to the + * rest of the code. If we're adding an l2cache, then it's + * already labeled appropriately and we can just return. + */ + if (reason == VDEV_LABEL_L2CACHE) + return (0); + ASSERT(reason == VDEV_LABEL_REPLACE); + } + + /* + * TRIM the whole thing, excluding the blank space and boot header + * as specified by ZFS On-Disk Specification (section 1.3), so that + * we start with a clean slate. + * It's just an optimization, so we don't care if it fails. + * Don't TRIM if removing so that we don't interfere with zpool + * disaster recovery. + */ + if (zfs_trim_enabled && vdev_trim_on_init && !vd->vdev_notrim && + (reason == VDEV_LABEL_CREATE || reason == VDEV_LABEL_SPARE || + reason == VDEV_LABEL_L2CACHE)) + zio_wait(zio_trim(NULL, spa, vd, VDEV_SKIP_SIZE, + vd->vdev_psize - VDEV_SKIP_SIZE)); + + /* + * Initialize its label. + */ + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); + + /* + * Generate a label describing the pool and our top-level vdev. + * We mark it as being from txg 0 to indicate that it's not + * really part of an active pool just yet. The labels will + * be written again with a meaningful txg by spa_sync(). + */ + if (reason == VDEV_LABEL_SPARE || + (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) { + /* + * For inactive hot spares, we generate a special label that + * identifies as a mutually shared hot spare. We write the + * label if we are adding a hot spare, or if we are removing an + * active hot spare (in which case we want to revert the + * labels). + */ + VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, + POOL_STATE_SPARE) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + } else if (reason == VDEV_LABEL_L2CACHE || + (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) { + /* + * For level 2 ARC devices, add a special label. + */ + VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, + POOL_STATE_L2CACHE) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + } else { + uint64_t txg = 0ULL; + + if (reason == VDEV_LABEL_SPLIT) + txg = spa->spa_uberblock.ub_txg; + label = spa_config_generate(spa, vd, txg, B_FALSE); + + /* + * Add our creation time. This allows us to detect multiple + * vdev uses as described above, and automatically expires if we + * fail. + */ + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, + crtxg) == 0); + } + + buf = vp->vp_nvlist; + buflen = sizeof (vp->vp_nvlist); + + error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); + if (error != 0) { + nvlist_free(label); + abd_free(vp_abd); + /* EFAULT means nvlist_pack ran out of room */ + return (error == EFAULT ? ENAMETOOLONG : EINVAL); + } + + /* + * Initialize uberblock template. + */ + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_RING); + abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t)); + ub = abd_to_buf(ub_abd); + ub->ub_txg = 0; + + /* Initialize the 2nd padding area. */ + pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(pad2, VDEV_PAD_SIZE); + + /* + * Write everything in parallel. + */ +retry: + zio = zio_root(spa, NULL, NULL, flags); + + for (int l = 0; l < VDEV_LABELS; l++) { + + vdev_label_write(zio, vd, l, vp_abd, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), NULL, NULL, flags); + + /* + * Skip the 1st padding area. + * Zero out the 2nd padding area where it might have + * left over data from previous filesystem format. + */ + vdev_label_write(zio, vd, l, pad2, + offsetof(vdev_label_t, vl_pad2), + VDEV_PAD_SIZE, NULL, NULL, flags); + + vdev_label_write(zio, vd, l, ub_abd, + offsetof(vdev_label_t, vl_uberblock), + VDEV_UBERBLOCK_RING, NULL, NULL, flags); + } + + error = zio_wait(zio); + + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + nvlist_free(label); + abd_free(pad2); + abd_free(ub_abd); + abd_free(vp_abd); + + /* + * If this vdev hasn't been previously identified as a spare, then we + * mark it as such only if a) we are labeling it as a spare, or b) it + * exists as a spare elsewhere in the system. Do the same for + * level 2 ARC devices. + */ + if (error == 0 && !vd->vdev_isspare && + (reason == VDEV_LABEL_SPARE || + spa_spare_exists(vd->vdev_guid, NULL, NULL))) + spa_spare_add(vd); + + if (error == 0 && !vd->vdev_isl2cache && + (reason == VDEV_LABEL_L2CACHE || + spa_l2cache_exists(vd->vdev_guid, NULL))) + spa_l2cache_add(vd); + + return (error); +} + +int +vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size) +{ + spa_t *spa = vd->vdev_spa; + zio_t *zio; + abd_t *pad2; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + int error; + + if (size > VDEV_PAD_SIZE) + return (EINVAL); + + if (!vd->vdev_ops->vdev_op_leaf) + return (ENODEV); + if (vdev_is_dead(vd)) + return (ENXIO); + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(pad2, VDEV_PAD_SIZE); + abd_copy_from_buf(pad2, buf, size); + +retry: + zio = zio_root(spa, NULL, NULL, flags); + vdev_label_write(zio, vd, 0, pad2, + offsetof(vdev_label_t, vl_pad2), + VDEV_PAD_SIZE, NULL, NULL, flags); + error = zio_wait(zio); + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + abd_free(pad2); + return (error); +} + +/* + * ========================================================================== + * uberblock load/sync + * ========================================================================== + */ + +/* + * Consider the following situation: txg is safely synced to disk. We've + * written the first uberblock for txg + 1, and then we lose power. When we + * come back up, we fail to see the uberblock for txg + 1 because, say, + * it was on a mirrored device and the replica to which we wrote txg + 1 + * is now offline. If we then make some changes and sync txg + 1, and then + * the missing replica comes back, then for a few seconds we'll have two + * conflicting uberblocks on disk with the same txg. The solution is simple: + * among uberblocks with equal txg, choose the one with the latest timestamp. + */ +static int +vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) +{ + int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); + if (likely(cmp)) + return (cmp); + + return (AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp)); +} + +struct ubl_cbdata { + uberblock_t *ubl_ubbest; /* Best uberblock */ + vdev_t *ubl_vd; /* vdev associated with the above */ +}; + +static void +vdev_uberblock_load_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + spa_t *spa = zio->io_spa; + zio_t *rio = zio->io_private; + uberblock_t *ub = abd_to_buf(zio->io_abd); + struct ubl_cbdata *cbp = rio->io_private; + + ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); + + if (zio->io_error == 0 && uberblock_verify(ub) == 0) { + mutex_enter(&rio->io_lock); + if (ub->ub_txg <= spa->spa_load_max_txg && + vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { + /* + * Keep track of the vdev in which this uberblock + * was found. We will use this information later + * to obtain the config nvlist associated with + * this uberblock. + */ + *cbp->ubl_ubbest = *ub; + cbp->ubl_vd = vd; + } + mutex_exit(&rio->io_lock); + } + + abd_free(zio->io_abd); +} + +static void +vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, + struct ubl_cbdata *cbp) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); + + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + for (int l = 0; l < VDEV_LABELS; l++) { + for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + vdev_label_read(zio, vd, l, + abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), + B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd), + vdev_uberblock_load_done, zio, flags); + } + } + } +} + +/* + * Reads the 'best' uberblock from disk along with its associated + * configuration. First, we read the uberblock array of each label of each + * vdev, keeping track of the uberblock with the highest txg in each array. + * Then, we read the configuration from the same vdev as the best uberblock. + */ +void +vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) +{ + zio_t *zio; + spa_t *spa = rvd->vdev_spa; + struct ubl_cbdata cb; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + + ASSERT(ub); + ASSERT(config); + + bzero(ub, sizeof (uberblock_t)); + *config = NULL; + + cb.ubl_ubbest = ub; + cb.ubl_vd = NULL; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + zio = zio_root(spa, NULL, &cb, flags); + vdev_uberblock_load_impl(zio, rvd, flags, &cb); + (void) zio_wait(zio); + + /* + * It's possible that the best uberblock was discovered on a label + * that has a configuration which was written in a future txg. + * Search all labels on this vdev to find the configuration that + * matches the txg for our uberblock. + */ + if (cb.ubl_vd != NULL) { + vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " + "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); + + *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); + if (*config == NULL && spa->spa_extreme_rewind) { + vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " + "Trying again without txg restrictions."); + *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX); + } + if (*config == NULL) { + vdev_dbgmsg(cb.ubl_vd, "failed to read label config"); + } + } + spa_config_exit(spa, SCL_ALL, FTAG); +} + +/* + * On success, increment root zio's count of good writes. + * We only get credit for writes to known-visible vdevs; see spa_vdev_add(). + */ +static void +vdev_uberblock_sync_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_private; + + if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) + atomic_inc_64(good_writes); +} + +/* + * Write the uberblock to all labels of all leaves of the specified vdev. + */ +static void +vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, + uberblock_t *ub, vdev_t *vd, int flags) +{ + for (uint64_t c = 0; c < vd->vdev_children; c++) { + vdev_uberblock_sync(zio, good_writes, + ub, vd->vdev_child[c], flags); + } + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (!vdev_writeable(vd)) + return; + + int n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); + + /* Copy the uberblock_t into the ABD */ + abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); + abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); + + for (int l = 0; l < VDEV_LABELS; l++) + vdev_label_write(zio, vd, l, ub_abd, + VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), + vdev_uberblock_sync_done, good_writes, + flags | ZIO_FLAG_DONT_PROPAGATE); + + abd_free(ub_abd); +} + +/* Sync the uberblocks to all vdevs in svd[] */ +int +vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) +{ + spa_t *spa = svd[0]->vdev_spa; + zio_t *zio; + uint64_t good_writes = 0; + + zio = zio_root(spa, NULL, NULL, flags); + + for (int v = 0; v < svdcount; v++) + vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); + + (void) zio_wait(zio); + + /* + * Flush the uberblocks to disk. This ensures that the odd labels + * are no longer needed (because the new uberblocks and the even + * labels are safely on disk), so it is safe to overwrite them. + */ + zio = zio_root(spa, NULL, NULL, flags); + + for (int v = 0; v < svdcount; v++) { + if (vdev_writeable(svd[v])) { + zio_flush(zio, svd[v]); + } + } + + (void) zio_wait(zio); + + return (good_writes >= 1 ? 0 : EIO); +} + +/* + * On success, increment the count of good writes for our top-level vdev. + */ +static void +vdev_label_sync_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_private; + + if (zio->io_error == 0) + atomic_inc_64(good_writes); +} + +/* + * If there weren't enough good writes, indicate failure to the parent. + */ +static void +vdev_label_sync_top_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_private; + + if (*good_writes == 0) + zio->io_error = SET_ERROR(EIO); + + kmem_free(good_writes, sizeof (uint64_t)); +} + +/* + * We ignore errors for log and cache devices, simply free the private data. + */ +static void +vdev_label_sync_ignore_done(zio_t *zio) +{ + kmem_free(zio->io_private, sizeof (uint64_t)); +} + +/* + * Write all even or odd labels to all leaves of the specified vdev. + */ +static void +vdev_label_sync(zio_t *zio, uint64_t *good_writes, + vdev_t *vd, int l, uint64_t txg, int flags) +{ + nvlist_t *label; + vdev_phys_t *vp; + abd_t *vp_abd; + char *buf; + size_t buflen; + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_label_sync(zio, good_writes, + vd->vdev_child[c], l, txg, flags); + } + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (!vdev_writeable(vd)) + return; + + /* + * Generate a label describing the top-level config to which we belong. + */ + label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); + + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); + + buf = vp->vp_nvlist; + buflen = sizeof (vp->vp_nvlist); + + if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) { + for (; l < VDEV_LABELS; l += 2) { + vdev_label_write(zio, vd, l, vp_abd, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), + vdev_label_sync_done, good_writes, + flags | ZIO_FLAG_DONT_PROPAGATE); + } + } + + abd_free(vp_abd); + nvlist_free(label); +} + +int +vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) +{ + list_t *dl = &spa->spa_config_dirty_list; + vdev_t *vd; + zio_t *zio; + int error; + + /* + * Write the new labels to disk. + */ + zio = zio_root(spa, NULL, NULL, flags); + + for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { + uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t), + KM_SLEEP); + + ASSERT(!vd->vdev_ishole); + + zio_t *vio = zio_null(zio, spa, NULL, + (vd->vdev_islog || vd->vdev_aux != NULL) ? + vdev_label_sync_ignore_done : vdev_label_sync_top_done, + good_writes, flags); + vdev_label_sync(vio, good_writes, vd, l, txg, flags); + zio_nowait(vio); + } + + error = zio_wait(zio); + + /* + * Flush the new labels to disk. + */ + zio = zio_root(spa, NULL, NULL, flags); + + for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) + zio_flush(zio, vd); + + (void) zio_wait(zio); + + return (error); +} + +/* + * Sync the uberblock and any changes to the vdev configuration. + * + * The order of operations is carefully crafted to ensure that + * if the system panics or loses power at any time, the state on disk + * is still transactionally consistent. The in-line comments below + * describe the failure semantics at each stage. + * + * Moreover, vdev_config_sync() is designed to be idempotent: if it fails + * at any time, you can just call it again, and it will resume its work. + */ +int +vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) +{ + spa_t *spa = svd[0]->vdev_spa; + uberblock_t *ub = &spa->spa_uberblock; + int error = 0; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + + ASSERT(svdcount != 0); +retry: + /* + * Normally, we don't want to try too hard to write every label and + * uberblock. If there is a flaky disk, we don't want the rest of the + * sync process to block while we retry. But if we can't write a + * single label out, we should retry with ZIO_FLAG_TRYHARD before + * bailing out and declaring the pool faulted. + */ + if (error != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) + return (error); + flags |= ZIO_FLAG_TRYHARD; + } + + ASSERT(ub->ub_txg <= txg); + + /* + * If this isn't a resync due to I/O errors, + * and nothing changed in this transaction group, + * and the vdev configuration hasn't changed, + * then there's nothing to do. + */ + if (ub->ub_txg < txg && + uberblock_update(ub, spa->spa_root_vdev, txg) == B_FALSE && + list_is_empty(&spa->spa_config_dirty_list)) + return (0); + + if (txg > spa_freeze_txg(spa)) + return (0); + + ASSERT(txg <= spa->spa_final_txg); + + /* + * Flush the write cache of every disk that's been written to + * in this transaction group. This ensures that all blocks + * written in this txg will be committed to stable storage + * before any uberblock that references them. + */ + zio_t *zio = zio_root(spa, NULL, NULL, flags); + + for (vdev_t *vd = + txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL; + vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) + zio_flush(zio, vd); + + (void) zio_wait(zio); + + /* + * Sync out the even labels (L0, L2) for every dirty vdev. If the + * system dies in the middle of this process, that's OK: all of the + * even labels that made it to disk will be newer than any uberblock, + * and will therefore be considered invalid. The odd labels (L1, L3), + * which have not yet been touched, will still be valid. We flush + * the new labels to disk to ensure that all even-label updates + * are committed to stable storage before the uberblock update. + */ + if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_label_sync_list() returned error %d " + "for pool '%s' when syncing out the even labels " + "of dirty vdevs", error, spa_name(spa)); + } + goto retry; + } + + /* + * Sync the uberblocks to all vdevs in svd[]. + * If the system dies in the middle of this step, there are two cases + * to consider, and the on-disk state is consistent either way: + * + * (1) If none of the new uberblocks made it to disk, then the + * previous uberblock will be the newest, and the odd labels + * (which had not yet been touched) will be valid with respect + * to that uberblock. + * + * (2) If one or more new uberblocks made it to disk, then they + * will be the newest, and the even labels (which had all + * been successfully committed) will be valid with respect + * to the new uberblocks. + */ + if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_uberblock_sync_list() returned error " + "%d for pool '%s'", error, spa_name(spa)); + } + goto retry; + } + + /* + * Sync out odd labels for every dirty vdev. If the system dies + * in the middle of this process, the even labels and the new + * uberblocks will suffice to open the pool. The next time + * the pool is opened, the first thing we'll do -- before any + * user data is modified -- is mark every vdev dirty so that + * all labels will be brought up to date. We flush the new labels + * to disk to ensure that all odd-label updates are committed to + * stable storage before the next transaction group begins. + */ + if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_label_sync_list() returned error %d " + "for pool '%s' when syncing out the odd labels of " + "dirty vdevs", error, spa_name(spa)); + } + goto retry;; + } + + trim_thread_wakeup(spa); + + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c new file mode 100644 index 000000000000..4691da726ec2 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c @@ -0,0 +1,778 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_scan.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/abd.h> +#include <sys/fs/zfs.h> + +/* + * Virtual device vector for mirroring. + */ + +typedef struct mirror_child { + vdev_t *mc_vd; + uint64_t mc_offset; + int mc_error; + int mc_load; + uint8_t mc_tried; + uint8_t mc_skipped; + uint8_t mc_speculative; +} mirror_child_t; + +typedef struct mirror_map { + int *mm_preferred; + int mm_preferred_cnt; + int mm_children; + boolean_t mm_resilvering; + boolean_t mm_root; + mirror_child_t mm_child[]; +} mirror_map_t; + +static int vdev_mirror_shift = 21; + +#ifdef _KERNEL +SYSCTL_DECL(_vfs_zfs_vdev); +static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, + "ZFS VDEV Mirror"); +#endif + +/* + * The load configuration settings below are tuned by default for + * the case where all devices are of the same rotational type. + * + * If there is a mixture of rotating and non-rotating media, setting + * non_rotating_seek_inc to 0 may well provide better results as it + * will direct more reads to the non-rotating vdevs which are more + * likely to have a higher performance. + */ + +/* Rotating media load calculation configuration. */ +static int rotating_inc = 0; +#ifdef _KERNEL +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RWTUN, + &rotating_inc, 0, "Rotating media load increment for non-seeking I/O's"); +#endif + +static int rotating_seek_inc = 5; +#ifdef _KERNEL +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RWTUN, + &rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's"); +#endif + +static int rotating_seek_offset = 1 * 1024 * 1024; +#ifdef _KERNEL +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RWTUN, + &rotating_seek_offset, 0, "Offset in bytes from the last I/O which " + "triggers a reduced rotating media seek increment"); +#endif + +/* Non-rotating media load calculation configuration. */ +static int non_rotating_inc = 0; +#ifdef _KERNEL +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RWTUN, + &non_rotating_inc, 0, + "Non-rotating media load increment for non-seeking I/O's"); +#endif + +static int non_rotating_seek_inc = 1; +#ifdef _KERNEL +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RWTUN, + &non_rotating_seek_inc, 0, + "Non-rotating media load increment for seeking I/O's"); +#endif + + +static inline size_t +vdev_mirror_map_size(int children) +{ + return (offsetof(mirror_map_t, mm_child[children]) + + sizeof(int) * children); +} + +static inline mirror_map_t * +vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root) +{ + mirror_map_t *mm; + + mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); + mm->mm_children = children; + mm->mm_resilvering = resilvering; + mm->mm_root = root; + mm->mm_preferred = (int *)((uintptr_t)mm + + offsetof(mirror_map_t, mm_child[children])); + + return mm; +} + +static void +vdev_mirror_map_free(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + + kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); +} + +static const zio_vsd_ops_t vdev_mirror_vsd_ops = { + vdev_mirror_map_free, + zio_vsd_default_cksum_report +}; + +static int +vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) +{ + uint64_t lastoffset; + int load; + + /* All DVAs have equal weight at the root. */ + if (mm->mm_root) + return (INT_MAX); + + /* + * We don't return INT_MAX if the device is resilvering i.e. + * vdev_resilver_txg != 0 as when tested performance was slightly + * worse overall when resilvering with compared to without. + */ + + /* Standard load based on pending queue length. */ + load = vdev_queue_length(vd); + lastoffset = vdev_queue_lastoffset(vd); + + if (vd->vdev_nonrot) { + /* Non-rotating media. */ + if (lastoffset == zio_offset) + return (load + non_rotating_inc); + + /* + * Apply a seek penalty even for non-rotating devices as + * sequential I/O'a can be aggregated into fewer operations + * on the device, thus avoiding unnecessary per-command + * overhead and boosting performance. + */ + return (load + non_rotating_seek_inc); + } + + /* Rotating media I/O's which directly follow the last I/O. */ + if (lastoffset == zio_offset) + return (load + rotating_inc); + + /* + * Apply half the seek increment to I/O's within seek offset + * of the last I/O queued to this vdev as they should incure less + * of a seek increment. + */ + if (ABS(lastoffset - zio_offset) < rotating_seek_offset) + return (load + (rotating_seek_inc / 2)); + + /* Apply the full seek increment to all other I/O's. */ + return (load + rotating_seek_inc); +} + + +static mirror_map_t * +vdev_mirror_map_init(zio_t *zio) +{ + mirror_map_t *mm = NULL; + mirror_child_t *mc; + vdev_t *vd = zio->io_vd; + int c; + + if (vd == NULL) { + dva_t *dva = zio->io_bp->blk_dva; + spa_t *spa = zio->io_spa; + dva_t dva_copy[SPA_DVAS_PER_BP]; + + c = BP_GET_NDVAS(zio->io_bp); + + /* + * If we do not trust the pool config, some DVAs might be + * invalid or point to vdevs that do not exist. We skip them. + */ + if (!spa_trust_config(spa)) { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + int j = 0; + for (int i = 0; i < c; i++) { + if (zfs_dva_valid(spa, &dva[i], zio->io_bp)) + dva_copy[j++] = dva[i]; + } + if (j == 0) { + zio->io_vsd = NULL; + zio->io_error = ENXIO; + return (NULL); + } + if (j < c) { + dva = dva_copy; + c = j; + } + } + + mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE); + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); + mc->mc_offset = DVA_GET_OFFSET(&dva[c]); + } + } else { + /* + * If we are resilvering, then we should handle scrub reads + * differently; we shouldn't issue them to the resilvering + * device because it might not have those blocks. + * + * We are resilvering iff: + * 1) We are a replacing vdev (ie our name is "replacing-1" or + * "spare-1" or something like that), and + * 2) The pool is currently being resilvered. + * + * We cannot simply check vd->vdev_resilver_txg, because it's + * not set in this path. + * + * Nor can we just check our vdev_ops; there are cases (such as + * when a user types "zpool replace pool odev spare_dev" and + * spare_dev is in the spare list, or when a spare device is + * automatically used to replace a DEGRADED device) when + * resilvering is complete but both the original vdev and the + * spare vdev remain in the pool. That behavior is intentional. + * It helps implement the policy that a spare should be + * automatically removed from the pool after the user replaces + * the device that originally failed. + * + * If a spa load is in progress, then spa_dsl_pool may be + * uninitialized. But we shouldn't be resilvering during a spa + * load anyway. + */ + boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops) && + spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE && + dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool); + mm = vdev_mirror_map_alloc(vd->vdev_children, replacing, + B_FALSE); + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + mc->mc_vd = vd->vdev_child[c]; + mc->mc_offset = zio->io_offset; + } + } + + zio->io_vsd = mm; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; + return (mm); +} + +static int +vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + int numerrors = 0; + int lasterror = 0; + + if (vd->vdev_children == 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + vdev_open_children(vd); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error) { + lasterror = cvd->vdev_open_error; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = MAX(*physical_ashift, + cvd->vdev_physical_ashift); + } + + if (numerrors == vd->vdev_children) { + if (vdev_children_are_offline(vd)) + vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE; + else + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_mirror_close(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static void +vdev_mirror_child_done(zio_t *zio) +{ + mirror_child_t *mc = zio->io_private; + + mc->mc_error = zio->io_error; + mc->mc_tried = 1; + mc->mc_skipped = 0; +} + +static void +vdev_mirror_scrub_done(zio_t *zio) +{ + mirror_child_t *mc = zio->io_private; + + if (zio->io_error == 0) { + zio_t *pio; + zio_link_t *zl = NULL; + + mutex_enter(&zio->io_lock); + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { + mutex_enter(&pio->io_lock); + ASSERT3U(zio->io_size, >=, pio->io_size); + abd_copy(pio->io_abd, zio->io_abd, pio->io_size); + mutex_exit(&pio->io_lock); + } + mutex_exit(&zio->io_lock); + } + abd_free(zio->io_abd); + + mc->mc_error = zio->io_error; + mc->mc_tried = 1; + mc->mc_skipped = 0; +} + +/* + * Check the other, lower-index DVAs to see if they're on the same + * vdev as the child we picked. If they are, use them since they + * are likely to have been allocated from the primary metaslab in + * use at the time, and hence are more likely to have locality with + * single-copy data. + */ +static int +vdev_mirror_dva_select(zio_t *zio, int p) +{ + dva_t *dva = zio->io_bp->blk_dva; + mirror_map_t *mm = zio->io_vsd; + int preferred; + int c; + + preferred = mm->mm_preferred[p]; + for (p-- ; p >= 0; p--) { + c = mm->mm_preferred[p]; + if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) + preferred = c; + } + return (preferred); +} + +static int +vdev_mirror_preferred_child_randomize(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + int p; + + if (mm->mm_root) { + p = spa_get_random(mm->mm_preferred_cnt); + return (vdev_mirror_dva_select(zio, p)); + } + + /* + * To ensure we don't always favour the first matching vdev, + * which could lead to wear leveling issues on SSD's, we + * use the I/O offset as a pseudo random seed into the vdevs + * which have the lowest load. + */ + p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; + return (mm->mm_preferred[p]); +} + +/* + * Try to find a vdev whose DTL doesn't contain the block we want to read + * prefering vdevs based on determined load. + * + * If we can't, try the read on any vdev we haven't already tried. + */ +static int +vdev_mirror_child_select(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + uint64_t txg = zio->io_txg; + int c, lowest_load; + + ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); + + lowest_load = INT_MAX; + mm->mm_preferred_cnt = 0; + for (c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc; + + mc = &mm->mm_child[c]; + if (mc->mc_tried || mc->mc_skipped) + continue; + + if (!vdev_readable(mc->mc_vd)) { + mc->mc_error = SET_ERROR(ENXIO); + mc->mc_tried = 1; /* don't even try */ + mc->mc_skipped = 1; + continue; + } + + if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + mc->mc_error = SET_ERROR(ESTALE); + mc->mc_skipped = 1; + mc->mc_speculative = 1; + continue; + } + + mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); + if (mc->mc_load > lowest_load) + continue; + + if (mc->mc_load < lowest_load) { + lowest_load = mc->mc_load; + mm->mm_preferred_cnt = 0; + } + mm->mm_preferred[mm->mm_preferred_cnt] = c; + mm->mm_preferred_cnt++; + } + + if (mm->mm_preferred_cnt == 1) { + vdev_queue_register_lastoffset( + mm->mm_child[mm->mm_preferred[0]].mc_vd, zio); + return (mm->mm_preferred[0]); + } + + if (mm->mm_preferred_cnt > 1) { + int c = vdev_mirror_preferred_child_randomize(zio); + + vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio); + return (c); + } + + /* + * Every device is either missing or has this txg in its DTL. + * Look for any child we haven't already tried before giving up. + */ + for (c = 0; c < mm->mm_children; c++) { + if (!mm->mm_child[c].mc_tried) { + vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, + zio); + return (c); + } + } + + /* + * Every child failed. There's no place left to look. + */ + return (-1); +} + +static void +vdev_mirror_io_start(zio_t *zio) +{ + mirror_map_t *mm; + mirror_child_t *mc; + int c, children; + + mm = vdev_mirror_map_init(zio); + + if (mm == NULL) { + ASSERT(!spa_trust_config(zio->io_spa)); + ASSERT(zio->io_type == ZIO_TYPE_READ); + zio_execute(zio); + return; + } + + if (zio->io_type == ZIO_TYPE_READ) { + if (zio->io_bp != NULL && + (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering && + mm->mm_children > 1) { + /* + * For scrubbing reads (if we can verify the + * checksum here, as indicated by io_bp being + * non-NULL) we need to allocate a read buffer for + * each child and issue reads to all children. If + * any child succeeds, it will copy its data into + * zio->io_data in vdev_mirror_scrub_done. + */ + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, + abd_alloc_sametype(zio->io_abd, + zio->io_size), zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_mirror_scrub_done, mc)); + } + zio_execute(zio); + return; + } + /* + * For normal reads just pick one child. + */ + c = vdev_mirror_child_select(zio); + children = (c >= 0); + } else { + ASSERT(zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_FREE); + + /* + * Writes and frees go to all children. + */ + c = 0; + children = mm->mm_children; + } + + while (children--) { + mc = &mm->mm_child[c]; + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_mirror_child_done, mc)); + c++; + } + + zio_execute(zio); +} + +static int +vdev_mirror_worst_error(mirror_map_t *mm) +{ + int error[2] = { 0, 0 }; + + for (int c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc = &mm->mm_child[c]; + int s = mc->mc_speculative; + error[s] = zio_worst_error(error[s], mc->mc_error); + } + + return (error[0] ? error[0] : error[1]); +} + +static void +vdev_mirror_io_done(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + mirror_child_t *mc; + int c; + int good_copies = 0; + int unexpected_errors = 0; + + if (mm == NULL) + return; + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + + if (mc->mc_error) { + if (!mc->mc_skipped) + unexpected_errors++; + } else if (mc->mc_tried) { + good_copies++; + } + } + + if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * XXX -- for now, treat partial writes as success. + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. + */ + /* XXPOLICY */ + if (good_copies != mm->mm_children) { + /* + * Always require at least one good copy. + * + * For ditto blocks (io_vd == NULL), require + * all copies to be good. + * + * XXX -- for replacing vdevs, there's no great answer. + * If the old device is really dead, we may not even + * be able to access it -- so we only want to + * require good writes to the new device. But if + * the new device turns out to be flaky, we want + * to be able to detach it -- which requires all + * writes to the old device to have succeeded. + */ + if (good_copies == 0 || zio->io_vd == NULL) + zio->io_error = vdev_mirror_worst_error(mm); + } + return; + } else if (zio->io_type == ZIO_TYPE_FREE) { + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + /* + * If we don't have a good copy yet, keep trying other children. + */ + /* XXPOLICY */ + if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { + ASSERT(c >= 0 && c < mm->mm_children); + mc = &mm->mm_child[c]; + zio_vdev_io_redone(zio); + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, + ZIO_TYPE_READ, zio->io_priority, 0, + vdev_mirror_child_done, mc)); + return; + } + + /* XXPOLICY */ + if (good_copies == 0) { + zio->io_error = vdev_mirror_worst_error(mm); + ASSERT(zio->io_error != 0); + } + + if (good_copies && spa_writeable(zio->io_spa) && + (unexpected_errors || + (zio->io_flags & ZIO_FLAG_RESILVER) || + ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) { + /* + * Use the good data we have in hand to repair damaged children. + */ + for (c = 0; c < mm->mm_children; c++) { + /* + * Don't rewrite known good children. + * Not only is it unnecessary, it could + * actually be harmful: if the system lost + * power while rewriting the only good copy, + * there would be no good copies left! + */ + mc = &mm->mm_child[c]; + + if (mc->mc_error == 0) { + if (mc->mc_tried) + continue; + /* + * We didn't try this child. We need to + * repair it if: + * 1. it's a scrub (in which case we have + * tried everything that was healthy) + * - or - + * 2. it's an indirect vdev (in which case + * it could point to any other vdev, which + * might have a bad DTL) + * - or - + * 3. the DTL indicates that this data is + * missing from this vdev + */ + if (!(zio->io_flags & ZIO_FLAG_SCRUB) && + mc->mc_vd->vdev_ops != &vdev_indirect_ops && + !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, + zio->io_txg, 1)) + continue; + mc->mc_error = SET_ERROR(ESTALE); + } + + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, + zio->io_abd, zio->io_size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); + } + } +} + +static void +vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted == vd->vdev_children) { + if (vdev_children_are_offline(vd)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE, + VDEV_AUX_CHILDREN_OFFLINE); + } else { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + } + } else if (degraded + faulted != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + } else { + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + } +} + +vdev_ops_t vdev_mirror_ops = { + vdev_mirror_open, + vdev_mirror_close, + vdev_default_asize, + vdev_mirror_io_start, + vdev_mirror_io_done, + vdev_mirror_state_change, + NULL, + NULL, + NULL, + NULL, + vdev_default_xlate, + VDEV_TYPE_MIRROR, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; + +vdev_ops_t vdev_replacing_ops = { + vdev_mirror_open, + vdev_mirror_close, + vdev_default_asize, + vdev_mirror_io_start, + vdev_mirror_io_done, + vdev_mirror_state_change, + NULL, + NULL, + NULL, + NULL, + vdev_default_xlate, + VDEV_TYPE_REPLACING, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; + +vdev_ops_t vdev_spare_ops = { + vdev_mirror_open, + vdev_mirror_close, + vdev_default_asize, + vdev_mirror_io_start, + vdev_mirror_io_done, + vdev_mirror_state_change, + NULL, + NULL, + NULL, + NULL, + vdev_default_xlate, + VDEV_TYPE_SPARE, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c new file mode 100644 index 000000000000..6852de445049 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c @@ -0,0 +1,113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + */ + +/* + * The 'missing' vdev is a special vdev type used only during import. It + * signifies a placeholder in the root vdev for some vdev that we know is + * missing. We pass it down to the kernel to allow the rest of the + * configuration to parsed and an attempt made to open all available devices. + * Because its GUID is always 0, we know that the guid sum will mismatch and we + * won't be able to open the pool anyway. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> + +/* ARGSUSED */ +static int +vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + /* + * Really this should just fail. But then the root vdev will be in the + * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is + * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we + * will fail the GUID sum check before ever trying to open the pool. + */ + *psize = 0; + *max_psize = 0; + *logical_ashift = 0; + *physical_ashift = 0; + return (0); +} + +/* ARGSUSED */ +static void +vdev_missing_close(vdev_t *vd) +{ +} + +/* ARGSUSED */ +static void +vdev_missing_io_start(zio_t *zio) +{ + zio->io_error = SET_ERROR(ENOTSUP); + zio_execute(zio); +} + +/* ARGSUSED */ +static void +vdev_missing_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_missing_ops = { + vdev_missing_open, + vdev_missing_close, + vdev_default_asize, + vdev_missing_io_start, + vdev_missing_io_done, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + VDEV_TYPE_MISSING, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +vdev_ops_t vdev_hole_ops = { + vdev_missing_open, + vdev_missing_close, + vdev_default_asize, + vdev_missing_io_start, + vdev_missing_io_done, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + VDEV_TYPE_HOLE, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c new file mode 100644 index 000000000000..b53ec7ffafc8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -0,0 +1,1055 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include <sys/zfs_context.h> +#include <sys/vdev_impl.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/avl.h> +#include <sys/dsl_pool.h> +#include <sys/metaslab_impl.h> +#include <sys/abd.h> + +/* + * ZFS I/O Scheduler + * --------------- + * + * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The + * I/O scheduler determines when and in what order those operations are + * issued. The I/O scheduler divides operations into six I/O classes + * prioritized in the following order: sync read, sync write, async read, + * async write, scrub/resilver and trim. Each queue defines the minimum and + * maximum number of concurrent operations that may be issued to the device. + * In addition, the device has an aggregate maximum. Note that the sum of the + * per-queue minimums must not exceed the aggregate maximum, and if the + * aggregate maximum is equal to or greater than the sum of the per-queue + * maximums, the per-queue minimum has no effect. + * + * For many physical devices, throughput increases with the number of + * concurrent operations, but latency typically suffers. Further, physical + * devices typically have a limit at which more concurrent operations have no + * effect on throughput or can actually cause it to decrease. + * + * The scheduler selects the next operation to issue by first looking for an + * I/O class whose minimum has not been satisfied. Once all are satisfied and + * the aggregate maximum has not been hit, the scheduler looks for classes + * whose maximum has not been satisfied. Iteration through the I/O classes is + * done in the order specified above. No further operations are issued if the + * aggregate maximum number of concurrent operations has been hit or if there + * are no operations queued for an I/O class that has not hit its maximum. + * Every time an I/O is queued or an operation completes, the I/O scheduler + * looks for new operations to issue. + * + * All I/O classes have a fixed maximum number of outstanding operations + * except for the async write class. Asynchronous writes represent the data + * that is committed to stable storage during the syncing stage for + * transaction groups (see txg.c). Transaction groups enter the syncing state + * periodically so the number of queued async writes will quickly burst up and + * then bleed down to zero. Rather than servicing them as quickly as possible, + * the I/O scheduler changes the maximum number of active async write I/Os + * according to the amount of dirty data in the pool (see dsl_pool.c). Since + * both throughput and latency typically increase with the number of + * concurrent operations issued to physical devices, reducing the burstiness + * in the number of concurrent operations also stabilizes the response time of + * operations from other -- and in particular synchronous -- queues. In broad + * strokes, the I/O scheduler will issue more concurrent operations from the + * async write queue as there's more dirty data in the pool. + * + * Async Writes + * + * The number of concurrent operations issued for the async write I/O class + * follows a piece-wise linear function defined by a few adjustable points. + * + * | o---------| <-- zfs_vdev_async_write_max_active + * ^ | /^ | + * | | / | | + * active | / | | + * I/O | / | | + * count | / | | + * | / | | + * |------------o | | <-- zfs_vdev_async_write_min_active + * 0|____________^______|_________| + * 0% | | 100% of zfs_dirty_data_max + * | | + * | `-- zfs_vdev_async_write_active_max_dirty_percent + * `--------- zfs_vdev_async_write_active_min_dirty_percent + * + * Until the amount of dirty data exceeds a minimum percentage of the dirty + * data allowed in the pool, the I/O scheduler will limit the number of + * concurrent operations to the minimum. As that threshold is crossed, the + * number of concurrent operations issued increases linearly to the maximum at + * the specified maximum percentage of the dirty data allowed in the pool. + * + * Ideally, the amount of dirty data on a busy pool will stay in the sloped + * part of the function between zfs_vdev_async_write_active_min_dirty_percent + * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the + * maximum percentage, this indicates that the rate of incoming data is + * greater than the rate that the backend storage can handle. In this case, we + * must further throttle incoming writes (see dmu_tx_delay() for details). + */ + +/* + * The maximum number of I/Os active to each device. Ideally, this will be >= + * the sum of each queue's max_active. It must be at least the sum of each + * queue's min_active. + */ +uint32_t zfs_vdev_max_active = 1000; + +/* + * Per-queue limits on the number of I/Os active to each device. If the + * sum of the queue's max_active is < zfs_vdev_max_active, then the + * min_active comes into play. We will send min_active from each queue, + * and then select from queues in the order defined by zio_priority_t. + * + * In general, smaller max_active's will lead to lower latency of synchronous + * operations. Larger max_active's may lead to higher overall throughput, + * depending on underlying storage. + * + * The ratio of the queues' max_actives determines the balance of performance + * between reads, writes, and scrubs. E.g., increasing + * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete + * more quickly, but reads and writes to have higher latency and lower + * throughput. + */ +uint32_t zfs_vdev_sync_read_min_active = 10; +uint32_t zfs_vdev_sync_read_max_active = 10; +uint32_t zfs_vdev_sync_write_min_active = 10; +uint32_t zfs_vdev_sync_write_max_active = 10; +uint32_t zfs_vdev_async_read_min_active = 1; +uint32_t zfs_vdev_async_read_max_active = 3; +uint32_t zfs_vdev_async_write_min_active = 1; +uint32_t zfs_vdev_async_write_max_active = 10; +uint32_t zfs_vdev_scrub_min_active = 1; +uint32_t zfs_vdev_scrub_max_active = 2; +uint32_t zfs_vdev_trim_min_active = 1; +/* + * TRIM max active is large in comparison to the other values due to the fact + * that TRIM IOs are coalesced at the device layer. This value is set such + * that a typical SSD can process the queued IOs in a single request. + */ +uint32_t zfs_vdev_trim_max_active = 64; +uint32_t zfs_vdev_removal_min_active = 1; +uint32_t zfs_vdev_removal_max_active = 2; +uint32_t zfs_vdev_initializing_min_active = 1; +uint32_t zfs_vdev_initializing_max_active = 1; + + +/* + * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent + * dirty data, use zfs_vdev_async_write_min_active. When it has more than + * zfs_vdev_async_write_active_max_dirty_percent, use + * zfs_vdev_async_write_max_active. The value is linearly interpolated + * between min and max. + */ +int zfs_vdev_async_write_active_min_dirty_percent = 30; +int zfs_vdev_async_write_active_max_dirty_percent = 60; + +/* + * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. + * For read I/Os, we also aggregate across small adjacency gaps; for writes + * we include spans of optional I/Os to aid aggregation at the disk even when + * they aren't able to help us aggregate at this level. + */ +int zfs_vdev_aggregation_limit = 1 << 20; +int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE; +int zfs_vdev_read_gap_limit = 32 << 10; +int zfs_vdev_write_gap_limit = 4 << 10; + +/* + * Define the queue depth percentage for each top-level. This percentage is + * used in conjunction with zfs_vdev_async_max_active to determine how many + * allocations a specific top-level vdev should handle. Once the queue depth + * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100 + * then allocator will stop allocating blocks on that top-level device. + * The default kernel setting is 1000% which will yield 100 allocations per + * device. For userland testing, the default setting is 300% which equates + * to 30 allocations per device. + */ +#ifdef _KERNEL +int zfs_vdev_queue_depth_pct = 1000; +#else +int zfs_vdev_queue_depth_pct = 300; +#endif + +/* + * When performing allocations for a given metaslab, we want to make sure that + * there are enough IOs to aggregate together to improve throughput. We want to + * ensure that there are at least 128k worth of IOs that can be aggregated, and + * we assume that the average allocation size is 4k, so we need the queue depth + * to be 32 per allocator to get good aggregation of sequential writes. + */ +int zfs_vdev_def_queue_depth = 32; + +#ifdef __FreeBSD__ +#ifdef _KERNEL +SYSCTL_DECL(_vfs_zfs_vdev); + +static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), + sysctl_zfs_async_write_active_min_dirty_percent, "I", + "Percentage of async write dirty data below which " + "async_write_min_active is used."); + +static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), + sysctl_zfs_async_write_active_max_dirty_percent, "I", + "Percentage of async write dirty data above which " + "async_write_max_active is used."); + +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN, + &zfs_vdev_max_active, 0, + "The maximum number of I/Os of all types active for each device."); + +#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \ +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\ + &zfs_vdev_ ## name ## _min_active, 0, \ + "Initial number of I/O requests of type " #name \ + " active for each device"); + +#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \ +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN,\ + &zfs_vdev_ ## name ## _max_active, 0, \ + "Maximum number of I/O requests of type " #name \ + " active for each device"); + +ZFS_VDEV_QUEUE_KNOB_MIN(sync_read); +ZFS_VDEV_QUEUE_KNOB_MAX(sync_read); +ZFS_VDEV_QUEUE_KNOB_MIN(sync_write); +ZFS_VDEV_QUEUE_KNOB_MAX(sync_write); +ZFS_VDEV_QUEUE_KNOB_MIN(async_read); +ZFS_VDEV_QUEUE_KNOB_MAX(async_read); +ZFS_VDEV_QUEUE_KNOB_MIN(async_write); +ZFS_VDEV_QUEUE_KNOB_MAX(async_write); +ZFS_VDEV_QUEUE_KNOB_MIN(scrub); +ZFS_VDEV_QUEUE_KNOB_MAX(scrub); +ZFS_VDEV_QUEUE_KNOB_MIN(trim); +ZFS_VDEV_QUEUE_KNOB_MAX(trim); +ZFS_VDEV_QUEUE_KNOB_MIN(removal); +ZFS_VDEV_QUEUE_KNOB_MAX(removal); +ZFS_VDEV_QUEUE_KNOB_MIN(initializing); +ZFS_VDEV_QUEUE_KNOB_MAX(initializing); + +#undef ZFS_VDEV_QUEUE_KNOB + +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN, + &zfs_vdev_aggregation_limit, 0, + "I/O requests are aggregated up to this size"); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit_non_rotating, CTLFLAG_RWTUN, + &zfs_vdev_aggregation_limit_non_rotating, 0, + "I/O requests are aggregated up to this size for non-rotating media"); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN, + &zfs_vdev_read_gap_limit, 0, + "Acceptable gap between two reads being aggregated"); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN, + &zfs_vdev_write_gap_limit, 0, + "Acceptable gap between two writes being aggregated"); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN, + &zfs_vdev_queue_depth_pct, 0, + "Queue depth percentage for each top-level"); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN, + &zfs_vdev_def_queue_depth, 0, + "Default queue depth for each allocator"); + +static int +sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS) +{ + int val, err; + + val = zfs_vdev_async_write_active_min_dirty_percent; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < 0 || val > 100 || + val >= zfs_vdev_async_write_active_max_dirty_percent) + return (EINVAL); + + zfs_vdev_async_write_active_min_dirty_percent = val; + + return (0); +} + +static int +sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS) +{ + int val, err; + + val = zfs_vdev_async_write_active_max_dirty_percent; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < 0 || val > 100 || + val <= zfs_vdev_async_write_active_min_dirty_percent) + return (EINVAL); + + zfs_vdev_async_write_active_max_dirty_percent = val; + + return (0); +} +#endif +#endif + +int +vdev_queue_offset_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = (const zio_t *)x1; + const zio_t *z2 = (const zio_t *)x2; + + int cmp = AVL_CMP(z1->io_offset, z2->io_offset); + + if (likely(cmp)) + return (cmp); + + return (AVL_PCMP(z1, z2)); +} + +static inline avl_tree_t * +vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) +{ + return (&vq->vq_class[p].vqc_queued_tree); +} + +static inline avl_tree_t * +vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) +{ + if (t == ZIO_TYPE_READ) + return (&vq->vq_read_offset_tree); + else if (t == ZIO_TYPE_WRITE) + return (&vq->vq_write_offset_tree); + else + return (NULL); +} + +int +vdev_queue_timestamp_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_timestamp < z2->io_timestamp) + return (-1); + if (z1->io_timestamp > z2->io_timestamp) + return (1); + + if (z1->io_offset < z2->io_offset) + return (-1); + if (z1->io_offset > z2->io_offset) + return (1); + + if (z1 < z2) + return (-1); + if (z1 > z2) + return (1); + + return (0); +} + +void +vdev_queue_init(vdev_t *vd) +{ + vdev_queue_t *vq = &vd->vdev_queue; + + mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); + vq->vq_vdev = vd; + + avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, + sizeof (zio_t), offsetof(struct zio, io_queue_node)); + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + int (*compfn) (const void *, const void *); + + /* + * The synchronous i/o queues are dispatched in FIFO rather + * than LBA order. This provides more consistent latency for + * these i/os. + */ + if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE) + compfn = vdev_queue_timestamp_compare; + else + compfn = vdev_queue_offset_compare; + + avl_create(vdev_queue_class_tree(vq, p), compfn, + sizeof (zio_t), offsetof(struct zio, io_queue_node)); + } + + vq->vq_lastoffset = 0; +} + +void +vdev_queue_fini(vdev_t *vd) +{ + vdev_queue_t *vq = &vd->vdev_queue; + + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) + avl_destroy(vdev_queue_class_tree(vq, p)); + avl_destroy(&vq->vq_active_tree); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); + + mutex_destroy(&vq->vq_lock); +} + +static void +vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) +{ + spa_t *spa = zio->io_spa; + avl_tree_t *qtt; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + qtt = vdev_queue_type_tree(vq, zio->io_type); + if (qtt) + avl_add(qtt, zio); + +#ifdef illumos + mutex_enter(&spa->spa_iokstat_lock); + spa->spa_queue_stats[zio->io_priority].spa_queued++; + if (spa->spa_iokstat != NULL) + kstat_waitq_enter(spa->spa_iokstat->ks_data); + mutex_exit(&spa->spa_iokstat_lock); +#endif +} + +static void +vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) +{ + spa_t *spa = zio->io_spa; + avl_tree_t *qtt; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + qtt = vdev_queue_type_tree(vq, zio->io_type); + if (qtt) + avl_remove(qtt, zio); + +#ifdef illumos + mutex_enter(&spa->spa_iokstat_lock); + ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0); + spa->spa_queue_stats[zio->io_priority].spa_queued--; + if (spa->spa_iokstat != NULL) + kstat_waitq_exit(spa->spa_iokstat->ks_data); + mutex_exit(&spa->spa_iokstat_lock); +#endif +} + +static void +vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) +{ + spa_t *spa = zio->io_spa; + ASSERT(MUTEX_HELD(&vq->vq_lock)); + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + vq->vq_class[zio->io_priority].vqc_active++; + avl_add(&vq->vq_active_tree, zio); + +#ifdef illumos + mutex_enter(&spa->spa_iokstat_lock); + spa->spa_queue_stats[zio->io_priority].spa_active++; + if (spa->spa_iokstat != NULL) + kstat_runq_enter(spa->spa_iokstat->ks_data); + mutex_exit(&spa->spa_iokstat_lock); +#endif +} + +static void +vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) +{ + spa_t *spa = zio->io_spa; + ASSERT(MUTEX_HELD(&vq->vq_lock)); + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + vq->vq_class[zio->io_priority].vqc_active--; + avl_remove(&vq->vq_active_tree, zio); + +#ifdef illumos + mutex_enter(&spa->spa_iokstat_lock); + ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0); + spa->spa_queue_stats[zio->io_priority].spa_active--; + if (spa->spa_iokstat != NULL) { + kstat_io_t *ksio = spa->spa_iokstat->ks_data; + + kstat_runq_exit(spa->spa_iokstat->ks_data); + if (zio->io_type == ZIO_TYPE_READ) { + ksio->reads++; + ksio->nread += zio->io_size; + } else if (zio->io_type == ZIO_TYPE_WRITE) { + ksio->writes++; + ksio->nwritten += zio->io_size; + } + } + mutex_exit(&spa->spa_iokstat_lock); +#endif +} + +static void +vdev_queue_agg_io_done(zio_t *aio) +{ + if (aio->io_type == ZIO_TYPE_READ) { + zio_t *pio; + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(aio, &zl)) != NULL) { + abd_copy_off(pio->io_abd, aio->io_abd, + 0, pio->io_offset - aio->io_offset, pio->io_size); + } + } + + abd_free(aio->io_abd); +} + +static int +vdev_queue_class_min_active(zio_priority_t p) +{ + switch (p) { + case ZIO_PRIORITY_SYNC_READ: + return (zfs_vdev_sync_read_min_active); + case ZIO_PRIORITY_SYNC_WRITE: + return (zfs_vdev_sync_write_min_active); + case ZIO_PRIORITY_ASYNC_READ: + return (zfs_vdev_async_read_min_active); + case ZIO_PRIORITY_ASYNC_WRITE: + return (zfs_vdev_async_write_min_active); + case ZIO_PRIORITY_SCRUB: + return (zfs_vdev_scrub_min_active); + case ZIO_PRIORITY_TRIM: + return (zfs_vdev_trim_min_active); + case ZIO_PRIORITY_REMOVAL: + return (zfs_vdev_removal_min_active); + case ZIO_PRIORITY_INITIALIZING: + return (zfs_vdev_initializing_min_active); + default: + panic("invalid priority %u", p); + return (0); + } +} + +static __noinline int +vdev_queue_max_async_writes(spa_t *spa) +{ + int writes; + uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total; + uint64_t min_bytes = zfs_dirty_data_max * + zfs_vdev_async_write_active_min_dirty_percent / 100; + uint64_t max_bytes = zfs_dirty_data_max * + zfs_vdev_async_write_active_max_dirty_percent / 100; + + /* + * Sync tasks correspond to interactive user actions. To reduce the + * execution time of those actions we push data out as fast as possible. + */ + if (spa_has_pending_synctask(spa)) { + return (zfs_vdev_async_write_max_active); + } + + if (dirty < min_bytes) + return (zfs_vdev_async_write_min_active); + if (dirty > max_bytes) + return (zfs_vdev_async_write_max_active); + + /* + * linear interpolation: + * slope = (max_writes - min_writes) / (max_bytes - min_bytes) + * move right by min_bytes + * move up by min_writes + */ + writes = (dirty - min_bytes) * + (zfs_vdev_async_write_max_active - + zfs_vdev_async_write_min_active) / + (max_bytes - min_bytes) + + zfs_vdev_async_write_min_active; + ASSERT3U(writes, >=, zfs_vdev_async_write_min_active); + ASSERT3U(writes, <=, zfs_vdev_async_write_max_active); + return (writes); +} + +static int +vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) +{ + switch (p) { + case ZIO_PRIORITY_SYNC_READ: + return (zfs_vdev_sync_read_max_active); + case ZIO_PRIORITY_SYNC_WRITE: + return (zfs_vdev_sync_write_max_active); + case ZIO_PRIORITY_ASYNC_READ: + return (zfs_vdev_async_read_max_active); + case ZIO_PRIORITY_ASYNC_WRITE: + return (vdev_queue_max_async_writes(spa)); + case ZIO_PRIORITY_SCRUB: + return (zfs_vdev_scrub_max_active); + case ZIO_PRIORITY_TRIM: + return (zfs_vdev_trim_max_active); + case ZIO_PRIORITY_REMOVAL: + return (zfs_vdev_removal_max_active); + case ZIO_PRIORITY_INITIALIZING: + return (zfs_vdev_initializing_max_active); + default: + panic("invalid priority %u", p); + return (0); + } +} + +/* + * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if + * there is no eligible class. + */ +static zio_priority_t +vdev_queue_class_to_issue(vdev_queue_t *vq) +{ + spa_t *spa = vq->vq_vdev->vdev_spa; + zio_priority_t p; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) + return (ZIO_PRIORITY_NUM_QUEUEABLE); + + /* find a queue that has not reached its minimum # outstanding i/os */ + for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && + vq->vq_class[p].vqc_active < + vdev_queue_class_min_active(p)) + return (p); + } + + /* + * If we haven't found a queue, look for one that hasn't reached its + * maximum # outstanding i/os. + */ + for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && + vq->vq_class[p].vqc_active < + vdev_queue_class_max_active(spa, p)) + return (p); + } + + /* No eligible queued i/os */ + return (ZIO_PRIORITY_NUM_QUEUEABLE); +} + +/* + * Compute the range spanned by two i/os, which is the endpoint of the last + * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). + * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); + * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. + */ +#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) +#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) + +static zio_t * +vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) +{ + zio_t *first, *last, *aio, *dio, *mandatory, *nio; + zio_link_t *zl = NULL; + uint64_t maxgap = 0; + uint64_t size; + uint64_t limit; + int maxblocksize; + boolean_t stretch; + avl_tree_t *t; + enum zio_flag flags; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); + if (vq->vq_vdev->vdev_nonrot) + limit = zfs_vdev_aggregation_limit_non_rotating; + else + limit = zfs_vdev_aggregation_limit; + limit = MAX(MIN(limit, maxblocksize), 0); + + if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0) + return (NULL); + + first = last = zio; + + if (zio->io_type == ZIO_TYPE_READ) + maxgap = zfs_vdev_read_gap_limit; + + /* + * We can aggregate I/Os that are sufficiently adjacent and of + * the same flavor, as expressed by the AGG_INHERIT flags. + * The latter requirement is necessary so that certain + * attributes of the I/O, such as whether it's a normal I/O + * or a scrub/resilver, can be preserved in the aggregate. + * We can include optional I/Os, but don't allow them + * to begin a range as they add no benefit in that situation. + */ + + /* + * We keep track of the last non-optional I/O. + */ + mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; + + /* + * Walk backwards through sufficiently contiguous I/Os + * recording the last non-optional I/O. + */ + flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; + t = vdev_queue_type_tree(vq, zio->io_type); + while (t != NULL && (dio = AVL_PREV(t, first)) != NULL && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + IO_SPAN(dio, last) <= limit && + IO_GAP(dio, first) <= maxgap && + dio->io_type == zio->io_type) { + first = dio; + if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) + mandatory = first; + } + + /* + * Skip any initial optional I/Os. + */ + while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { + first = AVL_NEXT(t, first); + ASSERT(first != NULL); + } + + /* + * Walk forward through sufficiently contiguous I/Os. + * The aggregation limit does not apply to optional i/os, so that + * we can issue contiguous writes even if they are larger than the + * aggregation limit. + */ + while ((dio = AVL_NEXT(t, last)) != NULL && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + (IO_SPAN(first, dio) <= limit || + (dio->io_flags & ZIO_FLAG_OPTIONAL)) && + IO_SPAN(first, dio) <= maxblocksize && + IO_GAP(last, dio) <= maxgap && + dio->io_type == zio->io_type) { + last = dio; + if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) + mandatory = last; + } + + /* + * Now that we've established the range of the I/O aggregation + * we must decide what to do with trailing optional I/Os. + * For reads, there's nothing to do. While we are unable to + * aggregate further, it's possible that a trailing optional + * I/O would allow the underlying device to aggregate with + * subsequent I/Os. We must therefore determine if the next + * non-optional I/O is close enough to make aggregation + * worthwhile. + */ + stretch = B_FALSE; + if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { + zio_t *nio = last; + while ((dio = AVL_NEXT(t, nio)) != NULL && + IO_GAP(nio, dio) == 0 && + IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { + nio = dio; + if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { + stretch = B_TRUE; + break; + } + } + } + + if (stretch) { + /* + * We are going to include an optional io in our aggregated + * span, thus closing the write gap. Only mandatory i/os can + * start aggregated spans, so make sure that the next i/o + * after our span is mandatory. + */ + dio = AVL_NEXT(t, last); + dio->io_flags &= ~ZIO_FLAG_OPTIONAL; + } else { + /* do not include the optional i/o */ + while (last != mandatory && last != first) { + ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); + last = AVL_PREV(t, last); + ASSERT(last != NULL); + } + } + + if (first == last) + return (NULL); + + size = IO_SPAN(first, last); + ASSERT3U(size, <=, maxblocksize); + + aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, + abd_alloc_for_io(size, B_TRUE), size, first->io_type, + zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, + vdev_queue_agg_io_done, NULL); + aio->io_timestamp = first->io_timestamp; + + nio = first; + do { + dio = nio; + nio = AVL_NEXT(t, dio); + ASSERT3U(dio->io_type, ==, aio->io_type); + + if (dio->io_flags & ZIO_FLAG_NODATA) { + ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); + abd_zero_off(aio->io_abd, + dio->io_offset - aio->io_offset, dio->io_size); + } else if (dio->io_type == ZIO_TYPE_WRITE) { + abd_copy_off(aio->io_abd, dio->io_abd, + dio->io_offset - aio->io_offset, 0, dio->io_size); + } + + zio_add_child(dio, aio); + vdev_queue_io_remove(vq, dio); + } while (dio != last); + + /* + * We need to drop the vdev queue's lock to avoid a deadlock that we + * could encounter since this I/O will complete immediately. + */ + mutex_exit(&vq->vq_lock); + while ((dio = zio_walk_parents(aio, &zl)) != NULL) { + zio_vdev_io_bypass(dio); + zio_execute(dio); + } + mutex_enter(&vq->vq_lock); + + return (aio); +} + +static zio_t * +vdev_queue_io_to_issue(vdev_queue_t *vq) +{ + zio_t *zio, *aio; + zio_priority_t p; + avl_index_t idx; + avl_tree_t *tree; + zio_t search; + +again: + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + p = vdev_queue_class_to_issue(vq); + + if (p == ZIO_PRIORITY_NUM_QUEUEABLE) { + /* No eligible queued i/os */ + return (NULL); + } + + /* + * For LBA-ordered queues (async / scrub / initializing), issue the + * i/o which follows the most recently issued i/o in LBA (offset) order. + * + * For FIFO queues (sync), issue the i/o with the lowest timestamp. + */ + tree = vdev_queue_class_tree(vq, p); + search.io_timestamp = 0; + search.io_offset = vq->vq_last_offset + 1; + VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); + zio = avl_nearest(tree, idx, AVL_AFTER); + if (zio == NULL) + zio = avl_first(tree); + ASSERT3U(zio->io_priority, ==, p); + + aio = vdev_queue_aggregate(vq, zio); + if (aio != NULL) + zio = aio; + else + vdev_queue_io_remove(vq, zio); + + /* + * If the I/O is or was optional and therefore has no data, we need to + * simply discard it. We need to drop the vdev queue's lock to avoid a + * deadlock that we could encounter since this I/O will complete + * immediately. + */ + if (zio->io_flags & ZIO_FLAG_NODATA) { + mutex_exit(&vq->vq_lock); + zio_vdev_io_bypass(zio); + zio_execute(zio); + mutex_enter(&vq->vq_lock); + goto again; + } + + vdev_queue_pending_add(vq, zio); + vq->vq_last_offset = zio->io_offset; + + return (zio); +} + +zio_t * +vdev_queue_io(zio_t *zio) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + zio_t *nio; + + if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) + return (zio); + + /* + * Children i/os inherent their parent's priority, which might + * not match the child's i/o type. Fix it up here. + */ + if (zio->io_type == ZIO_TYPE_READ) { + if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && + zio->io_priority != ZIO_PRIORITY_ASYNC_READ && + zio->io_priority != ZIO_PRIORITY_SCRUB && + zio->io_priority != ZIO_PRIORITY_REMOVAL && + zio->io_priority != ZIO_PRIORITY_INITIALIZING) + zio->io_priority = ZIO_PRIORITY_ASYNC_READ; + } else if (zio->io_type == ZIO_TYPE_WRITE) { + if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && + zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && + zio->io_priority != ZIO_PRIORITY_REMOVAL && + zio->io_priority != ZIO_PRIORITY_INITIALIZING) + zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; + } else { + ASSERT(zio->io_type == ZIO_TYPE_FREE); + zio->io_priority = ZIO_PRIORITY_TRIM; + } + + zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; + + mutex_enter(&vq->vq_lock); + zio->io_timestamp = gethrtime(); + vdev_queue_io_add(vq, zio); + nio = vdev_queue_io_to_issue(vq); + mutex_exit(&vq->vq_lock); + + if (nio == NULL) + return (NULL); + + if (nio->io_done == vdev_queue_agg_io_done) { + zio_nowait(nio); + return (NULL); + } + + return (nio); +} + +void +vdev_queue_io_done(zio_t *zio) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + zio_t *nio; + + mutex_enter(&vq->vq_lock); + + vdev_queue_pending_remove(vq, zio); + + vq->vq_io_complete_ts = gethrtime(); + + while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { + mutex_exit(&vq->vq_lock); + if (nio->io_done == vdev_queue_agg_io_done) { + zio_nowait(nio); + } else { + zio_vdev_io_reissue(nio); + zio_execute(nio); + } + mutex_enter(&vq->vq_lock); + } + + mutex_exit(&vq->vq_lock); +} + +void +vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + avl_tree_t *tree; + + /* + * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio + * code to issue IOs without adding them to the vdev queue. In this + * case, the zio is already going to be issued as quickly as possible + * and so it doesn't need any reprioitization to help. + */ + if (zio->io_priority == ZIO_PRIORITY_NOW) + return; + + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + + if (zio->io_type == ZIO_TYPE_READ) { + if (priority != ZIO_PRIORITY_SYNC_READ && + priority != ZIO_PRIORITY_ASYNC_READ && + priority != ZIO_PRIORITY_SCRUB) + priority = ZIO_PRIORITY_ASYNC_READ; + } else { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + if (priority != ZIO_PRIORITY_SYNC_WRITE && + priority != ZIO_PRIORITY_ASYNC_WRITE) + priority = ZIO_PRIORITY_ASYNC_WRITE; + } + + mutex_enter(&vq->vq_lock); + + /* + * If the zio is in none of the queues we can simply change + * the priority. If the zio is waiting to be submitted we must + * remove it from the queue and re-insert it with the new priority. + * Otherwise, the zio is currently active and we cannot change its + * priority. + */ + tree = vdev_queue_class_tree(vq, zio->io_priority); + if (avl_find(tree, zio, NULL) == zio) { + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + zio->io_priority = priority; + avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { + zio->io_priority = priority; + } + + mutex_exit(&vq->vq_lock); +} + +/* + * As these three methods are only used for load calculations we're not concerned + * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex + * use here, instead we prefer to keep it lock free for performance. + */ +int +vdev_queue_length(vdev_t *vd) +{ + return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); +} + +uint64_t +vdev_queue_lastoffset(vdev_t *vd) +{ + return (vd->vdev_queue.vq_lastoffset); +} + +void +vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio) +{ + vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size; +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c new file mode 100644 index 000000000000..29878ea6eaf6 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -0,0 +1,2707 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#ifdef illumos +#include <sys/vdev_disk.h> +#endif +#include <sys/vdev_file.h> +#include <sys/vdev_raidz.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/abd.h> +#include <sys/fs/zfs.h> +#include <sys/fm/fs/zfs.h> +#include <sys/bio.h> + +#ifdef ZFS_DEBUG +#include <sys/vdev_initialize.h> /* vdev_xlate testing */ +#endif + +/* + * Virtual device vector for RAID-Z. + * + * This vdev supports single, double, and triple parity. For single parity, + * we use a simple XOR of all the data columns. For double or triple parity, + * we use a special case of Reed-Solomon coding. This extends the + * technique described in "The mathematics of RAID-6" by H. Peter Anvin by + * drawing on the system described in "A Tutorial on Reed-Solomon Coding for + * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the + * former is also based. The latter is designed to provide higher performance + * for writes. + * + * Note that the Plank paper claimed to support arbitrary N+M, but was then + * amended six years later identifying a critical flaw that invalidates its + * claims. Nevertheless, the technique can be adapted to work for up to + * triple parity. For additional parity, the amendment "Note: Correction to + * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding + * is viable, but the additional complexity means that write performance will + * suffer. + * + * All of the methods above operate on a Galois field, defined over the + * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements + * can be expressed with a single byte. Briefly, the operations on the + * field are defined as follows: + * + * o addition (+) is represented by a bitwise XOR + * o subtraction (-) is therefore identical to addition: A + B = A - B + * o multiplication of A by 2 is defined by the following bitwise expression: + * + * (A * 2)_7 = A_6 + * (A * 2)_6 = A_5 + * (A * 2)_5 = A_4 + * (A * 2)_4 = A_3 + A_7 + * (A * 2)_3 = A_2 + A_7 + * (A * 2)_2 = A_1 + A_7 + * (A * 2)_1 = A_0 + * (A * 2)_0 = A_7 + * + * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). + * As an aside, this multiplication is derived from the error correcting + * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. + * + * Observe that any number in the field (except for 0) can be expressed as a + * power of 2 -- a generator for the field. We store a table of the powers of + * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can + * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather + * than field addition). The inverse of a field element A (A^-1) is therefore + * A ^ (255 - 1) = A^254. + * + * The up-to-three parity columns, P, Q, R over several data columns, + * D_0, ... D_n-1, can be expressed by field operations: + * + * P = D_0 + D_1 + ... + D_n-2 + D_n-1 + * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 + * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 + * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 + * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 + * + * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival + * XOR operation, and 2 and 4 can be computed quickly and generate linearly- + * independent coefficients. (There are no additional coefficients that have + * this property which is why the uncorrected Plank method breaks down.) + * + * See the reconstruction code below for how P, Q and R can used individually + * or in concert to recover missing data columns. + */ + +typedef struct raidz_col { + uint64_t rc_devidx; /* child device index for I/O */ + uint64_t rc_offset; /* device offset */ + uint64_t rc_size; /* I/O size */ + abd_t *rc_abd; /* I/O data */ + void *rc_gdata; /* used to store the "good" version */ + int rc_error; /* I/O error for this device */ + uint8_t rc_tried; /* Did we attempt this I/O column? */ + uint8_t rc_skipped; /* Did we skip this I/O column? */ +} raidz_col_t; + +typedef struct raidz_map { + uint64_t rm_cols; /* Regular column count */ + uint64_t rm_scols; /* Count including skipped columns */ + uint64_t rm_bigcols; /* Number of oversized columns */ + uint64_t rm_asize; /* Actual total I/O size */ + uint64_t rm_missingdata; /* Count of missing data devices */ + uint64_t rm_missingparity; /* Count of missing parity devices */ + uint64_t rm_firstdatacol; /* First data column/parity count */ + uint64_t rm_nskip; /* Skipped sectors for padding */ + uint64_t rm_skipstart; /* Column index of padding start */ + abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ + uintptr_t rm_reports; /* # of referencing checksum reports */ + uint8_t rm_freed; /* map no longer has referencing ZIO */ + uint8_t rm_ecksuminjected; /* checksum error was injected */ + raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ +} raidz_map_t; + +#define VDEV_RAIDZ_P 0 +#define VDEV_RAIDZ_Q 1 +#define VDEV_RAIDZ_R 2 + +#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) +#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) + +/* + * We provide a mechanism to perform the field multiplication operation on a + * 64-bit value all at once rather than a byte at a time. This works by + * creating a mask from the top bit in each byte and using that to + * conditionally apply the XOR of 0x1d. + */ +#define VDEV_RAIDZ_64MUL_2(x, mask) \ +{ \ + (mask) = (x) & 0x8080808080808080ULL; \ + (mask) = ((mask) << 1) - ((mask) >> 7); \ + (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ + ((mask) & 0x1d1d1d1d1d1d1d1d); \ +} + +#define VDEV_RAIDZ_64MUL_4(x, mask) \ +{ \ + VDEV_RAIDZ_64MUL_2((x), mask); \ + VDEV_RAIDZ_64MUL_2((x), mask); \ +} + +#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) + +/* + * Force reconstruction to use the general purpose method. + */ +int vdev_raidz_default_to_general; + +/* Powers of 2 in the Galois field defined above. */ +static const uint8_t vdev_raidz_pow2[256] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, + 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, + 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, + 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, + 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, + 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, + 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, + 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, + 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, + 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, + 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, + 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, + 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, + 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, + 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, + 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, + 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, + 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, + 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, + 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, + 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, + 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, + 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, + 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, + 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, + 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, + 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, + 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, + 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, + 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, + 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 +}; +/* Logs of 2 in the Galois field defined above. */ +static const uint8_t vdev_raidz_log2[256] = { + 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, + 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, + 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, + 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, + 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, + 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, + 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, + 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, + 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, + 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, + 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, + 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, + 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, + 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, + 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, + 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, + 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, + 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, + 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, + 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, + 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, + 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, + 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, + 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, + 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, + 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, + 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, + 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, + 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, + 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, + 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, + 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, +}; + +static void vdev_raidz_generate_parity(raidz_map_t *rm); + +/* + * Multiply a given number by 2 raised to the given power. + */ +static uint8_t +vdev_raidz_exp2(uint_t a, int exp) +{ + if (a == 0) + return (0); + + ASSERT(exp >= 0); + ASSERT(vdev_raidz_log2[a] > 0 || a == 1); + + exp += vdev_raidz_log2[a]; + if (exp > 255) + exp -= 255; + + return (vdev_raidz_pow2[exp]); +} + +static void +vdev_raidz_map_free(raidz_map_t *rm) +{ + int c; + + for (c = 0; c < rm->rm_firstdatacol; c++) { + if (rm->rm_col[c].rc_abd != NULL) + abd_free(rm->rm_col[c].rc_abd); + + if (rm->rm_col[c].rc_gdata != NULL) + zio_buf_free(rm->rm_col[c].rc_gdata, + rm->rm_col[c].rc_size); + } + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + if (rm->rm_col[c].rc_abd != NULL) + abd_put(rm->rm_col[c].rc_abd); + } + + if (rm->rm_abd_copy != NULL) + abd_free(rm->rm_abd_copy); + + kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); +} + +static void +vdev_raidz_map_free_vsd(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + ASSERT0(rm->rm_freed); + rm->rm_freed = 1; + + if (rm->rm_reports == 0) + vdev_raidz_map_free(rm); +} + +/*ARGSUSED*/ +static void +vdev_raidz_cksum_free(void *arg, size_t ignored) +{ + raidz_map_t *rm = arg; + + ASSERT3U(rm->rm_reports, >, 0); + + if (--rm->rm_reports == 0 && rm->rm_freed != 0) + vdev_raidz_map_free(rm); +} + +static void +vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) +{ + raidz_map_t *rm = zcr->zcr_cbdata; + size_t c = zcr->zcr_cbinfo; + size_t x; + + const char *good = NULL; + char *bad; + + if (good_data == NULL) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); + return; + } + + if (c < rm->rm_firstdatacol) { + /* + * The first time through, calculate the parity blocks for + * the good data (this relies on the fact that the good + * data never changes for a given logical ZIO) + */ + if (rm->rm_col[0].rc_gdata == NULL) { + abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; + char *buf; + int offset; + + /* + * Set up the rm_col[]s to generate the parity for + * good_data, first saving the parity bufs and + * replacing them with buffers to hold the result. + */ + for (x = 0; x < rm->rm_firstdatacol; x++) { + bad_parity[x] = rm->rm_col[x].rc_abd; + rm->rm_col[x].rc_gdata = + zio_buf_alloc(rm->rm_col[x].rc_size); + rm->rm_col[x].rc_abd = + abd_get_from_buf(rm->rm_col[x].rc_gdata, + rm->rm_col[x].rc_size); + } + + /* fill in the data columns from good_data */ + buf = (char *)good_data; + for (; x < rm->rm_cols; x++) { + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = abd_get_from_buf(buf, + rm->rm_col[x].rc_size); + buf += rm->rm_col[x].rc_size; + } + + /* + * Construct the parity from the good data. + */ + vdev_raidz_generate_parity(rm); + + /* restore everything back to its original state */ + for (x = 0; x < rm->rm_firstdatacol; x++) { + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = bad_parity[x]; + } + + offset = 0; + for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = abd_get_offset( + rm->rm_abd_copy, offset); + offset += rm->rm_col[x].rc_size; + } + } + + ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); + good = rm->rm_col[c].rc_gdata; + } else { + /* adjust good_data to point at the start of our column */ + good = good_data; + + for (x = rm->rm_firstdatacol; x < c; x++) + good += rm->rm_col[x].rc_size; + } + + bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size); + /* we drop the ereport if it ends up that the data was good */ + zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); + abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size); +} + +/* + * Invoked indirectly by zfs_ereport_start_checksum(), called + * below when our read operation fails completely. The main point + * is to keep a copy of everything we read from disk, so that at + * vdev_raidz_cksum_finish() time we can compare it with the good data. + */ +static void +vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) +{ + size_t c = (size_t)(uintptr_t)arg; + size_t offset; + + raidz_map_t *rm = zio->io_vsd; + size_t size; + + /* set up the report and bump the refcount */ + zcr->zcr_cbdata = rm; + zcr->zcr_cbinfo = c; + zcr->zcr_finish = vdev_raidz_cksum_finish; + zcr->zcr_free = vdev_raidz_cksum_free; + + rm->rm_reports++; + ASSERT3U(rm->rm_reports, >, 0); + + if (rm->rm_abd_copy != NULL) + return; + + /* + * It's the first time we're called for this raidz_map_t, so we need + * to copy the data aside; there's no guarantee that our zio's buffer + * won't be re-used for something else. + * + * Our parity data is already in separate buffers, so there's no need + * to copy them. + */ + + size = 0; + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + size += rm->rm_col[c].rc_size; + + rm->rm_abd_copy = + abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size); + + for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset); + + abd_copy(tmp, col->rc_abd, col->rc_size); + abd_put(col->rc_abd); + col->rc_abd = tmp; + + offset += col->rc_size; + } + ASSERT3U(offset, ==, size); +} + +static const zio_vsd_ops_t vdev_raidz_vsd_ops = { + vdev_raidz_map_free_vsd, + vdev_raidz_cksum_report +}; + +/* + * Divides the IO evenly across all child vdevs; usually, dcols is + * the number of children in the target vdev. + */ +static raidz_map_t * +vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree, + uint64_t unit_shift, uint64_t dcols, uint64_t nparity) +{ + raidz_map_t *rm; + /* The starting RAIDZ (parent) vdev sector of the block. */ + uint64_t b = offset >> unit_shift; + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = size >> unit_shift; + /* The first column for this stripe. */ + uint64_t f = b % dcols; + /* The starting byte offset on each child vdev. */ + uint64_t o = (b / dcols) << unit_shift; + uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + uint64_t off = 0; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ + q = s / (dcols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + r = s - q * (dcols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ + bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + /* acols: The columns that will be accessed. */ + /* scols: The columns that will be accessed or skipped. */ + if (q == 0) { + /* Our I/O request doesn't span all child vdevs. */ + acols = bc; + scols = MIN(dcols, roundup(bc, nparity + 1)); + } else { + acols = dcols; + scols = dcols; + } + + ASSERT3U(acols, <=, scols); + + rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); + + rm->rm_cols = acols; + rm->rm_scols = scols; + rm->rm_bigcols = bc; + rm->rm_skipstart = bc; + rm->rm_missingdata = 0; + rm->rm_missingparity = 0; + rm->rm_firstdatacol = nparity; + rm->rm_abd_copy = NULL; + rm->rm_reports = 0; + rm->rm_freed = 0; + rm->rm_ecksuminjected = 0; + + asize = 0; + + for (c = 0; c < scols; c++) { + col = f + c; + coff = o; + if (col >= dcols) { + col -= dcols; + coff += 1ULL << unit_shift; + } + rm->rm_col[c].rc_devidx = col; + rm->rm_col[c].rc_offset = coff; + rm->rm_col[c].rc_abd = NULL; + rm->rm_col[c].rc_gdata = NULL; + rm->rm_col[c].rc_error = 0; + rm->rm_col[c].rc_tried = 0; + rm->rm_col[c].rc_skipped = 0; + + if (c >= acols) + rm->rm_col[c].rc_size = 0; + else if (c < bc) + rm->rm_col[c].rc_size = (q + 1) << unit_shift; + else + rm->rm_col[c].rc_size = q << unit_shift; + + asize += rm->rm_col[c].rc_size; + } + + ASSERT3U(asize, ==, tot << unit_shift); + rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); + rm->rm_nskip = roundup(tot, nparity + 1) - tot; + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); + ASSERT3U(rm->rm_nskip, <=, nparity); + + if (!dofree) { + for (c = 0; c < rm->rm_firstdatacol; c++) { + rm->rm_col[c].rc_abd = + abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); + } + + for (off = 0, c = rm->rm_firstdatacol; c < acols; c++) { + rm->rm_col[c].rc_abd = abd_get_offset(abd, off); + off += rm->rm_col[c].rc_size; + } + } + + /* + * If all data stored spans all columns, there's a danger that parity + * will always be on the same device and, since parity isn't read + * during normal operation, that that device's I/O bandwidth won't be + * used effectively. We therefore switch the parity every 1MB. + * + * ... at least that was, ostensibly, the theory. As a practical + * matter unless we juggle the parity between all devices evenly, we + * won't see any benefit. Further, occasional writes that aren't a + * multiple of the LCM of the number of children and the minimum + * stripe width are sufficient to avoid pessimal behavior. + * Unfortunately, this decision created an implicit on-disk format + * requirement that we need to support for all eternity, but only + * for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for padding + * we must make sure to note this swap. We will never intend to + * skip the first column since at least one data and one parity + * column must appear in each row. + */ + ASSERT(rm->rm_cols >= 2); + ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); + + if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { + devidx = rm->rm_col[0].rc_devidx; + o = rm->rm_col[0].rc_offset; + rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; + rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; + rm->rm_col[1].rc_devidx = devidx; + rm->rm_col[1].rc_offset = o; + + if (rm->rm_skipstart == 0) + rm->rm_skipstart = 1; + } + + return (rm); +} + +struct pqr_struct { + uint64_t *p; + uint64_t *q; + uint64_t *r; +}; + +static int +vdev_raidz_p_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && !pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++) + *pqr->p ^= *src; + + return (0); +} + +static int +vdev_raidz_pq_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + } + + return (0); +} + +static int +vdev_raidz_pqr_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + VDEV_RAIDZ_64MUL_4(*pqr->r, mask); + *pqr->r ^= *src; + } + + return (0); +} + +static void +vdev_raidz_generate_parity_p(raidz_map_t *rm) +{ + uint64_t *p; + int c; + abd_t *src; + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + + if (c == rm->rm_firstdatacol) { + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + } else { + struct pqr_struct pqr = { p, NULL, NULL }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_p_func, &pqr); + } + } +} + +static void +vdev_raidz_generate_parity_pq(raidz_map_t *rm) +{ + uint64_t *p, *q, pcnt, ccnt, mask, i; + int c; + abd_t *src; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); + + if (c == rm->rm_firstdatacol) { + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + (void) memcpy(q, p, rm->rm_col[c].rc_size); + } else { + struct pqr_struct pqr = { p, q, NULL }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pq_func, &pqr); + } + + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; + } + } else { + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); + } + } + } +} + +static void +vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +{ + uint64_t *p, *q, *r, pcnt, ccnt, mask, i; + int c; + abd_t *src; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_R].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); + + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); + + if (c == rm->rm_firstdatacol) { + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + (void) memcpy(q, p, rm->rm_col[c].rc_size); + (void) memcpy(r, p, rm->rm_col[c].rc_size); + } else { + struct pqr_struct pqr = { p, q, r }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pqr_func, &pqr); + } + + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; + r[i] = 0; + } + } else { + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); + VDEV_RAIDZ_64MUL_4(r[i], mask); + } + } + } +} + +/* + * Generate RAID parity in the first virtual columns according to the number of + * parity columns available. + */ +static void +vdev_raidz_generate_parity(raidz_map_t *rm) +{ + switch (rm->rm_firstdatacol) { + case 1: + vdev_raidz_generate_parity_p(rm); + break; + case 2: + vdev_raidz_generate_parity_pq(rm); + break; + case 3: + vdev_raidz_generate_parity_pqr(rm); + break; + default: + cmn_err(CE_PANIC, "invalid RAID-Z configuration"); + } +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) +{ + uint64_t *dst = dbuf; + uint64_t *src = sbuf; + int cnt = size / sizeof (src[0]); + + for (int i = 0; i < cnt; i++) { + dst[i] ^= src[i]; + } + + return (0); +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, + void *private) +{ + uint64_t *dst = dbuf; + uint64_t *src = sbuf; + uint64_t mask; + int cnt = size / sizeof (dst[0]); + + for (int i = 0; i < cnt; i++, dst++, src++) { + VDEV_RAIDZ_64MUL_2(*dst, mask); + *dst ^= *src; + } + + return (0); +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) +{ + uint64_t *dst = buf; + uint64_t mask; + int cnt = size / sizeof (dst[0]); + + for (int i = 0; i < cnt; i++, dst++) { + /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ + VDEV_RAIDZ_64MUL_2(*dst, mask); + } + + return (0); +} + +struct reconst_q_struct { + uint64_t *q; + int exp; +}; + +static int +vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) +{ + struct reconst_q_struct *rq = private; + uint64_t *dst = buf; + int cnt = size / sizeof (dst[0]); + + for (int i = 0; i < cnt; i++, dst++, rq->q++) { + *dst ^= *rq->q; + + int j; + uint8_t *b; + for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { + *b = vdev_raidz_exp2(*b, rq->exp); + } + } + + return (0); +} + +struct reconst_pq_struct { + uint8_t *p; + uint8_t *q; + uint8_t *pxy; + uint8_t *qxy; + int aexp; + int bexp; +}; + +static int +vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf; + uint8_t *yd = ybuf; + + for (int i = 0; i < size; + i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + *yd = *rpq->p ^ *rpq->pxy ^ *xd; + } + + return (0); +} + +static int +vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf; + + for (int i = 0; i < size; + i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { + /* same operation as vdev_raidz_reconst_pq_func() on xd */ + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + } + + return (0); +} + +static int +vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) +{ + int x = tgts[0]; + int c; + abd_t *dst, *src; + + ASSERT(ntgts == 1); + ASSERT(x >= rm->rm_firstdatacol); + ASSERT(x < rm->rm_cols); + + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); + ASSERT(rm->rm_col[x].rc_size > 0); + + src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; + dst = rm->rm_col[x].rc_abd; + + abd_copy(dst, src, rm->rm_col[x].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + uint64_t size = MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + + src = rm->rm_col[c].rc_abd; + dst = rm->rm_col[x].rc_abd; + + if (c == x) + continue; + + (void) abd_iterate_func2(dst, src, 0, 0, size, + vdev_raidz_reconst_p_func, NULL); + } + + return (1 << VDEV_RAIDZ_P); +} + +static int +vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) +{ + int x = tgts[0]; + int c, exp; + abd_t *dst, *src; + + ASSERT(ntgts == 1); + + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + + src = rm->rm_col[c].rc_abd; + dst = rm->rm_col[x].rc_abd; + + if (c == rm->rm_firstdatacol) { + abd_copy(dst, src, size); + if (rm->rm_col[x].rc_size > size) + abd_zero_off(dst, size, + rm->rm_col[x].rc_size - size); + } else { + ASSERT3U(size, <=, rm->rm_col[x].rc_size); + (void) abd_iterate_func2(dst, src, 0, 0, size, + vdev_raidz_reconst_q_pre_func, NULL); + (void) abd_iterate_func(dst, + size, rm->rm_col[x].rc_size - size, + vdev_raidz_reconst_q_pre_tail_func, NULL); + } + } + + src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; + dst = rm->rm_col[x].rc_abd; + exp = 255 - (rm->rm_cols - 1 - x); + + struct reconst_q_struct rq = { abd_to_buf(src), exp }; + (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, + vdev_raidz_reconst_q_post_func, &rq); + + return (1 << VDEV_RAIDZ_Q); +} + +static int +vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) +{ + uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; + abd_t *pdata, *qdata; + uint64_t xsize, ysize; + int x = tgts[0]; + int y = tgts[1]; + abd_t *xd, *yd; + + ASSERT(ntgts == 2); + ASSERT(x < y); + ASSERT(x >= rm->rm_firstdatacol); + ASSERT(y < rm->rm_cols); + + ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); + + /* + * Move the parity data aside -- we're going to compute parity as + * though columns x and y were full of zeros -- Pxy and Qxy. We want to + * reuse the parity generation mechanism without trashing the actual + * parity so we make those columns appear to be full of zeros by + * setting their lengths to zero. + */ + pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; + qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; + xsize = rm->rm_col[x].rc_size; + ysize = rm->rm_col[y].rc_size; + + rm->rm_col[VDEV_RAIDZ_P].rc_abd = + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); + rm->rm_col[VDEV_RAIDZ_Q].rc_abd = + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); + rm->rm_col[x].rc_size = 0; + rm->rm_col[y].rc_size = 0; + + vdev_raidz_generate_parity_pq(rm); + + rm->rm_col[x].rc_size = xsize; + rm->rm_col[y].rc_size = ysize; + + p = abd_to_buf(pdata); + q = abd_to_buf(qdata); + pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + xd = rm->rm_col[x].rc_abd; + yd = rm->rm_col[y].rc_abd; + + /* + * We now have: + * Pxy = P + D_x + D_y + * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y + * + * We can then solve for D_x: + * D_x = A * (P + Pxy) + B * (Q + Qxy) + * where + * A = 2^(x - y) * (2^(x - y) + 1)^-1 + * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 + * + * With D_x in hand, we can easily solve for D_y: + * D_y = P + Pxy + D_x + */ + + a = vdev_raidz_pow2[255 + x - y]; + b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; + tmp = 255 - vdev_raidz_log2[a ^ 1]; + + aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; + bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; + + ASSERT3U(xsize, >=, ysize); + struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; + (void) abd_iterate_func2(xd, yd, 0, 0, ysize, + vdev_raidz_reconst_pq_func, &rpq); + (void) abd_iterate_func(xd, ysize, xsize - ysize, + vdev_raidz_reconst_pq_tail_func, &rpq); + + abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + + /* + * Restore the saved parity data. + */ + rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; + rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; + + return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); +} + +/* BEGIN CSTYLED */ +/* + * In the general case of reconstruction, we must solve the system of linear + * equations defined by the coeffecients used to generate parity as well as + * the contents of the data and parity disks. This can be expressed with + * vectors for the original data (D) and the actual data (d) and parity (p) + * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): + * + * __ __ __ __ + * | | __ __ | p_0 | + * | V | | D_0 | | p_m-1 | + * | | x | : | = | d_0 | + * | I | | D_n-1 | | : | + * | | ~~ ~~ | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * I is simply a square identity matrix of size n, and V is a vandermonde + * matrix defined by the coeffecients we chose for the various parity columns + * (1, 2, 4). Note that these values were chosen both for simplicity, speedy + * computation as well as linear separability. + * + * __ __ __ __ + * | 1 .. 1 1 1 | | p_0 | + * | 2^n-1 .. 4 2 1 | __ __ | : | + * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | + * | 1 .. 0 0 0 | | D_1 | | d_0 | + * | 0 .. 0 0 0 | x | D_2 | = | d_1 | + * | : : : : | | : | | d_2 | + * | 0 .. 1 0 0 | | D_n-1 | | : | + * | 0 .. 0 1 0 | ~~ ~~ | : | + * | 0 .. 0 0 1 | | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * Note that I, V, d, and p are known. To compute D, we must invert the + * matrix and use the known data and parity values to reconstruct the unknown + * data values. We begin by removing the rows in V|I and d|p that correspond + * to failed or missing columns; we then make V|I square (n x n) and d|p + * sized n by removing rows corresponding to unused parity from the bottom up + * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' + * using Gauss-Jordan elimination. In the example below we use m=3 parity + * columns, n=8 data columns, with errors in d_1, d_2, and p_1: + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks + * | 19 205 116 29 64 16 4 1 | / / + * | 1 0 0 0 0 0 0 0 | / / + * | 0 1 0 0 0 0 0 0 | <--' / + * (V|I) = | 0 0 1 0 0 0 0 0 | <---' + * | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 19 205 116 29 64 16 4 1 | + * | 1 0 0 0 0 0 0 0 | + * (V|I)' = | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We + * have carefully chosen the seed values 1, 2, and 4 to ensure that this + * matrix is not singular. + * __ __ + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 0 0 1 0 0 0 0 0 | + * | 167 100 5 41 159 169 217 208 | + * | 166 100 4 40 158 168 216 209 | + * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values + * of the missing data. + * + * As is apparent from the example above, the only non-trivial rows in the + * inverse matrix correspond to the data disks that we're trying to + * reconstruct. Indeed, those are the only rows we need as the others would + * only be useful for reconstructing data known or assumed to be valid. For + * that reason, we only build the coefficients in the rows that correspond to + * targeted columns. + */ +/* END CSTYLED */ + +static void +vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, + uint8_t **rows) +{ + int i, j; + int pow; + + ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); + + /* + * Fill in the missing rows of interest. + */ + for (i = 0; i < nmap; i++) { + ASSERT3S(0, <=, map[i]); + ASSERT3S(map[i], <=, 2); + + pow = map[i] * n; + if (pow > 255) + pow -= 255; + ASSERT(pow <= 255); + + for (j = 0; j < n; j++) { + pow -= map[i]; + if (pow < 0) + pow += 255; + rows[i][j] = vdev_raidz_pow2[pow]; + } + } +} + +static void +vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, + uint8_t **rows, uint8_t **invrows, const uint8_t *used) +{ + int i, j, ii, jj; + uint8_t log; + + /* + * Assert that the first nmissing entries from the array of used + * columns correspond to parity columns and that subsequent entries + * correspond to data columns. + */ + for (i = 0; i < nmissing; i++) { + ASSERT3S(used[i], <, rm->rm_firstdatacol); + } + for (; i < n; i++) { + ASSERT3S(used[i], >=, rm->rm_firstdatacol); + } + + /* + * First initialize the storage where we'll compute the inverse rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + invrows[i][j] = (i == j) ? 1 : 0; + } + } + + /* + * Subtract all trivial rows from the rows of consequence. + */ + for (i = 0; i < nmissing; i++) { + for (j = nmissing; j < n; j++) { + ASSERT3U(used[j], >=, rm->rm_firstdatacol); + jj = used[j] - rm->rm_firstdatacol; + ASSERT3S(jj, <, n); + invrows[i][j] = rows[i][jj]; + rows[i][jj] = 0; + } + } + + /* + * For each of the rows of interest, we must normalize it and subtract + * a multiple of it from the other rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < missing[i]; j++) { + ASSERT0(rows[i][j]); + } + ASSERT3U(rows[i][missing[i]], !=, 0); + + /* + * Compute the inverse of the first element and multiply each + * element in the row by that value. + */ + log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; + + for (j = 0; j < n; j++) { + rows[i][j] = vdev_raidz_exp2(rows[i][j], log); + invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); + } + + for (ii = 0; ii < nmissing; ii++) { + if (i == ii) + continue; + + ASSERT3U(rows[ii][missing[i]], !=, 0); + + log = vdev_raidz_log2[rows[ii][missing[i]]]; + + for (j = 0; j < n; j++) { + rows[ii][j] ^= + vdev_raidz_exp2(rows[i][j], log); + invrows[ii][j] ^= + vdev_raidz_exp2(invrows[i][j], log); + } + } + } + + /* + * Verify that the data that is left in the rows are properly part of + * an identity matrix. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + if (j == missing[i]) { + ASSERT3U(rows[i][j], ==, 1); + } else { + ASSERT0(rows[i][j]); + } + } + } +} + +static void +vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, + int *missing, uint8_t **invrows, const uint8_t *used) +{ + int i, j, x, cc, c; + uint8_t *src; + uint64_t ccount; + uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; + uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; + uint8_t log = 0; + uint8_t val; + int ll; + uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; + uint8_t *p, *pp; + size_t psize; + + psize = sizeof (invlog[0][0]) * n * nmissing; + p = kmem_alloc(psize, KM_SLEEP); + + for (pp = p, i = 0; i < nmissing; i++) { + invlog[i] = pp; + pp += n; + } + + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + ASSERT3U(invrows[i][j], !=, 0); + invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; + } + } + + for (i = 0; i < n; i++) { + c = used[i]; + ASSERT3U(c, <, rm->rm_cols); + + src = abd_to_buf(rm->rm_col[c].rc_abd); + ccount = rm->rm_col[c].rc_size; + for (j = 0; j < nmissing; j++) { + cc = missing[j] + rm->rm_firstdatacol; + ASSERT3U(cc, >=, rm->rm_firstdatacol); + ASSERT3U(cc, <, rm->rm_cols); + ASSERT3U(cc, !=, c); + + dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); + dcount[j] = rm->rm_col[cc].rc_size; + } + + ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); + + for (x = 0; x < ccount; x++, src++) { + if (*src != 0) + log = vdev_raidz_log2[*src]; + + for (cc = 0; cc < nmissing; cc++) { + if (x >= dcount[cc]) + continue; + + if (*src == 0) { + val = 0; + } else { + if ((ll = log + invlog[cc][i]) >= 255) + ll -= 255; + val = vdev_raidz_pow2[ll]; + } + + if (i == 0) + dst[cc][x] = val; + else + dst[cc][x] ^= val; + } + } + } + + kmem_free(p, psize); +} + +static int +vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) +{ + int n, i, c, t, tt; + int nmissing_rows; + int missing_rows[VDEV_RAIDZ_MAXPARITY]; + int parity_map[VDEV_RAIDZ_MAXPARITY]; + + uint8_t *p, *pp; + size_t psize; + + uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *used; + + abd_t **bufs = NULL; + + int code = 0; + + /* + * Matrix reconstruction can't use scatter ABDs yet, so we allocate + * temporary linear ABDs. + */ + if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { + bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + bufs[c] = col->rc_abd; + col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); + abd_copy(col->rc_abd, bufs[c], col->rc_size); + } + } + + n = rm->rm_cols - rm->rm_firstdatacol; + + /* + * Figure out which data columns are missing. + */ + nmissing_rows = 0; + for (t = 0; t < ntgts; t++) { + if (tgts[t] >= rm->rm_firstdatacol) { + missing_rows[nmissing_rows++] = + tgts[t] - rm->rm_firstdatacol; + } + } + + /* + * Figure out which parity columns to use to help generate the missing + * data columns. + */ + for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { + ASSERT(tt < ntgts); + ASSERT(c < rm->rm_firstdatacol); + + /* + * Skip any targeted parity columns. + */ + if (c == tgts[tt]) { + tt++; + continue; + } + + code |= 1 << c; + + parity_map[i] = c; + i++; + } + + ASSERT(code != 0); + ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); + + psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * + nmissing_rows * n + sizeof (used[0]) * n; + p = kmem_alloc(psize, KM_SLEEP); + + for (pp = p, i = 0; i < nmissing_rows; i++) { + rows[i] = pp; + pp += n; + invrows[i] = pp; + pp += n; + } + used = pp; + + for (i = 0; i < nmissing_rows; i++) { + used[i] = parity_map[i]; + } + + for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + if (tt < nmissing_rows && + c == missing_rows[tt] + rm->rm_firstdatacol) { + tt++; + continue; + } + + ASSERT3S(i, <, n); + used[i] = c; + i++; + } + + /* + * Initialize the interesting rows of the matrix. + */ + vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); + + /* + * Invert the matrix. + */ + vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, + invrows, used); + + /* + * Reconstruct the missing data using the generated matrix. + */ + vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, + invrows, used); + + kmem_free(p, psize); + + /* + * copy back from temporary linear abds and free them + */ + if (bufs) { + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + abd_copy(bufs[c], col->rc_abd, col->rc_size); + abd_free(col->rc_abd); + col->rc_abd = bufs[c]; + } + kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + } + + return (code); +} + +static int +vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) +{ + int tgts[VDEV_RAIDZ_MAXPARITY], *dt; + int ntgts; + int i, c; + int code; + int nbadparity, nbaddata; + int parity_valid[VDEV_RAIDZ_MAXPARITY]; + + /* + * The tgts list must already be sorted. + */ + for (i = 1; i < nt; i++) { + ASSERT(t[i] > t[i - 1]); + } + + nbadparity = rm->rm_firstdatacol; + nbaddata = rm->rm_cols - nbadparity; + ntgts = 0; + for (i = 0, c = 0; c < rm->rm_cols; c++) { + if (c < rm->rm_firstdatacol) + parity_valid[c] = B_FALSE; + + if (i < nt && c == t[i]) { + tgts[ntgts++] = c; + i++; + } else if (rm->rm_col[c].rc_error != 0) { + tgts[ntgts++] = c; + } else if (c >= rm->rm_firstdatacol) { + nbaddata--; + } else { + parity_valid[c] = B_TRUE; + nbadparity--; + } + } + + ASSERT(ntgts >= nt); + ASSERT(nbaddata >= 0); + ASSERT(nbaddata + nbadparity == ntgts); + + dt = &tgts[nbadparity]; + + /* + * See if we can use any of our optimized reconstruction routines. + */ + if (!vdev_raidz_default_to_general) { + switch (nbaddata) { + case 1: + if (parity_valid[VDEV_RAIDZ_P]) + return (vdev_raidz_reconstruct_p(rm, dt, 1)); + + ASSERT(rm->rm_firstdatacol > 1); + + if (parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_q(rm, dt, 1)); + + ASSERT(rm->rm_firstdatacol > 2); + break; + + case 2: + ASSERT(rm->rm_firstdatacol > 1); + + if (parity_valid[VDEV_RAIDZ_P] && + parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + + ASSERT(rm->rm_firstdatacol > 2); + + break; + } + } + + code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); + ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); + ASSERT(code > 0); + return (code); +} + +static int +vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_t *cvd; + uint64_t nparity = vd->vdev_nparity; + int c; + int lasterror = 0; + int numerrors = 0; + + ASSERT(nparity > 0); + + if (nparity > VDEV_RAIDZ_MAXPARITY || + vd->vdev_children < nparity + 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + vdev_open_children(vd); + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) { + lasterror = cvd->vdev_open_error; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = MAX(*physical_ashift, + cvd->vdev_physical_ashift); + } + + *asize *= vd->vdev_children; + *max_asize *= vd->vdev_children; + + if (numerrors > nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_raidz_close(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +#ifdef illumos +/* + * Handle a read or write I/O to a RAID-Z dump device. + * + * The dump device is in a unique situation compared to other ZFS datasets: + * writing to this device should be as simple and fast as possible. In + * addition, durability matters much less since the dump will be extracted + * once the machine reboots. For that reason, this function eschews parity for + * performance and simplicity. The dump device uses the checksum setting + * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this + * dataset. + * + * Blocks of size 128 KB have been preallocated for this volume. I/Os less than + * 128 KB will not fill an entire block; in addition, they may not be properly + * aligned. In that case, this function uses the preallocated 128 KB block and + * omits reading or writing any "empty" portions of that block, as opposed to + * allocating a fresh appropriately-sized block. + * + * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: + * + * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) + * + * If this were a standard RAID-Z dataset, a block of at least 40 KB would be + * allocated which spans all five child vdevs. 8 KB of data would be written to + * each of four vdevs, with the fifth containing the parity bits. + * + * parity data data data data + * | PP | XX | XX | XX | XX | + * ^ ^ ^ ^ ^ + * | | | | | + * 8 KB parity ------8 KB data blocks------ + * + * However, when writing to the dump device, the behavior is different: + * + * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) + * + * Unlike the normal RAID-Z case in which the block is allocated based on the + * I/O size, reads and writes here always use a 128 KB logical I/O size. If the + * I/O size is less than 128 KB, only the actual portions of data are written. + * In this example the data is written to the third data vdev since that vdev + * contains the offset [64 KB, 96 KB). + * + * parity data data data data + * | | | | XX | | + * ^ + * | + * 32 KB data block + * + * As a result, an individual I/O may not span all child vdevs; moreover, a + * small I/O may only operate on a single child vdev. + * + * Note that since there are no parity bits calculated or written, this format + * remains the same no matter how many parity bits are used in a normal RAID-Z + * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above + * would look like: + * + * parity parity parity data data data data + * | | | | | | XX | | + * ^ + * | + * 32 KB data block + */ +int +vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, + uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) +{ + vdev_t *tvd = vd->vdev_top; + vdev_t *cvd; + raidz_map_t *rm; + raidz_col_t *rc; + int c, err = 0; + + uint64_t start, end, colstart, colend; + uint64_t coloffset, colsize, colskip; + + int flags = doread ? BIO_READ : BIO_WRITE; + +#ifdef _KERNEL + + /* + * Don't write past the end of the block + */ + VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); + + start = offset; + end = start + size; + + /* + * Allocate a RAID-Z map for this block. Note that this block starts + * from the "original" offset, this is, the offset of the extent which + * contains the requisite offset of the data being read or written. + * + * Even if this I/O operation doesn't span the full block size, let's + * treat the on-disk format as if the only blocks are the complete 128 + * KB size. + */ + abd_t *abd = abd_get_from_buf(data - (offset - origoffset), + SPA_OLD_MAXBLOCKSIZE); + rm = vdev_raidz_map_alloc(abd, + SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, + vd->vdev_children, vd->vdev_nparity); + + coloffset = origoffset; + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; + c++, coloffset += rc->rc_size) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + /* + * Find the start and end of this column in the RAID-Z map, + * keeping in mind that the stated size and offset of the + * operation may not fill the entire column for this vdev. + * + * If any portion of the data spans this column, issue the + * appropriate operation to the vdev. + */ + if (coloffset + rc->rc_size <= start) + continue; + if (coloffset >= end) + continue; + + colstart = MAX(coloffset, start); + colend = MIN(end, coloffset + rc->rc_size); + colsize = colend - colstart; + colskip = colstart - coloffset; + + VERIFY3U(colsize, <=, rc->rc_size); + VERIFY3U(colskip, <=, rc->rc_size); + + /* + * Note that the child vdev will have a vdev label at the start + * of its range of offsets, hence the need for + * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another + * example of why this calculation is needed. + */ + if ((err = vdev_disk_physio(cvd, + ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize, + VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, + flags, isdump)) != 0) + break; + } + + vdev_raidz_map_free(rm); + abd_put(abd); +#endif /* KERNEL */ + + return (err); +} +#endif + +static uint64_t +vdev_raidz_asize(vdev_t *vd, uint64_t psize) +{ + uint64_t asize; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t cols = vd->vdev_children; + uint64_t nparity = vd->vdev_nparity; + + asize = ((psize - 1) >> ashift) + 1; + asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); + asize = roundup(asize, nparity + 1) << ashift; + + return (asize); +} + +static void +vdev_raidz_child_done(zio_t *zio) +{ + raidz_col_t *rc = zio->io_private; + + rc->rc_error = zio->io_error; + rc->rc_tried = 1; + rc->rc_skipped = 0; +} + +static void +vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) +{ +#ifdef ZFS_DEBUG + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + + range_seg_t logical_rs, physical_rs; + logical_rs.rs_start = zio->io_offset; + logical_rs.rs_end = logical_rs.rs_start + + vdev_raidz_asize(zio->io_vd, zio->io_size); + + raidz_col_t *rc = &rm->rm_col[col]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + vdev_xlate(cvd, &logical_rs, &physical_rs); + ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); + ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); + /* + * It would be nice to assert that rs_end is equal + * to rc_offset + rc_size but there might be an + * optional I/O at the end that is not accounted in + * rc_size. + */ + if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { + ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + + rc->rc_size + (1 << tvd->vdev_ashift)); + } else { + ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); + } +#endif +} + +/* + * Start an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ +static void +vdev_raidz_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + vdev_t *cvd; + raidz_map_t *rm; + raidz_col_t *rc; + int c, i; + + rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, + zio->io_type == ZIO_TYPE_FREE, + tvd->vdev_ashift, vd->vdev_children, + vd->vdev_nparity); + + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + + ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); + + if (zio->io_type == ZIO_TYPE_FREE) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + + zio_execute(zio); + return; + } + + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_raidz_generate_parity(rm); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + /* + * Verify physical to logical translation. + */ + vdev_raidz_io_verify(zio, rm, c); + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + + /* + * Generate optional I/Os for any skipped sectors to improve + * aggregation contiguity. + */ + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + ASSERT(c <= rm->rm_scols); + if (c == rm->rm_scols) + c = 0; + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, NULL, + 1 << tvd->vdev_ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + } + + zio_execute(zio); + return; + } + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last -- any errors along the way will force us to read the parity. + */ + for (c = rm->rm_cols - 1; c >= 0; c--) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + if (!vdev_readable(cvd)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = SET_ERROR(ENXIO); + rc->rc_tried = 1; /* don't even try */ + rc->rc_skipped = 1; + continue; + } + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + continue; + } + if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } + + zio_execute(zio); +} + + +/* + * Report a checksum error for a child of a RAID-Z device. + */ +static void +raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) +{ + void *buf; + vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; + + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + zio_bad_cksum_t zbc; + raidz_map_t *rm = zio->io_vsd; + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = rm->rm_ecksuminjected; + + buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size); + zfs_ereport_post_checksum(zio->io_spa, vd, zio, + rc->rc_offset, rc->rc_size, buf, bad_data, + &zbc); + abd_return_buf(rc->rc_abd, buf, rc->rc_size); + } +} + +/* + * We keep track of whether or not there were any injected errors, so that + * any ereports we generate can note it. + */ +static int +raidz_checksum_verify(zio_t *zio) +{ + zio_bad_cksum_t zbc; + raidz_map_t *rm = zio->io_vsd; + + int ret = zio_checksum_error(zio, &zbc); + if (ret != 0 && zbc.zbc_injected != 0) + rm->rm_ecksuminjected = 1; + + return (ret); +} + +/* + * Generate the parity from the data columns. If we tried and were able to + * read the parity without error, verify that the generated parity matches the + * data we read. If it doesn't, we fire off a checksum error. Return the + * number such failures. + */ +static int +raidz_parity_verify(zio_t *zio, raidz_map_t *rm) +{ + void *orig[VDEV_RAIDZ_MAXPARITY]; + int c, ret = 0; + raidz_col_t *rc; + + blkptr_t *bp = zio->io_bp; + enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + + if (checksum == ZIO_CHECKSUM_NOPARITY) + return (ret); + + for (c = 0; c < rm->rm_firstdatacol; c++) { + rc = &rm->rm_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + continue; + orig[c] = zio_buf_alloc(rc->rc_size); + abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size); + } + + vdev_raidz_generate_parity(rm); + + for (c = 0; c < rm->rm_firstdatacol; c++) { + rc = &rm->rm_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + continue; + if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) { + raidz_checksum_error(zio, rc, orig[c]); + rc->rc_error = SET_ERROR(ECKSUM); + ret++; + } + zio_buf_free(orig[c], rc->rc_size); + } + + return (ret); +} + +/* + * Keep statistics on all the ways that we used parity to correct data. + */ +static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; + +static int +vdev_raidz_worst_error(raidz_map_t *rm) +{ + int error = 0; + + for (int c = 0; c < rm->rm_cols; c++) + error = zio_worst_error(error, rm->rm_col[c].rc_error); + + return (error); +} + +/* + * Iterate over all combinations of bad data and attempt a reconstruction. + * Note that the algorithm below is non-optimal because it doesn't take into + * account how reconstruction is actually performed. For example, with + * triple-parity RAID-Z the reconstruction procedure is the same if column 4 + * is targeted as invalid as if columns 1 and 4 are targeted since in both + * cases we'd only use parity information in column 0. + */ +static int +vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) +{ + raidz_map_t *rm = zio->io_vsd; + raidz_col_t *rc; + void *orig[VDEV_RAIDZ_MAXPARITY]; + int tstore[VDEV_RAIDZ_MAXPARITY + 2]; + int *tgts = &tstore[1]; + int current, next, i, c, n; + int code, ret = 0; + + ASSERT(total_errors < rm->rm_firstdatacol); + + /* + * This simplifies one edge condition. + */ + tgts[-1] = -1; + + for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { + /* + * Initialize the targets array by finding the first n columns + * that contain no error. + * + * If there were no data errors, we need to ensure that we're + * always explicitly attempting to reconstruct at least one + * data column. To do this, we simply push the highest target + * up into the data columns. + */ + for (c = 0, i = 0; i < n; i++) { + if (i == n - 1 && data_errors == 0 && + c < rm->rm_firstdatacol) { + c = rm->rm_firstdatacol; + } + + while (rm->rm_col[c].rc_error != 0) { + c++; + ASSERT3S(c, <, rm->rm_cols); + } + + tgts[i] = c++; + } + + /* + * Setting tgts[n] simplifies the other edge condition. + */ + tgts[n] = rm->rm_cols; + + /* + * These buffers were allocated in previous iterations. + */ + for (i = 0; i < n - 1; i++) { + ASSERT(orig[i] != NULL); + } + + orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); + + current = 0; + next = tgts[current]; + + while (current != n) { + tgts[current] = next; + current = 0; + + /* + * Save off the original data that we're going to + * attempt to reconstruct. + */ + for (i = 0; i < n; i++) { + ASSERT(orig[i] != NULL); + c = tgts[i]; + ASSERT3S(c, >=, 0); + ASSERT3S(c, <, rm->rm_cols); + rc = &rm->rm_col[c]; + abd_copy_to_buf(orig[i], rc->rc_abd, + rc->rc_size); + } + + /* + * Attempt a reconstruction and exit the outer loop on + * success. + */ + code = vdev_raidz_reconstruct(rm, tgts, n); + if (raidz_checksum_verify(zio) == 0) { + atomic_inc_64(&raidz_corrected[code]); + + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + ASSERT(rc->rc_error == 0); + if (rc->rc_tried) + raidz_checksum_error(zio, rc, + orig[i]); + rc->rc_error = SET_ERROR(ECKSUM); + } + + ret = code; + goto done; + } + + /* + * Restore the original data. + */ + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + abd_copy_from_buf(rc->rc_abd, orig[i], + rc->rc_size); + } + + do { + /* + * Find the next valid column after the current + * position.. + */ + for (next = tgts[current] + 1; + next < rm->rm_cols && + rm->rm_col[next].rc_error != 0; next++) + continue; + + ASSERT(next <= tgts[current + 1]); + + /* + * If that spot is available, we're done here. + */ + if (next != tgts[current + 1]) + break; + + /* + * Otherwise, find the next valid column after + * the previous position. + */ + for (c = tgts[current - 1] + 1; + rm->rm_col[c].rc_error != 0; c++) + continue; + + tgts[current] = c; + current++; + + } while (current != n); + } + } + n--; +done: + for (i = 0; i < n; i++) { + zio_buf_free(orig[i], rm->rm_col[0].rc_size); + } + + return (ret); +} + +/* + * Complete an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Check for errors on the child IOs. + * 2. Return, setting an error code if too few child VDevs were written + * to reconstruct the data later. Note that partial writes are + * considered successful if they can be reconstructed at all. + * - For read operations: + * 1. Check for errors on the child IOs. + * 2. If data errors occurred: + * a. Try to reassemble the data from the parity available. + * b. If we haven't yet read the parity drives, read them now. + * c. If all parity drives have been read but the data still doesn't + * reassemble with a correct checksum, then try combinatorial + * reconstruction. + * d. If that doesn't work, return an error. + * 3. If there were unexpected errors or this is a resilver operation, + * rewrite the vdevs that had errors. + */ +static void +vdev_raidz_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *cvd; + raidz_map_t *rm = zio->io_vsd; + raidz_col_t *rc; + int unexpected_errors = 0; + int parity_errors = 0; + int parity_untried = 0; + int data_errors = 0; + int total_errors = 0; + int n, c; + int tgts[VDEV_RAIDZ_MAXPARITY]; + int code; + + ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ + + ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); + ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + + if (rc->rc_error) { + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + + if (c < rm->rm_firstdatacol) + parity_errors++; + else + data_errors++; + + if (!rc->rc_skipped) + unexpected_errors++; + + total_errors++; + } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { + parity_untried++; + } + } + + if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * XXX -- for now, treat partial writes as a success. + * (If we couldn't write enough columns to reconstruct + * the data, the I/O failed. Otherwise, good enough.) + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. + */ + /* XXPOLICY */ + if (total_errors > rm->rm_firstdatacol) + zio->io_error = vdev_raidz_worst_error(rm); + + return; + } else if (zio->io_type == ZIO_TYPE_FREE) { + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + /* + * There are three potential phases for a read: + * 1. produce valid data from the columns read + * 2. read all disks and try again + * 3. perform combinatorial reconstruction + * + * Each phase is progressively both more expensive and less likely to + * occur. If we encounter more errors than we can repair or all phases + * fail, we have no choice but to return an error. + */ + + /* + * If the number of errors we saw was correctable -- less than or equal + * to the number of parity disks read -- attempt to produce data that + * has a valid checksum. Naturally, this case applies in the absence of + * any errors. + */ + if (total_errors <= rm->rm_firstdatacol - parity_untried) { + if (data_errors == 0) { + if (raidz_checksum_verify(zio) == 0) { + /* + * If we read parity information (unnecessarily + * as it happens since no reconstruction was + * needed) regenerate and verify the parity. + * We also regenerate parity when resilvering + * so we can write it out to the failed device + * later. + */ + if (parity_errors + parity_untried < + rm->rm_firstdatacol || + (zio->io_flags & ZIO_FLAG_RESILVER)) { + n = raidz_parity_verify(zio, rm); + unexpected_errors += n; + ASSERT(parity_errors + n <= + rm->rm_firstdatacol); + } + goto done; + } + } else { + /* + * We either attempt to read all the parity columns or + * none of them. If we didn't try to read parity, we + * wouldn't be here in the correctable case. There must + * also have been fewer parity errors than parity + * columns or, again, we wouldn't be in this code path. + */ + ASSERT(parity_untried == 0); + ASSERT(parity_errors < rm->rm_firstdatacol); + + /* + * Identify the data columns that reported an error. + */ + n = 0; + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + if (rc->rc_error != 0) { + ASSERT(n < VDEV_RAIDZ_MAXPARITY); + tgts[n++] = c; + } + } + + ASSERT(rm->rm_firstdatacol >= n); + + code = vdev_raidz_reconstruct(rm, tgts, n); + + if (raidz_checksum_verify(zio) == 0) { + atomic_inc_64(&raidz_corrected[code]); + + /* + * If we read more parity disks than were used + * for reconstruction, confirm that the other + * parity disks produced correct data. This + * routine is suboptimal in that it regenerates + * the parity that we already used in addition + * to the parity that we're attempting to + * verify, but this should be a relatively + * uncommon case, and can be optimized if it + * becomes a problem. Note that we regenerate + * parity when resilvering so we can write it + * out to failed devices later. + */ + if (parity_errors < rm->rm_firstdatacol - n || + (zio->io_flags & ZIO_FLAG_RESILVER)) { + n = raidz_parity_verify(zio, rm); + unexpected_errors += n; + ASSERT(parity_errors + n <= + rm->rm_firstdatacol); + } + + goto done; + } + } + } + + /* + * This isn't a typical situation -- either we got a read error or + * a child silently returned bad data. Read every block so we can + * try again with as much data and parity as we can track down. If + * we've already been through once before, all children will be marked + * as tried so we'll proceed to combinatorial reconstruction. + */ + unexpected_errors = 1; + rm->rm_missingdata = 0; + rm->rm_missingparity = 0; + + for (c = 0; c < rm->rm_cols; c++) { + if (rm->rm_col[c].rc_tried) + continue; + + zio_vdev_io_redone(zio); + do { + rc = &rm->rm_col[c]; + if (rc->rc_tried) + continue; + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } while (++c < rm->rm_cols); + + return; + } + + /* + * At this point we've attempted to reconstruct the data given the + * errors we detected, and we've attempted to read all columns. There + * must, therefore, be one or more additional problems -- silent errors + * resulting in invalid data rather than explicit I/O errors resulting + * in absent data. We check if there is enough additional data to + * possibly reconstruct the data and then perform combinatorial + * reconstruction over all possible combinations. If that fails, + * we're cooked. + */ + if (total_errors > rm->rm_firstdatacol) { + zio->io_error = vdev_raidz_worst_error(rm); + + } else if (total_errors < rm->rm_firstdatacol && + (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { + /* + * If we didn't use all the available parity for the + * combinatorial reconstruction, verify that the remaining + * parity is correct. + */ + if (code != (1 << rm->rm_firstdatacol) - 1) + (void) raidz_parity_verify(zio, rm); + } else { + /* + * We're here because either: + * + * total_errors == rm_firstdatacol, or + * vdev_raidz_combrec() failed + * + * In either case, there is enough bad data to prevent + * reconstruction. + * + * Start checksum ereports for all children which haven't + * failed, and the IO wasn't speculative. + */ + zio->io_error = SET_ERROR(ECKSUM); + + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + if (rc->rc_error == 0) { + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = + rm->rm_ecksuminjected; + + zfs_ereport_start_checksum( + zio->io_spa, + vd->vdev_child[rc->rc_devidx], + zio, rc->rc_offset, rc->rc_size, + (void *)(uintptr_t)c, &zbc); + } + } + } + } + +done: + zio_checksum_verified(zio); + + if (zio->io_error == 0 && spa_writeable(zio->io_spa) && + (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + /* + * Use the good data we have in hand to repair damaged children. + */ + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error == 0) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); + } + } +} + +static void +vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted > vd->vdev_nparity) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +/* + * Determine if any portion of the provided block resides on a child vdev + * with a dirty DTL and therefore needs to be resilvered. The function + * assumes that at least one DTL is dirty which imples that full stripe + * width blocks must be resilvered. + */ +static boolean_t +vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + uint64_t dcols = vd->vdev_children; + uint64_t nparity = vd->vdev_nparity; + uint64_t ashift = vd->vdev_top->vdev_ashift; + /* The starting RAIDZ (parent) vdev sector of the block. */ + uint64_t b = offset >> ashift; + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = ((psize - 1) >> ashift) + 1; + /* The first column for this stripe. */ + uint64_t f = b % dcols; + + if (s + nparity >= dcols) + return (B_TRUE); + + for (uint64_t c = 0; c < s + nparity; c++) { + uint64_t devidx = (f + c) % dcols; + vdev_t *cvd = vd->vdev_child[devidx]; + + /* + * dsl_scan_need_resilver() already checked vd with + * vdev_dtl_contains(). So here just check cvd with + * vdev_dtl_empty(), cheaper and a good approximation. + */ + if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) + return (B_TRUE); + } + + return (B_FALSE); +} + +static void +vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) +{ + vdev_t *raidvd = cvd->vdev_parent; + ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); + + uint64_t width = raidvd->vdev_children; + uint64_t tgt_col = cvd->vdev_id; + uint64_t ashift = raidvd->vdev_top->vdev_ashift; + + /* make sure the offsets are block-aligned */ + ASSERT0(in->rs_start % (1 << ashift)); + ASSERT0(in->rs_end % (1 << ashift)); + uint64_t b_start = in->rs_start >> ashift; + uint64_t b_end = in->rs_end >> ashift; + + uint64_t start_row = 0; + if (b_start > tgt_col) /* avoid underflow */ + start_row = ((b_start - tgt_col - 1) / width) + 1; + + uint64_t end_row = 0; + if (b_end > tgt_col) + end_row = ((b_end - tgt_col - 1) / width) + 1; + + res->rs_start = start_row << ashift; + res->rs_end = end_row << ashift; + + ASSERT3U(res->rs_start, <=, in->rs_start); + ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); +} + +vdev_ops_t vdev_raidz_ops = { + vdev_raidz_open, + vdev_raidz_close, + vdev_raidz_asize, + vdev_raidz_io_start, + vdev_raidz_io_done, + vdev_raidz_state_change, + vdev_raidz_need_resilver, + NULL, + NULL, + NULL, + vdev_raidz_xlate, + VDEV_TYPE_RAIDZ, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c new file mode 100644 index 000000000000..20fa9c24db24 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c @@ -0,0 +1,2167 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/zap.h> +#include <sys/vdev_impl.h> +#include <sys/metaslab.h> +#include <sys/metaslab_impl.h> +#include <sys/uberblock_impl.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/bpobj.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_dir.h> +#include <sys/arc.h> +#include <sys/zfeature.h> +#include <sys/vdev_indirect_births.h> +#include <sys/vdev_indirect_mapping.h> +#include <sys/abd.h> +#include <sys/vdev_initialize.h> + +/* + * This file contains the necessary logic to remove vdevs from a + * storage pool. Currently, the only devices that can be removed + * are log, cache, and spare devices; and top level vdevs from a pool + * w/o raidz. (Note that members of a mirror can also be removed + * by the detach operation.) + * + * Log vdevs are removed by evacuating them and then turning the vdev + * into a hole vdev while holding spa config locks. + * + * Top level vdevs are removed and converted into an indirect vdev via + * a multi-step process: + * + * - Disable allocations from this device (spa_vdev_remove_top). + * + * - From a new thread (spa_vdev_remove_thread), copy data from + * the removing vdev to a different vdev. The copy happens in open + * context (spa_vdev_copy_impl) and issues a sync task + * (vdev_mapping_sync) so the sync thread can update the partial + * indirect mappings in core and on disk. + * + * - If a free happens during a removal, it is freed from the + * removing vdev, and if it has already been copied, from the new + * location as well (free_from_removing_vdev). + * + * - After the removal is completed, the copy thread converts the vdev + * into an indirect vdev (vdev_remove_complete) before instructing + * the sync thread to destroy the space maps and finish the removal + * (spa_finish_removal). + */ + +typedef struct vdev_copy_arg { + metaslab_t *vca_msp; + uint64_t vca_outstanding_bytes; + kcondvar_t vca_cv; + kmutex_t vca_lock; +} vdev_copy_arg_t; + +/* + * The maximum amount of memory we can use for outstanding i/o while + * doing a device removal. This determines how much i/o we can have + * in flight concurrently. + */ +int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; + +/* + * The largest contiguous segment that we will attempt to allocate when + * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If + * there is a performance problem with attempting to allocate large blocks, + * consider decreasing this. + * + * Note: we will issue I/Os of up to this size. The mpt driver does not + * respond well to I/Os larger than 1MB, so we set this to 1MB. (When + * mpt processes an I/O larger than 1MB, it needs to do an allocation of + * 2 physically contiguous pages; if this allocation fails, mpt will drop + * the I/O and hang the device.) + */ +int zfs_remove_max_segment = 1024 * 1024; + +/* + * Allow a remap segment to span free chunks of at most this size. The main + * impact of a larger span is that we will read and write larger, more + * contiguous chunks, with more "unnecessary" data -- trading off bandwidth + * for iops. The value here was chosen to align with + * zfs_vdev_read_gap_limit, which is a similar concept when doing regular + * reads (but there's no reason it has to be the same). + * + * Additionally, a higher span will have the following relatively minor + * effects: + * - the mapping will be smaller, since one entry can cover more allocated + * segments + * - more of the fragmentation in the removing device will be preserved + * - we'll do larger allocations, which may fail and fall back on smaller + * allocations + */ +int vdev_removal_max_span = 32 * 1024; + +/* + * This is used by the test suite so that it can ensure that certain + * actions happen while in the middle of a removal. + */ +uint64_t zfs_remove_max_bytes_pause = UINT64_MAX; + +#define VDEV_REMOVAL_ZAP_OBJS "lzap" + +static void spa_vdev_remove_thread(void *arg); + +static void +spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) +{ + VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_REMOVING, sizeof (uint64_t), + sizeof (spa->spa_removing_phys) / sizeof (uint64_t), + &spa->spa_removing_phys, tx)); +} + +static nvlist_t * +spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) +{ + for (int i = 0; i < count; i++) { + uint64_t guid = + fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID); + + if (guid == target_guid) + return (nvpp[i]); + } + + return (NULL); +} + +static void +spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, + nvlist_t *dev_to_remove) +{ + nvlist_t **newdev = NULL; + + if (count > 1) + newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); + + for (int i = 0, j = 0; i < count; i++) { + if (dev[i] == dev_to_remove) + continue; + VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); + } + + VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); + VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); + + for (int i = 0; i < count - 1; i++) + nvlist_free(newdev[i]); + + if (count > 1) + kmem_free(newdev, (count - 1) * sizeof (void *)); +} + +static spa_vdev_removal_t * +spa_vdev_removal_create(vdev_t *vd) +{ + spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); + mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); + svr->svr_allocd_segs = range_tree_create(NULL, NULL); + svr->svr_vdev_id = vd->vdev_id; + + for (int i = 0; i < TXG_SIZE; i++) { + svr->svr_frees[i] = range_tree_create(NULL, NULL); + list_create(&svr->svr_new_segments[i], + sizeof (vdev_indirect_mapping_entry_t), + offsetof(vdev_indirect_mapping_entry_t, vime_node)); + } + + return (svr); +} + +void +spa_vdev_removal_destroy(spa_vdev_removal_t *svr) +{ + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(svr->svr_bytes_done[i]); + ASSERT0(svr->svr_max_offset_to_sync[i]); + range_tree_destroy(svr->svr_frees[i]); + list_destroy(&svr->svr_new_segments[i]); + } + + range_tree_destroy(svr->svr_allocd_segs); + mutex_destroy(&svr->svr_lock); + cv_destroy(&svr->svr_cv); + kmem_free(svr, sizeof (*svr)); +} + +/* + * This is called as a synctask in the txg in which we will mark this vdev + * as removing (in the config stored in the MOS). + * + * It begins the evacuation of a toplevel vdev by: + * - initializing the spa_removing_phys which tracks this removal + * - computing the amount of space to remove for accounting purposes + * - dirtying all dbufs in the spa_config_object + * - creating the spa_vdev_removal + * - starting the spa_vdev_remove_thread + */ +static void +vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; + spa_vdev_removal_t *svr = NULL; + uint64_t txg = dmu_tx_get_txg(tx); + + ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + svr = spa_vdev_removal_create(vd); + + ASSERT(vd->vdev_removing); + ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); + + spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); + if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + /* + * By activating the OBSOLETE_COUNTS feature, we prevent + * the pool from being downgraded and ensure that the + * refcounts are precise. + */ + spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + uint64_t one = 1; + VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, + &one, tx)); + ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0); + } + + vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx); + vd->vdev_indirect_mapping = + vdev_indirect_mapping_open(mos, vic->vic_mapping_object); + vic->vic_births_object = vdev_indirect_births_alloc(mos, tx); + vd->vdev_indirect_births = + vdev_indirect_births_open(mos, vic->vic_births_object); + spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id; + spa->spa_removing_phys.sr_start_time = gethrestime_sec(); + spa->spa_removing_phys.sr_end_time = 0; + spa->spa_removing_phys.sr_state = DSS_SCANNING; + spa->spa_removing_phys.sr_to_copy = 0; + spa->spa_removing_phys.sr_copied = 0; + + /* + * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because + * there may be space in the defer tree, which is free, but still + * counted in vs_alloc. + */ + for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { + metaslab_t *ms = vd->vdev_ms[i]; + if (ms->ms_sm == NULL) + continue; + + /* + * Sync tasks happen before metaslab_sync(), therefore + * smp_alloc and sm_alloc must be the same. + */ + ASSERT3U(space_map_allocated(ms->ms_sm), ==, + ms->ms_sm->sm_phys->smp_alloc); + + spa->spa_removing_phys.sr_to_copy += + space_map_allocated(ms->ms_sm); + + /* + * Space which we are freeing this txg does not need to + * be copied. + */ + spa->spa_removing_phys.sr_to_copy -= + range_tree_space(ms->ms_freeing); + + ASSERT0(range_tree_space(ms->ms_freed)); + for (int t = 0; t < TXG_SIZE; t++) + ASSERT0(range_tree_space(ms->ms_allocating[t])); + } + + /* + * Sync tasks are called before metaslab_sync(), so there should + * be no already-synced metaslabs in the TXG_CLEAN list. + */ + ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL); + + spa_sync_removing_state(spa, tx); + + /* + * All blocks that we need to read the most recent mapping must be + * stored on concrete vdevs. Therefore, we must dirty anything that + * is read before spa_remove_init(). Specifically, the + * spa_config_object. (Note that although we already modified the + * spa_config_object in spa_sync_removing_state, that may not have + * modified all blocks of the object.) + */ + dmu_object_info_t doi; + VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi)); + for (uint64_t offset = 0; offset < doi.doi_max_offset; ) { + dmu_buf_t *dbuf; + VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT, + offset, FTAG, &dbuf, 0)); + dmu_buf_will_dirty(dbuf, tx); + offset += dbuf->db_size; + dmu_buf_rele(dbuf, FTAG); + } + + /* + * Now that we've allocated the im_object, dirty the vdev to ensure + * that the object gets written to the config on disk. + */ + vdev_config_dirty(vd); + + zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu " + "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx), + vic->vic_mapping_object); + + spa_history_log_internal(spa, "vdev remove started", tx, + "%s vdev %llu %s", spa_name(spa), vd->vdev_id, + (vd->vdev_path != NULL) ? vd->vdev_path : "-"); + /* + * Setting spa_vdev_removal causes subsequent frees to call + * free_from_removing_vdev(). Note that we don't need any locking + * because we are the sync thread, and metaslab_free_impl() is only + * called from syncing context (potentially from a zio taskq thread, + * but in any case only when there are outstanding free i/os, which + * there are not). + */ + ASSERT3P(spa->spa_vdev_removal, ==, NULL); + spa->spa_vdev_removal = svr; + svr->svr_thread = thread_create(NULL, 0, + spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); +} + +/* + * When we are opening a pool, we must read the mapping for each + * indirect vdev in order from most recently removed to least + * recently removed. We do this because the blocks for the mapping + * of older indirect vdevs may be stored on more recently removed vdevs. + * In order to read each indirect mapping object, we must have + * initialized all more recently removed vdevs. + */ +int +spa_remove_init(spa_t *spa) +{ + int error; + + error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_REMOVING, sizeof (uint64_t), + sizeof (spa->spa_removing_phys) / sizeof (uint64_t), + &spa->spa_removing_phys); + + if (error == ENOENT) { + spa->spa_removing_phys.sr_state = DSS_NONE; + spa->spa_removing_phys.sr_removing_vdev = -1; + spa->spa_removing_phys.sr_prev_indirect_vdev = -1; + spa->spa_indirect_vdevs_loaded = B_TRUE; + return (0); + } else if (error != 0) { + return (error); + } + + if (spa->spa_removing_phys.sr_state == DSS_SCANNING) { + /* + * We are currently removing a vdev. Create and + * initialize a spa_vdev_removal_t from the bonus + * buffer of the removing vdevs vdev_im_object, and + * initialize its partial mapping. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *vd = vdev_lookup_top(spa, + spa->spa_removing_phys.sr_removing_vdev); + + if (vd == NULL) { + spa_config_exit(spa, SCL_STATE, FTAG); + return (EINVAL); + } + + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + ASSERT(vdev_is_concrete(vd)); + spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); + ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id); + ASSERT(vd->vdev_removing); + + vd->vdev_indirect_mapping = vdev_indirect_mapping_open( + spa->spa_meta_objset, vic->vic_mapping_object); + vd->vdev_indirect_births = vdev_indirect_births_open( + spa->spa_meta_objset, vic->vic_births_object); + spa_config_exit(spa, SCL_STATE, FTAG); + + spa->spa_vdev_removal = svr; + } + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + uint64_t indirect_vdev_id = + spa->spa_removing_phys.sr_prev_indirect_vdev; + while (indirect_vdev_id != UINT64_MAX) { + vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + vd->vdev_indirect_mapping = vdev_indirect_mapping_open( + spa->spa_meta_objset, vic->vic_mapping_object); + vd->vdev_indirect_births = vdev_indirect_births_open( + spa->spa_meta_objset, vic->vic_births_object); + + indirect_vdev_id = vic->vic_prev_indirect_vdev; + } + spa_config_exit(spa, SCL_STATE, FTAG); + + /* + * Now that we've loaded all the indirect mappings, we can allow + * reads from other blocks (e.g. via predictive prefetch). + */ + spa->spa_indirect_vdevs_loaded = B_TRUE; + return (0); +} + +void +spa_restart_removal(spa_t *spa) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + + if (svr == NULL) + return; + + /* + * In general when this function is called there is no + * removal thread running. The only scenario where this + * is not true is during spa_import() where this function + * is called twice [once from spa_import_impl() and + * spa_async_resume()]. Thus, in the scenario where we + * import a pool that has an ongoing removal we don't + * want to spawn a second thread. + */ + if (svr->svr_thread != NULL) + return; + + if (!spa_writeable(spa)) + return; + + zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id); + svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, + 0, &p0, TS_RUN, minclsyspri); +} + +/* + * Process freeing from a device which is in the middle of being removed. + * We must handle this carefully so that we attempt to copy freed data, + * and we correctly free already-copied data. + */ +void +free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size) +{ + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t txg = spa_syncing_txg(spa); + uint64_t max_offset_yet = 0; + + ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); + ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, + vdev_indirect_mapping_object(vim)); + ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id); + + mutex_enter(&svr->svr_lock); + + /* + * Remove the segment from the removing vdev's spacemap. This + * ensures that we will not attempt to copy this space (if the + * removal thread has not yet visited it), and also ensures + * that we know what is actually allocated on the new vdevs + * (needed if we cancel the removal). + * + * Note: we must do the metaslab_free_concrete() with the svr_lock + * held, so that the remove_thread can not load this metaslab and then + * visit this offset between the time that we metaslab_free_concrete() + * and when we check to see if it has been visited. + * + * Note: The checkpoint flag is set to false as having/taking + * a checkpoint and removing a device can't happen at the same + * time. + */ + ASSERT(!spa_has_checkpoint(spa)); + metaslab_free_concrete(vd, offset, size, B_FALSE); + + uint64_t synced_size = 0; + uint64_t synced_offset = 0; + uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim); + if (offset < max_offset_synced) { + /* + * The mapping for this offset is already on disk. + * Free from the new location. + * + * Note that we use svr_max_synced_offset because it is + * updated atomically with respect to the in-core mapping. + * By contrast, vim_max_offset is not. + * + * This block may be split between a synced entry and an + * in-flight or unvisited entry. Only process the synced + * portion of it here. + */ + synced_size = MIN(size, max_offset_synced - offset); + synced_offset = offset; + + ASSERT3U(max_offset_yet, <=, max_offset_synced); + max_offset_yet = max_offset_synced; + + DTRACE_PROBE3(remove__free__synced, + spa_t *, spa, + uint64_t, offset, + uint64_t, synced_size); + + size -= synced_size; + offset += synced_size; + } + + /* + * Look at all in-flight txgs starting from the currently syncing one + * and see if a section of this free is being copied. By starting from + * this txg and iterating forward, we might find that this region + * was copied in two different txgs and handle it appropriately. + */ + for (int i = 0; i < TXG_CONCURRENT_STATES; i++) { + int txgoff = (txg + i) & TXG_MASK; + if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) { + /* + * The mapping for this offset is in flight, and + * will be synced in txg+i. + */ + uint64_t inflight_size = MIN(size, + svr->svr_max_offset_to_sync[txgoff] - offset); + + DTRACE_PROBE4(remove__free__inflight, + spa_t *, spa, + uint64_t, offset, + uint64_t, inflight_size, + uint64_t, txg + i); + + /* + * We copy data in order of increasing offset. + * Therefore the max_offset_to_sync[] must increase + * (or be zero, indicating that nothing is being + * copied in that txg). + */ + if (svr->svr_max_offset_to_sync[txgoff] != 0) { + ASSERT3U(svr->svr_max_offset_to_sync[txgoff], + >=, max_offset_yet); + max_offset_yet = + svr->svr_max_offset_to_sync[txgoff]; + } + + /* + * We've already committed to copying this segment: + * we have allocated space elsewhere in the pool for + * it and have an IO outstanding to copy the data. We + * cannot free the space before the copy has + * completed, or else the copy IO might overwrite any + * new data. To free that space, we record the + * segment in the appropriate svr_frees tree and free + * the mapped space later, in the txg where we have + * completed the copy and synced the mapping (see + * vdev_mapping_sync). + */ + range_tree_add(svr->svr_frees[txgoff], + offset, inflight_size); + size -= inflight_size; + offset += inflight_size; + + /* + * This space is already accounted for as being + * done, because it is being copied in txg+i. + * However, if i!=0, then it is being copied in + * a future txg. If we crash after this txg + * syncs but before txg+i syncs, then the space + * will be free. Therefore we must account + * for the space being done in *this* txg + * (when it is freed) rather than the future txg + * (when it will be copied). + */ + ASSERT3U(svr->svr_bytes_done[txgoff], >=, + inflight_size); + svr->svr_bytes_done[txgoff] -= inflight_size; + svr->svr_bytes_done[txg & TXG_MASK] += inflight_size; + } + } + ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]); + + if (size > 0) { + /* + * The copy thread has not yet visited this offset. Ensure + * that it doesn't. + */ + + DTRACE_PROBE3(remove__free__unvisited, + spa_t *, spa, + uint64_t, offset, + uint64_t, size); + + if (svr->svr_allocd_segs != NULL) + range_tree_clear(svr->svr_allocd_segs, offset, size); + + /* + * Since we now do not need to copy this data, for + * accounting purposes we have done our job and can count + * it as completed. + */ + svr->svr_bytes_done[txg & TXG_MASK] += size; + } + mutex_exit(&svr->svr_lock); + + /* + * Now that we have dropped svr_lock, process the synced portion + * of this free. + */ + if (synced_size > 0) { + vdev_indirect_mark_obsolete(vd, synced_offset, synced_size); + + /* + * Note: this can only be called from syncing context, + * and the vdev_indirect_mapping is only changed from the + * sync thread, so we don't need svr_lock while doing + * metaslab_free_impl_cb. + */ + boolean_t checkpoint = B_FALSE; + vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, + metaslab_free_impl_cb, &checkpoint); + } +} + +/* + * Stop an active removal and update the spa_removing phys. + */ +static void +spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa)); + + /* Ensure the removal thread has completed before we free the svr. */ + spa_vdev_remove_suspend(spa); + + ASSERT(state == DSS_FINISHED || state == DSS_CANCELED); + + if (state == DSS_FINISHED) { + spa_removing_phys_t *srp = &spa->spa_removing_phys; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + if (srp->sr_prev_indirect_vdev != UINT64_MAX) { + vdev_t *pvd = vdev_lookup_top(spa, + srp->sr_prev_indirect_vdev); + ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops); + } + + vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev; + srp->sr_prev_indirect_vdev = vd->vdev_id; + } + spa->spa_removing_phys.sr_state = state; + spa->spa_removing_phys.sr_end_time = gethrestime_sec(); + + spa->spa_vdev_removal = NULL; + spa_vdev_removal_destroy(svr); + + spa_sync_removing_state(spa, tx); + + vdev_config_dirty(spa->spa_root_vdev); +} + +static void +free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) +{ + vdev_t *vd = arg; + vdev_indirect_mark_obsolete(vd, offset, size); + boolean_t checkpoint = B_FALSE; + vdev_indirect_ops.vdev_op_remap(vd, offset, size, + metaslab_free_impl_cb, &checkpoint); +} + +/* + * On behalf of the removal thread, syncs an incremental bit more of + * the indirect mapping to disk and updates the in-memory mapping. + * Called as a sync task in every txg that the removal thread makes progress. + */ +static void +vdev_mapping_sync(void *arg, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + uint64_t txg = dmu_tx_get_txg(tx); + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + ASSERT(vic->vic_mapping_object != 0); + ASSERT3U(txg, ==, spa_syncing_txg(spa)); + + vdev_indirect_mapping_add_entries(vim, + &svr->svr_new_segments[txg & TXG_MASK], tx); + vdev_indirect_births_add_entry(vd->vdev_indirect_births, + vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx); + + /* + * Free the copied data for anything that was freed while the + * mapping entries were in flight. + */ + mutex_enter(&svr->svr_lock); + range_tree_vacate(svr->svr_frees[txg & TXG_MASK], + free_mapped_segment_cb, vd); + ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, + vdev_indirect_mapping_max_offset(vim)); + svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0; + mutex_exit(&svr->svr_lock); + + spa_sync_removing_state(spa, tx); +} + +typedef struct vdev_copy_segment_arg { + spa_t *vcsa_spa; + dva_t *vcsa_dest_dva; + uint64_t vcsa_txg; + range_tree_t *vcsa_obsolete_segs; +} vdev_copy_segment_arg_t; + +static void +unalloc_seg(void *arg, uint64_t start, uint64_t size) +{ + vdev_copy_segment_arg_t *vcsa = arg; + spa_t *spa = vcsa->vcsa_spa; + blkptr_t bp = { 0 }; + + BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL); + BP_SET_LSIZE(&bp, size); + BP_SET_PSIZE(&bp, size); + BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(&bp, DMU_OT_NONE); + BP_SET_LEVEL(&bp, 0); + BP_SET_DEDUP(&bp, 0); + BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER); + + DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva)); + DVA_SET_OFFSET(&bp.blk_dva[0], + DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start); + DVA_SET_ASIZE(&bp.blk_dva[0], size); + + zio_free(spa, vcsa->vcsa_txg, &bp); +} + +/* + * All reads and writes associated with a call to spa_vdev_copy_segment() + * are done. + */ +static void +spa_vdev_copy_segment_done(zio_t *zio) +{ + vdev_copy_segment_arg_t *vcsa = zio->io_private; + + range_tree_vacate(vcsa->vcsa_obsolete_segs, + unalloc_seg, vcsa); + range_tree_destroy(vcsa->vcsa_obsolete_segs); + kmem_free(vcsa, sizeof (*vcsa)); + + spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); +} + +/* + * The write of the new location is done. + */ +static void +spa_vdev_copy_segment_write_done(zio_t *zio) +{ + vdev_copy_arg_t *vca = zio->io_private; + + abd_free(zio->io_abd); + + mutex_enter(&vca->vca_lock); + vca->vca_outstanding_bytes -= zio->io_size; + cv_signal(&vca->vca_cv); + mutex_exit(&vca->vca_lock); +} + +/* + * The read of the old location is done. The parent zio is the write to + * the new location. Allow it to start. + */ +static void +spa_vdev_copy_segment_read_done(zio_t *zio) +{ + zio_nowait(zio_unique_parent(zio)); +} + +/* + * If the old and new vdevs are mirrors, we will read both sides of the old + * mirror, and write each copy to the corresponding side of the new mirror. + * If the old and new vdevs have a different number of children, we will do + * this as best as possible. Since we aren't verifying checksums, this + * ensures that as long as there's a good copy of the data, we'll have a + * good copy after the removal, even if there's silent damage to one side + * of the mirror. If we're removing a mirror that has some silent damage, + * we'll have exactly the same damage in the new location (assuming that + * the new location is also a mirror). + * + * We accomplish this by creating a tree of zio_t's, with as many writes as + * there are "children" of the new vdev (a non-redundant vdev counts as one + * child, a 2-way mirror has 2 children, etc). Each write has an associated + * read from a child of the old vdev. Typically there will be the same + * number of children of the old and new vdevs. However, if there are more + * children of the new vdev, some child(ren) of the old vdev will be issued + * multiple reads. If there are more children of the old vdev, some copies + * will be dropped. + * + * For example, the tree of zio_t's for a 2-way mirror is: + * + * null + * / \ + * write(new vdev, child 0) write(new vdev, child 1) + * | | + * read(old vdev, child 0) read(old vdev, child 1) + * + * Child zio's complete before their parents complete. However, zio's + * created with zio_vdev_child_io() may be issued before their children + * complete. In this case we need to make sure that the children (reads) + * complete before the parents (writes) are *issued*. We do this by not + * calling zio_nowait() on each write until its corresponding read has + * completed. + * + * The spa_config_lock must be held while zio's created by + * zio_vdev_child_io() are in progress, to ensure that the vdev tree does + * not change (e.g. due to a concurrent "zpool attach/detach"). The "null" + * zio is needed to release the spa_config_lock after all the reads and + * writes complete. (Note that we can't grab the config lock for each read, + * because it is not reentrant - we could deadlock with a thread waiting + * for a write lock.) + */ +static void +spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio, + vdev_t *source_vd, uint64_t source_offset, + vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size) +{ + ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0); + + mutex_enter(&vca->vca_lock); + vca->vca_outstanding_bytes += size; + mutex_exit(&vca->vca_lock); + + abd_t *abd = abd_alloc_for_io(size, B_FALSE); + + vdev_t *source_child_vd; + if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) { + /* + * Source and dest are both mirrors. Copy from the same + * child id as we are copying to (wrapping around if there + * are more dest children than source children). + */ + source_child_vd = + source_vd->vdev_child[dest_id % source_vd->vdev_children]; + } else { + source_child_vd = source_vd; + } + + zio_t *write_zio = zio_vdev_child_io(nzio, NULL, + dest_child_vd, dest_offset, abd, size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + spa_vdev_copy_segment_write_done, vca); + + zio_nowait(zio_vdev_child_io(write_zio, NULL, + source_child_vd, source_offset, abd, size, + ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + spa_vdev_copy_segment_read_done, vca)); +} + +/* + * Allocate a new location for this segment, and create the zio_t's to + * read from the old location and write to the new location. + */ +static int +spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, + uint64_t maxalloc, uint64_t txg, + vdev_copy_arg_t *vca, zio_alloc_list_t *zal) +{ + metaslab_group_t *mg = vd->vdev_mg; + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_indirect_mapping_entry_t *entry; + dva_t dst = { 0 }; + uint64_t start = range_tree_min(segs); + + ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); + + uint64_t size = range_tree_span(segs); + if (range_tree_span(segs) > maxalloc) { + /* + * We can't allocate all the segments. Prefer to end + * the allocation at the end of a segment, thus avoiding + * additional split blocks. + */ + range_seg_t search; + avl_index_t where; + search.rs_start = start + maxalloc; + search.rs_end = search.rs_start; + range_seg_t *rs = avl_find(&segs->rt_root, &search, &where); + if (rs == NULL) { + rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE); + } else { + rs = AVL_PREV(&segs->rt_root, rs); + } + if (rs != NULL) { + size = rs->rs_end - start; + } else { + /* + * There are no segments that end before maxalloc. + * I.e. the first segment is larger than maxalloc, + * so we must split it. + */ + size = maxalloc; + } + } + ASSERT3U(size, <=, maxalloc); + + /* + * We use allocator 0 for this I/O because we don't expect device remap + * to be the steady state of the system, so parallelizing is not as + * critical as it is for other allocation types. We also want to ensure + * that the IOs are allocated together as much as possible, to reduce + * mapping sizes. + */ + int error = metaslab_alloc_dva(spa, mg->mg_class, size, + &dst, 0, NULL, txg, 0, zal, 0); + if (error != 0) + return (error); + + /* + * Determine the ranges that are not actually needed. Offsets are + * relative to the start of the range to be copied (i.e. relative to the + * local variable "start"). + */ + range_tree_t *obsolete_segs = range_tree_create(NULL, NULL); + + range_seg_t *rs = avl_first(&segs->rt_root); + ASSERT3U(rs->rs_start, ==, start); + uint64_t prev_seg_end = rs->rs_end; + while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) { + if (rs->rs_start >= start + size) { + break; + } else { + range_tree_add(obsolete_segs, + prev_seg_end - start, + rs->rs_start - prev_seg_end); + } + prev_seg_end = rs->rs_end; + } + /* We don't end in the middle of an obsolete range */ + ASSERT3U(start + size, <=, prev_seg_end); + + range_tree_clear(segs, start, size); + + /* + * We can't have any padding of the allocated size, otherwise we will + * misunderstand what's allocated, and the size of the mapping. + * The caller ensures this will be true by passing in a size that is + * aligned to the worst (highest) ashift in the pool. + */ + ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); + + entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); + DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); + entry->vime_mapping.vimep_dst = dst; + if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + entry->vime_obsolete_count = range_tree_space(obsolete_segs); + } + + vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP); + vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; + vcsa->vcsa_obsolete_segs = obsolete_segs; + vcsa->vcsa_spa = spa; + vcsa->vcsa_txg = txg; + + /* + * See comment before spa_vdev_copy_one_child(). + */ + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL, + spa_vdev_copy_segment_done, vcsa, 0); + vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst)); + if (dest_vd->vdev_ops == &vdev_mirror_ops) { + for (int i = 0; i < dest_vd->vdev_children; i++) { + vdev_t *child = dest_vd->vdev_child[i]; + spa_vdev_copy_one_child(vca, nzio, vd, start, + child, DVA_GET_OFFSET(&dst), i, size); + } + } else { + spa_vdev_copy_one_child(vca, nzio, vd, start, + dest_vd, DVA_GET_OFFSET(&dst), -1, size); + } + zio_nowait(nzio); + + list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); + ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); + vdev_dirty(vd, 0, NULL, txg); + + return (0); +} + +/* + * Complete the removal of a toplevel vdev. This is called as a + * synctask in the same txg that we will sync out the new config (to the + * MOS object) which indicates that this vdev is indirect. + */ +static void +vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(svr->svr_bytes_done[i]); + } + + ASSERT3U(spa->spa_removing_phys.sr_copied, ==, + spa->spa_removing_phys.sr_to_copy); + + vdev_destroy_spacemaps(vd, tx); + + /* destroy leaf zaps, if any */ + ASSERT3P(svr->svr_zaplist, !=, NULL); + for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL); + pair != NULL; + pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) { + vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx); + } + fnvlist_free(svr->svr_zaplist); + + spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); + /* vd->vdev_path is not available here */ + spa_history_log_internal(spa, "vdev remove completed", tx, + "%s vdev %llu", spa_name(spa), vd->vdev_id); +} + +static void +vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) +{ + ASSERT3P(zlist, !=, NULL); + ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + + if (vd->vdev_leaf_zap != 0) { + char zkey[32]; + (void) snprintf(zkey, sizeof (zkey), "%s-%ju", + VDEV_REMOVAL_ZAP_OBJS, (uintmax_t)vd->vdev_leaf_zap); + fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap); + } + + for (uint64_t id = 0; id < vd->vdev_children; id++) { + vdev_remove_enlist_zaps(vd->vdev_child[id], zlist); + } +} + +static void +vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) +{ + vdev_t *ivd; + dmu_tx_t *tx; + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + + /* + * First, build a list of leaf zaps to be destroyed. + * This is passed to the sync context thread, + * which does the actual unlinking. + */ + svr->svr_zaplist = fnvlist_alloc(); + vdev_remove_enlist_zaps(vd, svr->svr_zaplist); + + ivd = vdev_add_parent(vd, &vdev_indirect_ops); + ivd->vdev_removing = 0; + + vd->vdev_leaf_zap = 0; + + vdev_remove_child(ivd, vd); + vdev_compact_children(ivd); + + ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, + 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + /* + * Indicate that this thread has exited. + * After this, we can not use svr. + */ + mutex_enter(&svr->svr_lock); + svr->svr_thread = NULL; + cv_broadcast(&svr->svr_cv); + mutex_exit(&svr->svr_lock); +} + +/* + * Complete the removal of a toplevel vdev. This is called in open + * context by the removal thread after we have copied all vdev's data. + */ +static void +vdev_remove_complete(spa_t *spa) +{ + uint64_t txg; + + /* + * Wait for any deferred frees to be synced before we call + * vdev_metaslab_fini() + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + txg = spa_vdev_enter(spa); + vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + + sysevent_t *ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_DEV); + + zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", + vd->vdev_id, txg); + + /* + * Discard allocation state. + */ + if (vd->vdev_mg != NULL) { + vdev_metaslab_fini(vd); + metaslab_group_destroy(vd->vdev_mg); + vd->vdev_mg = NULL; + } + ASSERT0(vd->vdev_stat.vs_space); + ASSERT0(vd->vdev_stat.vs_dspace); + + vdev_remove_replace_with_indirect(vd, txg); + + /* + * We now release the locks, allowing spa_sync to run and finish the + * removal via vdev_remove_complete_sync in syncing context. + * + * Note that we hold on to the vdev_t that has been replaced. Since + * it isn't part of the vdev tree any longer, it can't be concurrently + * manipulated, even while we don't have the config lock. + */ + (void) spa_vdev_exit(spa, NULL, txg, 0); + + /* + * Top ZAP should have been transferred to the indirect vdev in + * vdev_remove_replace_with_indirect. + */ + ASSERT0(vd->vdev_top_zap); + + /* + * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect. + */ + ASSERT0(vd->vdev_leaf_zap); + + txg = spa_vdev_enter(spa); + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + /* + * Request to update the config and the config cachefile. + */ + vdev_config_dirty(spa->spa_root_vdev); + (void) spa_vdev_exit(spa, vd, txg, 0); + + spa_event_post(ev); +} + +/* + * Evacuates a segment of size at most max_alloc from the vdev + * via repeated calls to spa_vdev_copy_segment. If an allocation + * fails, the pool is probably too fragmented to handle such a + * large size, so decrease max_alloc so that the caller will not try + * this size again this txg. + */ +static void +spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, + uint64_t *max_alloc, dmu_tx_t *tx) +{ + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + mutex_enter(&svr->svr_lock); + + /* + * Determine how big of a chunk to copy. We can allocate up + * to max_alloc bytes, and we can span up to vdev_removal_max_span + * bytes of unallocated space at a time. "segs" will track the + * allocated segments that we are copying. We may also be copying + * free segments (of up to vdev_removal_max_span bytes). + */ + range_tree_t *segs = range_tree_create(NULL, NULL); + for (;;) { + range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); + if (rs == NULL) + break; + + uint64_t seg_length; + + if (range_tree_is_empty(segs)) { + /* need to truncate the first seg based on max_alloc */ + seg_length = + MIN(rs->rs_end - rs->rs_start, *max_alloc); + } else { + if (rs->rs_start - range_tree_max(segs) > + vdev_removal_max_span) { + /* + * Including this segment would cause us to + * copy a larger unneeded chunk than is allowed. + */ + break; + } else if (rs->rs_end - range_tree_min(segs) > + *max_alloc) { + /* + * This additional segment would extend past + * max_alloc. Rather than splitting this + * segment, leave it for the next mapping. + */ + break; + } else { + seg_length = rs->rs_end - rs->rs_start; + } + } + + range_tree_add(segs, rs->rs_start, seg_length); + range_tree_remove(svr->svr_allocd_segs, + rs->rs_start, seg_length); + } + + if (range_tree_is_empty(segs)) { + mutex_exit(&svr->svr_lock); + range_tree_destroy(segs); + return; + } + + if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { + dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, + svr, 0, ZFS_SPACE_CHECK_NONE, tx); + } + + svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs); + + /* + * Note: this is the amount of *allocated* space + * that we are taking care of each txg. + */ + svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs); + + mutex_exit(&svr->svr_lock); + + zio_alloc_list_t zal; + metaslab_trace_init(&zal); + uint64_t thismax = SPA_MAXBLOCKSIZE; + while (!range_tree_is_empty(segs)) { + int error = spa_vdev_copy_segment(vd, + segs, thismax, txg, vca, &zal); + + if (error == ENOSPC) { + /* + * Cut our segment in half, and don't try this + * segment size again this txg. Note that the + * allocation size must be aligned to the highest + * ashift in the pool, so that the allocation will + * not be padded out to a multiple of the ashift, + * which could cause us to think that this mapping + * is larger than we intended. + */ + ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); + uint64_t attempted = + MIN(range_tree_span(segs), thismax); + thismax = P2ROUNDUP(attempted / 2, + 1 << spa->spa_max_ashift); + /* + * The minimum-size allocation can not fail. + */ + ASSERT3U(attempted, >, 1 << spa->spa_max_ashift); + *max_alloc = attempted - (1 << spa->spa_max_ashift); + } else { + ASSERT0(error); + + /* + * We've performed an allocation, so reset the + * alloc trace list. + */ + metaslab_trace_fini(&zal); + metaslab_trace_init(&zal); + } + } + metaslab_trace_fini(&zal); + range_tree_destroy(segs); +} + +/* + * The removal thread operates in open context. It iterates over all + * allocated space in the vdev, by loading each metaslab's spacemap. + * For each contiguous segment of allocated space (capping the segment + * size at SPA_MAXBLOCKSIZE), we: + * - Allocate space for it on another vdev. + * - Create a new mapping from the old location to the new location + * (as a record in svr_new_segments). + * - Initiate a logical read zio to get the data off the removing disk. + * - In the read zio's done callback, initiate a logical write zio to + * write it to the new vdev. + * Note that all of this will take effect when a particular TXG syncs. + * The sync thread ensures that all the phys reads and writes for the syncing + * TXG have completed (see spa_txg_zio) and writes the new mappings to disk + * (see vdev_mapping_sync()). + */ +static void +spa_vdev_remove_thread(void *arg) +{ + spa_t *spa = arg; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_copy_arg_t vca; + uint64_t max_alloc = zfs_remove_max_segment; + uint64_t last_txg = 0; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); + + ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); + ASSERT(vdev_is_concrete(vd)); + ASSERT(vd->vdev_removing); + ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); + ASSERT(vim != NULL); + + mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); + vca.vca_outstanding_bytes = 0; + + mutex_enter(&svr->svr_lock); + + /* + * Start from vim_max_offset so we pick up where we left off + * if we are restarting the removal after opening the pool. + */ + uint64_t msi; + for (msi = start_offset >> vd->vdev_ms_shift; + msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + ASSERT3U(msi, <=, vd->vdev_ms_count); + + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + /* + * Assert nothing in flight -- ms_*tree is empty. + */ + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(range_tree_space(msp->ms_allocating[i])); + } + + /* + * If the metaslab has ever been allocated from (ms_sm!=NULL), + * read the allocated segments from the space map object + * into svr_allocd_segs. Since we do this while holding + * svr_lock and ms_sync_lock, concurrent frees (which + * would have modified the space map) will wait for us + * to finish loading the spacemap, and then take the + * appropriate action (see free_from_removing_vdev()). + */ + if (msp->ms_sm != NULL) { + space_map_t *sm = NULL; + + /* + * We have to open a new space map here, because + * ms_sm's sm_length and sm_alloc may not reflect + * what's in the object contents, if we are in between + * metaslab_sync() and metaslab_sync_done(). + */ + VERIFY0(space_map_open(&sm, + spa->spa_dsl_pool->dp_meta_objset, + msp->ms_sm->sm_object, msp->ms_sm->sm_start, + msp->ms_sm->sm_size, msp->ms_sm->sm_shift)); + space_map_update(sm); + VERIFY0(space_map_load(sm, svr->svr_allocd_segs, + SM_ALLOC)); + space_map_close(sm); + + range_tree_walk(msp->ms_freeing, + range_tree_remove, svr->svr_allocd_segs); + + /* + * When we are resuming from a paused removal (i.e. + * when importing a pool with a removal in progress), + * discard any state that we have already processed. + */ + range_tree_clear(svr->svr_allocd_segs, 0, start_offset); + } + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + + vca.vca_msp = msp; + zfs_dbgmsg("copying %llu segments for metaslab %llu", + avl_numnodes(&svr->svr_allocd_segs->rt_root), + msp->ms_id); + + while (!svr->svr_thread_exit && + !range_tree_is_empty(svr->svr_allocd_segs)) { + + mutex_exit(&svr->svr_lock); + + /* + * We need to periodically drop the config lock so that + * writers can get in. Additionally, we can't wait + * for a txg to sync while holding a config lock + * (since a waiting writer could cause a 3-way deadlock + * with the sync thread, which also gets a config + * lock for reader). So we can't hold the config lock + * while calling dmu_tx_assign(). + */ + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * This delay will pause the removal around the point + * specified by zfs_remove_max_bytes_pause. We do this + * solely from the test suite or during debugging. + */ + uint64_t bytes_copied = + spa->spa_removing_phys.sr_copied; + for (int i = 0; i < TXG_SIZE; i++) + bytes_copied += svr->svr_bytes_done[i]; + while (zfs_remove_max_bytes_pause <= bytes_copied && + !svr->svr_thread_exit) + delay(hz); + + mutex_enter(&vca.vca_lock); + while (vca.vca_outstanding_bytes > + zfs_remove_max_copy_bytes) { + cv_wait(&vca.vca_cv, &vca.vca_lock); + } + mutex_exit(&vca.vca_lock); + + dmu_tx_t *tx = + dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + /* + * Reacquire the vdev_config lock. The vdev_t + * that we're removing may have changed, e.g. due + * to a vdev_attach or vdev_detach. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vd = vdev_lookup_top(spa, svr->svr_vdev_id); + + if (txg != last_txg) + max_alloc = zfs_remove_max_segment; + last_txg = txg; + + spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); + + dmu_tx_commit(tx); + mutex_enter(&svr->svr_lock); + } + } + + mutex_exit(&svr->svr_lock); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * Wait for all copies to finish before cleaning up the vca. + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + ASSERT0(vca.vca_outstanding_bytes); + + mutex_destroy(&vca.vca_lock); + cv_destroy(&vca.vca_cv); + + if (svr->svr_thread_exit) { + mutex_enter(&svr->svr_lock); + range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); + svr->svr_thread = NULL; + cv_broadcast(&svr->svr_cv); + mutex_exit(&svr->svr_lock); + } else { + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + vdev_remove_complete(spa); + } + thread_exit(); +} + +void +spa_vdev_remove_suspend(spa_t *spa) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + + if (svr == NULL) + return; + + mutex_enter(&svr->svr_lock); + svr->svr_thread_exit = B_TRUE; + while (svr->svr_thread != NULL) + cv_wait(&svr->svr_cv, &svr->svr_lock); + svr->svr_thread_exit = B_FALSE; + mutex_exit(&svr->svr_lock); +} + +/* ARGSUSED */ +static int +spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (spa->spa_vdev_removal == NULL) + return (ESRCH); + return (0); +} + +/* + * Cancel a removal by freeing all entries from the partial mapping + * and marking the vdev as no longer being removing. + */ +/* ARGSUSED */ +static void +spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + objset_t *mos = spa->spa_meta_objset; + + ASSERT3P(svr->svr_thread, ==, NULL); + + spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); + if (vdev_obsolete_counts_are_precise(vd)) { + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx)); + } + + if (vdev_obsolete_sm_object(vd) != 0) { + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT3U(vdev_obsolete_sm_object(vd), ==, + space_map_object(vd->vdev_obsolete_sm)); + + space_map_free(vd->vdev_obsolete_sm, tx); + VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); + space_map_close(vd->vdev_obsolete_sm); + vd->vdev_obsolete_sm = NULL; + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + } + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT(list_is_empty(&svr->svr_new_segments[i])); + ASSERT3U(svr->svr_max_offset_to_sync[i], <=, + vdev_indirect_mapping_max_offset(vim)); + } + + for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + + if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) + break; + + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + mutex_enter(&msp->ms_lock); + + /* + * Assert nothing in flight -- ms_*tree is empty. + */ + for (int i = 0; i < TXG_SIZE; i++) + ASSERT0(range_tree_space(msp->ms_allocating[i])); + for (int i = 0; i < TXG_DEFER_SIZE; i++) + ASSERT0(range_tree_space(msp->ms_defer[i])); + ASSERT0(range_tree_space(msp->ms_freed)); + + if (msp->ms_sm != NULL) { + /* + * Assert that the in-core spacemap has the same + * length as the on-disk one, so we can use the + * existing in-core spacemap to load it from disk. + */ + ASSERT3U(msp->ms_sm->sm_alloc, ==, + msp->ms_sm->sm_phys->smp_alloc); + ASSERT3U(msp->ms_sm->sm_length, ==, + msp->ms_sm->sm_phys->smp_objsize); + + mutex_enter(&svr->svr_lock); + VERIFY0(space_map_load(msp->ms_sm, + svr->svr_allocd_segs, SM_ALLOC)); + range_tree_walk(msp->ms_freeing, + range_tree_remove, svr->svr_allocd_segs); + + /* + * Clear everything past what has been synced, + * because we have not allocated mappings for it yet. + */ + uint64_t syncd = vdev_indirect_mapping_max_offset(vim); + uint64_t sm_end = msp->ms_sm->sm_start + + msp->ms_sm->sm_size; + if (sm_end > syncd) + range_tree_clear(svr->svr_allocd_segs, + syncd, sm_end - syncd); + + mutex_exit(&svr->svr_lock); + } + mutex_exit(&msp->ms_lock); + + mutex_enter(&svr->svr_lock); + range_tree_vacate(svr->svr_allocd_segs, + free_mapped_segment_cb, vd); + mutex_exit(&svr->svr_lock); + } + + /* + * Note: this must happen after we invoke free_mapped_segment_cb, + * because it adds to the obsolete_segments. + */ + range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); + + ASSERT3U(vic->vic_mapping_object, ==, + vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); + vdev_indirect_mapping_close(vd->vdev_indirect_mapping); + vd->vdev_indirect_mapping = NULL; + vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); + vic->vic_mapping_object = 0; + + ASSERT3U(vic->vic_births_object, ==, + vdev_indirect_births_object(vd->vdev_indirect_births)); + vdev_indirect_births_close(vd->vdev_indirect_births); + vd->vdev_indirect_births = NULL; + vdev_indirect_births_free(mos, vic->vic_births_object, tx); + vic->vic_births_object = 0; + + /* + * We may have processed some frees from the removing vdev in this + * txg, thus increasing svr_bytes_done; discard that here to + * satisfy the assertions in spa_vdev_removal_destroy(). + * Note that future txg's can not have any bytes_done, because + * future TXG's are only modified from open context, and we have + * already shut down the copying thread. + */ + svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0; + spa_finish_removal(spa, DSS_CANCELED, tx); + + vd->vdev_removing = B_FALSE; + vdev_config_dirty(vd); + + zfs_dbgmsg("canceled device removal for vdev %llu in %llu", + vd->vdev_id, dmu_tx_get_txg(tx)); + spa_history_log_internal(spa, "vdev remove canceled", tx, + "%s vdev %llu %s", spa_name(spa), + vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); +} + +int +spa_vdev_remove_cancel(spa_t *spa) +{ + spa_vdev_remove_suspend(spa); + + if (spa->spa_vdev_removal == NULL) + return (ESRCH); + + uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id; + + int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, + spa_vdev_remove_cancel_sync, NULL, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED); + + if (error == 0) { + spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); + vdev_t *vd = vdev_lookup_top(spa, vdid); + metaslab_group_activate(vd->vdev_mg); + spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); + } + + return (error); +} + +/* + * Called every sync pass of every txg if there's a svr. + */ +void +svr_sync(spa_t *spa, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + /* + * This check is necessary so that we do not dirty the + * DIRECTORY_OBJECT via spa_sync_removing_state() when there + * is nothing to do. Dirtying it every time would prevent us + * from syncing-to-convergence. + */ + if (svr->svr_bytes_done[txgoff] == 0) + return; + + /* + * Update progress accounting. + */ + spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff]; + svr->svr_bytes_done[txgoff] = 0; + + spa_sync_removing_state(spa, tx); +} + +static void +vdev_remove_make_hole_and_free(vdev_t *vd) +{ + uint64_t id = vd->vdev_id; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + boolean_t last_vdev = (id == (rvd->vdev_children - 1)); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + vdev_free(vd); + + if (last_vdev) { + vdev_compact_children(rvd); + } else { + vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); + vdev_add_child(rvd, vd); + } + vdev_config_dirty(rvd); + + /* + * Reassess the health of our root vdev. + */ + vdev_reopen(rvd); +} + +/* + * Remove a log device. The config lock is held for the specified TXG. + */ +static int +spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) +{ + metaslab_group_t *mg = vd->vdev_mg; + spa_t *spa = vd->vdev_spa; + int error = 0; + + ASSERT(vd->vdev_islog); + ASSERT(vd == vd->vdev_top); + + /* + * Stop allocating from this vdev. + */ + metaslab_group_passivate(mg); + + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + + /* + * Evacuate the device. We don't hold the config lock as writer + * since we need to do I/O but we do keep the + * spa_namespace_lock held. Once this completes the device + * should no longer have any blocks allocated on it. + */ + if (vd->vdev_islog) { + if (vd->vdev_stat.vs_alloc != 0) + error = spa_reset_logs(spa); + } + + *txg = spa_vdev_config_enter(spa); + + if (error != 0) { + metaslab_group_activate(mg); + return (error); + } + ASSERT0(vd->vdev_stat.vs_alloc); + + /* + * The evacuation succeeded. Remove any remaining MOS metadata + * associated with this vdev, and wait for these changes to sync. + */ + vd->vdev_removing = B_TRUE; + + vdev_dirty_leaves(vd, VDD_DTL, *txg); + vdev_config_dirty(vd); + + spa_history_log_internal(spa, "vdev remove", NULL, + "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id, + (vd->vdev_path != NULL) ? vd->vdev_path : "-"); + + /* Make sure these changes are sync'ed */ + spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); + + /* Stop initializing */ + (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); + + *txg = spa_vdev_config_enter(spa); + + sysevent_t *ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_DEV); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + /* The top ZAP should have been destroyed by vdev_remove_empty. */ + ASSERT0(vd->vdev_top_zap); + /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */ + ASSERT0(vd->vdev_leaf_zap); + + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + + if (list_link_active(&vd->vdev_state_dirty_node)) + vdev_state_clean(vd); + if (list_link_active(&vd->vdev_config_dirty_node)) + vdev_config_clean(vd); + + /* + * Clean up the vdev namespace. + */ + vdev_remove_make_hole_and_free(vd); + + if (ev != NULL) + spa_event_post(ev); + + return (0); +} + +static int +spa_vdev_remove_top_check(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + if (vd != vd->vdev_top) + return (SET_ERROR(ENOTSUP)); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) + return (SET_ERROR(ENOTSUP)); + + /* + * There has to be enough free space to remove the + * device and leave double the "slop" space (i.e. we + * must leave at least 3% of the pool free, in addition to + * the normal slop space). + */ + if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir, + NULL, 0, B_TRUE) < + vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { + return (SET_ERROR(ENOSPC)); + } + + /* + * There can not be a removal in progress. + */ + if (spa->spa_removing_phys.sr_state == DSS_SCANNING) + return (SET_ERROR(EBUSY)); + + /* + * The device must have all its data. + */ + if (!vdev_dtl_empty(vd, DTL_MISSING) || + !vdev_dtl_empty(vd, DTL_OUTAGE)) + return (SET_ERROR(EBUSY)); + + /* + * The device must be healthy. + */ + if (!vdev_readable(vd)) + return (SET_ERROR(EIO)); + + /* + * All vdevs in normal class must have the same ashift. + */ + if (spa->spa_max_ashift != spa->spa_min_ashift) { + return (SET_ERROR(EINVAL)); + } + + /* + * All vdevs in normal class must have the same ashift + * and not be raidz. + */ + vdev_t *rvd = spa->spa_root_vdev; + int num_indirect = 0; + for (uint64_t id = 0; id < rvd->vdev_children; id++) { + vdev_t *cvd = rvd->vdev_child[id]; + if (cvd->vdev_ashift != 0 && !cvd->vdev_islog) + ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); + if (cvd->vdev_ops == &vdev_indirect_ops) + num_indirect++; + if (!vdev_is_concrete(cvd)) + continue; + if (cvd->vdev_ops == &vdev_raidz_ops) + return (SET_ERROR(EINVAL)); + /* + * Need the mirror to be mirror of leaf vdevs only + */ + if (cvd->vdev_ops == &vdev_mirror_ops) { + for (uint64_t cid = 0; + cid < cvd->vdev_children; cid++) { + vdev_t *tmp = cvd->vdev_child[cid]; + if (!tmp->vdev_ops->vdev_op_leaf) + return (SET_ERROR(EINVAL)); + } + } + } + + return (0); +} + +/* + * Initiate removal of a top-level vdev, reducing the total space in the pool. + * The config lock is held for the specified TXG. Once initiated, + * evacuation of all allocated space (copying it to other vdevs) happens + * in the background (see spa_vdev_remove_thread()), and can be canceled + * (see spa_vdev_remove_cancel()). If successful, the vdev will + * be transformed to an indirect vdev (see spa_vdev_remove_complete()). + */ +static int +spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) +{ + spa_t *spa = vd->vdev_spa; + int error; + + /* + * Check for errors up-front, so that we don't waste time + * passivating the metaslab group and clearing the ZIL if there + * are errors. + */ + error = spa_vdev_remove_top_check(vd); + if (error != 0) + return (error); + + /* + * Stop allocating from this vdev. Note that we must check + * that this is not the only device in the pool before + * passivating, otherwise we will not be able to make + * progress because we can't allocate from any vdevs. + * The above check for sufficient free space serves this + * purpose. + */ + metaslab_group_t *mg = vd->vdev_mg; + metaslab_group_passivate(mg); + + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + + /* + * We must ensure that no "stubby" log blocks are allocated + * on the device to be removed. These blocks could be + * written at any time, including while we are in the middle + * of copying them. + */ + error = spa_reset_logs(spa); + + /* + * We stop any initializing that is currently in progress but leave + * the state as "active". This will allow the initializing to resume + * if the removal is canceled sometime later. + */ + vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); + + *txg = spa_vdev_config_enter(spa); + + /* + * Things might have changed while the config lock was dropped + * (e.g. space usage). Check for errors again. + */ + if (error == 0) + error = spa_vdev_remove_top_check(vd); + + if (error != 0) { + metaslab_group_activate(mg); + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + return (error); + } + + vd->vdev_removing = B_TRUE; + + vdev_dirty_leaves(vd, VDD_DTL, *txg); + vdev_config_dirty(vd); + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, + vdev_remove_initiate_sync, + (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + return (0); +} + +/* + * Remove a device from the pool. + * + * Removing a device from the vdev namespace requires several steps + * and can take a significant amount of time. As a result we use + * the spa_vdev_config_[enter/exit] functions which allow us to + * grab and release the spa_config_lock while still holding the namespace + * lock. During each step the configuration is synced out. + */ +int +spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) +{ + vdev_t *vd; + nvlist_t **spares, **l2cache, *nv; + uint64_t txg = 0; + uint_t nspares, nl2cache; + int error = 0; + boolean_t locked = MUTEX_HELD(&spa_namespace_lock); + sysevent_t *ev = NULL; + + ASSERT(spa_writeable(spa)); + + if (!locked) + txg = spa_vdev_enter(spa); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + + if (!locked) + return (spa_vdev_exit(spa, NULL, txg, error)); + + return (error); + } + + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + + if (spa->spa_spares.sav_vdevs != NULL && + nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && + (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { + /* + * Only remove the hot spare if it's not currently in use + * in this pool. + */ + if (vd == NULL || unspare) { + char *nvstr = fnvlist_lookup_string(nv, + ZPOOL_CONFIG_PATH); + spa_history_log_internal(spa, "vdev remove", NULL, + "%s vdev (%s) %s", spa_name(spa), + VDEV_TYPE_SPARE, nvstr); + if (vd == NULL) + vd = spa_lookup_by_guid(spa, guid, B_TRUE); + ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_AUX); + spa_vdev_remove_aux(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares, nv); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } else { + error = SET_ERROR(EBUSY); + } + } else if (spa->spa_l2cache.sav_vdevs != NULL && + nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && + (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { + char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); + spa_history_log_internal(spa, "vdev remove", NULL, + "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr); + /* + * Cache devices can always be removed. + */ + vd = spa_lookup_by_guid(spa, guid, B_TRUE); + ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); + spa_vdev_remove_aux(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); + spa_load_l2cache(spa); + spa->spa_l2cache.sav_sync = B_TRUE; + } else if (vd != NULL && vd->vdev_islog) { + ASSERT(!locked); + error = spa_vdev_remove_log(vd, &txg); + } else if (vd != NULL) { + ASSERT(!locked); + error = spa_vdev_remove_top(vd, &txg); + } else { + /* + * There is no vdev of any kind with the specified guid. + */ + error = SET_ERROR(ENOENT); + } + + if (!locked) + error = spa_vdev_exit(spa, NULL, txg, error); + + if (ev != NULL) { + if (error != 0) { + spa_event_discard(ev); + } else { + spa_event_post(ev); + } + } + + return (error); +} + +int +spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) +{ + prs->prs_state = spa->spa_removing_phys.sr_state; + + if (prs->prs_state == DSS_NONE) + return (SET_ERROR(ENOENT)); + + prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev; + prs->prs_start_time = spa->spa_removing_phys.sr_start_time; + prs->prs_end_time = spa->spa_removing_phys.sr_end_time; + prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy; + prs->prs_copied = spa->spa_removing_phys.sr_copied; + + if (spa->spa_vdev_removal != NULL) { + for (int i = 0; i < TXG_SIZE; i++) { + prs->prs_copied += + spa->spa_vdev_removal->svr_bytes_done[i]; + } + } + + prs->prs_mapping_memory = 0; + uint64_t indirect_vdev_id = + spa->spa_removing_phys.sr_prev_indirect_vdev; + while (indirect_vdev_id != -1) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id]; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + prs->prs_mapping_memory += vdev_indirect_mapping_size(vim); + indirect_vdev_id = vic->vic_prev_indirect_vdev; + } + + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c new file mode 100644 index 000000000000..a03d18704dfc --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c @@ -0,0 +1,157 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> + +/* + * Virtual device vector for the pool's root vdev. + */ + +static uint64_t +vdev_root_core_tvds(vdev_t *vd) +{ + uint64_t tvds = 0; + + for (uint64_t c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!cvd->vdev_ishole && !cvd->vdev_islog && + cvd->vdev_ops != &vdev_indirect_ops) { + tvds++; + } + } + + return (tvds); +} + +/* + * We should be able to tolerate one failure with absolutely no damage + * to our metadata. Two failures will take out space maps, a bunch of + * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy + * place to live. When we get smarter, we can liberalize this policy. + * e.g. If we haven't lost two consecutive top-level vdevs, then we are + * probably fine. Adding bean counters during alloc/free can make this + * future guesswork more accurate. + */ +static boolean_t +too_many_errors(vdev_t *vd, uint64_t numerrors) +{ + uint64_t tvds; + + if (numerrors == 0) + return (B_FALSE); + + tvds = vdev_root_core_tvds(vd); + ASSERT3U(numerrors, <=, tvds); + + if (numerrors == tvds) + return (B_TRUE); + + return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa)); +} + +static int +vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + spa_t *spa = vd->vdev_spa; + int lasterror = 0; + int numerrors = 0; + + if (vd->vdev_children == 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + vdev_open_children(vd); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error && !cvd->vdev_islog) { + lasterror = cvd->vdev_open_error; + numerrors++; + } + } + + if (spa_load_state(spa) != SPA_LOAD_NONE) + spa_set_missing_tvds(spa, numerrors); + + if (too_many_errors(vd, numerrors)) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + *asize = 0; + *max_asize = 0; + *logical_ashift = 0; + *physical_ashift = 0; + + return (0); +} + +static void +vdev_root_close(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static void +vdev_root_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (too_many_errors(vd, faulted)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + } else if (degraded || faulted) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + } else { + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + } +} + +vdev_ops_t vdev_root_ops = { + vdev_root_open, + vdev_root_close, + vdev_default_asize, + NULL, /* io_start - not applicable to the root */ + NULL, /* io_done - not applicable to the root */ + vdev_root_state_change, + NULL, + NULL, + NULL, + NULL, + NULL, + VDEV_TYPE_ROOT, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c new file mode 100644 index 000000000000..9e905acbcf3b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c @@ -0,0 +1,1334 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +/* + * This file contains the top half of the zfs directory structure + * implementation. The bottom half is in zap_leaf.c. + * + * The zdir is an extendable hash data structure. There is a table of + * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are + * each a constant size and hold a variable number of directory entries. + * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. + * + * The pointer table holds a power of 2 number of pointers. + * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to + * by the pointer at index i in the table holds entries whose hash value + * has a zd_prefix_len - bit prefix + */ + +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zfs_context.h> +#include <sys/zfs_znode.h> +#include <sys/fs/zfs.h> +#include <sys/zap.h> +#include <sys/refcount.h> +#include <sys/zap_impl.h> +#include <sys/zap_leaf.h> + +int fzap_default_block_shift = 14; /* 16k blocksize */ + +extern inline zap_phys_t *zap_f_phys(zap_t *zap); + +static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); + +void +fzap_byteswap(void *vbuf, size_t size) +{ + uint64_t block_type = *(uint64_t *)vbuf; + + if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) + zap_leaf_byteswap(vbuf, size); + else { + /* it's a ptrtbl block */ + byteswap_uint64_array(vbuf, size); + } +} + +void +fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) +{ + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + zap->zap_ismicro = FALSE; + + zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync; + zap->zap_dbu.dbu_evict_func_async = NULL; + + mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); + zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; + + zap_phys_t *zp = zap_f_phys(zap); + /* + * explicitly zero it since it might be coming from an + * initialized microzap + */ + bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); + zp->zap_block_type = ZBT_HEADER; + zp->zap_magic = ZAP_MAGIC; + + zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); + + zp->zap_freeblk = 2; /* block 1 will be the first leaf */ + zp->zap_num_leafs = 1; + zp->zap_num_entries = 0; + zp->zap_salt = zap->zap_salt; + zp->zap_normflags = zap->zap_normflags; + zp->zap_flags = flags; + + /* block 1 will be the first leaf */ + for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++) + ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; + + /* + * set up block 1 - the first leaf + */ + dmu_buf_t *db; + VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH)); + dmu_buf_will_dirty(db, tx); + + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + l->l_dbuf = db; + + zap_leaf_init(l, zp->zap_normflags != 0); + + kmem_free(l, sizeof (zap_leaf_t)); + dmu_buf_rele(db, FTAG); +} + +static int +zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) +{ + if (RW_WRITE_HELD(&zap->zap_rwlock)) + return (1); + if (rw_tryupgrade(&zap->zap_rwlock)) { + dmu_buf_will_dirty(zap->zap_dbuf, tx); + return (1); + } + return (0); +} + +/* + * Generic routines for dealing with the pointer & cookie tables. + */ + +static int +zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, + void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), + dmu_tx_t *tx) +{ + uint64_t newblk; + int bs = FZAP_BLOCK_SHIFT(zap); + int hepb = 1<<(bs-4); + /* hepb = half the number of entries in a block */ + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + ASSERT(tbl->zt_numblks > 0); + + if (tbl->zt_nextblk != 0) { + newblk = tbl->zt_nextblk; + } else { + newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); + tbl->zt_nextblk = newblk; + ASSERT0(tbl->zt_blks_copied); + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + tbl->zt_blk << bs, tbl->zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); + } + + /* + * Copy the ptrtbl from the old to new location. + */ + + uint64_t b = tbl->zt_blks_copied; + dmu_buf_t *db_old; + int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + + /* first half of entries in old[b] go to new[2*b+0] */ + dmu_buf_t *db_new; + VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); + dmu_buf_will_dirty(db_new, tx); + transfer_func(db_old->db_data, db_new->db_data, hepb); + dmu_buf_rele(db_new, FTAG); + + /* second half of entries in old[b] go to new[2*b+1] */ + VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); + dmu_buf_will_dirty(db_new, tx); + transfer_func((uint64_t *)db_old->db_data + hepb, + db_new->db_data, hepb); + dmu_buf_rele(db_new, FTAG); + + dmu_buf_rele(db_old, FTAG); + + tbl->zt_blks_copied++; + + dprintf("copied block %llu of %llu\n", + tbl->zt_blks_copied, tbl->zt_numblks); + + if (tbl->zt_blks_copied == tbl->zt_numblks) { + (void) dmu_free_range(zap->zap_objset, zap->zap_object, + tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); + + tbl->zt_blk = newblk; + tbl->zt_numblks *= 2; + tbl->zt_shift++; + tbl->zt_nextblk = 0; + tbl->zt_blks_copied = 0; + + dprintf("finished; numblocks now %llu (%lluk entries)\n", + tbl->zt_numblks, 1<<(tbl->zt_shift-10)); + } + + return (0); +} + +static int +zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, + dmu_tx_t *tx) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + + dprintf("storing %llx at index %llx\n", val, idx); + + uint64_t blk = idx >> (bs-3); + uint64_t off = idx & ((1<<(bs-3))-1); + + dmu_buf_t *db; + int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + dmu_buf_will_dirty(db, tx); + + if (tbl->zt_nextblk != 0) { + uint64_t idx2 = idx * 2; + uint64_t blk2 = idx2 >> (bs-3); + uint64_t off2 = idx2 & ((1<<(bs-3))-1); + dmu_buf_t *db2; + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, + DMU_READ_NO_PREFETCH); + if (err != 0) { + dmu_buf_rele(db, FTAG); + return (err); + } + dmu_buf_will_dirty(db2, tx); + ((uint64_t *)db2->db_data)[off2] = val; + ((uint64_t *)db2->db_data)[off2+1] = val; + dmu_buf_rele(db2, FTAG); + } + + ((uint64_t *)db->db_data)[off] = val; + dmu_buf_rele(db, FTAG); + + return (0); +} + +static int +zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + uint64_t blk = idx >> (bs-3); + uint64_t off = idx & ((1<<(bs-3))-1); + + /* + * Note: this is equivalent to dmu_buf_hold(), but we use + * _dnode_enter / _by_dnode because it's faster because we don't + * have to hold the dnode. + */ + dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); + dmu_buf_t *db; + int err = dmu_buf_hold_by_dnode(dn, + (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); + dmu_buf_dnode_exit(zap->zap_dbuf); + if (err != 0) + return (err); + *valp = ((uint64_t *)db->db_data)[off]; + dmu_buf_rele(db, FTAG); + + if (tbl->zt_nextblk != 0) { + /* + * read the nextblk for the sake of i/o error checking, + * so that zap_table_load() will catch errors for + * zap_table_store. + */ + blk = (idx*2) >> (bs-3); + + dn = dmu_buf_dnode_enter(zap->zap_dbuf); + err = dmu_buf_hold_by_dnode(dn, + (tbl->zt_nextblk + blk) << bs, FTAG, &db, + DMU_READ_NO_PREFETCH); + dmu_buf_dnode_exit(zap->zap_dbuf); + if (err == 0) + dmu_buf_rele(db, FTAG); + } + return (err); +} + +/* + * Routines for growing the ptrtbl. + */ + +static void +zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) +{ + for (int i = 0; i < n; i++) { + uint64_t lb = src[i]; + dst[2 * i + 0] = lb; + dst[2 * i + 1] = lb; + } +} + +static int +zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) +{ + /* + * The pointer table should never use more hash bits than we + * have (otherwise we'd be using useless zero bits to index it). + * If we are within 2 bits of running out, stop growing, since + * this is already an aberrant condition. + */ + if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) + return (SET_ERROR(ENOSPC)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { + /* + * We are outgrowing the "embedded" ptrtbl (the one + * stored in the header block). Give it its own entire + * block, which will double the size of the ptrtbl. + */ + ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, + ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); + + uint64_t newblk = zap_allocate_blocks(zap, 1); + dmu_buf_t *db_new; + int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, + DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + dmu_buf_will_dirty(db_new, tx); + zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), + db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + dmu_buf_rele(db_new, FTAG); + + zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk; + zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1; + zap_f_phys(zap)->zap_ptrtbl.zt_shift++; + + ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << + (FZAP_BLOCK_SHIFT(zap)-3)); + + return (0); + } else { + return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl, + zap_ptrtbl_transfer, tx)); + } +} + +static void +zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) +{ + dmu_buf_will_dirty(zap->zap_dbuf, tx); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); + ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta); + zap_f_phys(zap)->zap_num_entries += delta; + mutex_exit(&zap->zap_f.zap_num_entries_mtx); +} + +static uint64_t +zap_allocate_blocks(zap_t *zap, int nblocks) +{ + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + uint64_t newblk = zap_f_phys(zap)->zap_freeblk; + zap_f_phys(zap)->zap_freeblk += nblocks; + return (newblk); +} + +static void +zap_leaf_evict_sync(void *dbu) +{ + zap_leaf_t *l = dbu; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + +static zap_leaf_t * +zap_create_leaf(zap_t *zap, dmu_tx_t *tx) +{ + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + rw_init(&l->l_rwlock, 0, 0, 0); + rw_enter(&l->l_rwlock, RW_WRITER); + l->l_blkid = zap_allocate_blocks(zap, 1); + l->l_dbuf = NULL; + + VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, + DMU_READ_NO_PREFETCH)); + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); + VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu)); + dmu_buf_will_dirty(l->l_dbuf, tx); + + zap_leaf_init(l, zap->zap_normflags != 0); + + zap_f_phys(zap)->zap_num_leafs++; + + return (l); +} + +int +fzap_count(zap_t *zap, uint64_t *count) +{ + ASSERT(!zap->zap_ismicro); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ + *count = zap_f_phys(zap)->zap_num_entries; + mutex_exit(&zap->zap_f.zap_num_entries_mtx); + return (0); +} + +/* + * Routines for obtaining zap_leaf_t's + */ + +void +zap_put_leaf(zap_leaf_t *l) +{ + rw_exit(&l->l_rwlock); + dmu_buf_rele(l->l_dbuf, NULL); +} + +static zap_leaf_t * +zap_open_leaf(uint64_t blkid, dmu_buf_t *db) +{ + ASSERT(blkid != 0); + + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + rw_init(&l->l_rwlock, 0, 0, 0); + rw_enter(&l->l_rwlock, RW_WRITER); + l->l_blkid = blkid; + l->l_bs = highbit64(db->db_size) - 1; + l->l_dbuf = db; + + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); + zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu); + + rw_exit(&l->l_rwlock); + if (winner != NULL) { + /* someone else set it first */ + zap_leaf_evict_sync(&l->l_dbu); + l = winner; + } + + /* + * lhr_pad was previously used for the next leaf in the leaf + * chain. There should be no chained leafs (as we have removed + * support for them). + */ + ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1); + + /* + * There should be more hash entries than there can be + * chunks to put in the hash table + */ + ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); + + /* The chunks should begin at the end of the hash table */ + ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, + &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); + + /* The chunks should end at the end of the block */ + ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - + (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size); + + return (l); +} + +static int +zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, + zap_leaf_t **lp) +{ + dmu_buf_t *db; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + int bs = FZAP_BLOCK_SHIFT(zap); + dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); + int err = dmu_buf_hold_by_dnode(dn, + blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); + dmu_buf_dnode_exit(zap->zap_dbuf); + if (err != 0) + return (err); + + ASSERT3U(db->db_object, ==, zap->zap_object); + ASSERT3U(db->db_offset, ==, blkid << bs); + ASSERT3U(db->db_size, ==, 1 << bs); + ASSERT(blkid != 0); + + zap_leaf_t *l = dmu_buf_get_user(db); + + if (l == NULL) + l = zap_open_leaf(blkid, db); + + rw_enter(&l->l_rwlock, lt); + /* + * Must lock before dirtying, otherwise zap_leaf_phys(l) could change, + * causing ASSERT below to fail. + */ + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + ASSERT3U(l->l_blkid, ==, blkid); + ASSERT3P(l->l_dbuf, ==, db); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + + *lp = l; + return (0); +} + +static int +zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) +{ + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { + ASSERT3U(idx, <, + (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); + *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); + return (0); + } else { + return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl, + idx, valp)); + } +} + +static int +zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) +{ + ASSERT(tx != NULL); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { + ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; + return (0); + } else { + return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl, + idx, blk, tx)); + } +} + +static int +zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) +{ + uint64_t blk; + + ASSERT(zap->zap_dbuf == NULL || + zap_f_phys(zap) == zap->zap_dbuf->db_data); + + /* Reality check for corrupt zap objects (leaf or header). */ + if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF && + zap_f_phys(zap)->zap_block_type != ZBT_HEADER) || + zap_f_phys(zap)->zap_magic != ZAP_MAGIC) { + return (SET_ERROR(EIO)); + } + + uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + int err = zap_idx_to_blk(zap, idx, &blk); + if (err != 0) + return (err); + err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); + + ASSERT(err || + ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) == + zap_leaf_phys(*lp)->l_hdr.lh_prefix); + return (err); +} + +static int +zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, + void *tag, dmu_tx_t *tx, zap_leaf_t **lp) +{ + zap_t *zap = zn->zn_zap; + uint64_t hash = zn->zn_hash; + int err; + int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + + ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, + zap_leaf_phys(l)->l_hdr.lh_prefix); + + if (zap_tryupgradedir(zap, tx) == 0 || + old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { + /* We failed to upgrade, or need to grow the pointer table */ + objset_t *os = zap->zap_objset; + uint64_t object = zap->zap_object; + + zap_put_leaf(l); + zap_unlockdir(zap, tag); + err = zap_lockdir(os, object, tx, RW_WRITER, + FALSE, FALSE, tag, &zn->zn_zap); + zap = zn->zn_zap; + if (err != 0) + return (err); + ASSERT(!zap->zap_ismicro); + + while (old_prefix_len == + zap_f_phys(zap)->zap_ptrtbl.zt_shift) { + err = zap_grow_ptrtbl(zap, tx); + if (err != 0) + return (err); + } + + err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); + + if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { + /* it split while our locks were down */ + *lp = l; + return (0); + } + } + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, + zap_leaf_phys(l)->l_hdr.lh_prefix); + + int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - + (old_prefix_len + 1); + uint64_t sibling = + (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; + + /* check for i/o errors before doing zap_leaf_split */ + for (int i = 0; i < (1ULL << prefix_diff); i++) { + uint64_t blk; + err = zap_idx_to_blk(zap, sibling + i, &blk); + if (err != 0) + return (err); + ASSERT3U(blk, ==, l->l_blkid); + } + + zap_leaf_t *nl = zap_create_leaf(zap, tx); + zap_leaf_split(l, nl, zap->zap_normflags != 0); + + /* set sibling pointers */ + for (int i = 0; i < (1ULL << prefix_diff); i++) { + err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx); + ASSERT0(err); /* we checked for i/o errors above */ + } + + if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) { + /* we want the sibling */ + zap_put_leaf(l); + *lp = nl; + } else { + zap_put_leaf(nl); + *lp = l; + } + + return (0); +} + +static void +zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, + void *tag, dmu_tx_t *tx) +{ + zap_t *zap = zn->zn_zap; + int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && + zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); + + zap_put_leaf(l); + + if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { + /* + * We are in the middle of growing the pointer table, or + * this leaf will soon make us grow it. + */ + if (zap_tryupgradedir(zap, tx) == 0) { + objset_t *os = zap->zap_objset; + uint64_t zapobj = zap->zap_object; + + zap_unlockdir(zap, tag); + int err = zap_lockdir(os, zapobj, tx, + RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); + zap = zn->zn_zap; + if (err != 0) + return; + } + + /* could have finished growing while our locks were down */ + if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) + (void) zap_grow_ptrtbl(zap, tx); + } +} + +static int +fzap_checkname(zap_name_t *zn) +{ + if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) + return (SET_ERROR(ENAMETOOLONG)); + return (0); +} + +static int +fzap_checksize(uint64_t integer_size, uint64_t num_integers) +{ + /* Only integer sizes supported by C */ + switch (integer_size) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return (SET_ERROR(EINVAL)); + } + + if (integer_size * num_integers > ZAP_MAXVALUELEN) + return (E2BIG); + + return (0); +} + +static int +fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) +{ + int err = fzap_checkname(zn); + if (err != 0) + return (err); + return (fzap_checksize(integer_size, num_integers)); +} + +/* + * Routines for manipulating attributes. + */ +int +fzap_lookup(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, void *buf, + char *realname, int rn_len, boolean_t *ncp) +{ + zap_leaf_t *l; + zap_entry_handle_t zeh; + + int err = fzap_checkname(zn); + if (err != 0) + return (err); + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + if ((err = fzap_checksize(integer_size, num_integers)) != 0) { + zap_put_leaf(l); + return (err); + } + + err = zap_entry_read(&zeh, integer_size, num_integers, buf); + (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); + if (ncp) { + *ncp = zap_entry_normalization_conflict(&zeh, + zn, NULL, zn->zn_zap); + } + } + + zap_put_leaf(l); + return (err); +} + +int +fzap_add_cd(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, void *tag, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + zap_t *zap = zn->zn_zap; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(!zap->zap_ismicro); + ASSERT(fzap_check(zn, integer_size, num_integers) == 0); + + err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); +retry: + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + err = SET_ERROR(EEXIST); + goto out; + } + if (err != ENOENT) + goto out; + + err = zap_entry_create(l, zn, cd, + integer_size, num_integers, val, &zeh); + + if (err == 0) { + zap_increment_num_entries(zap, 1, tx); + } else if (err == EAGAIN) { + err = zap_expand_leaf(zn, l, tag, tx, &l); + zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ + if (err == 0) + goto retry; + } + +out: + if (zap != NULL) + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + return (err); +} + +int +fzap_add(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, + const void *val, void *tag, dmu_tx_t *tx) +{ + int err = fzap_check(zn, integer_size, num_integers); + if (err != 0) + return (err); + + return (fzap_add_cd(zn, integer_size, num_integers, + val, ZAP_NEED_CD, tag, tx)); +} + +int +fzap_update(zap_name_t *zn, + int integer_size, uint64_t num_integers, const void *val, + void *tag, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + boolean_t create; + zap_entry_handle_t zeh; + zap_t *zap = zn->zn_zap; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + err = fzap_check(zn, integer_size, num_integers); + if (err != 0) + return (err); + + err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); +retry: + err = zap_leaf_lookup(l, zn, &zeh); + create = (err == ENOENT); + ASSERT(err == 0 || err == ENOENT); + + if (create) { + err = zap_entry_create(l, zn, ZAP_NEED_CD, + integer_size, num_integers, val, &zeh); + if (err == 0) + zap_increment_num_entries(zap, 1, tx); + } else { + err = zap_entry_update(&zeh, integer_size, num_integers, val); + } + + if (err == EAGAIN) { + err = zap_expand_leaf(zn, l, tag, tx, &l); + zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ + if (err == 0) + goto retry; + } + + if (zap != NULL) + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + return (err); +} + +int +fzap_length(zap_name_t *zn, + uint64_t *integer_size, uint64_t *num_integers) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err != 0) + goto out; + + if (integer_size != 0) + *integer_size = zeh.zeh_integer_size; + if (num_integers != 0) + *num_integers = zeh.zeh_num_integers; +out: + zap_put_leaf(l); + return (err); +} + +int +fzap_remove(zap_name_t *zn, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + zap_entry_remove(&zeh); + zap_increment_num_entries(zn->zn_zap, -1, tx); + } + zap_put_leaf(l); + return (err); +} + +void +fzap_prefetch(zap_name_t *zn) +{ + uint64_t blk; + zap_t *zap = zn->zn_zap; + + uint64_t idx = ZAP_HASH_IDX(zn->zn_hash, + zap_f_phys(zap)->zap_ptrtbl.zt_shift); + if (zap_idx_to_blk(zap, idx, &blk) != 0) + return; + int bs = FZAP_BLOCK_SHIFT(zap); + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, + ZIO_PRIORITY_SYNC_READ); +} + +/* + * Helper functions for consumers. + */ + +uint64_t +zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, + const char *name, dmu_tx_t *tx) +{ + return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx)); +} + +uint64_t +zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, + const char *name, int dnodesize, dmu_tx_t *tx) +{ + uint64_t new_obj; + + VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0, + dnodesize, tx)) > 0); + VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, + tx)); + + return (new_obj); +} + +int +zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, + char *name) +{ + zap_cursor_t zc; + int err; + + if (mask == 0) + mask = -1ULL; + + zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); + for (zap_cursor_init(&zc, os, zapobj); + (err = zap_cursor_retrieve(&zc, za)) == 0; + zap_cursor_advance(&zc)) { + if ((za->za_first_integer & mask) == (value & mask)) { + (void) strcpy(name, za->za_name); + break; + } + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (*za)); + return (err); +} + +int +zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + int err = 0; + + zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, za) == 0; + (void) zap_cursor_advance(&zc)) { + if (za->za_integer_length != 8 || za->za_num_integers != 1) { + err = SET_ERROR(EINVAL); + break; + } + err = zap_add(os, intoobj, za->za_name, + 8, 1, &za->za_first_integer, tx); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (*za)); + return (err); +} + +int +zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, + uint64_t value, dmu_tx_t *tx) +{ + zap_cursor_t zc; + int err = 0; + + zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, za) == 0; + (void) zap_cursor_advance(&zc)) { + if (za->za_integer_length != 8 || za->za_num_integers != 1) { + err = SET_ERROR(EINVAL); + break; + } + err = zap_add(os, intoobj, za->za_name, + 8, 1, &value, tx); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (*za)); + return (err); +} + +int +zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, + dmu_tx_t *tx) +{ + zap_cursor_t zc; + int err = 0; + + zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, za) == 0; + (void) zap_cursor_advance(&zc)) { + uint64_t delta = 0; + + if (za->za_integer_length != 8 || za->za_num_integers != 1) { + err = SET_ERROR(EINVAL); + break; + } + + err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta); + if (err != 0 && err != ENOENT) + break; + delta += za->za_first_integer; + err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (*za)); + return (err); +} + +int +zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); + return (zap_add(os, obj, name, 8, 1, &value, tx)); +} + +int +zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); + return (zap_remove(os, obj, name, tx)); +} + +int +zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); + return (zap_lookup(os, obj, name, 8, 1, &value)); +} + +int +zap_add_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_add(os, obj, name, 8, 1, &value, tx)); +} + +int +zap_update_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_update(os, obj, name, 8, 1, &value, tx)); +} + +int +zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_lookup(os, obj, name, 8, 1, valuep)); +} + +int +zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, + dmu_tx_t *tx) +{ + uint64_t value = 0; + + if (delta == 0) + return (0); + + int err = zap_lookup(os, obj, name, 8, 1, &value); + if (err != 0 && err != ENOENT) + return (err); + value += delta; + if (value == 0) + err = zap_remove(os, obj, name, tx); + else + err = zap_update(os, obj, name, 8, 1, &value, tx); + return (err); +} + +int +zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_increment(os, obj, name, delta, tx)); +} + +/* + * Routines for iterating over the attributes. + */ + +int +fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) +{ + int err = ENOENT; + zap_entry_handle_t zeh; + zap_leaf_t *l; + + /* retrieve the next entry at or after zc_hash/zc_cd */ + /* if no entry, return ENOENT */ + + if (zc->zc_leaf && + (ZAP_HASH_IDX(zc->zc_hash, + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } + +again: + if (zc->zc_leaf == NULL) { + err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, + &zc->zc_leaf); + if (err != 0) + return (err); + } else { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + } + l = zc->zc_leaf; + + err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); + + if (err == ENOENT) { + uint64_t nocare = + (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1; + zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; + zc->zc_cd = 0; + if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 || + zc->zc_hash == 0) { + zc->zc_hash = -1ULL; + } else { + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + goto again; + } + } + + if (err == 0) { + zc->zc_hash = zeh.zeh_hash; + zc->zc_cd = zeh.zeh_cd; + za->za_integer_length = zeh.zeh_integer_size; + za->za_num_integers = zeh.zeh_num_integers; + if (zeh.zeh_num_integers == 0) { + za->za_first_integer = 0; + } else { + err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); + ASSERT(err == 0 || err == EOVERFLOW); + } + err = zap_entry_read_name(zap, &zeh, + sizeof (za->za_name), za->za_name); + ASSERT(err == 0); + + za->za_normalization_conflict = + zap_entry_normalization_conflict(&zeh, + NULL, za->za_name, zap); + } + rw_exit(&zc->zc_leaf->l_rwlock); + return (err); +} + +static void +zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) +{ + uint64_t lastblk = 0; + + /* + * NB: if a leaf has more pointers than an entire ptrtbl block + * can hold, then it'll be accounted for more than once, since + * we won't have lastblk. + */ + for (int i = 0; i < len; i++) { + zap_leaf_t *l; + + if (tbl[i] == lastblk) + continue; + lastblk = tbl[i]; + + int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); + if (err == 0) { + zap_leaf_stats(zap, l, zs); + zap_put_leaf(l); + } + } +} + +int +fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn) +{ + int err; + zap_leaf_t *l; + zap_entry_handle_t zeh; + + if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) + return (SET_ERROR(ENAMETOOLONG)); + + err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + + err = zap_leaf_lookup(l, zn, &zeh); + if (err != 0) + return (err); + + zc->zc_leaf = l; + zc->zc_hash = zeh.zeh_hash; + zc->zc_cd = zeh.zeh_cd; + + return (err); +} + +void +fzap_get_stats(zap_t *zap, zap_stats_t *zs) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + zs->zs_blocksize = 1ULL << bs; + + /* + * Set zap_phys_t fields + */ + zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs; + zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries; + zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk; + zs->zs_block_type = zap_f_phys(zap)->zap_block_type; + zs->zs_magic = zap_f_phys(zap)->zap_magic; + zs->zs_salt = zap_f_phys(zap)->zap_salt; + + /* + * Set zap_ptrtbl fields + */ + zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift; + zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk; + zs->zs_ptrtbl_blks_copied = + zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied; + zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk; + zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks; + zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { + /* the ptrtbl is entirely in the header block. */ + zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), + 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); + } else { + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); + + for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; + b++) { + dmu_buf_t *db; + int err; + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, + FTAG, &db, DMU_READ_NO_PREFETCH); + if (err == 0) { + zap_stats_ptrtbl(zap, db->db_data, + 1<<(bs-3), zs); + dmu_buf_rele(db, FTAG); + } + } + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c new file mode 100644 index 000000000000..1c7c736d8e97 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c @@ -0,0 +1,849 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +/* + * The 512-byte leaf is broken into 32 16-byte chunks. + * chunk number n means l_chunk[n], even though the header precedes it. + * the names are stored null-terminated. + */ + +#include <sys/zio.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zfs_context.h> +#include <sys/fs/zfs.h> +#include <sys/zap.h> +#include <sys/zap_impl.h> +#include <sys/zap_leaf.h> +#include <sys/arc.h> + +static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); + +#define CHAIN_END 0xffff /* end of the chunk chain */ + +/* half the (current) minimum block size */ +#define MAX_ARRAY_BYTES (8<<10) + +#define LEAF_HASH(l, h) \ + ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ + ((h) >> \ + (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) + +#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)]) + +extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l); + +static void +zap_memset(void *a, int c, size_t n) +{ + char *cp = a; + char *cpend = cp + n; + + while (cp < cpend) + *cp++ = c; +} + +static void +stv(int len, void *addr, uint64_t value) +{ + switch (len) { + case 1: + *(uint8_t *)addr = value; + return; + case 2: + *(uint16_t *)addr = value; + return; + case 4: + *(uint32_t *)addr = value; + return; + case 8: + *(uint64_t *)addr = value; + return; + } + ASSERT(!"bad int len"); +} + +static uint64_t +ldv(int len, const void *addr) +{ + switch (len) { + case 1: + return (*(uint8_t *)addr); + case 2: + return (*(uint16_t *)addr); + case 4: + return (*(uint32_t *)addr); + case 8: + return (*(uint64_t *)addr); + } + ASSERT(!"bad int len"); + return (0xFEEDFACEDEADBEEFULL); +} + +void +zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) +{ + zap_leaf_t l; + dmu_buf_t l_dbuf; + + l_dbuf.db_data = buf; + l.l_bs = highbit64(size) - 1; + l.l_dbuf = &l_dbuf; + + buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type); + buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix); + buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic); + buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree); + buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries); + buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); + buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); + + for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) + buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); + + for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { + zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); + struct zap_leaf_entry *le; + + switch (lc->l_free.lf_type) { + case ZAP_CHUNK_ENTRY: + le = &lc->l_entry; + + le->le_type = BSWAP_8(le->le_type); + le->le_value_intlen = BSWAP_8(le->le_value_intlen); + le->le_next = BSWAP_16(le->le_next); + le->le_name_chunk = BSWAP_16(le->le_name_chunk); + le->le_name_numints = BSWAP_16(le->le_name_numints); + le->le_value_chunk = BSWAP_16(le->le_value_chunk); + le->le_value_numints = BSWAP_16(le->le_value_numints); + le->le_cd = BSWAP_32(le->le_cd); + le->le_hash = BSWAP_64(le->le_hash); + break; + case ZAP_CHUNK_FREE: + lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type); + lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next); + break; + case ZAP_CHUNK_ARRAY: + lc->l_array.la_type = BSWAP_8(lc->l_array.la_type); + lc->l_array.la_next = BSWAP_16(lc->l_array.la_next); + /* la_array doesn't need swapping */ + break; + default: + ASSERT(!"bad leaf type"); + } + } +} + +void +zap_leaf_init(zap_leaf_t *l, boolean_t sort) +{ + l->l_bs = highbit64(l->l_dbuf->db_size) - 1; + zap_memset(&zap_leaf_phys(l)->l_hdr, 0, + sizeof (struct zap_leaf_header)); + zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + 2*ZAP_LEAF_HASH_NUMENTRIES(l)); + for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; + ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; + } + ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END; + zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF; + zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC; + zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); + if (sort) + zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; +} + +/* + * Routines which manipulate leaf chunks (l_chunk[]). + */ + +static uint16_t +zap_leaf_chunk_alloc(zap_leaf_t *l) +{ + ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0); + + int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); + + zap_leaf_phys(l)->l_hdr.lh_freelist = + ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; + + zap_leaf_phys(l)->l_hdr.lh_nfree--; + + return (chunk); +} + +static void +zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) +{ + struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free; + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT(zlf->lf_type != ZAP_CHUNK_FREE); + + zlf->lf_type = ZAP_CHUNK_FREE; + zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist; + bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */ + zap_leaf_phys(l)->l_hdr.lh_freelist = chunk; + + zap_leaf_phys(l)->l_hdr.lh_nfree++; +} + +/* + * Routines which manipulate leaf arrays (zap_leaf_array type chunks). + */ + +static uint16_t +zap_leaf_array_create(zap_leaf_t *l, const char *buf, + int integer_size, int num_integers) +{ + uint16_t chunk_head; + uint16_t *chunkp = &chunk_head; + int byten = 0; + uint64_t value = 0; + int shift = (integer_size - 1) * 8; + int len = num_integers; + + ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES); + + while (len > 0) { + uint16_t chunk = zap_leaf_chunk_alloc(l); + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + + la->la_type = ZAP_CHUNK_ARRAY; + for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { + if (byten == 0) + value = ldv(integer_size, buf); + la->la_array[i] = value >> shift; + value <<= 8; + if (++byten == integer_size) { + byten = 0; + buf += integer_size; + if (--len == 0) + break; + } + } + + *chunkp = chunk; + chunkp = &la->la_next; + } + *chunkp = CHAIN_END; + + return (chunk_head); +} + +static void +zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) +{ + uint16_t chunk = *chunkp; + + *chunkp = CHAIN_END; + + while (chunk != CHAIN_END) { + int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; + ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, + ZAP_CHUNK_ARRAY); + zap_leaf_chunk_free(l, chunk); + chunk = nextchunk; + } +} + +/* array_len and buf_len are in integers, not bytes */ +static void +zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, + int array_int_len, int array_len, int buf_int_len, uint64_t buf_len, + void *buf) +{ + int len = MIN(array_len, buf_len); + int byten = 0; + uint64_t value = 0; + char *p = buf; + + ASSERT3U(array_int_len, <=, buf_int_len); + + /* Fast path for one 8-byte integer */ + if (array_int_len == 8 && buf_int_len == 8 && len == 1) { + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + uint8_t *ip = la->la_array; + uint64_t *buf64 = buf; + + *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | + (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | + (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 | + (uint64_t)ip[6] << 8 | (uint64_t)ip[7]; + return; + } + + /* Fast path for an array of 1-byte integers (eg. the entry name) */ + if (array_int_len == 1 && buf_int_len == 1 && + buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) { + while (chunk != CHAIN_END) { + struct zap_leaf_array *la = + &ZAP_LEAF_CHUNK(l, chunk).l_array; + bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES); + p += ZAP_LEAF_ARRAY_BYTES; + chunk = la->la_next; + } + return; + } + + while (len > 0) { + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { + value = (value << 8) | la->la_array[i]; + byten++; + if (byten == array_int_len) { + stv(buf_int_len, p, value); + byten = 0; + len--; + if (len == 0) + return; + p += buf_int_len; + } + } + chunk = la->la_next; + } +} + +static boolean_t +zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, + int chunk, int array_numints) +{ + int bseen = 0; + + if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) { + uint64_t *thiskey = + kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP); + ASSERT(zn->zn_key_intlen == sizeof (*thiskey)); + + zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints, + sizeof (*thiskey), array_numints, thiskey); + boolean_t match = bcmp(thiskey, zn->zn_key_orig, + array_numints * sizeof (*thiskey)) == 0; + kmem_free(thiskey, array_numints * sizeof (*thiskey)); + return (match); + } + + ASSERT(zn->zn_key_intlen == 1); + if (zn->zn_matchtype & MT_NORMALIZE) { + char *thisname = kmem_alloc(array_numints, KM_SLEEP); + + zap_leaf_array_read(l, chunk, sizeof (char), array_numints, + sizeof (char), array_numints, thisname); + boolean_t match = zap_match(zn, thisname); + kmem_free(thisname, array_numints); + return (match); + } + + /* + * Fast path for exact matching. + * First check that the lengths match, so that we don't read + * past the end of the zn_key_orig array. + */ + if (array_numints != zn->zn_key_orig_numints) + return (B_FALSE); + while (bseen < array_numints) { + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES); + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread)) + break; + chunk = la->la_next; + bseen += toread; + } + return (bseen == array_numints); +} + +/* + * Routines which manipulate leaf entries. + */ + +int +zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh) +{ + struct zap_leaf_entry *le; + + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + + for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash); + *chunkp != CHAIN_END; chunkp = &le->le_next) { + uint16_t chunk = *chunkp; + le = ZAP_LEAF_ENTRY(l, chunk); + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (le->le_hash != zn->zn_hash) + continue; + + /* + * NB: the entry chain is always sorted by cd on + * normalized zap objects, so this will find the + * lowest-cd match for MT_NORMALIZE. + */ + ASSERT((zn->zn_matchtype == 0) || + (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); + if (zap_leaf_array_match(l, zn, le->le_name_chunk, + le->le_name_numints)) { + zeh->zeh_num_integers = le->le_value_numints; + zeh->zeh_integer_size = le->le_value_intlen; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_chunkp = chunkp; + zeh->zeh_leaf = l; + return (0); + } + } + + return (SET_ERROR(ENOENT)); +} + +/* Return (h1,cd1 >= h2,cd2) */ +#define HCD_GTEQ(h1, cd1, h2, cd2) \ + ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE)) + +int +zap_leaf_lookup_closest(zap_leaf_t *l, + uint64_t h, uint32_t cd, zap_entry_handle_t *zeh) +{ + uint64_t besth = -1ULL; + uint32_t bestcd = -1U; + uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1; + struct zap_leaf_entry *le; + + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + + for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { + for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh]; + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(l, chunk); + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) && + HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) { + ASSERT3U(bestlh, >=, lh); + bestlh = lh; + besth = le->le_hash; + bestcd = le->le_cd; + + zeh->zeh_num_integers = le->le_value_numints; + zeh->zeh_integer_size = le->le_value_intlen; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_fakechunk = chunk; + zeh->zeh_chunkp = &zeh->zeh_fakechunk; + zeh->zeh_leaf = l; + } + } + } + + return (bestcd == -1U ? ENOENT : 0); +} + +int +zap_entry_read(const zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, void *buf) +{ + struct zap_leaf_entry *le = + ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (le->le_value_intlen > integer_size) + return (SET_ERROR(EINVAL)); + + zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, + le->le_value_intlen, le->le_value_numints, + integer_size, num_integers, buf); + + if (zeh->zeh_num_integers > num_integers) + return (SET_ERROR(EOVERFLOW)); + return (0); + +} + +int +zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen, + char *buf) +{ + struct zap_leaf_entry *le = + ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { + zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8, + le->le_name_numints, 8, buflen / 8, buf); + } else { + zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, + le->le_name_numints, 1, buflen, buf); + } + if (le->le_name_numints > buflen) + return (SET_ERROR(EOVERFLOW)); + return (0); +} + +int +zap_entry_update(zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, const void *buf) +{ + zap_leaf_t *l = zeh->zeh_leaf; + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp); + + int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); + + if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks) + return (SET_ERROR(EAGAIN)); + + zap_leaf_array_free(l, &le->le_value_chunk); + le->le_value_chunk = + zap_leaf_array_create(l, buf, integer_size, num_integers); + le->le_value_numints = num_integers; + le->le_value_intlen = integer_size; + return (0); +} + +void +zap_entry_remove(zap_entry_handle_t *zeh) +{ + zap_leaf_t *l = zeh->zeh_leaf; + + ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk); + + uint16_t entry_chunk = *zeh->zeh_chunkp; + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + zap_leaf_array_free(l, &le->le_name_chunk); + zap_leaf_array_free(l, &le->le_value_chunk); + + *zeh->zeh_chunkp = le->le_next; + zap_leaf_chunk_free(l, entry_chunk); + + zap_leaf_phys(l)->l_hdr.lh_nentries--; +} + +int +zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh) +{ + uint16_t chunk; + struct zap_leaf_entry *le; + uint64_t h = zn->zn_hash; + + uint64_t valuelen = integer_size * num_integers; + + int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * + zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); + if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) + return (E2BIG); + + if (cd == ZAP_NEED_CD) { + /* find the lowest unused cd */ + if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { + cd = 0; + + for (chunk = *LEAF_HASH_ENTPTR(l, h); + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(l, chunk); + if (le->le_cd > cd) + break; + if (le->le_hash == h) { + ASSERT3U(cd, ==, le->le_cd); + cd++; + } + } + } else { + /* old unsorted format; do it the O(n^2) way */ + for (cd = 0; ; cd++) { + for (chunk = *LEAF_HASH_ENTPTR(l, h); + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(l, chunk); + if (le->le_hash == h && + le->le_cd == cd) { + break; + } + } + /* If this cd is not in use, we are good. */ + if (chunk == CHAIN_END) + break; + } + } + /* + * We would run out of space in a block before we could + * store enough entries to run out of CD values. + */ + ASSERT3U(cd, <, zap_maxcd(zn->zn_zap)); + } + + if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks) + return (SET_ERROR(EAGAIN)); + + /* make the entry */ + chunk = zap_leaf_chunk_alloc(l); + le = ZAP_LEAF_ENTRY(l, chunk); + le->le_type = ZAP_CHUNK_ENTRY; + le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig, + zn->zn_key_intlen, zn->zn_key_orig_numints); + le->le_name_numints = zn->zn_key_orig_numints; + le->le_value_chunk = + zap_leaf_array_create(l, buf, integer_size, num_integers); + le->le_value_numints = num_integers; + le->le_value_intlen = integer_size; + le->le_hash = h; + le->le_cd = cd; + + /* link it into the hash chain */ + /* XXX if we did the search above, we could just use that */ + uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk); + + zap_leaf_phys(l)->l_hdr.lh_nentries++; + + zeh->zeh_leaf = l; + zeh->zeh_num_integers = num_integers; + zeh->zeh_integer_size = le->le_value_intlen; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_chunkp = chunkp; + + return (0); +} + +/* + * Determine if there is another entry with the same normalized form. + * For performance purposes, either zn or name must be provided (the + * other can be NULL). Note, there usually won't be any hash + * conflicts, in which case we don't need the concatenated/normalized + * form of the name. But all callers have one of these on hand anyway, + * so might as well take advantage. A cleaner but slower interface + * would accept neither argument, and compute the normalized name as + * needed (using zap_name_alloc(zap_entry_read_name(zeh))). + */ +boolean_t +zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, + const char *name, zap_t *zap) +{ + struct zap_leaf_entry *le; + boolean_t allocdzn = B_FALSE; + + if (zap->zap_normflags == 0) + return (B_FALSE); + + for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash); + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk); + if (le->le_hash != zeh->zeh_hash) + continue; + if (le->le_cd == zeh->zeh_cd) + continue; + + if (zn == NULL) { + zn = zap_name_alloc(zap, name, MT_NORMALIZE); + allocdzn = B_TRUE; + } + if (zap_leaf_array_match(zeh->zeh_leaf, zn, + le->le_name_chunk, le->le_name_numints)) { + if (allocdzn) + zap_name_free(zn); + return (B_TRUE); + } + } + if (allocdzn) + zap_name_free(zn); + return (B_FALSE); +} + +/* + * Routines for transferring entries between leafs. + */ + +static uint16_t * +zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) +{ + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); + struct zap_leaf_entry *le2; + uint16_t *chunkp; + + /* + * keep the entry chain sorted by cd + * NB: this will not cause problems for unsorted leafs, though + * it is unnecessary there. + */ + for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash); + *chunkp != CHAIN_END; chunkp = &le2->le_next) { + le2 = ZAP_LEAF_ENTRY(l, *chunkp); + if (le2->le_cd > le->le_cd) + break; + } + + le->le_next = *chunkp; + *chunkp = entry; + return (chunkp); +} + +static uint16_t +zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) +{ + uint16_t new_chunk; + uint16_t *nchunkp = &new_chunk; + + while (chunk != CHAIN_END) { + uint16_t nchunk = zap_leaf_chunk_alloc(nl); + struct zap_leaf_array *nla = + &ZAP_LEAF_CHUNK(nl, nchunk).l_array; + struct zap_leaf_array *la = + &ZAP_LEAF_CHUNK(l, chunk).l_array; + int nextchunk = la->la_next; + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); + + *nla = *la; /* structure assignment */ + + zap_leaf_chunk_free(l, chunk); + chunk = nextchunk; + *nchunkp = nchunk; + nchunkp = &nla->la_next; + } + *nchunkp = CHAIN_END; + return (new_chunk); +} + +static void +zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) +{ + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + uint16_t chunk = zap_leaf_chunk_alloc(nl); + struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk); + *nle = *le; /* structure assignment */ + + (void) zap_leaf_rehash_entry(nl, chunk); + + nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); + nle->le_value_chunk = + zap_leaf_transfer_array(l, le->le_value_chunk, nl); + + zap_leaf_chunk_free(l, entry); + + zap_leaf_phys(l)->l_hdr.lh_nentries--; + zap_leaf_phys(nl)->l_hdr.lh_nentries++; +} + +/* + * Transfer the entries whose hash prefix ends in 1 to the new leaf. + */ +void +zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) +{ + int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; + + /* set new prefix and prefix_len */ + zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1; + zap_leaf_phys(l)->l_hdr.lh_prefix_len++; + zap_leaf_phys(nl)->l_hdr.lh_prefix = + zap_leaf_phys(l)->l_hdr.lh_prefix | 1; + zap_leaf_phys(nl)->l_hdr.lh_prefix_len = + zap_leaf_phys(l)->l_hdr.lh_prefix_len; + + /* break existing hash chains */ + zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + 2*ZAP_LEAF_HASH_NUMENTRIES(l)); + + if (sort) + zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; + + /* + * Transfer entries whose hash bit 'bit' is set to nl; rehash + * the remaining entries + * + * NB: We could find entries via the hashtable instead. That + * would be O(hashents+numents) rather than O(numblks+numents), + * but this accesses memory more sequentially, and when we're + * called, the block is usually pretty full. + */ + for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); + if (le->le_type != ZAP_CHUNK_ENTRY) + continue; + + if (le->le_hash & (1ULL << bit)) + zap_leaf_transfer_entry(l, i, nl); + else + (void) zap_leaf_rehash_entry(l, i); + } +} + +void +zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) +{ + int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - + zap_leaf_phys(l)->l_hdr.lh_prefix_len; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_leafs_with_2n_pointers[n]++; + + + n = zap_leaf_phys(l)->l_hdr.lh_nentries/5; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_blocks_with_n5_entries[n]++; + + n = ((1<<FZAP_BLOCK_SHIFT(zap)) - + zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / + (1<<FZAP_BLOCK_SHIFT(zap)); + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_blocks_n_tenths_full[n]++; + + for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { + int nentries = 0; + int chunk = zap_leaf_phys(l)->l_hash[i]; + + while (chunk != CHAIN_END) { + struct zap_leaf_entry *le = + ZAP_LEAF_ENTRY(l, chunk); + + n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) + + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * + le->le_value_intlen); + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_entries_using_n_chunks[n]++; + + chunk = le->le_next; + nentries++; + } + + n = nentries; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_buckets_with_n_entries[n]++; + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c new file mode 100644 index 000000000000..a2f7b08163c3 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c @@ -0,0 +1,1588 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include <sys/zio.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zfs_context.h> +#include <sys/zap.h> +#include <sys/refcount.h> +#include <sys/zap_impl.h> +#include <sys/zap_leaf.h> +#include <sys/avl.h> +#include <sys/arc.h> +#include <sys/dmu_objset.h> + +#ifdef _KERNEL +#include <sys/sunddi.h> +#endif + +extern inline mzap_phys_t *zap_m_phys(zap_t *zap); + +static int mzap_upgrade(zap_t **zapp, + void *tag, dmu_tx_t *tx, zap_flags_t flags); + +uint64_t +zap_getflags(zap_t *zap) +{ + if (zap->zap_ismicro) + return (0); + return (zap_f_phys(zap)->zap_flags); +} + +int +zap_hashbits(zap_t *zap) +{ + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return (48); + else + return (28); +} + +uint32_t +zap_maxcd(zap_t *zap) +{ + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return ((1<<16)-1); + else + return (-1U); +} + +static uint64_t +zap_hash(zap_name_t *zn) +{ + zap_t *zap = zn->zn_zap; + uint64_t h = 0; + + if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { + ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); + h = *(uint64_t *)zn->zn_key_orig; + } else { + h = zap->zap_salt; + ASSERT(h != 0); + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + + if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { + const uint64_t *wp = zn->zn_key_norm; + + ASSERT(zn->zn_key_intlen == 8); + for (int i = 0; i < zn->zn_key_norm_numints; + wp++, i++) { + uint64_t word = *wp; + + for (int j = 0; j < zn->zn_key_intlen; j++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ word) & 0xFF]; + word >>= NBBY; + } + } + } else { + const uint8_t *cp = zn->zn_key_norm; + + /* + * We previously stored the terminating null on + * disk, but didn't hash it, so we need to + * continue to not hash it. (The + * zn_key_*_numints includes the terminating + * null for non-binary keys.) + */ + int len = zn->zn_key_norm_numints - 1; + + ASSERT(zn->zn_key_intlen == 1); + for (int i = 0; i < len; cp++, i++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ *cp) & 0xFF]; + } + } + } + /* + * Don't use all 64 bits, since we need some in the cookie for + * the collision differentiator. We MUST use the high bits, + * since those are the ones that we first pay attention to when + * chosing the bucket. + */ + h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); + + return (h); +} + +static int +zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags) +{ + ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); + + size_t inlen = strlen(name) + 1; + size_t outlen = ZAP_MAXNAMELEN; + + int err = 0; + (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, + normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, + U8_UNICODE_LATEST, &err); + + return (err); +} + +boolean_t +zap_match(zap_name_t *zn, const char *matchname) +{ + ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); + + if (zn->zn_matchtype & MT_NORMALIZE) { + char norm[ZAP_MAXNAMELEN]; + + if (zap_normalize(zn->zn_zap, matchname, norm, + zn->zn_normflags) != 0) + return (B_FALSE); + + return (strcmp(zn->zn_key_norm, norm) == 0); + } else { + return (strcmp(zn->zn_key_orig, matchname) == 0); + } +} + +void +zap_name_free(zap_name_t *zn) +{ + kmem_free(zn, sizeof (zap_name_t)); +} + +zap_name_t * +zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) +{ + zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + + zn->zn_zap = zap; + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = key; + zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; + zn->zn_matchtype = mt; + zn->zn_normflags = zap->zap_normflags; + + /* + * If we're dealing with a case sensitive lookup on a mixed or + * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup + * will fold case to all caps overriding the lookup request. + */ + if (mt & MT_MATCH_CASE) + zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; + + if (zap->zap_normflags) { + /* + * We *must* use zap_normflags because this normalization is + * what the hash is computed from. + */ + if (zap_normalize(zap, key, zn->zn_normbuf, + zap->zap_normflags) != 0) { + zap_name_free(zn); + return (NULL); + } + zn->zn_key_norm = zn->zn_normbuf; + zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; + } else { + if (mt != 0) { + zap_name_free(zn); + return (NULL); + } + zn->zn_key_norm = zn->zn_key_orig; + zn->zn_key_norm_numints = zn->zn_key_orig_numints; + } + + zn->zn_hash = zap_hash(zn); + + if (zap->zap_normflags != zn->zn_normflags) { + /* + * We *must* use zn_normflags because this normalization is + * what the matching is based on. (Not the hash!) + */ + if (zap_normalize(zap, key, zn->zn_normbuf, + zn->zn_normflags) != 0) { + zap_name_free(zn); + return (NULL); + } + zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; + } + + return (zn); +} + +zap_name_t * +zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) +{ + zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + + ASSERT(zap->zap_normflags == 0); + zn->zn_zap = zap; + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = zn->zn_key_norm = key; + zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; + zn->zn_matchtype = 0; + + zn->zn_hash = zap_hash(zn); + return (zn); +} + +static void +mzap_byteswap(mzap_phys_t *buf, size_t size) +{ + buf->mz_block_type = BSWAP_64(buf->mz_block_type); + buf->mz_salt = BSWAP_64(buf->mz_salt); + buf->mz_normflags = BSWAP_64(buf->mz_normflags); + int max = (size / MZAP_ENT_LEN) - 1; + for (int i = 0; i < max; i++) { + buf->mz_chunk[i].mze_value = + BSWAP_64(buf->mz_chunk[i].mze_value); + buf->mz_chunk[i].mze_cd = + BSWAP_32(buf->mz_chunk[i].mze_cd); + } +} + +void +zap_byteswap(void *buf, size_t size) +{ + uint64_t block_type = *(uint64_t *)buf; + + if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { + /* ASSERT(magic == ZAP_LEAF_MAGIC); */ + mzap_byteswap(buf, size); + } else { + fzap_byteswap(buf, size); + } +} + +static int +mze_compare(const void *arg1, const void *arg2) +{ + const mzap_ent_t *mze1 = arg1; + const mzap_ent_t *mze2 = arg2; + + int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash); + if (likely(cmp)) + return (cmp); + + return (AVL_CMP(mze1->mze_cd, mze2->mze_cd)); +} + +static int +mze_insert(zap_t *zap, int chunkid, uint64_t hash) +{ + avl_index_t idx; + + ASSERT(zap->zap_ismicro); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); + mze->mze_chunkid = chunkid; + mze->mze_hash = hash; + mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; + ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); + if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) { + kmem_free(mze, sizeof (mzap_ent_t)); + return (EEXIST); + } + avl_insert(&zap->zap_m.zap_avl, mze, idx); + return (0); +} + +static mzap_ent_t * +mze_find(zap_name_t *zn) +{ + mzap_ent_t mze_tofind; + mzap_ent_t *mze; + avl_index_t idx; + avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; + + ASSERT(zn->zn_zap->zap_ismicro); + ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); + + mze_tofind.mze_hash = zn->zn_hash; + mze_tofind.mze_cd = 0; + + mze = avl_find(avl, &mze_tofind, &idx); + if (mze == NULL) + mze = avl_nearest(avl, idx, AVL_AFTER); + for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { + ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); + if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) + return (mze); + } + + return (NULL); +} + +static uint32_t +mze_find_unused_cd(zap_t *zap, uint64_t hash) +{ + mzap_ent_t mze_tofind; + avl_index_t idx; + avl_tree_t *avl = &zap->zap_m.zap_avl; + + ASSERT(zap->zap_ismicro); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + mze_tofind.mze_hash = hash; + mze_tofind.mze_cd = 0; + + uint32_t cd = 0; + for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx); + mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + if (mze->mze_cd != cd) + break; + cd++; + } + + return (cd); +} + +static void +mze_remove(zap_t *zap, mzap_ent_t *mze) +{ + ASSERT(zap->zap_ismicro); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + avl_remove(&zap->zap_m.zap_avl, mze); + kmem_free(mze, sizeof (mzap_ent_t)); +} + +static void +mze_destroy(zap_t *zap) +{ + mzap_ent_t *mze; + void *avlcookie = NULL; + + while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) + kmem_free(mze, sizeof (mzap_ent_t)); + avl_destroy(&zap->zap_m.zap_avl); +} + +static zap_t * +mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) +{ + zap_t *winner; + uint64_t *zap_hdr = (uint64_t *)db->db_data; + uint64_t zap_block_type = zap_hdr[0]; + uint64_t zap_magic = zap_hdr[1]; + + ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); + + zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); + rw_init(&zap->zap_rwlock, 0, 0, 0); + rw_enter(&zap->zap_rwlock, RW_WRITER); + zap->zap_objset = os; + zap->zap_object = obj; + zap->zap_dbuf = db; + + if (zap_block_type != ZBT_MICRO) { + mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); + zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; + if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { + winner = NULL; /* No actual winner here... */ + goto handle_winner; + } + } else { + zap->zap_ismicro = TRUE; + } + + /* + * Make sure that zap_ismicro is set before we let others see + * it, because zap_lockdir() checks zap_ismicro without the lock + * held. + */ + dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); + winner = dmu_buf_set_user(db, &zap->zap_dbu); + + if (winner != NULL) + goto handle_winner; + + if (zap->zap_ismicro) { + zap->zap_salt = zap_m_phys(zap)->mz_salt; + zap->zap_normflags = zap_m_phys(zap)->mz_normflags; + zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; + avl_create(&zap->zap_m.zap_avl, mze_compare, + sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); + + for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = + &zap_m_phys(zap)->mz_chunk[i]; + if (mze->mze_name[0]) { + zap_name_t *zn; + + zn = zap_name_alloc(zap, mze->mze_name, 0); + if (mze_insert(zap, i, zn->zn_hash) == 0) + zap->zap_m.zap_num_entries++; + else { + printf("ZFS WARNING: Duplicated ZAP " + "entry detected (%s).\n", + mze->mze_name); + } + zap_name_free(zn); + } + } + } else { + zap->zap_salt = zap_f_phys(zap)->zap_salt; + zap->zap_normflags = zap_f_phys(zap)->zap_normflags; + + ASSERT3U(sizeof (struct zap_leaf_header), ==, + 2*ZAP_LEAF_CHUNKSIZE); + + /* + * The embedded pointer table should not overlap the + * other members. + */ + ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, + &zap_f_phys(zap)->zap_salt); + + /* + * The embedded pointer table should end at the end of + * the block + */ + ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, + 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) - + (uintptr_t)zap_f_phys(zap), ==, + zap->zap_dbuf->db_size); + } + rw_exit(&zap->zap_rwlock); + return (zap); + +handle_winner: + rw_exit(&zap->zap_rwlock); + rw_destroy(&zap->zap_rwlock); + if (!zap->zap_ismicro) + mutex_destroy(&zap->zap_f.zap_num_entries_mtx); + kmem_free(zap, sizeof (zap_t)); + return (winner); +} + +/* + * This routine "consumes" the caller's hold on the dbuf, which must + * have the specified tag. + */ +static int +zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) +{ + ASSERT0(db->db_offset); + objset_t *os = dmu_buf_get_objset(db); + uint64_t obj = db->db_object; + + *zapp = NULL; + + zap_t *zap = dmu_buf_get_user(db); + if (zap == NULL) { + zap = mzap_open(os, obj, db); + if (zap == NULL) { + /* + * mzap_open() didn't like what it saw on-disk. + * Check for corruption! + */ + return (SET_ERROR(EIO)); + } + } + + /* + * We're checking zap_ismicro without the lock held, in order to + * tell what type of lock we want. Once we have some sort of + * lock, see if it really is the right type. In practice this + * can only be different if it was upgraded from micro to fat, + * and micro wanted WRITER but fat only needs READER. + */ + krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; + rw_enter(&zap->zap_rwlock, lt); + if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { + /* it was upgraded, now we only need reader */ + ASSERT(lt == RW_WRITER); + ASSERT(RW_READER == + (!zap->zap_ismicro && fatreader) ? RW_READER : lti); + rw_downgrade(&zap->zap_rwlock); + lt = RW_READER; + } + + zap->zap_objset = os; + + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + + ASSERT3P(zap->zap_dbuf, ==, db); + + ASSERT(!zap->zap_ismicro || + zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); + if (zap->zap_ismicro && tx && adding && + zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { + uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; + if (newsz > MZAP_MAX_BLKSZ) { + dprintf("upgrading obj %llu: num_entries=%u\n", + obj, zap->zap_m.zap_num_entries); + *zapp = zap; + int err = mzap_upgrade(zapp, tag, tx, 0); + if (err != 0) + rw_exit(&zap->zap_rwlock); + return (err); + } + VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); + zap->zap_m.zap_num_chunks = + db->db_size / MZAP_ENT_LEN - 1; + } + + *zapp = zap; + return (0); +} + +static int +zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) +{ + dmu_buf_t *db; + + int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) { + return (err); + } +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(db, &doi); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); + } +#endif + + err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) { + dmu_buf_rele(db, tag); + } + return (err); +} + +int +zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) +{ + dmu_buf_t *db; + + int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(db, &doi); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); + } +#endif + err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) + dmu_buf_rele(db, tag); + return (err); +} + +void +zap_unlockdir(zap_t *zap, void *tag) +{ + rw_exit(&zap->zap_rwlock); + dmu_buf_rele(zap->zap_dbuf, tag); +} + +static int +mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) +{ + int err = 0; + zap_t *zap = *zapp; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + int sz = zap->zap_dbuf->db_size; + mzap_phys_t *mzp = zio_buf_alloc(sz); + bcopy(zap->zap_dbuf->db_data, mzp, sz); + int nchunks = zap->zap_m.zap_num_chunks; + + if (!flags) { + err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, + 1ULL << fzap_default_block_shift, 0, tx); + if (err != 0) { + zio_buf_free(mzp, sz); + return (err); + } + } + + dprintf("upgrading obj=%llu with %u chunks\n", + zap->zap_object, nchunks); + /* XXX destroy the avl later, so we can use the stored hash value */ + mze_destroy(zap); + + fzap_upgrade(zap, tx, flags); + + for (int i = 0; i < nchunks; i++) { + mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; + if (mze->mze_name[0] == 0) + continue; + dprintf("adding %s=%llu\n", + mze->mze_name, mze->mze_value); + zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0); + err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, + tag, tx); + zap = zn->zn_zap; /* fzap_add_cd() may change zap */ + zap_name_free(zn); + if (err != 0) + break; + } + zio_buf_free(mzp, sz); + *zapp = zap; + return (err); +} + +/* + * The "normflags" determine the behavior of the matchtype_t which is + * passed to zap_lookup_norm(). Names which have the same normalized + * version will be stored with the same hash value, and therefore we can + * perform normalization-insensitive lookups. We can be Unicode form- + * insensitive and/or case-insensitive. The following flags are valid for + * "normflags": + * + * U8_TEXTPREP_NFC + * U8_TEXTPREP_NFD + * U8_TEXTPREP_NFKC + * U8_TEXTPREP_NFKD + * U8_TEXTPREP_TOUPPER + * + * The *_NF* (Normalization Form) flags are mutually exclusive; at most one + * of them may be supplied. + */ +void +mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, + dmu_tx_t *tx) +{ + dmu_buf_t *db; + + VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); + + dmu_buf_will_dirty(db, tx); + mzap_phys_t *zp = db->db_data; + zp->mz_block_type = ZBT_MICRO; + zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; + zp->mz_normflags = normflags; + + if (flags != 0) { + zap_t *zap; + /* Only fat zap supports flags; upgrade immediately. */ + VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER, + B_FALSE, B_FALSE, &zap)); + VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); + zap_unlockdir(zap, FTAG); + } else { + dmu_buf_rele(db, FTAG); + } +} + +int +zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, + 0, tx)); +} + +int +zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_claim_norm_dnsize(os, obj, + 0, ot, bonustype, bonuslen, dnodesize, tx)); +} + +int +zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, + dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, + bonuslen, 0, tx)); +} + +int +zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, + dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx) + { + int err; + + err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, + dnodesize, tx); + if (err != 0) + return (err); + mzap_create_impl(os, obj, normflags, 0, tx); + return (0); +} + +uint64_t +zap_create(objset_t *os, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); +} + +uint64_t +zap_create_dnsize(objset_t *os, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, + dnodesize, tx)); +} + +uint64_t +zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); + return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, + 0, tx)); +} + +uint64_t +zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen, + dnodesize, tx); + + mzap_create_impl(os, obj, normflags, 0, tx); + return (obj); +} + +uint64_t +zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); + return (zap_create_flags_dnsize(os, normflags, flags, ot, + leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); +} + +uint64_t +zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen, + dnodesize, tx); + + ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && + leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT && + indirect_blockshift >= SPA_MINBLOCKSHIFT && + indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT); + + VERIFY(dmu_object_set_blocksize(os, obj, + 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0); + + mzap_create_impl(os, obj, normflags, flags, tx); + return (obj); +} + +int +zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) +{ + /* + * dmu_object_free will free the object number and free the + * data. Freeing the data will cause our pageout function to be + * called, which will destroy our data (zap_leaf_t's and zap_t). + */ + + return (dmu_object_free(os, zapobj, tx)); +} + +void +zap_evict_sync(void *dbu) +{ + zap_t *zap = dbu; + + rw_destroy(&zap->zap_rwlock); + + if (zap->zap_ismicro) + mze_destroy(zap); + else + mutex_destroy(&zap->zap_f.zap_num_entries_mtx); + + kmem_free(zap, sizeof (zap_t)); +} + +int +zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + if (!zap->zap_ismicro) { + err = fzap_count(zap, count); + } else { + *count = zap->zap_m.zap_num_entries; + } + zap_unlockdir(zap, FTAG); + return (err); +} + +/* + * zn may be NULL; if not specified, it will be computed if needed. + * See also the comment above zap_entry_normalization_conflict(). + */ +static boolean_t +mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) +{ + int direction = AVL_BEFORE; + boolean_t allocdzn = B_FALSE; + + if (zap->zap_normflags == 0) + return (B_FALSE); + +again: + for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction); + other && other->mze_hash == mze->mze_hash; + other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { + + if (zn == NULL) { + zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, + MT_NORMALIZE); + allocdzn = B_TRUE; + } + if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { + if (allocdzn) + zap_name_free(zn); + return (B_TRUE); + } + } + + if (direction == AVL_BEFORE) { + direction = AVL_AFTER; + goto again; + } + + if (allocdzn) + zap_name_free(zn); + return (B_FALSE); +} + +/* + * Routines for manipulating attributes. + */ + +int +zap_lookup(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + return (zap_lookup_norm(os, zapobj, name, integer_size, + num_integers, buf, 0, NULL, 0, NULL)); +} + +static int +zap_lookup_impl(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + int err = 0; + + zap_name_t *zn = zap_name_alloc(zap, name, mt); + if (zn == NULL) + return (SET_ERROR(ENOTSUP)); + + if (!zap->zap_ismicro) { + err = fzap_lookup(zn, integer_size, num_integers, buf, + realname, rn_len, ncp); + } else { + mzap_ent_t *mze = mze_find(zn); + if (mze == NULL) { + err = SET_ERROR(ENOENT); + } else { + if (num_integers < 1) { + err = SET_ERROR(EOVERFLOW); + } else if (integer_size != 8) { + err = SET_ERROR(EINVAL); + } else { + *(uint64_t *)buf = + MZE_PHYS(zap, mze)->mze_value; + (void) strlcpy(realname, + MZE_PHYS(zap, mze)->mze_name, rn_len); + if (ncp) { + *ncp = mzap_normalization_conflict(zap, + zn, mze); + } + } + } + } + zap_name_free(zn); + return (err); +} + +int +zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_impl(zap, name, integer_size, + num_integers, buf, mt, realname, rn_len, ncp); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_lookup_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + return (zap_lookup_norm_by_dnode(dn, name, integer_size, + num_integers, buf, 0, NULL, 0, NULL)); +} + +int +zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + zap_t *zap; + + int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, + FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_impl(zap, name, integer_size, + num_integers, buf, mt, realname, rn_len, ncp); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + fzap_prefetch(zn); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + err = fzap_lookup(zn, integer_size, num_integers, buf, + NULL, 0, NULL); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_contains(objset_t *os, uint64_t zapobj, const char *name) +{ + int err = zap_lookup_norm(os, zapobj, name, 0, + 0, NULL, 0, NULL, 0, NULL); + if (err == EOVERFLOW || err == EINVAL) + err = 0; /* found, but skipped reading the value */ + return (err); +} + +int +zap_length(objset_t *os, uint64_t zapobj, const char *name, + uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc(zap, name, 0); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + if (!zap->zap_ismicro) { + err = fzap_length(zn, integer_size, num_integers); + } else { + mzap_ent_t *mze = mze_find(zn); + if (mze == NULL) { + err = SET_ERROR(ENOENT); + } else { + if (integer_size) + *integer_size = 8; + if (num_integers) + *num_integers = 1; + } + } + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_length(zn, integer_size, num_integers); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +static void +mzap_addent(zap_name_t *zn, uint64_t value) +{ + zap_t *zap = zn->zn_zap; + int start = zap->zap_m.zap_alloc_next; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + +#ifdef ZFS_DEBUG + for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; + ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); + } +#endif + + uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); + /* given the limited size of the microzap, this can't happen */ + ASSERT(cd < zap_maxcd(zap)); + +again: + for (int i = start; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; + if (mze->mze_name[0] == 0) { + mze->mze_value = value; + mze->mze_cd = cd; + (void) strcpy(mze->mze_name, zn->zn_key_orig); + zap->zap_m.zap_num_entries++; + zap->zap_m.zap_alloc_next = i+1; + if (zap->zap_m.zap_alloc_next == + zap->zap_m.zap_num_chunks) + zap->zap_m.zap_alloc_next = 0; + VERIFY(0 == mze_insert(zap, i, zn->zn_hash)); + return; + } + } + if (start != 0) { + start = 0; + goto again; + } + ASSERT(!"out of entries!"); +} + +static int +zap_add_impl(zap_t *zap, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx, void *tag) +{ + const uint64_t *intval = val; + int err = 0; + + zap_name_t *zn = zap_name_alloc(zap, key, 0); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + if (!zap->zap_ismicro) { + err = fzap_add(zn, integer_size, num_integers, val, tag, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + } else if (integer_size != 8 || num_integers != 1 || + strlen(key) >= MZAP_NAME_LEN) { + err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); + if (err == 0) { + err = fzap_add(zn, integer_size, num_integers, val, + tag, tx); + } + zap = zn->zn_zap; /* fzap_add() may change zap */ + } else { + if (mze_find(zn) != NULL) { + err = SET_ERROR(EEXIST); + } else { + mzap_addent(zn, *intval); + } + } + ASSERT(zap == zn->zn_zap); + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap, tag); + return (err); +} + +int +zap_add(objset_t *os, uint64_t zapobj, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); + /* zap_add_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_add_by_dnode(dnode_t *dn, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); + /* zap_add_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_update(objset_t *os, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + uint64_t oldval; + const uint64_t *intval = val; + +#ifdef ZFS_DEBUG + /* + * If there is an old value, it shouldn't change across the + * lockdir (eg, due to bprewrite's xlation). + */ + if (integer_size == 8 && num_integers == 1) + (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); +#endif + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc(zap, name, 0); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + if (!zap->zap_ismicro) { + err = fzap_update(zn, integer_size, num_integers, val, + FTAG, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ + } else if (integer_size != 8 || num_integers != 1 || + strlen(name) >= MZAP_NAME_LEN) { + dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", + zapobj, integer_size, num_integers, name); + err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); + if (err == 0) { + err = fzap_update(zn, integer_size, num_integers, + val, FTAG, tx); + } + zap = zn->zn_zap; /* fzap_update() may change zap */ + } else { + mzap_ent_t *mze = mze_find(zn); + if (mze != NULL) { + ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); + MZE_PHYS(zap, mze)->mze_value = *intval; + } else { + mzap_addent(zn, *intval); + } + } + ASSERT(zap == zn->zn_zap); + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) +{ + return (zap_remove_norm(os, zapobj, name, 0, tx)); +} + +static int +zap_remove_impl(zap_t *zap, const char *name, + matchtype_t mt, dmu_tx_t *tx) +{ + int err = 0; + + zap_name_t *zn = zap_name_alloc(zap, name, mt); + if (zn == NULL) + return (SET_ERROR(ENOTSUP)); + if (!zap->zap_ismicro) { + err = fzap_remove(zn, tx); + } else { + mzap_ent_t *mze = mze_find(zn); + if (mze == NULL) { + err = SET_ERROR(ENOENT); + } else { + zap->zap_m.zap_num_entries--; + bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid], + sizeof (mzap_ent_phys_t)); + mze_remove(zap, mze); + } + } + zap_name_free(zn); + return (err); +} + +int +zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + err = zap_remove_impl(zap, name, mt, tx); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + err = zap_remove_impl(zap, name, 0, tx); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_remove(zn, tx); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +/* + * Routines for iterating over the attributes. + */ + +void +zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized) +{ + zc->zc_objset = os; + zc->zc_zap = NULL; + zc->zc_leaf = NULL; + zc->zc_zapobj = zapobj; + zc->zc_serialized = serialized; + zc->zc_hash = 0; + zc->zc_cd = 0; +} + +void +zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) +{ + zap_cursor_init_serialized(zc, os, zapobj, 0); +} + +void +zap_cursor_fini(zap_cursor_t *zc) +{ + if (zc->zc_zap) { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + zap_unlockdir(zc->zc_zap, NULL); + zc->zc_zap = NULL; + } + if (zc->zc_leaf) { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } + zc->zc_objset = NULL; +} + +uint64_t +zap_cursor_serialize(zap_cursor_t *zc) +{ + if (zc->zc_hash == -1ULL) + return (-1ULL); + if (zc->zc_zap == NULL) + return (zc->zc_serialized); + ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); + ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); + + /* + * We want to keep the high 32 bits of the cursor zero if we can, so + * that 32-bit programs can access this. So usually use a small + * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits + * of the cursor. + * + * [ collision differentiator | zap_hashbits()-bit hash value ] + */ + return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | + ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); +} + +int +zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) +{ + int err; + + if (zc->zc_hash == -1ULL) + return (SET_ERROR(ENOENT)); + + if (zc->zc_zap == NULL) { + int hb; + err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, + RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); + if (err != 0) + return (err); + + /* + * To support zap_cursor_init_serialized, advance, retrieve, + * we must add to the existing zc_cd, which may already + * be 1 due to the zap_cursor_advance. + */ + ASSERT(zc->zc_hash == 0); + hb = zap_hashbits(zc->zc_zap); + zc->zc_hash = zc->zc_serialized << (64 - hb); + zc->zc_cd += zc->zc_serialized >> hb; + if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ + zc->zc_cd = 0; + } else { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + } + if (!zc->zc_zap->zap_ismicro) { + err = fzap_cursor_retrieve(zc->zc_zap, zc, za); + } else { + avl_index_t idx; + mzap_ent_t mze_tofind; + + mze_tofind.mze_hash = zc->zc_hash; + mze_tofind.mze_cd = zc->zc_cd; + + mzap_ent_t *mze = + avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); + if (mze == NULL) { + mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, + idx, AVL_AFTER); + } + if (mze) { + mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); + ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); + za->za_normalization_conflict = + mzap_normalization_conflict(zc->zc_zap, NULL, mze); + za->za_integer_length = 8; + za->za_num_integers = 1; + za->za_first_integer = mzep->mze_value; + (void) strcpy(za->za_name, mzep->mze_name); + zc->zc_hash = mze->mze_hash; + zc->zc_cd = mze->mze_cd; + err = 0; + } else { + zc->zc_hash = -1ULL; + err = SET_ERROR(ENOENT); + } + } + rw_exit(&zc->zc_zap->zap_rwlock); + return (err); +} + +void +zap_cursor_advance(zap_cursor_t *zc) +{ + if (zc->zc_hash == -1ULL) + return; + zc->zc_cd++; +} + +int +zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) +{ + int err = 0; + mzap_ent_t *mze; + zap_name_t *zn; + + if (zc->zc_zap == NULL) { + err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, + RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap); + if (err) + return (err); + } else { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + } + + zn = zap_name_alloc(zc->zc_zap, name, mt); + if (zn == NULL) { + rw_exit(&zc->zc_zap->zap_rwlock); + return (SET_ERROR(ENOTSUP)); + } + + if (!zc->zc_zap->zap_ismicro) { + err = fzap_cursor_move_to_key(zc, zn); + } else { + mze = mze_find(zn); + if (mze == NULL) { + err = SET_ERROR(ENOENT); + goto out; + } + zc->zc_hash = mze->mze_hash; + zc->zc_cd = mze->mze_cd; + } + +out: + zap_name_free(zn); + rw_exit(&zc->zc_zap->zap_rwlock); + return (err); +} + +int +zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + + bzero(zs, sizeof (zap_stats_t)); + + if (zap->zap_ismicro) { + zs->zs_blocksize = zap->zap_dbuf->db_size; + zs->zs_num_entries = zap->zap_m.zap_num_entries; + zs->zs_num_blocks = 1; + } else { + fzap_get_stats(zap, zs); + } + zap_unlockdir(zap, FTAG); + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c new file mode 100644 index 000000000000..54bc638c6e98 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c @@ -0,0 +1,1420 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2017 by Delphix. All rights reserved. + */ + +/* + * ZFS Channel Programs (ZCP) + * + * The ZCP interface allows various ZFS commands and operations ZFS + * administrative operations (e.g. creating and destroying snapshots, typically + * performed via an ioctl to /dev/zfs by the zfs(1M) command and + * libzfs/libzfs_core) to be run * programmatically as a Lua script. A ZCP + * script is run as a dsl_sync_task and fully executed during one transaction + * group sync. This ensures that no other changes can be written concurrently + * with a running Lua script. Combining multiple calls to the exposed ZFS + * functions into one script gives a number of benefits: + * + * 1. Atomicity. For some compound or iterative operations, it's useful to be + * able to guarantee that the state of a pool has not changed between calls to + * ZFS. + * + * 2. Performance. If a large number of changes need to be made (e.g. deleting + * many filesystems), there can be a significant performance penalty as a + * result of the need to wait for a transaction group sync to pass for every + * single operation. When expressed as a single ZCP script, all these changes + * can be performed at once in one txg sync. + * + * A modified version of the Lua 5.2 interpreter is used to run channel program + * scripts. The Lua 5.2 manual can be found at: + * + * http://www.lua.org/manual/5.2/ + * + * If being run by a user (via an ioctl syscall), executing a ZCP script + * requires root privileges in the global zone. + * + * Scripts are passed to zcp_eval() as a string, then run in a synctask by + * zcp_eval_sync(). Arguments can be passed into the Lua script as an nvlist, + * which will be converted to a Lua table. Similarly, values returned from + * a ZCP script will be converted to an nvlist. See zcp_lua_to_nvlist_impl() + * for details on exact allowed types and conversion. + * + * ZFS functionality is exposed to a ZCP script as a library of function calls. + * These calls are sorted into submodules, such as zfs.list and zfs.sync, for + * iterators and synctasks, respectively. Each of these submodules resides in + * its own source file, with a zcp_*_info structure describing each library + * call in the submodule. + * + * Error handling in ZCP scripts is handled by a number of different methods + * based on severity: + * + * 1. Memory and time limits are in place to prevent a channel program from + * consuming excessive system or running forever. If one of these limits is + * hit, the channel program will be stopped immediately and return from + * zcp_eval() with an error code. No attempt will be made to roll back or undo + * any changes made by the channel program before the error occured. + * Consumers invoking zcp_eval() from elsewhere in the kernel may pass a time + * limit of 0, disabling the time limit. + * + * 2. Internal Lua errors can occur as a result of a syntax error, calling a + * library function with incorrect arguments, invoking the error() function, + * failing an assert(), or other runtime errors. In these cases the channel + * program will stop executing and return from zcp_eval() with an error code. + * In place of a return value, an error message will also be returned in the + * 'result' nvlist containing information about the error. No attempt will be + * made to roll back or undo any changes made by the channel program before the + * error occured. + * + * 3. If an error occurs inside a ZFS library call which returns an error code, + * the error is returned to the Lua script to be handled as desired. + * + * In the first two cases, Lua's error-throwing mechanism is used, which + * longjumps out of the script execution with luaL_error() and returns with the + * error. + * + * See zfs-program(1M) for more information on high level usage. + */ + +#include "lua.h" +#include "lualib.h" +#include "lauxlib.h" + +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_dataset.h> +#include <sys/zcp.h> +#include <sys/zcp_iter.h> +#include <sys/zcp_prop.h> +#include <sys/zcp_global.h> +#ifdef illumos +#include <util/sscanf.h> +#endif + +#ifdef __FreeBSD__ +#define ECHRNG EDOM +#define ETIME ETIMEDOUT +#endif + +#define ZCP_NVLIST_MAX_DEPTH 20 + +uint64_t zfs_lua_check_instrlimit_interval = 100; +uint64_t zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT; +uint64_t zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT; + +/* + * Forward declarations for mutually recursive functions + */ +static int zcp_nvpair_value_to_lua(lua_State *, nvpair_t *, char *, int); +static int zcp_lua_to_nvlist_impl(lua_State *, int, nvlist_t *, const char *, + int); + +typedef struct zcp_alloc_arg { + boolean_t aa_must_succeed; + int64_t aa_alloc_remaining; + int64_t aa_alloc_limit; +} zcp_alloc_arg_t; + +typedef struct zcp_eval_arg { + lua_State *ea_state; + zcp_alloc_arg_t *ea_allocargs; + cred_t *ea_cred; + nvlist_t *ea_outnvl; + int ea_result; + uint64_t ea_instrlimit; +} zcp_eval_arg_t; + +/* + * The outer-most error callback handler for use with lua_pcall(). On + * error Lua will call this callback with a single argument that + * represents the error value. In most cases this will be a string + * containing an error message, but channel programs can use Lua's + * error() function to return arbitrary objects as errors. This callback + * returns (on the Lua stack) the original error object along with a traceback. + * + * Fatal Lua errors can occur while resources are held, so we also call any + * registered cleanup function here. + */ +static int +zcp_error_handler(lua_State *state) +{ + const char *msg; + + zcp_cleanup(state); + + VERIFY3U(1, ==, lua_gettop(state)); + msg = lua_tostring(state, 1); + luaL_traceback(state, state, msg, 1); + return (1); +} + +int +zcp_argerror(lua_State *state, int narg, const char *msg, ...) +{ + va_list alist; + + va_start(alist, msg); + const char *buf = lua_pushvfstring(state, msg, alist); + va_end(alist); + + return (luaL_argerror(state, narg, buf)); +} + +/* + * Install a new cleanup function, which will be invoked with the given + * opaque argument if a fatal error causes the Lua interpreter to longjump out + * of a function call. + * + * If an error occurs, the cleanup function will be invoked exactly once and + * then unreigstered. + * + * Returns the registered cleanup handler so the caller can deregister it + * if no error occurs. + */ +zcp_cleanup_handler_t * +zcp_register_cleanup(lua_State *state, zcp_cleanup_t cleanfunc, void *cleanarg) +{ + zcp_run_info_t *ri = zcp_run_info(state); + + zcp_cleanup_handler_t *zch = kmem_alloc(sizeof (*zch), KM_SLEEP); + zch->zch_cleanup_func = cleanfunc; + zch->zch_cleanup_arg = cleanarg; + list_insert_head(&ri->zri_cleanup_handlers, zch); + + return (zch); +} + +void +zcp_deregister_cleanup(lua_State *state, zcp_cleanup_handler_t *zch) +{ + zcp_run_info_t *ri = zcp_run_info(state); + list_remove(&ri->zri_cleanup_handlers, zch); + kmem_free(zch, sizeof (*zch)); +} + +/* + * Execute the currently registered cleanup handlers then free them and + * destroy the handler list. + */ +void +zcp_cleanup(lua_State *state) +{ + zcp_run_info_t *ri = zcp_run_info(state); + + for (zcp_cleanup_handler_t *zch = + list_remove_head(&ri->zri_cleanup_handlers); zch != NULL; + zch = list_remove_head(&ri->zri_cleanup_handlers)) { + zch->zch_cleanup_func(zch->zch_cleanup_arg); + kmem_free(zch, sizeof (*zch)); + } +} + +/* + * Convert the lua table at the given index on the Lua stack to an nvlist + * and return it. + * + * If the table can not be converted for any reason, NULL is returned and + * an error message is pushed onto the Lua stack. + */ +static nvlist_t * +zcp_table_to_nvlist(lua_State *state, int index, int depth) +{ + nvlist_t *nvl; + /* + * Converting a Lua table to an nvlist with key uniqueness checking is + * O(n^2) in the number of keys in the nvlist, which can take a long + * time when we return a large table from a channel program. + * Furthermore, Lua's table interface *almost* guarantees unique keys + * on its own (details below). Therefore, we don't use fnvlist_alloc() + * here to avoid the built-in uniqueness checking. + * + * The *almost* is because it's possible to have key collisions between + * e.g. the string "1" and the number 1, or the string "true" and the + * boolean true, so we explicitly check that when we're looking at a + * key which is an integer / boolean or a string that can be parsed as + * one of those types. In the worst case this could still devolve into + * O(n^2), so we only start doing these checks on boolean/integer keys + * once we've seen a string key which fits this weird usage pattern. + * + * Ultimately, we still want callers to know that the keys in this + * nvlist are unique, so before we return this we set the nvlist's + * flags to reflect that. + */ + VERIFY0(nvlist_alloc(&nvl, 0, KM_SLEEP)); + + /* + * Push an empty stack slot where lua_next() will store each + * table key. + */ + lua_pushnil(state); + boolean_t saw_str_could_collide = B_FALSE; + while (lua_next(state, index) != 0) { + /* + * The next key-value pair from the table at index is + * now on the stack, with the key at stack slot -2 and + * the value at slot -1. + */ + int err = 0; + char buf[32]; + const char *key = NULL; + boolean_t key_could_collide = B_FALSE; + + switch (lua_type(state, -2)) { + case LUA_TSTRING: + key = lua_tostring(state, -2); + + /* check if this could collide with a number or bool */ + long long tmp; + int parselen; + if ((sscanf(key, "%lld%n", &tmp, &parselen) > 0 && + parselen == strlen(key)) || + strcmp(key, "true") == 0 || + strcmp(key, "false") == 0) { + key_could_collide = B_TRUE; + saw_str_could_collide = B_TRUE; + } + break; + case LUA_TBOOLEAN: + key = (lua_toboolean(state, -2) == B_TRUE ? + "true" : "false"); + if (saw_str_could_collide) { + key_could_collide = B_TRUE; + } + break; + case LUA_TNUMBER: + VERIFY3U(sizeof (buf), >, + snprintf(buf, sizeof (buf), "%lld", + (longlong_t)lua_tonumber(state, -2))); + key = buf; + if (saw_str_could_collide) { + key_could_collide = B_TRUE; + } + break; + default: + fnvlist_free(nvl); + (void) lua_pushfstring(state, "Invalid key " + "type '%s' in table", + lua_typename(state, lua_type(state, -2))); + return (NULL); + } + /* + * Check for type-mismatched key collisions, and throw an error. + */ + if (key_could_collide && nvlist_exists(nvl, key)) { + fnvlist_free(nvl); + (void) lua_pushfstring(state, "Collision of " + "key '%s' in table", key); + return (NULL); + } + /* + * Recursively convert the table value and insert into + * the new nvlist with the parsed key. To prevent + * stack overflow on circular or heavily nested tables, + * we track the current nvlist depth. + */ + if (depth >= ZCP_NVLIST_MAX_DEPTH) { + fnvlist_free(nvl); + (void) lua_pushfstring(state, "Maximum table " + "depth (%d) exceeded for table", + ZCP_NVLIST_MAX_DEPTH); + return (NULL); + } + err = zcp_lua_to_nvlist_impl(state, -1, nvl, key, + depth + 1); + if (err != 0) { + fnvlist_free(nvl); + /* + * Error message has been pushed to the lua + * stack by the recursive call. + */ + return (NULL); + } + /* + * Pop the value pushed by lua_next(). + */ + lua_pop(state, 1); + } + + /* + * Mark the nvlist as having unique keys. This is a little ugly, but we + * ensured above that there are no duplicate keys in the nvlist. + */ + nvl->nvl_nvflag |= NV_UNIQUE_NAME; + + return (nvl); +} + +/* + * Convert a value from the given index into the lua stack to an nvpair, adding + * it to an nvlist with the given key. + * + * Values are converted as follows: + * + * string -> string + * number -> int64 + * boolean -> boolean + * nil -> boolean (no value) + * + * Lua tables are converted to nvlists and then inserted. The table's keys + * are converted to strings then used as keys in the nvlist to store each table + * element. Keys are converted as follows: + * + * string -> no change + * number -> "%lld" + * boolean -> "true" | "false" + * nil -> error + * + * In the case of a key collision, an error is thrown. + * + * If an error is encountered, a nonzero error code is returned, and an error + * string will be pushed onto the Lua stack. + */ +static int +zcp_lua_to_nvlist_impl(lua_State *state, int index, nvlist_t *nvl, + const char *key, int depth) +{ + /* + * Verify that we have enough remaining space in the lua stack to parse + * a key-value pair and push an error. + */ + if (!lua_checkstack(state, 3)) { + (void) lua_pushstring(state, "Lua stack overflow"); + return (1); + } + + index = lua_absindex(state, index); + + switch (lua_type(state, index)) { + case LUA_TNIL: + fnvlist_add_boolean(nvl, key); + break; + case LUA_TBOOLEAN: + fnvlist_add_boolean_value(nvl, key, + lua_toboolean(state, index)); + break; + case LUA_TNUMBER: + fnvlist_add_int64(nvl, key, lua_tonumber(state, index)); + break; + case LUA_TSTRING: + fnvlist_add_string(nvl, key, lua_tostring(state, index)); + break; + case LUA_TTABLE: { + nvlist_t *value_nvl = zcp_table_to_nvlist(state, index, depth); + if (value_nvl == NULL) + return (EINVAL); + + fnvlist_add_nvlist(nvl, key, value_nvl); + fnvlist_free(value_nvl); + break; + } + default: + (void) lua_pushfstring(state, + "Invalid value type '%s' for key '%s'", + lua_typename(state, lua_type(state, index)), key); + return (EINVAL); + } + + return (0); +} + +/* + * Convert a lua value to an nvpair, adding it to an nvlist with the given key. + */ +static void +zcp_lua_to_nvlist(lua_State *state, int index, nvlist_t *nvl, const char *key) +{ + /* + * On error, zcp_lua_to_nvlist_impl pushes an error string onto the Lua + * stack before returning with a nonzero error code. If an error is + * returned, throw a fatal lua error with the given string. + */ + if (zcp_lua_to_nvlist_impl(state, index, nvl, key, 0) != 0) + (void) lua_error(state); +} + +static int +zcp_lua_to_nvlist_helper(lua_State *state) +{ + nvlist_t *nv = (nvlist_t *)lua_touserdata(state, 2); + const char *key = (const char *)lua_touserdata(state, 1); + zcp_lua_to_nvlist(state, 3, nv, key); + return (0); +} + +static void +zcp_convert_return_values(lua_State *state, nvlist_t *nvl, + const char *key, zcp_eval_arg_t *evalargs) +{ + int err; + VERIFY3U(1, ==, lua_gettop(state)); + lua_pushcfunction(state, zcp_lua_to_nvlist_helper); + lua_pushlightuserdata(state, (char *)key); + lua_pushlightuserdata(state, nvl); + lua_pushvalue(state, 1); + lua_remove(state, 1); + err = lua_pcall(state, 3, 0, 0); /* zcp_lua_to_nvlist_helper */ + if (err != 0) { + zcp_lua_to_nvlist(state, 1, nvl, ZCP_RET_ERROR); + evalargs->ea_result = SET_ERROR(ECHRNG); + } +} + +/* + * Push a Lua table representing nvl onto the stack. If it can't be + * converted, return EINVAL, fill in errbuf, and push nothing. errbuf may + * be specified as NULL, in which case no error string will be output. + * + * Most nvlists are converted as simple key->value Lua tables, but we make + * an exception for the case where all nvlist entries are BOOLEANs (a string + * key without a value). In Lua, a table key pointing to a value of Nil + * (no value) is equivalent to the key not existing, so a BOOLEAN nvlist + * entry can't be directly converted to a Lua table entry. Nvlists of entirely + * BOOLEAN entries are frequently used to pass around lists of datasets, so for + * convenience we check for this case, and convert it to a simple Lua array of + * strings. + */ +int +zcp_nvlist_to_lua(lua_State *state, nvlist_t *nvl, + char *errbuf, int errbuf_len) +{ + nvpair_t *pair; + lua_newtable(state); + boolean_t has_values = B_FALSE; + /* + * If the list doesn't have any values, just convert it to a string + * array. + */ + for (pair = nvlist_next_nvpair(nvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) { + if (nvpair_type(pair) != DATA_TYPE_BOOLEAN) { + has_values = B_TRUE; + break; + } + } + if (!has_values) { + int i = 1; + for (pair = nvlist_next_nvpair(nvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) { + (void) lua_pushinteger(state, i); + (void) lua_pushstring(state, nvpair_name(pair)); + (void) lua_settable(state, -3); + i++; + } + } else { + for (pair = nvlist_next_nvpair(nvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) { + int err = zcp_nvpair_value_to_lua(state, pair, + errbuf, errbuf_len); + if (err != 0) { + lua_pop(state, 1); + return (err); + } + (void) lua_setfield(state, -2, nvpair_name(pair)); + } + } + return (0); +} + +/* + * Push a Lua object representing the value of "pair" onto the stack. + * + * Only understands boolean_value, string, int64, nvlist, + * string_array, and int64_array type values. For other + * types, returns EINVAL, fills in errbuf, and pushes nothing. + */ +static int +zcp_nvpair_value_to_lua(lua_State *state, nvpair_t *pair, + char *errbuf, int errbuf_len) +{ + int err = 0; + + if (pair == NULL) { + lua_pushnil(state); + return (0); + } + + switch (nvpair_type(pair)) { + case DATA_TYPE_BOOLEAN_VALUE: + (void) lua_pushboolean(state, + fnvpair_value_boolean_value(pair)); + break; + case DATA_TYPE_STRING: + (void) lua_pushstring(state, fnvpair_value_string(pair)); + break; + case DATA_TYPE_INT64: + (void) lua_pushinteger(state, fnvpair_value_int64(pair)); + break; + case DATA_TYPE_NVLIST: + err = zcp_nvlist_to_lua(state, + fnvpair_value_nvlist(pair), errbuf, errbuf_len); + break; + case DATA_TYPE_STRING_ARRAY: { + char **strarr; + uint_t nelem; + (void) nvpair_value_string_array(pair, &strarr, &nelem); + lua_newtable(state); + for (int i = 0; i < nelem; i++) { + (void) lua_pushinteger(state, i + 1); + (void) lua_pushstring(state, strarr[i]); + (void) lua_settable(state, -3); + } + break; + } + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *intarr; + uint_t nelem; + (void) nvpair_value_uint64_array(pair, &intarr, &nelem); + lua_newtable(state); + for (int i = 0; i < nelem; i++) { + (void) lua_pushinteger(state, i + 1); + (void) lua_pushinteger(state, intarr[i]); + (void) lua_settable(state, -3); + } + break; + } + case DATA_TYPE_INT64_ARRAY: { + int64_t *intarr; + uint_t nelem; + (void) nvpair_value_int64_array(pair, &intarr, &nelem); + lua_newtable(state); + for (int i = 0; i < nelem; i++) { + (void) lua_pushinteger(state, i + 1); + (void) lua_pushinteger(state, intarr[i]); + (void) lua_settable(state, -3); + } + break; + } + default: { + if (errbuf != NULL) { + (void) snprintf(errbuf, errbuf_len, + "Unhandled nvpair type %d for key '%s'", + nvpair_type(pair), nvpair_name(pair)); + } + return (EINVAL); + } + } + return (err); +} + +int +zcp_dataset_hold_error(lua_State *state, dsl_pool_t *dp, const char *dsname, + int error) +{ + if (error == ENOENT) { + (void) zcp_argerror(state, 1, "no such dataset '%s'", dsname); + return (0); /* not reached; zcp_argerror will longjmp */ + } else if (error == EXDEV) { + (void) zcp_argerror(state, 1, + "dataset '%s' is not in the target pool '%s'", + dsname, spa_name(dp->dp_spa)); + return (0); /* not reached; zcp_argerror will longjmp */ + } else if (error == EIO) { + (void) luaL_error(state, + "I/O error while accessing dataset '%s'", dsname); + return (0); /* not reached; luaL_error will longjmp */ + } else if (error != 0) { + (void) luaL_error(state, + "unexpected error %d while accessing dataset '%s'", + error, dsname); + return (0); /* not reached; luaL_error will longjmp */ + } + return (0); +} + +/* + * Note: will longjmp (via lua_error()) on error. + * Assumes that the dsname is argument #1 (for error reporting purposes). + */ +dsl_dataset_t * +zcp_dataset_hold(lua_State *state, dsl_pool_t *dp, const char *dsname, + void *tag) +{ + dsl_dataset_t *ds; + int error = dsl_dataset_hold(dp, dsname, tag, &ds); + (void) zcp_dataset_hold_error(state, dp, dsname, error); + return (ds); +} + +static int zcp_debug(lua_State *); +static zcp_lib_info_t zcp_debug_info = { + .name = "debug", + .func = zcp_debug, + .pargs = { + { .za_name = "debug string", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_debug(lua_State *state) +{ + const char *dbgstring; + zcp_run_info_t *ri = zcp_run_info(state); + zcp_lib_info_t *libinfo = &zcp_debug_info; + + zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); + + dbgstring = lua_tostring(state, 1); + + zfs_dbgmsg("txg %lld ZCP: %s", ri->zri_tx->tx_txg, dbgstring); + + return (0); +} + +static int zcp_exists(lua_State *); +static zcp_lib_info_t zcp_exists_info = { + .name = "exists", + .func = zcp_exists, + .pargs = { + { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_exists(lua_State *state) +{ + zcp_run_info_t *ri = zcp_run_info(state); + dsl_pool_t *dp = ri->zri_pool; + zcp_lib_info_t *libinfo = &zcp_exists_info; + + zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); + + const char *dsname = lua_tostring(state, 1); + + dsl_dataset_t *ds; + int error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == 0) { + dsl_dataset_rele(ds, FTAG); + lua_pushboolean(state, B_TRUE); + } else if (error == ENOENT) { + lua_pushboolean(state, B_FALSE); + } else if (error == EXDEV) { + return (luaL_error(state, "dataset '%s' is not in the " + "target pool", dsname)); + } else if (error == EIO) { + return (luaL_error(state, "I/O error opening dataset '%s'", + dsname)); + } else if (error != 0) { + return (luaL_error(state, "unexpected error %d", error)); + } + + return (1); +} + +/* + * Allocate/realloc/free a buffer for the lua interpreter. + * + * When nsize is 0, behaves as free() and returns NULL. + * + * If ptr is NULL, behaves as malloc() and returns an allocated buffer of size + * at least nsize. + * + * Otherwise, behaves as realloc(), changing the allocation from osize to nsize. + * Shrinking the buffer size never fails. + * + * The original allocated buffer size is stored as a uint64 at the beginning of + * the buffer to avoid actually reallocating when shrinking a buffer, since lua + * requires that this operation never fail. + */ +static void * +zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize) +{ + zcp_alloc_arg_t *allocargs = ud; + int flags = (allocargs->aa_must_succeed) ? + KM_SLEEP : (KM_NOSLEEP | KM_NORMALPRI); + + if (nsize == 0) { + if (ptr != NULL) { + int64_t *allocbuf = (int64_t *)ptr - 1; + int64_t allocsize = *allocbuf; + ASSERT3S(allocsize, >, 0); + ASSERT3S(allocargs->aa_alloc_remaining + allocsize, <=, + allocargs->aa_alloc_limit); + allocargs->aa_alloc_remaining += allocsize; + kmem_free(allocbuf, allocsize); + } + return (NULL); + } else if (ptr == NULL) { + int64_t *allocbuf; + int64_t allocsize = nsize + sizeof (int64_t); + + if (!allocargs->aa_must_succeed && + (allocsize <= 0 || + allocsize > allocargs->aa_alloc_remaining)) { + return (NULL); + } + + allocbuf = kmem_alloc(allocsize, flags); + if (allocbuf == NULL) { + return (NULL); + } + allocargs->aa_alloc_remaining -= allocsize; + + *allocbuf = allocsize; + return (allocbuf + 1); + } else if (nsize <= osize) { + /* + * If shrinking the buffer, lua requires that the reallocation + * never fail. + */ + return (ptr); + } else { + ASSERT3U(nsize, >, osize); + + uint64_t *luabuf = zcp_lua_alloc(ud, NULL, 0, nsize); + if (luabuf == NULL) { + return (NULL); + } + (void) memcpy(luabuf, ptr, osize); + VERIFY3P(zcp_lua_alloc(ud, ptr, osize, 0), ==, NULL); + return (luabuf); + } +} + +/* ARGSUSED */ +static void +zcp_lua_counthook(lua_State *state, lua_Debug *ar) +{ + /* + * If we're called, check how many instructions the channel program has + * executed so far, and compare against the limit. + */ + lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY); + zcp_run_info_t *ri = lua_touserdata(state, -1); + + ri->zri_curinstrs += zfs_lua_check_instrlimit_interval; + if (ri->zri_maxinstrs != 0 && ri->zri_curinstrs > ri->zri_maxinstrs) { + ri->zri_timed_out = B_TRUE; + (void) lua_pushstring(state, + "Channel program timed out."); + (void) lua_error(state); + } +} + +static int +zcp_panic_cb(lua_State *state) +{ + panic("unprotected error in call to Lua API (%s)\n", + lua_tostring(state, -1)); + return (0); +} + +static void +zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs) +{ + int err; + zcp_run_info_t ri; + lua_State *state = evalargs->ea_state; + + VERIFY3U(3, ==, lua_gettop(state)); + + /* + * Store the zcp_run_info_t struct for this run in the Lua registry. + * Registry entries are not directly accessible by the Lua scripts but + * can be accessed by our callbacks. + */ + ri.zri_space_used = 0; + ri.zri_pool = dmu_tx_pool(tx); + ri.zri_cred = evalargs->ea_cred; + ri.zri_tx = tx; + ri.zri_timed_out = B_FALSE; + ri.zri_sync = sync; + list_create(&ri.zri_cleanup_handlers, sizeof (zcp_cleanup_handler_t), + offsetof(zcp_cleanup_handler_t, zch_node)); + ri.zri_curinstrs = 0; + ri.zri_maxinstrs = evalargs->ea_instrlimit; + + lua_pushlightuserdata(state, &ri); + lua_setfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY); + VERIFY3U(3, ==, lua_gettop(state)); + + /* + * Tell the Lua interpreter to call our handler every count + * instructions. Channel programs that execute too many instructions + * should die with ETIMEDOUT. + */ + (void) lua_sethook(state, zcp_lua_counthook, LUA_MASKCOUNT, + zfs_lua_check_instrlimit_interval); + + /* + * Tell the Lua memory allocator to stop using KM_SLEEP before handing + * off control to the channel program. Channel programs that use too + * much memory should die with ENOSPC. + */ + evalargs->ea_allocargs->aa_must_succeed = B_FALSE; + + /* + * Call the Lua function that open-context passed us. This pops the + * function and its input from the stack and pushes any return + * or error values. + */ + err = lua_pcall(state, 1, LUA_MULTRET, 1); + + /* + * Let Lua use KM_SLEEP while we interpret the return values. + */ + evalargs->ea_allocargs->aa_must_succeed = B_TRUE; + + /* + * Remove the error handler callback from the stack. At this point, + * there shouldn't be any cleanup handler registered in the handler + * list (zri_cleanup_handlers), regardless of whether it ran or not. + */ + list_destroy(&ri.zri_cleanup_handlers); + lua_remove(state, 1); + + switch (err) { + case LUA_OK: { + /* + * Lua supports returning multiple values in a single return + * statement. Return values will have been pushed onto the + * stack: + * 1: Return value 1 + * 2: Return value 2 + * 3: etc... + * To simplify the process of retrieving a return value from a + * channel program, we disallow returning more than one value + * to ZFS from the Lua script, yielding a singleton return + * nvlist of the form { "return": Return value 1 }. + */ + int return_count = lua_gettop(state); + + if (return_count == 1) { + evalargs->ea_result = 0; + zcp_convert_return_values(state, evalargs->ea_outnvl, + ZCP_RET_RETURN, evalargs); + } else if (return_count > 1) { + evalargs->ea_result = SET_ERROR(ECHRNG); + lua_settop(state, 0); + (void) lua_pushfstring(state, "Multiple return " + "values not supported"); + zcp_convert_return_values(state, evalargs->ea_outnvl, + ZCP_RET_ERROR, evalargs); + } + break; + } + case LUA_ERRRUN: + case LUA_ERRGCMM: { + /* + * The channel program encountered a fatal error within the + * script, such as failing an assertion, or calling a function + * with incompatible arguments. The error value and the + * traceback generated by zcp_error_handler() should be on the + * stack. + */ + VERIFY3U(1, ==, lua_gettop(state)); + if (ri.zri_timed_out) { + evalargs->ea_result = SET_ERROR(ETIME); + } else { + evalargs->ea_result = SET_ERROR(ECHRNG); + } + + zcp_convert_return_values(state, evalargs->ea_outnvl, + ZCP_RET_ERROR, evalargs); + break; + } + case LUA_ERRERR: { + /* + * The channel program encountered a fatal error within the + * script, and we encountered another error while trying to + * compute the traceback in zcp_error_handler(). We can only + * return the error message. + */ + VERIFY3U(1, ==, lua_gettop(state)); + if (ri.zri_timed_out) { + evalargs->ea_result = SET_ERROR(ETIME); + } else { + evalargs->ea_result = SET_ERROR(ECHRNG); + } + + zcp_convert_return_values(state, evalargs->ea_outnvl, + ZCP_RET_ERROR, evalargs); + break; + } + case LUA_ERRMEM: + /* + * Lua ran out of memory while running the channel program. + * There's not much we can do. + */ + evalargs->ea_result = SET_ERROR(ENOSPC); + break; + default: + VERIFY0(err); + } +} + +static void +zcp_pool_error(zcp_eval_arg_t *evalargs, const char *poolname) +{ + evalargs->ea_result = SET_ERROR(ECHRNG); + lua_settop(evalargs->ea_state, 0); + (void) lua_pushfstring(evalargs->ea_state, "Could not open pool: %s", + poolname); + zcp_convert_return_values(evalargs->ea_state, evalargs->ea_outnvl, + ZCP_RET_ERROR, evalargs); + +} + +static void +zcp_eval_sync(void *arg, dmu_tx_t *tx) +{ + zcp_eval_arg_t *evalargs = arg; + + /* + * Open context should have setup the stack to contain: + * 1: Error handler callback + * 2: Script to run (converted to a Lua function) + * 3: nvlist input to function (converted to Lua table or nil) + */ + VERIFY3U(3, ==, lua_gettop(evalargs->ea_state)); + + zcp_eval_impl(tx, B_TRUE, evalargs); +} + +static void +zcp_eval_open(zcp_eval_arg_t *evalargs, const char *poolname) +{ + + int error; + dsl_pool_t *dp; + dmu_tx_t *tx; + + /* + * See comment from the same assertion in zcp_eval_sync(). + */ + VERIFY3U(3, ==, lua_gettop(evalargs->ea_state)); + + error = dsl_pool_hold(poolname, FTAG, &dp); + if (error != 0) { + zcp_pool_error(evalargs, poolname); + return; + } + + /* + * As we are running in open-context, we have no transaction associated + * with the channel program. At the same time, functions from the + * zfs.check submodule need to be associated with a transaction as + * they are basically dry-runs of their counterparts in the zfs.sync + * submodule. These functions should be able to run in open-context. + * Therefore we create a new transaction that we later abort once + * the channel program has been evaluated. + */ + tx = dmu_tx_create_dd(dp->dp_mos_dir); + + zcp_eval_impl(tx, B_FALSE, evalargs); + + dmu_tx_abort(tx); + + dsl_pool_rele(dp, FTAG); +} + +int +zcp_eval(const char *poolname, const char *program, boolean_t sync, + uint64_t instrlimit, uint64_t memlimit, nvpair_t *nvarg, nvlist_t *outnvl) +{ + int err; + lua_State *state; + zcp_eval_arg_t evalargs; + + if (instrlimit > zfs_lua_max_instrlimit) + return (SET_ERROR(EINVAL)); + if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) + return (SET_ERROR(EINVAL)); + + zcp_alloc_arg_t allocargs = { + .aa_must_succeed = B_TRUE, + .aa_alloc_remaining = (int64_t)memlimit, + .aa_alloc_limit = (int64_t)memlimit, + }; + + /* + * Creates a Lua state with a memory allocator that uses KM_SLEEP. + * This should never fail. + */ + state = lua_newstate(zcp_lua_alloc, &allocargs); + VERIFY(state != NULL); + (void) lua_atpanic(state, zcp_panic_cb); + + /* + * Load core Lua libraries we want access to. + */ + VERIFY3U(1, ==, luaopen_base(state)); + lua_pop(state, 1); + VERIFY3U(1, ==, luaopen_coroutine(state)); + lua_setglobal(state, LUA_COLIBNAME); + VERIFY0(lua_gettop(state)); + VERIFY3U(1, ==, luaopen_string(state)); + lua_setglobal(state, LUA_STRLIBNAME); + VERIFY0(lua_gettop(state)); + VERIFY3U(1, ==, luaopen_table(state)); + lua_setglobal(state, LUA_TABLIBNAME); + VERIFY0(lua_gettop(state)); + + /* + * Load globally visible variables such as errno aliases. + */ + zcp_load_globals(state); + VERIFY0(lua_gettop(state)); + + /* + * Load ZFS-specific modules. + */ + lua_newtable(state); + VERIFY3U(1, ==, zcp_load_list_lib(state)); + lua_setfield(state, -2, "list"); + VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_FALSE)); + lua_setfield(state, -2, "check"); + VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_TRUE)); + lua_setfield(state, -2, "sync"); + VERIFY3U(1, ==, zcp_load_get_lib(state)); + lua_pushcclosure(state, zcp_debug_info.func, 0); + lua_setfield(state, -2, zcp_debug_info.name); + lua_pushcclosure(state, zcp_exists_info.func, 0); + lua_setfield(state, -2, zcp_exists_info.name); + lua_setglobal(state, "zfs"); + VERIFY0(lua_gettop(state)); + + /* + * Push the error-callback that calculates Lua stack traces on + * unexpected failures. + */ + lua_pushcfunction(state, zcp_error_handler); + VERIFY3U(1, ==, lua_gettop(state)); + + /* + * Load the actual script as a function onto the stack as text ("t"). + * The only valid error condition is a syntax error in the script. + * ERRMEM should not be possible because our allocator is using + * KM_SLEEP. ERRGCMM should not be possible because we have not added + * any objects with __gc metamethods to the interpreter that could + * fail. + */ + err = luaL_loadbufferx(state, program, strlen(program), + "channel program", "t"); + if (err == LUA_ERRSYNTAX) { + fnvlist_add_string(outnvl, ZCP_RET_ERROR, + lua_tostring(state, -1)); + lua_close(state); + return (SET_ERROR(EINVAL)); + } + VERIFY0(err); + VERIFY3U(2, ==, lua_gettop(state)); + + /* + * Convert the input nvlist to a Lua object and put it on top of the + * stack. + */ + char errmsg[128]; + err = zcp_nvpair_value_to_lua(state, nvarg, + errmsg, sizeof (errmsg)); + if (err != 0) { + fnvlist_add_string(outnvl, ZCP_RET_ERROR, errmsg); + lua_close(state); + return (SET_ERROR(EINVAL)); + } + VERIFY3U(3, ==, lua_gettop(state)); + + evalargs.ea_state = state; + evalargs.ea_allocargs = &allocargs; + evalargs.ea_instrlimit = instrlimit; + evalargs.ea_cred = CRED(); + evalargs.ea_outnvl = outnvl; + evalargs.ea_result = 0; + + if (sync) { + err = dsl_sync_task(poolname, NULL, + zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_ZCP_EVAL); + if (err != 0) + zcp_pool_error(&evalargs, poolname); + } else { + zcp_eval_open(&evalargs, poolname); + } + lua_close(state); + + return (evalargs.ea_result); +} + +/* + * Retrieve metadata about the currently running channel program. + */ +zcp_run_info_t * +zcp_run_info(lua_State *state) +{ + zcp_run_info_t *ri; + + lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY); + ri = lua_touserdata(state, -1); + lua_pop(state, 1); + return (ri); +} + +/* + * Argument Parsing + * ================ + * + * The Lua language allows methods to be called with any number + * of arguments of any type. When calling back into ZFS we need to sanitize + * arguments from channel programs to make sure unexpected arguments or + * arguments of the wrong type result in clear error messages. To do this + * in a uniform way all callbacks from channel programs should use the + * zcp_parse_args() function to interpret inputs. + * + * Positional vs Keyword Arguments + * =============================== + * + * Every callback function takes a fixed set of required positional arguments + * and optional keyword arguments. For example, the destroy function takes + * a single positional string argument (the name of the dataset to destroy) + * and an optional "defer" keyword boolean argument. When calling lua functions + * with parentheses, only positional arguments can be used: + * + * zfs.sync.snapshot("rpool@snap") + * + * To use keyword arguments functions should be called with a single argument + * that is a lua table containing mappings of integer -> positional arguments + * and string -> keyword arguments: + * + * zfs.sync.snapshot({1="rpool@snap", defer=true}) + * + * The lua language allows curly braces to be used in place of parenthesis as + * syntactic sugar for this calling convention: + * + * zfs.sync.snapshot{"rpool@snap", defer=true} + */ + +/* + * Throw an error and print the given arguments. If there are too many + * arguments to fit in the output buffer, only the error format string is + * output. + */ +static void +zcp_args_error(lua_State *state, const char *fname, const zcp_arg_t *pargs, + const zcp_arg_t *kwargs, const char *fmt, ...) +{ + int i; + char errmsg[512]; + size_t len = sizeof (errmsg); + size_t msglen = 0; + va_list argp; + + va_start(argp, fmt); + VERIFY3U(len, >, vsnprintf(errmsg, len, fmt, argp)); + va_end(argp); + + /* + * Calculate the total length of the final string, including extra + * formatting characters. If the argument dump would be too large, + * only print the error string. + */ + msglen = strlen(errmsg); + msglen += strlen(fname) + 4; /* : + {} + null terminator */ + for (i = 0; pargs[i].za_name != NULL; i++) { + msglen += strlen(pargs[i].za_name); + msglen += strlen(lua_typename(state, pargs[i].za_lua_type)); + if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL) + msglen += 5; /* < + ( + )> + , */ + else + msglen += 4; /* < + ( + )> */ + } + for (i = 0; kwargs[i].za_name != NULL; i++) { + msglen += strlen(kwargs[i].za_name); + msglen += strlen(lua_typename(state, kwargs[i].za_lua_type)); + if (kwargs[i + 1].za_name != NULL) + msglen += 4; /* =( + ) + , */ + else + msglen += 3; /* =( + ) */ + } + + if (msglen >= len) + (void) luaL_error(state, errmsg); + + VERIFY3U(len, >, strlcat(errmsg, ": ", len)); + VERIFY3U(len, >, strlcat(errmsg, fname, len)); + VERIFY3U(len, >, strlcat(errmsg, "{", len)); + for (i = 0; pargs[i].za_name != NULL; i++) { + VERIFY3U(len, >, strlcat(errmsg, "<", len)); + VERIFY3U(len, >, strlcat(errmsg, pargs[i].za_name, len)); + VERIFY3U(len, >, strlcat(errmsg, "(", len)); + VERIFY3U(len, >, strlcat(errmsg, + lua_typename(state, pargs[i].za_lua_type), len)); + VERIFY3U(len, >, strlcat(errmsg, ")>", len)); + if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL) { + VERIFY3U(len, >, strlcat(errmsg, ", ", len)); + } + } + for (i = 0; kwargs[i].za_name != NULL; i++) { + VERIFY3U(len, >, strlcat(errmsg, kwargs[i].za_name, len)); + VERIFY3U(len, >, strlcat(errmsg, "=(", len)); + VERIFY3U(len, >, strlcat(errmsg, + lua_typename(state, kwargs[i].za_lua_type), len)); + VERIFY3U(len, >, strlcat(errmsg, ")", len)); + if (kwargs[i + 1].za_name != NULL) { + VERIFY3U(len, >, strlcat(errmsg, ", ", len)); + } + } + VERIFY3U(len, >, strlcat(errmsg, "}", len)); + + (void) luaL_error(state, errmsg); + panic("unreachable code"); +} + +static void +zcp_parse_table_args(lua_State *state, const char *fname, + const zcp_arg_t *pargs, const zcp_arg_t *kwargs) +{ + int i; + int type; + + for (i = 0; pargs[i].za_name != NULL; i++) { + /* + * Check the table for this positional argument, leaving it + * on the top of the stack once we finish validating it. + */ + lua_pushinteger(state, i + 1); + lua_gettable(state, 1); + + type = lua_type(state, -1); + if (type == LUA_TNIL) { + zcp_args_error(state, fname, pargs, kwargs, + "too few arguments"); + panic("unreachable code"); + } else if (type != pargs[i].za_lua_type) { + zcp_args_error(state, fname, pargs, kwargs, + "arg %d wrong type (is '%s', expected '%s')", + i + 1, lua_typename(state, type), + lua_typename(state, pargs[i].za_lua_type)); + panic("unreachable code"); + } + + /* + * Remove the positional argument from the table. + */ + lua_pushinteger(state, i + 1); + lua_pushnil(state); + lua_settable(state, 1); + } + + for (i = 0; kwargs[i].za_name != NULL; i++) { + /* + * Check the table for this keyword argument, which may be + * nil if it was omitted. Leave the value on the top of + * the stack after validating it. + */ + lua_getfield(state, 1, kwargs[i].za_name); + + type = lua_type(state, -1); + if (type != LUA_TNIL && type != kwargs[i].za_lua_type) { + zcp_args_error(state, fname, pargs, kwargs, + "kwarg '%s' wrong type (is '%s', expected '%s')", + kwargs[i].za_name, lua_typename(state, type), + lua_typename(state, kwargs[i].za_lua_type)); + panic("unreachable code"); + } + + /* + * Remove the keyword argument from the table. + */ + lua_pushnil(state); + lua_setfield(state, 1, kwargs[i].za_name); + } + + /* + * Any entries remaining in the table are invalid inputs, print + * an error message based on what the entry is. + */ + lua_pushnil(state); + if (lua_next(state, 1)) { + if (lua_isnumber(state, -2) && lua_tointeger(state, -2) > 0) { + zcp_args_error(state, fname, pargs, kwargs, + "too many positional arguments"); + } else if (lua_isstring(state, -2)) { + zcp_args_error(state, fname, pargs, kwargs, + "invalid kwarg '%s'", lua_tostring(state, -2)); + } else { + zcp_args_error(state, fname, pargs, kwargs, + "kwarg keys must be strings"); + } + panic("unreachable code"); + } + + lua_remove(state, 1); +} + +static void +zcp_parse_pos_args(lua_State *state, const char *fname, const zcp_arg_t *pargs, + const zcp_arg_t *kwargs) +{ + int i; + int type; + + for (i = 0; pargs[i].za_name != NULL; i++) { + type = lua_type(state, i + 1); + if (type == LUA_TNONE) { + zcp_args_error(state, fname, pargs, kwargs, + "too few arguments"); + panic("unreachable code"); + } else if (type != pargs[i].za_lua_type) { + zcp_args_error(state, fname, pargs, kwargs, + "arg %d wrong type (is '%s', expected '%s')", + i + 1, lua_typename(state, type), + lua_typename(state, pargs[i].za_lua_type)); + panic("unreachable code"); + } + } + if (lua_gettop(state) != i) { + zcp_args_error(state, fname, pargs, kwargs, + "too many positional arguments"); + panic("unreachable code"); + } + + for (i = 0; kwargs[i].za_name != NULL; i++) { + lua_pushnil(state); + } +} + +/* + * Checks the current Lua stack against an expected set of positional and + * keyword arguments. If the stack does not match the expected arguments + * aborts the current channel program with a useful error message, otherwise + * it re-arranges the stack so that it contains the positional arguments + * followed by the keyword argument values in declaration order. Any missing + * keyword argument will be represented by a nil value on the stack. + * + * If the stack contains exactly one argument of type LUA_TTABLE the curly + * braces calling convention is assumed, otherwise the stack is parsed for + * positional arguments only. + * + * This function should be used by every function callback. It should be called + * before the callback manipulates the Lua stack as it assumes the stack + * represents the function arguments. + */ +void +zcp_parse_args(lua_State *state, const char *fname, const zcp_arg_t *pargs, + const zcp_arg_t *kwargs) +{ + if (lua_gettop(state) == 1 && lua_istable(state, 1)) { + zcp_parse_table_args(state, fname, pargs, kwargs); + } else { + zcp_parse_pos_args(state, fname, pargs, kwargs); + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c new file mode 100644 index 000000000000..964b5f1037ac --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c @@ -0,0 +1,860 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#include "lua.h" +#include "lualib.h" +#include "lauxlib.h" + +#include <zfs_prop.h> + +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_objset.h> +#include <sys/mntent.h> +#include <sys/sunddi.h> +#include <sys/zap.h> +#include <sys/zcp.h> +#include <sys/zcp_iter.h> +#include <sys/zcp_global.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_znode.h> +#include <sys/zvol.h> + +#ifdef _KERNEL +#include <sys/zfs_vfsops.h> +#endif + +static int +get_objset_type(dsl_dataset_t *ds, zfs_type_t *type) +{ + int error; + objset_t *os; + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); + if (ds->ds_is_snapshot) { + *type = ZFS_TYPE_SNAPSHOT; + } else { + switch (os->os_phys->os_type) { + case DMU_OST_ZFS: + *type = ZFS_TYPE_FILESYSTEM; + break; + case DMU_OST_ZVOL: + *type = ZFS_TYPE_VOLUME; + break; + default: + return (EINVAL); + } + } + return (0); +} + +/* + * Returns the string name of ds's type in str (a buffer which should be + * at least 12 bytes long). + */ +static int +get_objset_type_name(dsl_dataset_t *ds, char *str) +{ + int error; + zfs_type_t type; + error = get_objset_type(ds, &type); + if (error != 0) + return (error); + switch (type) { + case ZFS_TYPE_SNAPSHOT: + (void) strcpy(str, "snapshot"); + break; + case ZFS_TYPE_FILESYSTEM: + (void) strcpy(str, "filesystem"); + break; + case ZFS_TYPE_VOLUME: + (void) strcpy(str, "volume"); + break; + default: + return (EINVAL); + } + return (0); +} + +/* + * Determines the source of a property given its setpoint and + * property type. It pushes the source to the lua stack. + */ +static void +get_prop_src(lua_State *state, const char *setpoint, zfs_prop_t prop) +{ + if (zfs_prop_readonly(prop) || (prop == ZFS_PROP_VERSION)) { + lua_pushnil(state); + } else { + const char *src; + if (strcmp("", setpoint) == 0) { + src = "default"; + } else { + src = setpoint; + } + (void) lua_pushstring(state, src); + } +} + +/* + * Given an error encountered while getting properties, either longjmp's for + * a fatal error or pushes nothing to the stack for a non fatal one. + */ +static int +zcp_handle_error(lua_State *state, const char *dataset_name, + const char *property_name, int error) +{ + ASSERT3S(error, !=, 0); + if (error == ENOENT) { + return (0); + } else if (error == EINVAL) { + return (luaL_error(state, + "property '%s' is not a valid property on dataset '%s'", + property_name, dataset_name)); + } else if (error == EIO) { + return (luaL_error(state, + "I/O error while retrieving property '%s' on dataset '%s'", + property_name, dataset_name)); + } else { + return (luaL_error(state, "unexpected error %d while " + "retrieving property '%s' on dataset '%s'", + error, property_name, dataset_name)); + } +} + +/* + * Look up a user defined property in the zap object. If it exists, push it + * and the setpoint onto the stack, otherwise don't push anything. + */ +static int +zcp_get_user_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, + const char *property_name) +{ + int error; + char *buf; + char setpoint[ZFS_MAX_DATASET_NAME_LEN]; + /* + * zcp_dataset_hold will either successfully return the requested + * dataset or throw a lua error and longjmp out of the zfs.get_prop call + * without returning. + */ + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + error = dsl_prop_get_ds(ds, property_name, 1, ZAP_MAXVALUELEN, + buf, setpoint); + dsl_dataset_rele(ds, FTAG); + + if (error != 0) { + kmem_free(buf, ZAP_MAXVALUELEN); + return (zcp_handle_error(state, dataset_name, property_name, + error)); + } + (void) lua_pushstring(state, buf); + (void) lua_pushstring(state, setpoint); + kmem_free(buf, ZAP_MAXVALUELEN); + return (2); +} + +/* + * Check if the property we're looking for is stored in the ds_dir. If so, + * return it in the 'val' argument. Return 0 on success and ENOENT and if + * the property is not present. + */ +static int +get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, + uint64_t *val) +{ + dsl_dir_t *dd = ds->ds_dir; + mutex_enter(&dd->dd_lock); + switch (zfs_prop) { + case ZFS_PROP_USEDSNAP: + *val = dsl_dir_get_usedsnap(dd); + break; + case ZFS_PROP_USEDCHILD: + *val = dsl_dir_get_usedchild(dd); + break; + case ZFS_PROP_USEDDS: + *val = dsl_dir_get_usedds(dd); + break; + case ZFS_PROP_USEDREFRESERV: + *val = dsl_dir_get_usedrefreserv(dd); + break; + case ZFS_PROP_LOGICALUSED: + *val = dsl_dir_get_logicalused(dd); + break; + default: + mutex_exit(&dd->dd_lock); + return (ENOENT); + } + mutex_exit(&dd->dd_lock); + return (0); +} + +/* + * Takes a dataset, a property, a value and that value's setpoint as + * found in the ZAP. Checks if the property has been changed in the vfs. + * If so, val and setpoint will be overwritten with updated content. + * Otherwise, they are left unchanged. + */ +static int +get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, + char *setpoint) +{ +#ifndef _KERNEL + return (0); +#else + int error; +#ifdef illumos + zfsvfs_t *zfvp; +#endif + vfs_t *vfsp; + objset_t *os; + uint64_t tmp = *val; + + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); + + error = getzfsvfs_impl(os, &vfsp); + if (error != 0) + return (error); +#ifdef illumos + vfsp = zfvp->z_vfs; +#endif + switch (zfs_prop) { + case ZFS_PROP_ATIME: + if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) + tmp = 1; + break; + case ZFS_PROP_DEVICES: + if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) + tmp = 1; + break; + case ZFS_PROP_EXEC: + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) + tmp = 1; + break; + case ZFS_PROP_SETUID: + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) + tmp = 1; + break; + case ZFS_PROP_READONLY: + if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) + tmp = 1; + break; + case ZFS_PROP_XATTR: + if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) + tmp = 1; + break; + case ZFS_PROP_NBMAND: + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) + tmp = 1; + break; + default: +#ifdef illumos + VFS_RELE(vfsp); +#else + vfs_rel(vfsp); +#endif + return (ENOENT); + } + +#ifdef illumos + VFS_RELE(vfsp); +#else + vfs_rel(vfsp); +#endif + if (tmp != *val) { + (void) strcpy(setpoint, "temporary"); + *val = tmp; + } + return (0); +#endif +} + +/* + * Check if the property we're looking for is stored at the dsl_dataset or + * dsl_dir level. If so, push the property value and source onto the lua stack + * and return 0. If it is not present or a failure occurs in lookup, return a + * non-zero error value. + */ +static int +get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, + zfs_prop_t zfs_prop) +{ + int error = 0; + objset_t *os; + uint64_t numval; + char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + char setpoint[ZFS_MAX_DATASET_NAME_LEN] = + "Internal error - setpoint not determined"; + zfs_type_t ds_type; + zprop_type_t prop_type = zfs_prop_get_type(zfs_prop); + (void) get_objset_type(ds, &ds_type); + + switch (zfs_prop) { + case ZFS_PROP_REFRATIO: + numval = dsl_get_refratio(ds); + break; + case ZFS_PROP_USED: + numval = dsl_get_used(ds); + break; + case ZFS_PROP_CLONES: { + nvlist_t *clones = fnvlist_alloc(); + error = get_clones_stat_impl(ds, clones); + if (error == 0) { + /* push list to lua stack */ + VERIFY0(zcp_nvlist_to_lua(state, clones, NULL, 0)); + /* source */ + (void) lua_pushnil(state); + } + nvlist_free(clones); + kmem_free(strval, ZAP_MAXVALUELEN); + return (error); + } + case ZFS_PROP_COMPRESSRATIO: + numval = dsl_get_compressratio(ds); + break; + case ZFS_PROP_CREATION: + numval = dsl_get_creation(ds); + break; + case ZFS_PROP_REFERENCED: + numval = dsl_get_referenced(ds); + break; + case ZFS_PROP_AVAILABLE: + numval = dsl_get_available(ds); + break; + case ZFS_PROP_LOGICALREFERENCED: + numval = dsl_get_logicalreferenced(ds); + break; + case ZFS_PROP_CREATETXG: + numval = dsl_get_creationtxg(ds); + break; + case ZFS_PROP_GUID: + numval = dsl_get_guid(ds); + break; + case ZFS_PROP_UNIQUE: + numval = dsl_get_unique(ds); + break; + case ZFS_PROP_OBJSETID: + numval = dsl_get_objsetid(ds); + break; + case ZFS_PROP_ORIGIN: + dsl_dir_get_origin(ds->ds_dir, strval); + break; + case ZFS_PROP_USERACCOUNTING: + error = dmu_objset_from_ds(ds, &os); + if (error == 0) + numval = dmu_objset_userspace_present(os); + break; + case ZFS_PROP_WRITTEN: + error = dsl_get_written(ds, &numval); + break; + case ZFS_PROP_TYPE: + error = get_objset_type_name(ds, strval); + break; + case ZFS_PROP_PREV_SNAP: + error = dsl_get_prev_snap(ds, strval); + break; + case ZFS_PROP_NAME: + dsl_dataset_name(ds, strval); + break; + case ZFS_PROP_MOUNTPOINT: + error = dsl_get_mountpoint(ds, dsname, strval, setpoint); + break; + case ZFS_PROP_VERSION: + /* should be a snapshot or filesystem */ + ASSERT(ds_type != ZFS_TYPE_VOLUME); + error = dmu_objset_from_ds(ds, &os); + /* look in the master node for the version */ + if (error == 0) { + error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + sizeof (numval), 1, &numval); + } + break; + case ZFS_PROP_DEFER_DESTROY: + numval = dsl_get_defer_destroy(ds); + break; + case ZFS_PROP_USERREFS: + numval = dsl_get_userrefs(ds); + break; + case ZFS_PROP_FILESYSTEM_COUNT: + error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval); + (void) strcpy(setpoint, ""); + break; + case ZFS_PROP_SNAPSHOT_COUNT: + error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval); + (void) strcpy(setpoint, ""); + break; + case ZFS_PROP_REMAPTXG: + error = dsl_dir_get_remaptxg(ds->ds_dir, &numval); + break; + case ZFS_PROP_NUMCLONES: + numval = dsl_get_numclones(ds); + break; + case ZFS_PROP_INCONSISTENT: + numval = dsl_get_inconsistent(ds); + break; + case ZFS_PROP_RECEIVE_RESUME_TOKEN: + VERIFY3U(strlcpy(strval, get_receive_resume_stats_impl(ds), + ZAP_MAXVALUELEN), <, ZAP_MAXVALUELEN); + if (strcmp(strval, "") == 0) { + VERIFY3U(strlcpy(strval, get_child_receive_stats(ds), + ZAP_MAXVALUELEN), <, ZAP_MAXVALUELEN); + if (strcmp(strval, "") == 0) + error = ENOENT; + } + break; + case ZFS_PROP_VOLSIZE: + ASSERT(ds_type == ZFS_TYPE_VOLUME); + error = dmu_objset_from_ds(ds, &os); + if (error == 0) { + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", + sizeof (numval), 1, &numval); + } + if (error == 0) + (void) strcpy(setpoint, dsname); + + break; + case ZFS_PROP_VOLBLOCKSIZE: { + ASSERT(ds_type == ZFS_TYPE_VOLUME); + dmu_object_info_t doi; + error = dmu_objset_from_ds(ds, &os); + if (error == 0) { + error = dmu_object_info(os, ZVOL_OBJ, &doi); + if (error == 0) + numval = doi.doi_data_block_size; + } + break; + } + default: + /* Did not match these props, check in the dsl_dir */ + error = get_dsl_dir_prop(ds, zfs_prop, &numval); + } + if (error != 0) { + kmem_free(strval, ZAP_MAXVALUELEN); + return (error); + } + + switch (prop_type) { + case PROP_TYPE_NUMBER: { + (void) lua_pushnumber(state, numval); + break; + } + case PROP_TYPE_STRING: { + (void) lua_pushstring(state, strval); + break; + } + case PROP_TYPE_INDEX: { + const char *propval; + error = zfs_prop_index_to_string(zfs_prop, numval, &propval); + if (error != 0) { + kmem_free(strval, ZAP_MAXVALUELEN); + return (error); + } + (void) lua_pushstring(state, propval); + break; + } + } + kmem_free(strval, ZAP_MAXVALUELEN); + + /* Push the source to the stack */ + get_prop_src(state, setpoint, zfs_prop); + return (0); +} + +/* + * Look up a property and its source in the zap object. If the value is + * present and successfully retrieved, push the value and source on the + * lua stack and return 0. On failure, return a non-zero error value. + */ +static int +get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop) +{ + int error = 0; + char setpoint[ZFS_MAX_DATASET_NAME_LEN]; + char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + uint64_t numval; + const char *prop_name = zfs_prop_to_name(zfs_prop); + zprop_type_t prop_type = zfs_prop_get_type(zfs_prop); + + if (prop_type == PROP_TYPE_STRING) { + /* Push value to lua stack */ + error = dsl_prop_get_ds(ds, prop_name, 1, + ZAP_MAXVALUELEN, strval, setpoint); + if (error == 0) + (void) lua_pushstring(state, strval); + } else { + error = dsl_prop_get_ds(ds, prop_name, sizeof (numval), + 1, &numval, setpoint); + + /* Fill in temorary value for prop, if applicable */ + (void) get_temporary_prop(ds, zfs_prop, &numval, setpoint); + + /* Push value to lua stack */ + if (prop_type == PROP_TYPE_INDEX) { + const char *propval; + error = zfs_prop_index_to_string(zfs_prop, numval, + &propval); + if (error == 0) + (void) lua_pushstring(state, propval); + } else { + if (error == 0) + (void) lua_pushnumber(state, numval); + } + } + kmem_free(strval, ZAP_MAXVALUELEN); + if (error == 0) + get_prop_src(state, setpoint, zfs_prop); + return (error); +} + +/* + * Determine whether property is valid for a given dataset + */ +boolean_t +prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop) +{ + int error; + zfs_type_t zfs_type; + + /* properties not supported */ + if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) || + (zfs_prop == ZFS_PROP_MOUNTED)) + return (B_FALSE); + + /* if we want the origin prop, ds must be a clone */ + if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir))) + return (B_FALSE); + + error = get_objset_type(ds, &zfs_type); + if (error != 0) + return (B_FALSE); + return (zfs_prop_valid_for_type(zfs_prop, zfs_type)); +} + +/* + * Look up a given dataset property. On success return 2, the number of + * values pushed to the lua stack (property value and source). On a fatal + * error, longjmp. On a non fatal error push nothing. + */ +static int +zcp_get_system_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, + zfs_prop_t zfs_prop) +{ + int error; + /* + * zcp_dataset_hold will either successfully return the requested + * dataset or throw a lua error and longjmp out of the zfs.get_prop call + * without returning. + */ + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + /* Check that the property is valid for the given dataset */ + const char *prop_name = zfs_prop_to_name(zfs_prop); + if (!prop_valid_for_ds(ds, zfs_prop)) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + /* Check if the property can be accessed directly */ + error = get_special_prop(state, ds, dataset_name, zfs_prop); + if (error == 0) { + dsl_dataset_rele(ds, FTAG); + /* The value and source have been pushed by get_special_prop */ + return (2); + } + if (error != ENOENT) { + dsl_dataset_rele(ds, FTAG); + return (zcp_handle_error(state, dataset_name, + prop_name, error)); + } + + /* If we were unable to find it, look in the zap object */ + error = get_zap_prop(state, ds, zfs_prop); + dsl_dataset_rele(ds, FTAG); + if (error != 0) { + return (zcp_handle_error(state, dataset_name, + prop_name, error)); + } + /* The value and source have been pushed by get_zap_prop */ + return (2); +} + +static zfs_userquota_prop_t +get_userquota_prop(const char *prop_name) +{ + zfs_userquota_prop_t type; + /* Figure out the property type ({user|group}{quota|used}) */ + for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { + if (strncmp(prop_name, zfs_userquota_prop_prefixes[type], + strlen(zfs_userquota_prop_prefixes[type])) == 0) + break; + } + return (type); +} + +#ifdef _KERNEL +/* + * Given the name of a zfs_userquota_prop, this function determines the + * prop type as well as the numeric group/user ids based on the string + * following the '@' in the property name. On success, returns 0. On failure, + * returns a non-zero error. + * 'domain' must be free'd by caller using strfree() + */ +static int +parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type, + char **domain, uint64_t *rid) +{ + char *cp, *end, *domain_val; + + *type = get_userquota_prop(prop_name); + if (*type >= ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + + *rid = 0; + cp = strchr(prop_name, '@') + 1; + if (strncmp(cp, "S-1-", 4) == 0) { + /* + * It's a numeric SID (eg "S-1-234-567-89") and we want to + * seperate the domain id and the rid + */ + int domain_len = strrchr(cp, '-') - cp; + domain_val = kmem_alloc(domain_len + 1, KM_SLEEP); + (void) strncpy(domain_val, cp, domain_len); + domain_val[domain_len] = '\0'; + cp += domain_len + 1; + + (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); + if (*end != '\0') { + strfree(domain_val); + return (EINVAL); + } + } else { + /* It's only a user/group ID (eg "12345"), just get the rid */ + domain_val = NULL; + (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); + if (*end != '\0') + return (EINVAL); + } + *domain = domain_val; + return (0); +} + +/* + * Look up {user|group}{quota|used} property for given dataset. On success + * push the value (quota or used amount) and the setpoint. On failure, push + * a lua error. + */ +static int +zcp_get_userquota_prop(lua_State *state, dsl_pool_t *dp, + const char *dataset_name, const char *prop_name) +{ + zfsvfs_t *zfvp; + zfsvfs_t *zfsvfs; + int error; + zfs_userquota_prop_t type; + char *domain; + uint64_t rid, value; + objset_t *os; + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + error = parse_userquota_prop(prop_name, &type, &domain, &rid); + if (error == 0) { + error = dmu_objset_from_ds(ds, &os); + if (error == 0) { + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + error = zfsvfs_create_impl(&zfvp, zfsvfs, os); + if (error == 0) { + error = zfs_userspace_one(zfvp, type, domain, + rid, &value); + zfsvfs_free(zfvp); + } + } + if (domain != NULL) + strfree(domain); + } + dsl_dataset_rele(ds, FTAG); + + if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) || + (type == ZFS_PROP_GROUPQUOTA))) + error = ENOENT; + if (error != 0) { + return (zcp_handle_error(state, dataset_name, + prop_name, error)); + } + + (void) lua_pushnumber(state, value); + (void) lua_pushstring(state, dataset_name); + return (2); +} +#endif + +/* + * Determines the name of the snapshot referenced in the written property + * name. Returns snapshot name in snap_name, a buffer that must be at least + * as large as ZFS_MAX_DATASET_NAME_LEN + */ +static void +parse_written_prop(const char *dataset_name, const char *prop_name, + char *snap_name) +{ + ASSERT(zfs_prop_written(prop_name)); + const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN; + if (strchr(name, '@') == NULL) { + (void) sprintf(snap_name, "%s@%s", dataset_name, name); + } else { + (void) strcpy(snap_name, name); + } +} + +/* + * Look up written@ property for given dataset. On success + * push the value and the setpoint. If error is fatal, we will + * longjmp, otherwise push nothing. + */ +static int +zcp_get_written_prop(lua_State *state, dsl_pool_t *dp, + const char *dataset_name, const char *prop_name) +{ + char snap_name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t used, comp, uncomp; + dsl_dataset_t *old; + int error = 0; + + parse_written_prop(dataset_name, prop_name, snap_name); + dsl_dataset_t *new = zcp_dataset_hold(state, dp, dataset_name, FTAG); + if (new == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + error = dsl_dataset_hold(dp, snap_name, FTAG, &old); + if (error != 0) { + dsl_dataset_rele(new, FTAG); + return (zcp_dataset_hold_error(state, dp, snap_name, + error)); + } + error = dsl_dataset_space_written(old, new, + &used, &comp, &uncomp); + + dsl_dataset_rele(old, FTAG); + dsl_dataset_rele(new, FTAG); + + if (error != 0) { + return (zcp_handle_error(state, dataset_name, + snap_name, error)); + } + (void) lua_pushnumber(state, used); + (void) lua_pushstring(state, dataset_name); + return (2); +} + +static int zcp_get_prop(lua_State *state); +static zcp_lib_info_t zcp_get_prop_info = { + .name = "get_prop", + .func = zcp_get_prop, + .pargs = { + { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, + { .za_name = "property", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_get_prop(lua_State *state) +{ + const char *dataset_name; + const char *property_name; + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + zcp_lib_info_t *libinfo = &zcp_get_prop_info; + + zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); + + dataset_name = lua_tostring(state, 1); + property_name = lua_tostring(state, 2); + + /* User defined property */ + if (zfs_prop_user(property_name)) { + return (zcp_get_user_prop(state, dp, + dataset_name, property_name)); + } + /* userspace property */ + if (zfs_prop_userquota(property_name)) { +#ifdef _KERNEL + return (zcp_get_userquota_prop(state, dp, + dataset_name, property_name)); +#else + return (luaL_error(state, + "user quota properties only supported in kernel mode", + property_name)); +#endif + } + /* written@ property */ + if (zfs_prop_written(property_name)) { + return (zcp_get_written_prop(state, dp, + dataset_name, property_name)); + } + + zfs_prop_t zfs_prop = zfs_name_to_prop(property_name); + /* Valid system property */ + if (zfs_prop != ZPROP_INVAL) { + return (zcp_get_system_prop(state, dp, dataset_name, + zfs_prop)); + } + + /* Invalid property name */ + return (luaL_error(state, + "'%s' is not a valid property", property_name)); +} + +int +zcp_load_get_lib(lua_State *state) +{ + lua_pushcclosure(state, zcp_get_prop_info.func, 0); + lua_setfield(state, -2, zcp_get_prop_info.name); + + return (1); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c new file mode 100644 index 000000000000..c25431fd6703 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2017 by Delphix. All rights reserved. + */ + +#include <sys/zcp_global.h> + +#include "lua.h" +#include "lauxlib.h" + +typedef struct zcp_errno_global { + const char *zeg_name; + int zeg_errno; +} zcp_errno_global_t; + +static const zcp_errno_global_t errno_globals[] = { + {"EPERM", EPERM}, + {"ENOENT", ENOENT}, + {"ESRCH", ESRCH}, + {"EINTR", EINTR}, + {"EIO", EIO}, + {"ENXIO", ENXIO}, + {"E2BIG", E2BIG}, + {"ENOEXEC", ENOEXEC}, + {"EBADF", EBADF}, + {"ECHILD", ECHILD}, + {"EAGAIN", EAGAIN}, + {"ENOMEM", ENOMEM}, + {"EACCES", EACCES}, + {"EFAULT", EFAULT}, + {"ENOTBLK", ENOTBLK}, + {"EBUSY", EBUSY}, + {"EEXIST", EEXIST}, + {"EXDEV", EXDEV}, + {"ENODEV", ENODEV}, + {"ENOTDIR", ENOTDIR}, + {"EISDIR", EISDIR}, + {"EINVAL", EINVAL}, + {"ENFILE", ENFILE}, + {"EMFILE", EMFILE}, + {"ENOTTY", ENOTTY}, + {"ETXTBSY", ETXTBSY}, + {"EFBIG", EFBIG}, + {"ENOSPC", ENOSPC}, + {"ESPIPE", ESPIPE}, + {"EROFS", EROFS}, + {"EMLINK", EMLINK}, + {"EPIPE", EPIPE}, + {"EDOM", EDOM}, + {"ERANGE", ERANGE}, + {"EDEADLK", EDEADLK}, + {"ENOLCK", ENOLCK}, + {"ECANCELED", ECANCELED}, + {"ENOTSUP", ENOTSUP}, + {"EDQUOT", EDQUOT}, + {"ENAMETOOLONG", ENAMETOOLONG}, + {NULL, 0} +}; + +static void +zcp_load_errno_globals(lua_State *state) +{ + const zcp_errno_global_t *global = errno_globals; + while (global->zeg_name != NULL) { + lua_pushnumber(state, (lua_Number)global->zeg_errno); + lua_setglobal(state, global->zeg_name); + global++; + } +} + +void +zcp_load_globals(lua_State *state) +{ + zcp_load_errno_globals(state); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c new file mode 100644 index 000000000000..0236c6474ef6 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c @@ -0,0 +1,531 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#include "lua.h" +#include "lauxlib.h" + +#include <sys/dmu.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_pool.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/zap.h> +#include <sys/dsl_dir.h> +#include <sys/zcp_prop.h> + +#include <sys/zcp.h> + +typedef int (zcp_list_func_t)(lua_State *); +typedef struct zcp_list_info { + const char *name; + zcp_list_func_t *func; + zcp_list_func_t *gc; + const zcp_arg_t pargs[4]; + const zcp_arg_t kwargs[2]; +} zcp_list_info_t; + +static int +zcp_clones_iter(lua_State *state) +{ + int err; + char clonename[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1)); + uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2)); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + dsl_dataset_t *ds, *clone; + zap_attribute_t za; + zap_cursor_t zc; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err == ENOENT) { + return (0); + } else if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dsl_dataset_hold_obj(dsobj)", + err)); + } + + if (dsl_dataset_phys(ds)->ds_next_clones_obj == 0) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + zap_cursor_init_serialized(&zc, dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj, cursor); + dsl_dataset_rele(ds, FTAG); + + err = zap_cursor_retrieve(&zc, &za); + if (err != 0) { + zap_cursor_fini(&zc); + if (err != ENOENT) { + return (luaL_error(state, + "unexpected error %d from zap_cursor_retrieve()", + err)); + } + return (0); + } + zap_cursor_advance(&zc); + cursor = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + + err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &clone); + if (err != 0) { + return (luaL_error(state, + "unexpected error %d from " + "dsl_dataset_hold_obj(za_first_integer)", err)); + } + + dsl_dir_name(clone->ds_dir, clonename); + dsl_dataset_rele(clone, FTAG); + + lua_pushnumber(state, cursor); + lua_replace(state, lua_upvalueindex(2)); + + (void) lua_pushstring(state, clonename); + return (1); +} + +static int zcp_clones_list(lua_State *); +static zcp_list_info_t zcp_clones_list_info = { + .name = "clones", + .func = zcp_clones_list, + .gc = NULL, + .pargs = { + { .za_name = "snapshot", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_clones_list(lua_State *state) +{ + const char *snapname = lua_tostring(state, 1); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + boolean_t issnap; + uint64_t dsobj, cursor; + + /* + * zcp_dataset_hold will either successfully return the requested + * dataset or throw a lua error and longjmp out of the zfs.list.clones + * call without returning. + */ + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, snapname, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + cursor = 0; + issnap = ds->ds_is_snapshot; + dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + + if (!issnap) { + return (zcp_argerror(state, 1, "%s is not a snapshot", + snapname)); + } + + lua_pushnumber(state, dsobj); + lua_pushnumber(state, cursor); + lua_pushcclosure(state, &zcp_clones_iter, 2); + return (1); +} + +static int +zcp_snapshots_iter(lua_State *state) +{ + int err; + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1)); + uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2)); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + dsl_dataset_t *ds; + objset_t *os; + char *p; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dsl_dataset_hold_obj(dsobj)", + err)); + } + + dsl_dataset_name(ds, snapname); + VERIFY3U(sizeof (snapname), >, + strlcat(snapname, "@", sizeof (snapname))); + + p = strchr(snapname, '\0'); + VERIFY0(dmu_objset_from_ds(ds, &os)); + err = dmu_snapshot_list_next(os, + sizeof (snapname) - (p - snapname), p, NULL, &cursor, NULL); + dsl_dataset_rele(ds, FTAG); + + if (err == ENOENT) { + return (0); + } else if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dmu_snapshot_list_next()", err)); + } + + lua_pushnumber(state, cursor); + lua_replace(state, lua_upvalueindex(2)); + + (void) lua_pushstring(state, snapname); + return (1); +} + +static int zcp_snapshots_list(lua_State *); +static zcp_list_info_t zcp_snapshots_list_info = { + .name = "snapshots", + .func = zcp_snapshots_list, + .gc = NULL, + .pargs = { + { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_snapshots_list(lua_State *state) +{ + const char *fsname = lua_tostring(state, 1); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + boolean_t issnap; + uint64_t dsobj; + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + issnap = ds->ds_is_snapshot; + dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + + if (issnap) { + return (zcp_argerror(state, 1, + "argument %s cannot be a snapshot", fsname)); + } + + lua_pushnumber(state, dsobj); + lua_pushnumber(state, 0); + lua_pushcclosure(state, &zcp_snapshots_iter, 2); + return (1); +} + +/* + * Note: channel programs only run in the global zone, so all datasets + * are visible to this zone. + */ +static boolean_t +dataset_name_hidden(const char *name) +{ + if (strchr(name, '$') != NULL) + return (B_TRUE); + if (strchr(name, '%') != NULL) + return (B_TRUE); + return (B_FALSE); +} + +static int +zcp_children_iter(lua_State *state) +{ + int err; + char childname[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1)); + uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2)); + zcp_run_info_t *ri = zcp_run_info(state); + dsl_pool_t *dp = ri->zri_pool; + dsl_dataset_t *ds; + objset_t *os; + char *p; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dsl_dataset_hold_obj(dsobj)", + err)); + } + + dsl_dataset_name(ds, childname); + VERIFY3U(sizeof (childname), >, + strlcat(childname, "/", sizeof (childname))); + p = strchr(childname, '\0'); + + VERIFY0(dmu_objset_from_ds(ds, &os)); + do { + err = dmu_dir_list_next(os, + sizeof (childname) - (p - childname), p, NULL, &cursor); + } while (err == 0 && dataset_name_hidden(childname)); + dsl_dataset_rele(ds, FTAG); + + if (err == ENOENT) { + return (0); + } else if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dmu_dir_list_next()", + err)); + } + + lua_pushnumber(state, cursor); + lua_replace(state, lua_upvalueindex(2)); + + (void) lua_pushstring(state, childname); + return (1); +} + +static int zcp_children_list(lua_State *); +static zcp_list_info_t zcp_children_list_info = { + .name = "children", + .func = zcp_children_list, + .gc = NULL, + .pargs = { + { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_children_list(lua_State *state) +{ + const char *fsname = lua_tostring(state, 1); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + boolean_t issnap; + uint64_t dsobj; + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + issnap = ds->ds_is_snapshot; + dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + + if (issnap) { + return (zcp_argerror(state, 1, + "argument %s cannot be a snapshot", fsname)); + } + + lua_pushnumber(state, dsobj); + lua_pushnumber(state, 0); + lua_pushcclosure(state, &zcp_children_iter, 2); + return (1); +} + +static int +zcp_props_list_gc(lua_State *state) +{ + nvlist_t **props = lua_touserdata(state, 1); + if (*props != NULL) + fnvlist_free(*props); + return (0); +} + +static int +zcp_props_iter(lua_State *state) +{ + char *source, *val; + nvlist_t *nvprop; + nvlist_t **props = lua_touserdata(state, lua_upvalueindex(1)); + nvpair_t *pair = lua_touserdata(state, lua_upvalueindex(2)); + + do { + pair = nvlist_next_nvpair(*props, pair); + if (pair == NULL) { + fnvlist_free(*props); + *props = NULL; + return (0); + } + } while (!zfs_prop_user(nvpair_name(pair))); + + lua_pushlightuserdata(state, pair); + lua_replace(state, lua_upvalueindex(2)); + + nvprop = fnvpair_value_nvlist(pair); + val = fnvlist_lookup_string(nvprop, ZPROP_VALUE); + source = fnvlist_lookup_string(nvprop, ZPROP_SOURCE); + + (void) lua_pushstring(state, nvpair_name(pair)); + (void) lua_pushstring(state, val); + (void) lua_pushstring(state, source); + return (3); +} + +static int zcp_props_list(lua_State *); +static zcp_list_info_t zcp_props_list_info = { + .name = "properties", + .func = zcp_props_list, + .gc = zcp_props_list_gc, + .pargs = { + { .za_name = "filesystem | snapshot | volume", + .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_props_list(lua_State *state) +{ + const char *dsname = lua_tostring(state, 1); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + objset_t *os; + nvlist_t **props = lua_newuserdata(state, sizeof (nvlist_t *)); + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + VERIFY0(dmu_objset_from_ds(ds, &os)); + VERIFY0(dsl_prop_get_all(os, props)); + dsl_dataset_rele(ds, FTAG); + + /* + * Set the metatable for the properties list to free it on completion. + */ + luaL_getmetatable(state, zcp_props_list_info.name); + (void) lua_setmetatable(state, -2); + + lua_pushlightuserdata(state, NULL); + lua_pushcclosure(state, &zcp_props_iter, 2); + return (1); +} + + +/* + * Populate nv with all valid properties and their values for the given + * dataset. + */ +static void +zcp_dataset_props(dsl_dataset_t *ds, nvlist_t *nv) +{ + for (int prop = ZFS_PROP_TYPE; prop < ZFS_NUM_PROPS; prop++) { + /* Do not display hidden props */ + if (!zfs_prop_visible(prop)) + continue; + /* Do not display props not valid for this dataset */ + if (!prop_valid_for_ds(ds, prop)) + continue; + fnvlist_add_boolean(nv, zfs_prop_to_name(prop)); + } +} + +static int zcp_system_props_list(lua_State *); +static zcp_list_info_t zcp_system_props_list_info = { + .name = "system_properties", + .func = zcp_system_props_list, + .pargs = { + { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +/* + * Get a list of all visble properties and their values for a given dataset. + * Returned on the stack as a Lua table. + */ +static int +zcp_system_props_list(lua_State *state) +{ + int error; + char errbuf[128]; + const char *dataset_name; + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + zcp_list_info_t *libinfo = &zcp_system_props_list_info; + zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); + dataset_name = lua_tostring(state, 1); + nvlist_t *nv = fnvlist_alloc(); + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + /* Get the names of all valid properties for this dataset */ + zcp_dataset_props(ds, nv); + dsl_dataset_rele(ds, FTAG); + + /* push list as lua table */ + error = zcp_nvlist_to_lua(state, nv, errbuf, sizeof (errbuf)); + nvlist_free(nv); + if (error != 0) { + return (luaL_error(state, + "Error returning nvlist: %s", errbuf)); + } + return (1); +} + +static int +zcp_list_func(lua_State *state) +{ + zcp_list_info_t *info = lua_touserdata(state, lua_upvalueindex(1)); + + zcp_parse_args(state, info->name, info->pargs, info->kwargs); + + return (info->func(state)); +} + +int +zcp_load_list_lib(lua_State *state) +{ + int i; + zcp_list_info_t *zcp_list_funcs[] = { + &zcp_children_list_info, + &zcp_snapshots_list_info, + &zcp_props_list_info, + &zcp_clones_list_info, + &zcp_system_props_list_info, + NULL + }; + + lua_newtable(state); + + for (i = 0; zcp_list_funcs[i] != NULL; i++) { + zcp_list_info_t *info = zcp_list_funcs[i]; + + if (info->gc != NULL) { + /* + * If the function requires garbage collection, create + * a metatable with its name and register the __gc + * function. + */ + (void) luaL_newmetatable(state, info->name); + (void) lua_pushstring(state, "__gc"); + lua_pushcfunction(state, info->gc); + lua_settable(state, -3); + lua_pop(state, 1); + } + + lua_pushlightuserdata(state, info); + lua_pushcclosure(state, &zcp_list_func, 1); + lua_setfield(state, -2, info->name); + info++; + } + + return (1); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c new file mode 100644 index 000000000000..25d970ec0888 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c @@ -0,0 +1,360 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2017 by Delphix. All rights reserved. + */ + +#include "lua.h" +#include "lauxlib.h" + +#include <sys/zcp.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_bookmark.h> +#include <sys/dsl_destroy.h> +#include <sys/dmu_objset.h> +#include <sys/zfs_znode.h> +#include <sys/zfeature.h> +#include <sys/metaslab.h> + +#define DST_AVG_BLKSHIFT 14 + +typedef int (zcp_synctask_func_t)(lua_State *, boolean_t, nvlist_t *); +typedef struct zcp_synctask_info { + const char *name; + zcp_synctask_func_t *func; + const zcp_arg_t pargs[4]; + const zcp_arg_t kwargs[2]; + zfs_space_check_t space_check; + int blocks_modified; +} zcp_synctask_info_t; + +/* + * Generic synctask interface for channel program syncfuncs. + * + * To perform some action in syncing context, we'd generally call + * dsl_sync_task(), but since the Lua script is already running inside a + * synctask we need to leave out some actions (such as acquiring the config + * rwlock and performing space checks). + * + * If 'sync' is false, executes a dry run and returns the error code. + * + * If we are not running in syncing context and we are not doing a dry run + * (meaning we are running a zfs.sync function in open-context) then we + * return a Lua error. + * + * This function also handles common fatal error cases for channel program + * library functions. If a fatal error occurs, err_dsname will be the dataset + * name reported in error messages, if supplied. + */ +static int +zcp_sync_task(lua_State *state, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, void *arg, boolean_t sync, const char *err_dsname) +{ + int err; + zcp_run_info_t *ri = zcp_run_info(state); + + err = checkfunc(arg, ri->zri_tx); + if (!sync) + return (err); + + if (!ri->zri_sync) { + return (luaL_error(state, "running functions from the zfs.sync " + "submodule requires passing sync=TRUE to " + "lzc_channel_program() (i.e. do not specify the \"-n\" " + "command line argument)")); + } + + if (err == 0) { + syncfunc(arg, ri->zri_tx); + } else if (err == EIO) { + if (err_dsname != NULL) { + return (luaL_error(state, + "I/O error while accessing dataset '%s'", + err_dsname)); + } else { + return (luaL_error(state, + "I/O error while accessing dataset.")); + } + } + + return (err); +} + + +static int zcp_synctask_destroy(lua_State *, boolean_t, nvlist_t *); +static zcp_synctask_info_t zcp_synctask_destroy_info = { + .name = "destroy", + .func = zcp_synctask_destroy, + .pargs = { + {.za_name = "filesystem | snapshot", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN}, + {NULL, 0} + }, + .space_check = ZFS_SPACE_CHECK_DESTROY, + .blocks_modified = 0 +}; + +/* ARGSUSED */ +static int +zcp_synctask_destroy(lua_State *state, boolean_t sync, nvlist_t *err_details) +{ + int err; + const char *dsname = lua_tostring(state, 1); + + boolean_t issnap = (strchr(dsname, '@') != NULL); + + if (!issnap && !lua_isnil(state, 2)) { + return (luaL_error(state, + "'deferred' kwarg only supported for snapshots: %s", + dsname)); + } + + if (issnap) { + dsl_destroy_snapshot_arg_t ddsa = { 0 }; + ddsa.ddsa_name = dsname; + if (!lua_isnil(state, 2)) { + ddsa.ddsa_defer = lua_toboolean(state, 2); + } else { + ddsa.ddsa_defer = B_FALSE; + } + + err = zcp_sync_task(state, dsl_destroy_snapshot_check, + dsl_destroy_snapshot_sync, &ddsa, sync, dsname); + } else { + dsl_destroy_head_arg_t ddha = { 0 }; + ddha.ddha_name = dsname; + + err = zcp_sync_task(state, dsl_destroy_head_check, + dsl_destroy_head_sync, &ddha, sync, dsname); + } + + return (err); +} + +static int zcp_synctask_promote(lua_State *, boolean_t, nvlist_t *); +static zcp_synctask_info_t zcp_synctask_promote_info = { + .name = "promote", + .func = zcp_synctask_promote, + .pargs = { + {.za_name = "clone", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + }, + .space_check = ZFS_SPACE_CHECK_RESERVED, + .blocks_modified = 3 +}; + +static int +zcp_synctask_promote(lua_State *state, boolean_t sync, nvlist_t *err_details) +{ + int err; + dsl_dataset_promote_arg_t ddpa = { 0 }; + const char *dsname = lua_tostring(state, 1); + zcp_run_info_t *ri = zcp_run_info(state); + + ddpa.ddpa_clonename = dsname; + ddpa.err_ds = err_details; + ddpa.cr = ri->zri_cred; + + /* + * If there was a snapshot name conflict, then err_ds will be filled + * with a list of conflicting snapshot names. + */ + err = zcp_sync_task(state, dsl_dataset_promote_check, + dsl_dataset_promote_sync, &ddpa, sync, dsname); + + return (err); +} + +static int zcp_synctask_rollback(lua_State *, boolean_t, nvlist_t *err_details); +static zcp_synctask_info_t zcp_synctask_rollback_info = { + .name = "rollback", + .func = zcp_synctask_rollback, + .space_check = ZFS_SPACE_CHECK_RESERVED, + .blocks_modified = 1, + .pargs = { + {.za_name = "filesystem", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_synctask_rollback(lua_State *state, boolean_t sync, nvlist_t *err_details) +{ + int err; + const char *dsname = lua_tostring(state, 1); + dsl_dataset_rollback_arg_t ddra = { 0 }; + + ddra.ddra_fsname = dsname; + ddra.ddra_result = err_details; + + err = zcp_sync_task(state, dsl_dataset_rollback_check, + dsl_dataset_rollback_sync, &ddra, sync, dsname); + + return (err); +} + +static int zcp_synctask_snapshot(lua_State *, boolean_t, nvlist_t *); +static zcp_synctask_info_t zcp_synctask_snapshot_info = { + .name = "snapshot", + .func = zcp_synctask_snapshot, + .pargs = { + {.za_name = "filesystem@snapname | volume@snapname", + .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + }, + .space_check = ZFS_SPACE_CHECK_NORMAL, + .blocks_modified = 3 +}; + +/* ARGSUSED */ +static int +zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details) +{ + int err; + dsl_dataset_snapshot_arg_t ddsa = { 0 }; + const char *dsname = lua_tostring(state, 1); + zcp_run_info_t *ri = zcp_run_info(state); + + /* + * On old pools, the ZIL must not be active when a snapshot is created, + * but we can't suspend the ZIL because we're already in syncing + * context. + */ + if (spa_version(ri->zri_pool->dp_spa) < SPA_VERSION_FAST_SNAP) { + return (ENOTSUP); + } + + /* + * We only allow for a single snapshot rather than a list, so the + * error list output is unnecessary. + */ + ddsa.ddsa_errors = NULL; + ddsa.ddsa_props = NULL; + ddsa.ddsa_cr = ri->zri_cred; + ddsa.ddsa_snaps = fnvlist_alloc(); + fnvlist_add_boolean(ddsa.ddsa_snaps, dsname); + + zcp_cleanup_handler_t *zch = zcp_register_cleanup(state, + (zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps); + + err = zcp_sync_task(state, dsl_dataset_snapshot_check, + dsl_dataset_snapshot_sync, &ddsa, sync, dsname); + + zcp_deregister_cleanup(state, zch); + fnvlist_free(ddsa.ddsa_snaps); + + return (err); +} + +static int +zcp_synctask_wrapper(lua_State *state) +{ + int err; + zcp_cleanup_handler_t *zch; + int num_ret = 1; + nvlist_t *err_details = fnvlist_alloc(); + + /* + * Make sure err_details is properly freed, even if a fatal error is + * thrown during the synctask. + */ + zch = zcp_register_cleanup(state, + (zcp_cleanup_t *)&fnvlist_free, err_details); + + zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1)); + boolean_t sync = lua_toboolean(state, lua_upvalueindex(2)); + + zcp_run_info_t *ri = zcp_run_info(state); + dsl_pool_t *dp = ri->zri_pool; + + /* MOS space is triple-dittoed, so we multiply by 3. */ + uint64_t funcspace = (info->blocks_modified << DST_AVG_BLKSHIFT) * 3; + + zcp_parse_args(state, info->name, info->pargs, info->kwargs); + + err = 0; + if (info->space_check != ZFS_SPACE_CHECK_NONE) { + uint64_t quota = dsl_pool_unreserved_space(dp, + info->space_check); + uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes + + ri->zri_space_used; + + if (used + funcspace > quota) { + err = SET_ERROR(ENOSPC); + } + } + + if (err == 0) { + err = info->func(state, sync, err_details); + } + + if (err == 0) { + ri->zri_space_used += funcspace; + } + + lua_pushnumber(state, (lua_Number)err); + if (fnvlist_num_pairs(err_details) > 0) { + (void) zcp_nvlist_to_lua(state, err_details, NULL, 0); + num_ret++; + } + + zcp_deregister_cleanup(state, zch); + fnvlist_free(err_details); + + return (num_ret); +} + +int +zcp_load_synctask_lib(lua_State *state, boolean_t sync) +{ + int i; + zcp_synctask_info_t *zcp_synctask_funcs[] = { + &zcp_synctask_destroy_info, + &zcp_synctask_promote_info, + &zcp_synctask_rollback_info, + &zcp_synctask_snapshot_info, + NULL + }; + + lua_newtable(state); + + for (i = 0; zcp_synctask_funcs[i] != NULL; i++) { + zcp_synctask_info_t *info = zcp_synctask_funcs[i]; + lua_pushlightuserdata(state, info); + lua_pushboolean(state, sync); + lua_pushcclosure(state, &zcp_synctask_wrapper, 2); + lua_setfield(state, -2, info->name); + info++; + } + + return (1); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c new file mode 100644 index 000000000000..76003e3544f4 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c @@ -0,0 +1,509 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/zfeature.h> +#include <sys/dmu.h> +#include <sys/nvpair.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> +#include "zfeature_common.h" +#include <sys/spa_impl.h> + +/* + * ZFS Feature Flags + * ----------------- + * + * ZFS feature flags are used to provide fine-grained versioning to the ZFS + * on-disk format. Once enabled on a pool feature flags replace the old + * spa_version() number. + * + * Each new on-disk format change will be given a uniquely identifying string + * guid rather than a version number. This avoids the problem of different + * organizations creating new on-disk formats with the same version number. To + * keep feature guids unique they should consist of the reverse dns name of the + * organization which implemented the feature and a short name for the feature, + * separated by a colon (e.g. com.delphix:async_destroy). + * + * Reference Counts + * ---------------- + * + * Within each pool features can be in one of three states: disabled, enabled, + * or active. These states are differentiated by a reference count stored on + * disk for each feature: + * + * 1) If there is no reference count stored on disk the feature is disabled. + * 2) If the reference count is 0 a system administrator has enabled the + * feature, but the feature has not been used yet, so no on-disk + * format changes have been made. + * 3) If the reference count is greater than 0 the feature is active. + * The format changes required by the feature are currently on disk. + * Note that if the feature's format changes are reversed the feature + * may choose to set its reference count back to 0. + * + * Feature flags makes no differentiation between non-zero reference counts + * for an active feature (e.g. a reference count of 1 means the same thing as a + * reference count of 27834721), but feature implementations may choose to use + * the reference count to store meaningful information. For example, a new RAID + * implementation might set the reference count to the number of vdevs using + * it. If all those disks are removed from the pool the feature goes back to + * having a reference count of 0. + * + * It is the responsibility of the individual features to maintain a non-zero + * reference count as long as the feature's format changes are present on disk. + * + * Dependencies + * ------------ + * + * Each feature may depend on other features. The only effect of this + * relationship is that when a feature is enabled all of its dependencies are + * automatically enabled as well. Any future work to support disabling of + * features would need to ensure that features cannot be disabled if other + * enabled features depend on them. + * + * On-disk Format + * -------------- + * + * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES + * (5000). In order for this to work the pool is automatically upgraded to + * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk + * format changes will be in use. + * + * Information about features is stored in 3 ZAP objects in the pool's MOS. + * These objects are linked to by the following names in the pool directory + * object: + * + * 1) features_for_read: feature guid -> reference count + * Features needed to open the pool for reading. + * 2) features_for_write: feature guid -> reference count + * Features needed to open the pool for writing. + * 3) feature_descriptions: feature guid -> descriptive string + * A human readable string. + * + * All enabled features appear in either features_for_read or + * features_for_write, but not both. + * + * To open a pool in read-only mode only the features listed in + * features_for_read need to be supported. + * + * To open the pool in read-write mode features in both features_for_read and + * features_for_write need to be supported. + * + * Some features may be required to read the ZAP objects containing feature + * information. To allow software to check for compatibility with these features + * before the pool is opened their names must be stored in the label in a + * new "features_for_read" entry (note that features that are only required + * to write to a pool never need to be stored in the label since the + * features_for_write ZAP object can be read before the pool is written to). + * To save space in the label features must be explicitly marked as needing to + * be written to the label. Also, reference counts are not stored in the label, + * instead any feature whose reference count drops to 0 is removed from the + * label. + * + * Adding New Features + * ------------------- + * + * Features must be registered in zpool_feature_init() function in + * zfeature_common.c using the zfeature_register() function. This function + * has arguments to specify if the feature should be stored in the + * features_for_read or features_for_write ZAP object and if it needs to be + * written to the label when active. + * + * Once a feature is registered it will appear as a "feature@<feature name>" + * property which can be set by an administrator. Feature implementors should + * use the spa_feature_is_enabled() and spa_feature_is_active() functions to + * query the state of a feature and the spa_feature_incr() and + * spa_feature_decr() functions to change an enabled feature's reference count. + * Reference counts may only be updated in the syncing context. + * + * Features may not perform enable-time initialization. Instead, any such + * initialization should occur when the feature is first used. This design + * enforces that on-disk changes be made only when features are used. Code + * should only check if a feature is enabled using spa_feature_is_enabled(), + * not by relying on any feature specific metadata existing. If a feature is + * enabled, but the feature's metadata is not on disk yet then it should be + * created as needed. + * + * As an example, consider the com.delphix:async_destroy feature. This feature + * relies on the existence of a bptree in the MOS that store blocks for + * asynchronous freeing. This bptree is not created when async_destroy is + * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is + * called to check if async_destroy is enabled. If it is and the bptree object + * does not exist yet, the bptree object is created as part of the dataset + * destroy and async_destroy's reference count is incremented to indicate it + * has made an on-disk format change. Later, after the destroyed dataset's + * blocks have all been asynchronously freed there is no longer any use for the + * bptree object, so it is destroyed and async_destroy's reference count is + * decremented back to 0 to indicate that it has undone its on-disk format + * changes. + */ + +typedef enum { + FEATURE_ACTION_INCR, + FEATURE_ACTION_DECR, +} feature_action_t; + +/* + * Checks that the active features in the pool are supported by + * this software. Adds each unsupported feature (name -> description) to + * the supplied nvlist. + */ +boolean_t +spa_features_check(spa_t *spa, boolean_t for_write, + nvlist_t *unsup_feat, nvlist_t *enabled_feat) +{ + objset_t *os = spa->spa_meta_objset; + boolean_t supported; + zap_cursor_t zc; + zap_attribute_t za; + uint64_t obj = for_write ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + + supported = B_TRUE; + for (zap_cursor_init(&zc, os, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + + if (NULL != enabled_feat) { + fnvlist_add_uint64(enabled_feat, za.za_name, + za.za_first_integer); + } + + if (za.za_first_integer != 0 && + !zfeature_is_supported(za.za_name)) { + supported = B_FALSE; + + if (NULL != unsup_feat) { + char *desc = ""; + char buf[MAXPATHLEN]; + + if (zap_lookup(os, spa->spa_feat_desc_obj, + za.za_name, 1, sizeof (buf), buf) == 0) + desc = buf; + + VERIFY(nvlist_add_string(unsup_feat, za.za_name, + desc) == 0); + } + } + } + zap_cursor_fini(&zc); + + return (supported); +} + +/* + * Use an in-memory cache of feature refcounts for quick retrieval. + * + * Note: well-designed features will not need to use this; they should + * use spa_feature_is_enabled() and spa_feature_is_active() instead. + * However, this is non-static for zdb, zhack, and spa_add_feature_stats(). + */ +int +feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res) +{ + ASSERT(VALID_FEATURE_FID(feature->fi_feature)); + if (spa->spa_feat_refcount_cache[feature->fi_feature] == + SPA_FEATURE_DISABLED) { + return (SET_ERROR(ENOTSUP)); + } + *res = spa->spa_feat_refcount_cache[feature->fi_feature]; + return (0); +} + +/* + * Note: well-designed features will not need to use this; they should + * use spa_feature_is_enabled() and spa_feature_is_active() instead. + * However, this is non-static for zdb and zhack. + */ +int +feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, + uint64_t *res) +{ + int err; + uint64_t refcount; + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + + /* + * If the pool is currently being created, the feature objects may not + * have been allocated yet. Act as though all features are disabled. + */ + if (zapobj == 0) + return (SET_ERROR(ENOTSUP)); + + err = zap_lookup(spa->spa_meta_objset, zapobj, + feature->fi_guid, sizeof (uint64_t), 1, &refcount); + if (err != 0) { + if (err == ENOENT) + return (SET_ERROR(ENOTSUP)); + else + return (err); + } + *res = refcount; + return (0); +} + + +static int +feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res) +{ + uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj; + + ASSERT(zfeature_depends_on(feature->fi_feature, + SPA_FEATURE_ENABLED_TXG)); + + if (!spa_feature_is_enabled(spa, feature->fi_feature)) { + return (SET_ERROR(ENOTSUP)); + } + + ASSERT(enabled_txg_obj != 0); + + VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj, + feature->fi_guid, sizeof (uint64_t), 1, res)); + + return (0); +} + +/* + * This function is non-static for zhack; it should otherwise not be used + * outside this file. + */ +void +feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount, + dmu_tx_t *tx) +{ + ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature)); + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + + VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid, + sizeof (uint64_t), 1, &refcount, tx)); + + /* + * feature_sync is called directly from zhack, allowing the + * creation of arbitrary features whose fi_feature field may + * be greater than SPA_FEATURES. When called from zhack, the + * zfeature_info_t object's fi_feature field will be set to + * SPA_FEATURE_NONE. + */ + if (feature->fi_feature != SPA_FEATURE_NONE) { + uint64_t *refcount_cache = + &spa->spa_feat_refcount_cache[feature->fi_feature]; +#ifdef atomic_swap_64 + VERIFY3U(*refcount_cache, ==, + atomic_swap_64(refcount_cache, refcount)); +#else + *refcount_cache = refcount; +#endif + } + + if (refcount == 0) + spa_deactivate_mos_feature(spa, feature->fi_guid); + else if (feature->fi_flags & ZFEATURE_FLAG_MOS) + spa_activate_mos_feature(spa, feature->fi_guid, tx); +} + +/* + * This function is non-static for zhack; it should otherwise not be used + * outside this file. + */ +void +feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + uint64_t initial_refcount = + (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0; + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + + ASSERT(0 != zapobj); + ASSERT(zfeature_is_valid_guid(feature->fi_guid)); + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + + /* + * If the feature is already enabled, ignore the request. + */ + if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0) + return; + + for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) + spa_feature_enable(spa, feature->fi_depends[i], tx); + + VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj, + feature->fi_guid, 1, strlen(feature->fi_desc) + 1, + feature->fi_desc, tx)); + + feature_sync(spa, feature, initial_refcount, tx); + + if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) { + uint64_t enabling_txg = dmu_tx_get_txg(tx); + + if (spa->spa_feat_enabled_txg_obj == 0ULL) { + spa->spa_feat_enabled_txg_obj = + zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURE_ENABLED_TXG, tx); + } + spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx); + + VERIFY0(zap_add(spa->spa_meta_objset, + spa->spa_feat_enabled_txg_obj, feature->fi_guid, + sizeof (uint64_t), 1, &enabling_txg, tx)); + } +} + +static void +feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, + dmu_tx_t *tx) +{ + uint64_t refcount; + zfeature_info_t *feature = &spa_feature_table[fid]; + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + + ASSERT(VALID_FEATURE_FID(fid)); + ASSERT(0 != zapobj); + ASSERT(zfeature_is_valid_guid(feature->fi_guid)); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + + VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP); + + switch (action) { + case FEATURE_ACTION_INCR: + VERIFY3U(refcount, !=, UINT64_MAX); + refcount++; + break; + case FEATURE_ACTION_DECR: + VERIFY3U(refcount, !=, 0); + refcount--; + break; + default: + ASSERT(0); + break; + } + + feature_sync(spa, feature, refcount, tx); +} + +void +spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx) +{ + /* + * We create feature flags ZAP objects in two instances: during pool + * creation and during pool upgrade. + */ + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on && + tx->tx_txg == TXG_INITIAL)); + + spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_READ, tx); + spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_WRITE, tx); + spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURE_DESCRIPTIONS, tx); +} + +/* + * Enable any required dependencies, then enable the requested feature. + */ +void +spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + ASSERT(VALID_FEATURE_FID(fid)); + feature_enable_sync(spa, &spa_feature_table[fid], tx); +} + +void +spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) +{ + feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx); +} + +void +spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) +{ + feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx); +} + +boolean_t +spa_feature_is_enabled(spa_t *spa, spa_feature_t fid) +{ + int err; + uint64_t refcount; + + ASSERT(VALID_FEATURE_FID(fid)); + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0); +} + +boolean_t +spa_feature_is_active(spa_t *spa, spa_feature_t fid) +{ + int err; + uint64_t refcount; + + ASSERT(VALID_FEATURE_FID(fid)); + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0 && refcount > 0); +} + +/* + * For the feature specified by fid (which must depend on + * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the + * OUT txg argument. + * + * Returns B_TRUE if the feature is enabled, in which case txg will be filled + * with the transaction group in which the specified feature was enabled. + * Returns B_FALSE otherwise (i.e. if the feature is not enabled). + */ +boolean_t +spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg) +{ + int err; + + ASSERT(VALID_FEATURE_FID(fid)); + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg); + ASSERT(err == 0 || err == ENOTSUP); + + return (err == 0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf new file mode 100644 index 000000000000..09881909b804 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf @@ -0,0 +1,28 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +name="zfs" parent="pseudo"; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c new file mode 100644 index 000000000000..e0bc7422c5ad --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c @@ -0,0 +1,2740 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/sdt.h> +#include <sys/fs/zfs.h> +#include <sys/policy.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_fuid.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_vfsops.h> +#include <sys/dmu.h> +#include <sys/dnode.h> +#include <sys/zap.h> +#include <sys/sa.h> +#include <acl/acl_common.h> + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE +#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE +#define MIN_ACE_TYPE ALLOW + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) + +#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ + ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ + ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) + +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ + ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) + +#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ + ZFS_ACL_PROTECTED) + +#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ + ZFS_ACL_OBJ_ACE) + +#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) + +static uint16_t +zfs_ace_v0_get_type(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_v0_get_flags(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_v0_get_mask(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_v0_get_who(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_fuid); +} + +static void +zfs_ace_v0_set_type(void *acep, uint16_t type) +{ + ((zfs_oldace_t *)acep)->z_type = type; +} + +static void +zfs_ace_v0_set_flags(void *acep, uint16_t flags) +{ + ((zfs_oldace_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_v0_set_mask(void *acep, uint32_t mask) +{ + ((zfs_oldace_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_v0_set_who(void *acep, uint64_t who) +{ + ((zfs_oldace_t *)acep)->z_fuid = who; +} + +/*ARGSUSED*/ +static size_t +zfs_ace_v0_size(void *acep) +{ + return (sizeof (zfs_oldace_t)); +} + +static size_t +zfs_ace_v0_abstract_size(void) +{ + return (sizeof (zfs_oldace_t)); +} + +static int +zfs_ace_v0_mask_off(void) +{ + return (offsetof(zfs_oldace_t, z_access_mask)); +} + +/*ARGSUSED*/ +static int +zfs_ace_v0_data(void *acep, void **datap) +{ + *datap = NULL; + return (0); +} + +static acl_ops_t zfs_acl_v0_ops = { + zfs_ace_v0_get_mask, + zfs_ace_v0_set_mask, + zfs_ace_v0_get_flags, + zfs_ace_v0_set_flags, + zfs_ace_v0_get_type, + zfs_ace_v0_set_type, + zfs_ace_v0_get_who, + zfs_ace_v0_set_who, + zfs_ace_v0_size, + zfs_ace_v0_abstract_size, + zfs_ace_v0_mask_off, + zfs_ace_v0_data +}; + +static uint16_t +zfs_ace_fuid_get_type(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_fuid_get_flags(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_fuid_get_mask(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_fuid_get_who(void *args) +{ + uint16_t entry_type; + zfs_ace_t *acep = args; + + entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (-1); + return (((zfs_ace_t *)acep)->z_fuid); +} + +static void +zfs_ace_fuid_set_type(void *acep, uint16_t type) +{ + ((zfs_ace_hdr_t *)acep)->z_type = type; +} + +static void +zfs_ace_fuid_set_flags(void *acep, uint16_t flags) +{ + ((zfs_ace_hdr_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_fuid_set_mask(void *acep, uint32_t mask) +{ + ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_fuid_set_who(void *arg, uint64_t who) +{ + zfs_ace_t *acep = arg; + + uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return; + acep->z_fuid = who; +} + +static size_t +zfs_ace_fuid_size(void *acep) +{ + zfs_ace_hdr_t *zacep = acep; + uint16_t entry_type; + + switch (zacep->z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + return (sizeof (zfs_object_ace_t)); + case ALLOW: + case DENY: + entry_type = + (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (sizeof (zfs_ace_hdr_t)); + /*FALLTHROUGH*/ + default: + return (sizeof (zfs_ace_t)); + } +} + +static size_t +zfs_ace_fuid_abstract_size(void) +{ + return (sizeof (zfs_ace_hdr_t)); +} + +static int +zfs_ace_fuid_mask_off(void) +{ + return (offsetof(zfs_ace_hdr_t, z_access_mask)); +} + +static int +zfs_ace_fuid_data(void *acep, void **datap) +{ + zfs_ace_t *zacep = acep; + zfs_object_ace_t *zobjp; + + switch (zacep->z_hdr.z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjp = acep; + *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); + return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); + default: + *datap = NULL; + return (0); + } +} + +static acl_ops_t zfs_acl_fuid_ops = { + zfs_ace_fuid_get_mask, + zfs_ace_fuid_set_mask, + zfs_ace_fuid_get_flags, + zfs_ace_fuid_set_flags, + zfs_ace_fuid_get_type, + zfs_ace_fuid_set_type, + zfs_ace_fuid_get_who, + zfs_ace_fuid_set_who, + zfs_ace_fuid_size, + zfs_ace_fuid_abstract_size, + zfs_ace_fuid_mask_off, + zfs_ace_fuid_data +}; + +/* + * The following three functions are provided for compatibility with + * older ZPL version in order to determine if the file use to have + * an external ACL and what version of ACL previously existed on the + * file. Would really be nice to not need this, sigh. + */ +uint64_t +zfs_external_acl(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + int error; + + if (zp->z_is_sa) + return (0); + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_extern_obj); + else { + /* + * after upgrade the SA_ZPL_ZNODE_ACL should have been + * removed + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (0); + } +} + +/* + * Determine size of ACL in bytes + * + * This is more complicated than it should be since we have to deal + * with old external ACLs. + */ +static int +zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, + zfs_acl_phys_t *aclphys) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t acl_count; + int size; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_is_sa) { + if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), + &size)) != 0) + return (error); + *aclsize = size; + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), + &acl_count, sizeof (acl_count))) != 0) + return (error); + *aclcount = acl_count; + } else { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + aclphys, sizeof (*aclphys))) != 0) + return (error); + + if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { + *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); + *aclcount = aclphys->z_acl_size; + } else { + *aclsize = aclphys->z_acl_size; + *aclcount = aclphys->z_acl_count; + } + } + return (0); +} + +int +zfs_znode_acl_version(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) + return (ZFS_ACL_VERSION_FUID); + else { + int error; + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_version); + else { + /* + * After upgrade SA_ZPL_ZNODE_ACL should have + * been removed. + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (ZFS_ACL_VERSION_FUID); + } + } +} + +static int +zfs_acl_version(int version) +{ + if (version < ZPL_VERSION_FUID) + return (ZFS_ACL_VERSION_INITIAL); + else + return (ZFS_ACL_VERSION_FUID); +} + +static int +zfs_acl_version_zp(znode_t *zp) +{ + return (zfs_acl_version(zp->z_zfsvfs->z_version)); +} + +zfs_acl_t * +zfs_acl_alloc(int vers) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), + offsetof(zfs_acl_node_t, z_next)); + aclp->z_version = vers; + if (vers == ZFS_ACL_VERSION_FUID) + aclp->z_ops = zfs_acl_fuid_ops; + else + aclp->z_ops = zfs_acl_v0_ops; + return (aclp); +} + +zfs_acl_node_t * +zfs_acl_node_alloc(size_t bytes) +{ + zfs_acl_node_t *aclnode; + + aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); + if (bytes) { + aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); + aclnode->z_allocdata = aclnode->z_acldata; + aclnode->z_allocsize = bytes; + aclnode->z_size = bytes; + } + + return (aclnode); +} + +static void +zfs_acl_node_free(zfs_acl_node_t *aclnode) +{ + if (aclnode->z_allocsize) + kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); + kmem_free(aclnode, sizeof (zfs_acl_node_t)); +} + +static void +zfs_acl_release_nodes(zfs_acl_t *aclp) +{ + zfs_acl_node_t *aclnode; + + while (aclnode = list_head(&aclp->z_acl)) { + list_remove(&aclp->z_acl, aclnode); + zfs_acl_node_free(aclnode); + } + aclp->z_acl_count = 0; + aclp->z_acl_bytes = 0; +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + zfs_acl_release_nodes(aclp); + list_destroy(&aclp->z_acl); + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static boolean_t +zfs_acl_valid_ace_type(uint_t type, uint_t flags) +{ + uint16_t entry_type; + + switch (type) { + case ALLOW: + case DENY: + case ACE_SYSTEM_AUDIT_ACE_TYPE: + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry_type = flags & ACE_TYPE_FLAGS; + return (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE || entry_type == 0 || + entry_type == ACE_IDENTIFIER_GROUP); + default: + if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) +{ + /* + * first check type of entry + */ + + if (!zfs_acl_valid_ace_type(type, iflags)) + return (B_FALSE); + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (aclp->z_version < ZFS_ACL_VERSION_FUID) + return (B_FALSE); + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + } + + /* + * next check inheritance level flags + */ + + if (obj_type == VDIR && + (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void * +zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, + uint32_t *access_mask, uint16_t *iflags, uint16_t *type) +{ + zfs_acl_node_t *aclnode; + + ASSERT(aclp); + + if (start == NULL) { + aclnode = list_head(&aclp->z_acl); + if (aclnode == NULL) + return (NULL); + + aclp->z_next_ace = aclnode->z_acldata; + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + } + + aclnode = aclp->z_curr_node; + + if (aclnode == NULL) + return (NULL); + + if (aclnode->z_ace_idx >= aclnode->z_ace_count) { + aclnode = list_next(&aclp->z_acl, aclnode); + if (aclnode == NULL) + return (NULL); + else { + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + aclp->z_next_ace = aclnode->z_acldata; + } + } + + if (aclnode->z_ace_idx < aclnode->z_ace_count) { + void *acep = aclp->z_next_ace; + size_t ace_size; + + /* + * Make sure we don't overstep our bounds + */ + ace_size = aclp->z_ops.ace_size(acep); + + if (((caddr_t)acep + ace_size) > + ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { + return (NULL); + } + + *iflags = aclp->z_ops.ace_flags_get(acep); + *type = aclp->z_ops.ace_type_get(acep); + *access_mask = aclp->z_ops.ace_mask_get(acep); + *who = aclp->z_ops.ace_who_get(acep); + aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; + aclnode->z_ace_idx++; + + return ((void *)acep); + } + return (NULL); +} + +/*ARGSUSED*/ +static uint64_t +zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, + uint16_t *flags, uint16_t *type, uint32_t *mask) +{ + zfs_acl_t *aclp = datap; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + uint64_t who; + + acep = zfs_acl_next_ace(aclp, acep, &who, mask, + flags, type); + return ((uint64_t)(uintptr_t)acep); +} + +static zfs_acl_node_t * +zfs_acl_curr_node(zfs_acl_t *aclp) +{ + ASSERT(aclp->z_curr_node); + return (aclp->z_curr_node); +} + +/* + * Copy ACE to internal ZFS format. + * While processing the ACL each ACE will be validated for correctness. + * ACE FUIDs will be created later. + */ +int +zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, + void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, + zfs_fuid_info_t **fuidp, cred_t *cr) +{ + int i; + uint16_t entry_type; + zfs_ace_t *aceptr = z_acl; + ace_t *acep = datap; + zfs_object_ace_t *zobjacep; + ace_object_t *aceobjp; + + for (i = 0; i != aclcnt; i++) { + aceptr->z_hdr.z_access_mask = acep->a_access_mask; + aceptr->z_hdr.z_flags = acep->a_flags; + aceptr->z_hdr.z_type = acep->a_type; + entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; + if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE) { + aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, + cr, (entry_type == 0) ? + ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); + } + + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type, + aceptr->z_hdr.z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + + switch (acep->a_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjacep = (zfs_object_ace_t *)aceptr; + aceobjp = (ace_object_t *)acep; + + bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, + sizeof (aceobjp->a_obj_type)); + bcopy(aceobjp->a_inherit_obj_type, + zobjacep->z_inherit_type, + sizeof (aceobjp->a_inherit_obj_type)); + acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); + break; + default: + acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); + } + + aceptr = (zfs_ace_t *)((caddr_t)aceptr + + aclp->z_ops.ace_size(aceptr)); + } + + *size = (caddr_t)aceptr - (caddr_t)z_acl; + + return (0); +} + +/* + * Copy ZFS ACEs to fixed size ace_t layout + */ +static void +zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, + void *datap, int filter) +{ + uint64_t who; + uint32_t access_mask; + uint16_t iflags, type; + zfs_ace_hdr_t *zacep = NULL; + ace_t *acep = datap; + ace_object_t *objacep; + zfs_object_ace_t *zobjacep; + size_t ace_size; + uint16_t entry_type; + + while (zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type)) { + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (filter) { + continue; + } + zobjacep = (zfs_object_ace_t *)zacep; + objacep = (ace_object_t *)acep; + bcopy(zobjacep->z_object_type, + objacep->a_obj_type, + sizeof (zobjacep->z_object_type)); + bcopy(zobjacep->z_inherit_type, + objacep->a_inherit_obj_type, + sizeof (zobjacep->z_inherit_type)); + ace_size = sizeof (ace_object_t); + break; + default: + ace_size = sizeof (ace_t); + break; + } + + entry_type = (iflags & ACE_TYPE_FLAGS); + if ((entry_type != ACE_OWNER && + entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE)) { + acep->a_who = zfs_fuid_map_id(zfsvfs, who, + cr, (entry_type & ACE_IDENTIFIER_GROUP) ? + ZFS_ACE_GROUP : ZFS_ACE_USER); + } else { + acep->a_who = (uid_t)(int64_t)who; + } + acep->a_access_mask = access_mask; + acep->a_flags = iflags; + acep->a_type = type; + acep = (ace_t *)((caddr_t)acep + ace_size); + } +} + +static int +zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, + zfs_oldace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + zfs_oldace_t *aceptr = z_acl; + + for (i = 0; i != aclcnt; i++, aceptr++) { + aceptr->z_access_mask = acep[i].a_access_mask; + aceptr->z_type = acep[i].a_type; + aceptr->z_flags = acep[i].a_flags; + aceptr->z_fuid = acep[i].a_who; + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_type, + aceptr->z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + } + *size = (caddr_t)aceptr - (caddr_t)z_acl; + return (0); +} + +/* + * convert old ACL format to new + */ +void +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) +{ + zfs_oldace_t *oldaclp; + int i; + uint16_t type, iflags; + uint32_t access_mask; + uint64_t who; + void *cookie = NULL; + zfs_acl_node_t *newaclnode; + + ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); + /* + * First create the ACE in a contiguous piece of memory + * for zfs_copy_ace_2_fuid(). + * + * We only convert an ACL once, so this won't happen + * everytime. + */ + oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, + KM_SLEEP); + i = 0; + while (cookie = zfs_acl_next_ace(aclp, cookie, &who, + &access_mask, &iflags, &type)) { + oldaclp[i].z_flags = iflags; + oldaclp[i].z_type = type; + oldaclp[i].z_fuid = who; + oldaclp[i++].z_access_mask = access_mask; + } + + newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * + sizeof (zfs_object_ace_t)); + aclp->z_ops = zfs_acl_fuid_ops; + VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp, + oldaclp, newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size, NULL, cr) == 0); + newaclnode->z_ace_count = aclp->z_acl_count; + aclp->z_version = ZFS_ACL_VERSION; + kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); + + /* + * Release all previous ACL nodes + */ + + zfs_acl_release_nodes(aclp); + + list_insert_head(&aclp->z_acl, newaclnode); + + aclp->z_acl_bytes = newaclnode->z_size; + aclp->z_acl_count = newaclnode->z_ace_count; + +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & S_IXOTH) + new_mask |= ACE_EXECUTE; + if (access_mask & S_IWOTH) + new_mask |= ACE_WRITE_DATA; + if (access_mask & S_IROTH) + new_mask |= ACE_READ_DATA; + return (new_mask); +} + +static void +zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, + uint16_t access_type, uint64_t fuid, uint16_t entry_type) +{ + uint16_t type = entry_type & ACE_TYPE_FLAGS; + + aclp->z_ops.ace_mask_set(acep, access_mask); + aclp->z_ops.ace_type_set(acep, access_type); + aclp->z_ops.ace_flags_set(acep, entry_type); + if ((type != ACE_OWNER && type != OWNING_GROUP && + type != ACE_EVERYONE)) + aclp->z_ops.ace_who_set(acep, fuid); +} + +/* + * Determine mode of file based on ACL. + */ +uint64_t +zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, + uint64_t *pflags, uint64_t fuid, uint64_t fgid) +{ + int entry_type; + mode_t mode; + mode_t seen = 0; + zfs_ace_hdr_t *acep = NULL; + uint64_t who; + uint16_t iflags, type; + uint32_t access_mask; + boolean_t an_exec_denied = B_FALSE; + + mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + + while (acep = zfs_acl_next_ace(aclp, acep, &who, + &access_mask, &iflags, &type)) { + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + /* + * Skip over any inherit_only ACEs + */ + if (iflags & ACE_INHERIT_ONLY_ACE) + continue; + + if (entry_type == ACE_OWNER || (entry_type == 0 && + who == fuid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP || + (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } else { + /* + * Only care if this IDENTIFIER_GROUP or + * USER ACE denies execute access to someone, + * mode is not affected + */ + if ((access_mask & ACE_EXECUTE) && type == DENY) + an_exec_denied = B_TRUE; + } + } + + /* + * Failure to allow is effectively a deny, so execute permission + * is denied if it was never mentioned or if we explicitly + * weren't allowed it. + */ + if (!an_exec_denied && + ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || + (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) + an_exec_denied = B_TRUE; + + if (an_exec_denied) + *pflags &= ~ZFS_NO_EXECS_DENIED; + else + *pflags |= ZFS_NO_EXECS_DENIED; + + return (mode); +} + +/* + * Read an external acl object. If the intent is to modify, always + * create a new acl and leave any cached acl in place. + */ +static int +zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) +{ + zfs_acl_t *aclp; + int aclsize; + int acl_count; + zfs_acl_node_t *aclnode; + zfs_acl_phys_t znode_acl; + int version; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + + if (zp->z_acl_cached && !will_modify) { + *aclpp = zp->z_acl_cached; + return (0); + } + + version = zfs_znode_acl_version(zp); + + if ((error = zfs_acl_znode_info(zp, &aclsize, + &acl_count, &znode_acl)) != 0) { + goto done; + } + + aclp = zfs_acl_alloc(version); + + aclp->z_acl_count = acl_count; + aclp->z_acl_bytes = aclsize; + + aclnode = zfs_acl_node_alloc(aclsize); + aclnode->z_ace_count = aclp->z_acl_count; + aclnode->z_size = aclsize; + + if (!zp->z_is_sa) { + if (znode_acl.z_acl_extern_obj) { + error = dmu_read(zp->z_zfsvfs->z_os, + znode_acl.z_acl_extern_obj, 0, aclnode->z_size, + aclnode->z_acldata, DMU_READ_PREFETCH); + } else { + bcopy(znode_acl.z_ace_data, aclnode->z_acldata, + aclnode->z_size); + } + } else { + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), + aclnode->z_acldata, aclnode->z_size); + } + + if (error != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + goto done; + } + + list_insert_head(&aclp->z_acl, aclnode); + + *aclpp = aclp; + if (!will_modify) + zp->z_acl_cached = aclp; +done: + return (error); +} + +/*ARGSUSED*/ +void +zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, + boolean_t start, void *userdata) +{ + zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; + + if (start) { + cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); + } else { + cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, + cb->cb_acl_node); + } + *dataptr = cb->cb_acl_node->z_acldata; + *length = cb->cb_acl_node->z_size; +} + +int +zfs_acl_chown_setattr(znode_t *zp) +{ + int error; + zfs_acl_t *aclp; + + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0) + zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, + &zp->z_pflags, zp->z_uid, zp->z_gid); + return (error); +} + +/* + * common code for setting ACLs. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) +{ + int error; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_type_t otype; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t mode; + sa_bulk_attr_t bulk[5]; + uint64_t ctime[2]; + int count = 0; + zfs_acl_phys_t acl_phys; + + mode = zp->z_mode; + + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, + zp->z_uid, zp->z_gid); + + zp->z_mode = mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + /* + * Upgrade needed? + */ + if (!zfsvfs->z_use_fuids) { + otype = DMU_OT_OLDACL; + } else { + if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && + (zfsvfs->z_version >= ZPL_VERSION_FUID)) + zfs_acl_xform(zp, aclp, cr); + ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); + otype = DMU_OT_ACL; + } + + /* + * Arrgh, we have to handle old on disk format + * as well as newer (preferred) SA format. + */ + + if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ + locate.cb_aclp = aclp; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, aclp->z_acl_bytes); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), + NULL, &aclp->z_acl_count, sizeof (uint64_t)); + } else { /* Painful legacy way */ + zfs_acl_node_t *aclnode; + uint64_t off = 0; + uint64_t aoid; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + &acl_phys, sizeof (acl_phys))) != 0) + return (error); + + aoid = acl_phys.z_acl_extern_obj; + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && + aclp->z_version != acl_phys.z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, aoid, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? + DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? + DN_OLD_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, + aoid, aclp->z_acl_bytes, 0, tx); + } + acl_phys.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } + } else { + void *start = acl_phys.z_ace_data; + /* + * Migrating back embedded? + */ + if (acl_phys.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + acl_phys.z_acl_extern_obj, tx); + if (error) + return (error); + acl_phys.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + } + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + acl_phys.z_acl_size = aclp->z_acl_count; + acl_phys.z_acl_count = aclp->z_acl_bytes; + } else { + acl_phys.z_acl_size = aclp->z_acl_bytes; + acl_phys.z_acl_count = aclp->z_acl_count; + } + acl_phys.z_acl_version = aclp->z_version; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (acl_phys)); + } + + /* + * Replace ACL wide bits, but first clear them. + */ + zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; + + zp->z_pflags |= aclp->z_hints; + + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_pflags |= ZFS_ACL_TRIVIAL; + + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE); + return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); +} + +static void +zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim, + zfs_acl_t *aclp) +{ + void *acep = NULL; + uint64_t who; + int new_count, new_bytes; + int ace_size; + int entry_type; + uint16_t iflags, type; + uint32_t access_mask; + zfs_acl_node_t *newnode; + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + void *zacep; + boolean_t isdir; + trivial_acl_t masks; + + new_count = new_bytes = 0; + + isdir = (vtype == VDIR); + + acl_trivial_access_masks((mode_t)mode, isdir, &masks); + + newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); + + zacep = newnode->z_acldata; + if (masks.allow0) { + zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny1) { + zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny2) { + zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + + while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type)) { + entry_type = (iflags & ACE_TYPE_FLAGS); + /* + * ACEs used to represent the file mode may be divided + * into an equivalent pair of inherit-only and regular + * ACEs, if they are inheritable. + * Skip regular ACEs, which are replaced by the new mode. + */ + if (split && (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE)) { + if (!isdir || !(iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + continue; + /* + * We preserve owner@, group@, or @everyone + * permissions, if they are inheritable, by + * copying them to inherit_only ACEs. This + * prevents inheritable permissions from being + * altered along with the file mode. + */ + iflags |= ACE_INHERIT_ONLY_ACE; + } + + /* + * If this ACL has any inheritable ACEs, mark that in + * the hints (which are later masked into the pflags) + * so create knows to do inheritance. + */ + if (isdir && (iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if ((type != ALLOW && type != DENY) || + (iflags & ACE_INHERIT_ONLY_ACE)) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + break; + } + } else { + /* + * Limit permissions granted by ACEs to be no greater + * than permissions of the requested group mode. + * Applies when the "aclmode" property is set to + * "groupmask". + */ + if ((type == ALLOW) && trim) + access_mask &= masks.group; + } + zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); + ace_size = aclp->z_ops.ace_size(acep); + zacep = (void *)((uintptr_t)zacep + ace_size); + new_count++; + new_bytes += ace_size; + } + zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); + + new_count += 3; + new_bytes += abstract_size * 3; + zfs_acl_release_nodes(aclp); + aclp->z_acl_count = new_count; + aclp->z_acl_bytes = new_bytes; + newnode->z_ace_count = new_count; + newnode->z_size = new_bytes; + list_insert_tail(&aclp->z_acl, newnode); +} + +int +zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) +{ + int error = 0; + + mutex_enter(&zp->z_acl_lock); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + else + error = zfs_acl_node_read(zp, aclp, B_TRUE); + + if (error == 0) { + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE, + (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); + } + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags) +{ + int iflags = (acep_flags & 0xf); + + if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!((vtype == VDIR) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, + uint64_t mode, boolean_t *need_chmod) +{ + void *pacep = NULL; + void *acep; + zfs_acl_node_t *aclnode; + zfs_acl_t *aclp = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t iflags, newflags, type; + size_t ace_size; + void *data1, *data2; + size_t data1sz, data2sz; + uint_t aclinherit; + boolean_t isdir = (vtype == VDIR); + boolean_t isreg = (vtype == VREG); + + *need_chmod = B_TRUE; + + aclp = zfs_acl_alloc(paclp->z_version); + aclinherit = zfsvfs->z_acl_inherit; + if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK) + return (aclp); + + while (pacep = zfs_acl_next_ace(paclp, pacep, &who, + &access_mask, &iflags, &type)) { + + /* + * don't inherit bogus ACEs + */ + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + /* + * Check if ACE is inheritable by this vnode + */ + if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || + !zfs_ace_can_use(vtype, iflags)) + continue; + + /* + * If owner@, group@, or everyone@ inheritable + * then zfs_acl_chmod() isn't needed. + */ + if ((aclinherit == ZFS_ACL_PASSTHROUGH || + aclinherit == ZFS_ACL_PASSTHROUGH_X) && + ((iflags & (ACE_OWNER|ACE_EVERYONE)) || + ((iflags & OWNING_GROUP) == OWNING_GROUP)) && + (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) + *need_chmod = B_FALSE; + + /* + * Strip inherited execute permission from file if + * not in mode + */ + if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && + !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { + access_mask &= ~ACE_EXECUTE; + } + + /* + * Strip write_acl and write_owner from permissions + * when inheriting an ACE + */ + if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { + access_mask &= ~RESTRICTED_CLEAR; + } + + ace_size = aclp->z_ops.ace_size(pacep); + aclnode = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode); + acep = aclnode->z_acldata; + + zfs_set_ace(aclp, acep, access_mask, type, + who, iflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = paclp->z_ops.ace_data(pacep, &data1)) != 0) { + VERIFY((data2sz = aclp->z_ops.ace_data(acep, + &data2)) == data1sz); + bcopy(data1, data2, data2sz); + } + + aclp->z_acl_count++; + aclnode->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + newflags = aclp->z_ops.ace_flags_get(acep); + + /* + * If ACE is not to be inherited further, or if the vnode is + * not a directory, remove all inheritance flags + */ + if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { + newflags &= ~ALL_INHERIT; + aclp->z_ops.ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + continue; + } + + /* + * This directory has an inheritable ACE + */ + aclp->z_hints |= ZFS_INHERIT_ACE; + + /* + * If only FILE_INHERIT is set then turn on + * inherit_only + */ + if ((iflags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops.ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } else { + newflags &= ~ACE_INHERIT_ONLY_ACE; + aclp->z_ops.ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } + } + + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + * Also, create FUIDs for owner and group. + */ +int +zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) +{ + int error; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_acl_t *paclp; + gid_t gid; + boolean_t need_chmod = B_TRUE; + boolean_t trim = B_FALSE; + boolean_t inherited = B_FALSE; + + if ((flag & IS_ROOT_NODE) == 0) + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + else + ASSERT(dzp->z_vnode == NULL); + bzero(acl_ids, sizeof (zfs_acl_ids_t)); + acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); + + if (vsecp) + if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, + &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) + return (error); + /* + * Determine uid and gid. + */ + if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || + ((flag & IS_XATTR) && (vap->va_type == VDIR))) { + acl_ids->z_fuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, + ZFS_OWNER, &acl_ids->z_fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, + ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + } else { + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, + cr, &acl_ids->z_fuidp); + acl_ids->z_fgid = 0; + if (vap->va_mask & AT_GID) { + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + if (acl_ids->z_fgid != dzp->z_gid && + !groupmember(vap->va_gid, cr) && + secpolicy_vnode_create_gid(cr) != 0) + acl_ids->z_fgid = 0; + } + if (acl_ids->z_fgid == 0) { +#ifndef __FreeBSD_kernel__ + if (dzp->z_mode & S_ISGID) { +#endif + char *domain; + uint32_t rid; + + acl_ids->z_fgid = dzp->z_gid; + gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, + cr, ZFS_GROUP); + + if (zfsvfs->z_use_fuids && + IS_EPHEMERAL(acl_ids->z_fgid)) { + domain = zfs_fuid_idx_domain( + &zfsvfs->z_fuid_idx, + FUID_INDEX(acl_ids->z_fgid)); + rid = FUID_RID(acl_ids->z_fgid); + zfs_fuid_node_add(&acl_ids->z_fuidp, + domain, rid, + FUID_INDEX(acl_ids->z_fgid), + acl_ids->z_fgid, ZFS_GROUP); + } +#ifndef __FreeBSD_kernel__ + } else { + acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, cr, &acl_ids->z_fuidp); + gid = crgetgid(cr); + } +#endif + } + } + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && + (vap->va_type == VDIR)) { + acl_ids->z_mode |= S_ISGID; + } else { + if ((acl_ids->z_mode & S_ISGID) && + secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0) + acl_ids->z_mode &= ~S_ISGID; + } + + if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_acl_lock); + if (!(flag & IS_ROOT_NODE) && + (dzp->z_pflags & ZFS_INHERIT_ACE) && + !(dzp->z_pflags & ZFS_XATTR)) { + VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE)); + acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, + vap->va_type, paclp, acl_ids->z_mode, &need_chmod); + inherited = B_TRUE; + } else { + acl_ids->z_aclp = + zfs_acl_alloc(zfs_acl_version_zp(dzp)); + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + mutex_exit(&dzp->z_acl_lock); + + if (need_chmod) { + if (vap->va_type == VDIR) + acl_ids->z_aclp->z_hints |= + ZFS_ACL_AUTO_INHERIT; + + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) + trim = B_TRUE; + zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE, + trim, acl_ids->z_aclp); + } + } + + if (inherited || vsecp) { + acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, + acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, + acl_ids->z_fuid, acl_ids->z_fgid); + if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + + return (0); +} + +/* + * Free ACL and fuid_infop, but not the acl_ids structure + */ +void +zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) +{ + if (acl_ids->z_aclp) + zfs_acl_free(acl_ids->z_aclp); + if (acl_ids->z_fuidp) + zfs_fuid_info_free(acl_ids->z_fuidp); + acl_ids->z_aclp = NULL; + acl_ids->z_fuidp = NULL; +} + +boolean_t +zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids) +{ + return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) || + zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid)); +} + +/* + * Retrieve a file's ACL + */ +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfs_acl_t *aclp; + ulong_t mask; + int error; + int count = 0; + int largeace = 0; + + mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | + VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)) + return (error); + + mutex_enter(&zp->z_acl_lock); + + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + error = zfs_acl_node_read(zp, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + /* + * Scan ACL to determine number of ACEs + */ + if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { + void *zacep = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t type, iflags; + + while (zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type)) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + largeace++; + continue; + default: + count++; + } + } + vsecp->vsa_aclcnt = count; + } else + count = (int)aclp->z_acl_count; + + if (mask & VSA_ACECNT) { + vsecp->vsa_aclcnt = count; + } + + if (mask & VSA_ACE) { + size_t aclsz; + + aclsz = count * sizeof (ace_t) + + sizeof (ace_object_t) * largeace; + + vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); + vsecp->vsa_aclentsz = aclsz; + + if (aclp->z_version == ZFS_ACL_VERSION_FUID) + zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, + vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); + else { + zfs_acl_node_t *aclnode; + void *start = vsecp->vsa_aclentp; + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == + aclp->z_acl_bytes); + } + } + if (mask & VSA_ACE_ACLFLAGS) { + vsecp->vsa_aclflags = 0; + if (zp->z_pflags & ZFS_ACL_DEFAULTED) + vsecp->vsa_aclflags |= ACL_DEFAULTED; + if (zp->z_pflags & ZFS_ACL_PROTECTED) + vsecp->vsa_aclflags |= ACL_PROTECTED; + if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) + vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; + } + + mutex_exit(&zp->z_acl_lock); + + return (0); +} + +int +zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, + vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + int aclcnt = vsecp->vsa_aclcnt; + int error; + + if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) + return (SET_ERROR(EINVAL)); + + aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); + + aclp->z_hints = 0; + aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + if ((error = zfs_copy_ace_2_oldace(obj_type, aclp, + (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, + aclcnt, &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } else { + if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp, + vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, + &aclnode->z_size, fuidp, cr)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } + aclp->z_acl_bytes = aclnode->z_size; + aclnode->z_ace_count = aclcnt; + aclp->z_acl_count = aclcnt; + list_insert_head(&aclp->z_acl, aclnode); + + /* + * If flags are being set then add them to z_hints + */ + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { + if (vsecp->vsa_aclflags & ACL_PROTECTED) + aclp->z_hints |= ZFS_ACL_PROTECTED; + if (vsecp->vsa_aclflags & ACL_DEFAULTED) + aclp->z_hints |= ZFS_ACL_DEFAULTED; + if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + } + + *zaclp = aclp; + + return (0); +} + +/* + * Set a file's ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp; + zfs_fuid_info_t *fuidp = NULL; + boolean_t fuid_dirtied; + uint64_t acl_obj; + + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if (zp->z_pflags & ZFS_IMMUTABLE) + return (SET_ERROR(EPERM)); + + if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) + return (error); + + error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp, + &aclp); + if (error) + return (error); + + /* + * If ACL wide flags aren't being set then preserve any + * existing flags. + */ + if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { + aclp->z_hints |= + (zp->z_pflags & V4_ACL_WIDE_FLAGS); + } +top: + mutex_enter(&zp->z_acl_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + /* + * If old version and ACL won't fit in bonus and we aren't + * upgrading then take out necessary DMU holds + */ + + if ((acl_obj = zfs_external_acl(zp)) != 0) { + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_NOWAIT); + if (error) { + mutex_exit(&zp->z_acl_lock); + + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_acl_free(aclp); + return (error); + } + + error = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT(error == 0); + ASSERT(zp->z_acl_cached == NULL); + zp->z_acl_cached = aclp; + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_log_acl(zilog, tx, zp, vsecp, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Check accesses of interest (AoI) against attributes of the dataset + * such as read-only. Returns zero if no AoI conflict with dataset + * attributes, otherwise an appropriate errno is returned. + */ +static int +zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) +{ + if ((v4_mode & WRITE_MASK) && + (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && + (!IS_DEVVP(ZTOV(zp)) || + (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { + return (SET_ERROR(EROFS)); + } + + /* + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common(). + */ + if ((v4_mode & WRITE_MASK_DATA) && + (zp->z_pflags & ZFS_IMMUTABLE)) { + return (SET_ERROR(EPERM)); + } + +#ifdef illumos + if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (SET_ERROR(EPERM)); + } +#else + /* + * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK + * (sunlnk) is set. We just don't allow directory removal, which is + * handled in zfs_zaccess_delete(). + */ + if ((v4_mode & ACE_DELETE) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (EPERM); + } +#endif + + if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && + (zp->z_pflags & ZFS_AV_QUARANTINED))) { + return (SET_ERROR(EACCES)); + } + + return (0); +} + +/* + * The primary usage of this function is to loop through all of the + * ACEs in the znode, determining what accesses of interest (AoI) to + * the caller are allowed or denied. The AoI are expressed as bits in + * the working_mode parameter. As each ACE is processed, bits covered + * by that ACE are removed from the working_mode. This removal + * facilitates two things. The first is that when the working mode is + * empty (= 0), we know we've looked at all the AoI. The second is + * that the ACE interpretation rules don't allow a later ACE to undo + * something granted or denied by an earlier ACE. Removing the + * discovered access or denial enforces this rule. At the end of + * processing the ACEs, all AoI that were found to be denied are + * placed into the working_mode, giving the caller a mask of denied + * accesses. Returns: + * 0 if all AoI granted + * EACCESS if the denied mask is non-zero + * other error if abnormal failure (e.g., IO error) + * + * A secondary usage of the function is to determine if any of the + * AoI are granted. If an ACE grants any access in + * the working_mode, we immediately short circuit out of the function. + * This mode is chosen by setting anyaccess to B_TRUE. The + * working_mode is not a denied access mask upon exit if the function + * is used in this manner. + */ +static int +zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, + boolean_t anyaccess, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_acl_t *aclp; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t gowner; + uid_t fowner; + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + + mutex_enter(&zp->z_acl_lock); + + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + error = zfs_acl_node_read(zp, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + ASSERT(zp->z_acl_cached); + + while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type)) { + uint32_t mask_matched; + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE)) + continue; + + /* Skip ACE if it does not affect any AoI */ + mask_matched = (access_mask & *working_mode); + if (!mask_matched) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + checkit = B_FALSE; + + switch (entry_type) { + case ACE_OWNER: + if (uid == fowner) + checkit = B_TRUE; + break; + case OWNING_GROUP: + who = gowner; + /*FALLTHROUGH*/ + case ACE_IDENTIFIER_GROUP: + checkit = zfs_groupmember(zfsvfs, who, cr); + break; + case ACE_EVERYONE: + checkit = B_TRUE; + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + uid_t newid; + + newid = zfs_fuid_map_id(zfsvfs, who, cr, + ZFS_ACE_USER); + if (newid != IDMAP_WK_CREATOR_OWNER_UID && + uid == newid) + checkit = B_TRUE; + break; + } else { + mutex_exit(&zp->z_acl_lock); + return (SET_ERROR(EIO)); + } + } + + if (checkit) { + if (type == DENY) { + DTRACE_PROBE3(zfs__ace__denies, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + deny_mask |= mask_matched; + } else { + DTRACE_PROBE3(zfs__ace__allows, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + if (anyaccess) { + mutex_exit(&zp->z_acl_lock); + return (0); + } + } + *working_mode &= ~mask_matched; + } + + /* Are we done? */ + if (*working_mode == 0) + break; + } + + mutex_exit(&zp->z_acl_lock); + + /* Put the found 'denies' back on the working mode */ + if (deny_mask) { + *working_mode |= deny_mask; + return (SET_ERROR(EACCES)); + } else if (*working_mode) { + return (-1); + } + + return (0); +} + +/* + * Return true if any access whatsoever granted, we don't actually + * care what access is granted. + */ +boolean_t +zfs_has_access(znode_t *zp, cred_t *cr) +{ + uint32_t have = ACE_ALL_PERMS; + + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + uid_t owner; + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); + } + return (B_TRUE); +} + +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int err; + + *working_mode = v4_mode; + *check_privs = B_TRUE; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0 || zfsvfs->z_replay) { + *working_mode = 0; + return (0); + } + + if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { + *check_privs = B_FALSE; + return (err); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + /* + * Note: ZFS_READONLY represents the "DOS R/O" attribute. + * When that flag is set, we should behave as if write access + * were not granted by anything in the ACL. In particular: + * We _must_ allow writes after opening the file r/w, then + * setting the DOS R/O attribute, and writing some more. + * (Similar to how you can write after fchmod(fd, 0444).) + * + * Therefore ZFS_READONLY is ignored in the dataset check + * above, and checked here as if part of the ACL check. + * Also note: DOS R/O is ignored for directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + (ZTOV(zp)->v_type != VDIR) && + (zp->z_pflags & ZFS_READONLY)) { + return (SET_ERROR(EPERM)); + } + + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); +} + +static int +zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, + cred_t *cr) +{ + if (*working_mode != ACE_WRITE_DATA) + return (SET_ERROR(EACCES)); + + return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, + check_privs, B_FALSE, cr)); +} + +int +zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) +{ + boolean_t owner = B_FALSE; + boolean_t groupmbr = B_FALSE; + boolean_t is_attr; + uid_t uid = crgetuid(cr); + int error; + + if (zdp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + is_attr = ((zdp->z_pflags & ZFS_XATTR) && + (ZTOV(zdp)->v_type == VDIR)); + if (is_attr) + goto slow; + + + mutex_enter(&zdp->z_acl_lock); + + if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + + if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + + if (uid == zdp->z_uid) { + owner = B_TRUE; + if (zdp->z_mode & S_IXUSR) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (groupmember(zdp->z_gid, cr)) { + groupmbr = B_TRUE; + if (zdp->z_mode & S_IXGRP) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (!owner && !groupmbr) { + if (zdp->z_mode & S_IXOTH) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + } + + mutex_exit(&zdp->z_acl_lock); + +slow: + DTRACE_PROBE(zfs__fastpath__execute__access__miss); + ZFS_ENTER(zdp->z_zfsvfs); + error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); + ZFS_EXIT(zdp->z_zfsvfs); + return (error); +} + +/* + * Determine whether Access should be granted/denied. + * + * The least priv subsystem is always consulted as a basic privilege + * can define any form of access. + */ +int +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +{ + uint32_t working_mode; + int error; + int is_attr; + boolean_t check_privs; + znode_t *xzp; + znode_t *check_zp = zp; + mode_t needed_bits; + uid_t owner; + + is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR)); + +#ifdef __FreeBSD_kernel__ + /* + * In FreeBSD, we don't care about permissions of individual ADS. + * Note that not checking them is not just an optimization - without + * this shortcut, EA operations may bogusly fail with EACCES. + */ + if (zp->z_pflags & ZFS_XATTR) + return (0); +#else + /* + * If attribute then validate against base file + */ + if (is_attr) { + uint64_t parent; + + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zp->z_zfsvfs), &parent, + sizeof (parent))) != 0) + return (error); + + if ((error = zfs_zget(zp->z_zfsvfs, + parent, &xzp)) != 0) { + return (error); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } +#endif + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + /* + * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC + * in needed_bits. Map the bits mapped by working_mode (currently + * missing) in missing_bits. + * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), + * needed_bits. + */ + needed_bits = 0; + + working_mode = mode; + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + owner == crgetuid(cr)) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VWRITE; + if (working_mode & ACE_EXECUTE) + needed_bits |= VEXEC; + + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, + &check_privs, skipaclchk, cr)) == 0) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits)); + } + + if (error && !check_privs) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (error); + } + + if (error && (flags & V_APPEND)) { + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + } + + if (error && check_privs) { + mode_t checkmode = 0; + + /* + * First check for implicit owner permission on + * read_acl/read_attributes + */ + + error = 0; + ASSERT(working_mode != 0); + + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && + owner == crgetuid(cr))) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VWRITE; + if (working_mode & ACE_EXECUTE) + checkmode |= VEXEC; + + error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner, + needed_bits & ~checkmode, needed_bits); + + if (error == 0 && (working_mode & ACE_WRITE_OWNER)) + error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner); + if (error == 0 && (working_mode & ACE_WRITE_ACL)) + error = secpolicy_vnode_setdac(ZTOV(check_zp), cr, owner); + + if (error == 0 && (working_mode & + (ACE_DELETE|ACE_DELETE_CHILD))) + error = secpolicy_vnode_remove(ZTOV(check_zp), cr); + + if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { + error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner); + } + if (error == 0) { + /* + * See if any bits other than those already checked + * for are still present. If so then return EACCES + */ + if (working_mode & ~(ZFS_CHECKED_MASKS)) { + error = SET_ERROR(EACCES); + } + } + } else if (error == 0) { + error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits); + } + + + if (is_attr) + VN_RELE(ZTOV(xzp)); + + return (error); +} + +/* + * Translate traditional unix VREAD/VWRITE/VEXEC mode into + * native ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +{ + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); +} + +/* + * Access function for secpolicy_vnode_setattr + */ +int +zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); +} + +static int +zfs_delete_final_check(znode_t *zp, znode_t *dzp, + mode_t available_perms, cred_t *cr) +{ + int error; + uid_t downer; + + downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER); + + error = secpolicy_vnode_access2(cr, ZTOV(dzp), + downer, available_perms, VWRITE|VEXEC); + + if (error == 0) + error = zfs_sticky_remove_access(dzp, zp, cr); + + return (error); +} + +/* + * Determine whether Access should be granted/deny, without + * consulting least priv subsystem. + * + * The following chart is the recommended NFSv4 enforcement for + * ability to delete an object. + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Permit | Permit | + * | DELETE_CHILD | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Permit | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * No search privilege, can't even look up file? + * + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + uint32_t dzp_working_mode = 0; + uint32_t zp_working_mode = 0; + int dzp_error, zp_error; + mode_t available_perms; + boolean_t dzpcheck_privs = B_TRUE; + boolean_t zpcheck_privs = B_TRUE; + + /* + * We want specific DELETE permissions to + * take precedence over WRITE/EXECUTE. We don't + * want an ACL such as this to mess us up. + * user:joe:write_data:deny,user:joe:delete:allow + * + * However, deny permissions may ultimately be overridden + * by secpolicy_vnode_access(). + * + * We will ask for all of the necessary permissions and then + * look at the working modes from the directory and target object + * to determine what was found. + */ + + if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (SET_ERROR(EPERM)); + + /* + * First row + * If the directory permissions allow the delete, we are done. + */ + if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + /* + * If target object has delete permission then we are done + */ + if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + ASSERT(dzp_error && zp_error); + + if (!dzpcheck_privs) + return (dzp_error); + if (!zpcheck_privs) + return (zp_error); + + /* + * Second row + * + * If directory returns EACCES then delete_child was denied + * due to deny delete_child. In this case send the request through + * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() + * since that *could* allow the delete based on write/execute permission + * and we want delete permissions to override write/execute. + */ + + if (dzp_error == EACCES) + return (secpolicy_vnode_remove(ZTOV(dzp), cr)); /* XXXPJD: s/dzp/zp/ ? */ + + /* + * Third Row + * only need to see if we have write/execute on directory. + */ + + dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); + + if (dzp_error != 0 && !dzpcheck_privs) + return (dzp_error); + + /* + * Fourth row + */ + + available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE; + available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC; + + return (zfs_delete_final_check(zp, dzp, available_perms, cr)); + +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + if (szp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + add_perm = (ZTOV(szp)->v_type == VDIR) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + * + * BSD operating systems also require write permission + * on the directory being moved from one parent directory + * to another. + */ + if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) { + if (error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr)) + return (error); + } + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if (error = zfs_zaccess_delete(sdzp, szp, cr)) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp) { + if (error = zfs_zaccess_delete(tdzp, tzp, cr)) + return (error); + } + + /* + * Now check for add permissions + */ + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + + return (error); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c new file mode 100644 index 000000000000..6048eb124525 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c @@ -0,0 +1,199 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/vfs.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_sa.h> +#include <sys/zfs_acl.h> + +void +zfs_oldace_byteswap(ace_t *ace, int ace_cnt) +{ + int i; + + for (i = 0; i != ace_cnt; i++, ace++) { + ace->a_who = BSWAP_32(ace->a_who); + ace->a_access_mask = BSWAP_32(ace->a_access_mask); + ace->a_flags = BSWAP_16(ace->a_flags); + ace->a_type = BSWAP_16(ace->a_type); + } +} + +/* + * swap ace_t and ace_oject_t + */ +void +zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) +{ + caddr_t end; + caddr_t ptr; + zfs_ace_t *zacep = NULL; + ace_t *acep; + uint16_t entry_type; + size_t entry_size; + int ace_type; + + end = (caddr_t)buf + size; + ptr = buf; + + while (ptr < end) { + if (zfs_layout) { + /* + * Avoid overrun. Embedded aces can have one + * of several sizes. We don't know exactly + * how many our present, only the size of the + * buffer containing them. That size may be + * larger than needed to hold the aces + * present. As long as we do not do any + * swapping beyond the end of our block we are + * okay. It it safe to swap any non-ace data + * within the block since it is just zeros. + */ + if (ptr + sizeof (zfs_ace_hdr_t) > end) { + break; + } + zacep = (zfs_ace_t *)ptr; + zacep->z_hdr.z_access_mask = + BSWAP_32(zacep->z_hdr.z_access_mask); + zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags); + ace_type = zacep->z_hdr.z_type = + BSWAP_16(zacep->z_hdr.z_type); + entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS; + } else { + /* Overrun avoidance */ + if (ptr + sizeof (ace_t) > end) { + break; + } + acep = (ace_t *)ptr; + acep->a_access_mask = BSWAP_32(acep->a_access_mask); + acep->a_flags = BSWAP_16(acep->a_flags); + ace_type = acep->a_type = BSWAP_16(acep->a_type); + acep->a_who = BSWAP_32(acep->a_who); + entry_type = acep->a_flags & ACE_TYPE_FLAGS; + } + switch (entry_type) { + case ACE_OWNER: + case ACE_EVERYONE: + case (ACE_IDENTIFIER_GROUP | ACE_GROUP): + entry_size = zfs_layout ? + sizeof (zfs_ace_hdr_t) : sizeof (ace_t); + break; + case ACE_IDENTIFIER_GROUP: + default: + /* Overrun avoidance */ + if (zfs_layout) { + if (ptr + sizeof (zfs_ace_t) <= end) { + zacep->z_fuid = BSWAP_64(zacep->z_fuid); + } else { + entry_size = sizeof (zfs_ace_t); + break; + } + } + switch (ace_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + entry_size = zfs_layout ? + sizeof (zfs_object_ace_t) : + sizeof (ace_object_t); + break; + default: + entry_size = zfs_layout ? sizeof (zfs_ace_t) : + sizeof (ace_t); + break; + } + } + ptr = ptr + entry_size; + } +} + +/* ARGSUSED */ +void +zfs_oldacl_byteswap(void *buf, size_t size) +{ + int cnt; + + /* + * Arggh, since we don't know how many ACEs are in + * the array, we have to swap the entire block + */ + + cnt = size / sizeof (ace_t); + + zfs_oldace_byteswap((ace_t *)buf, cnt); +} + +/* ARGSUSED */ +void +zfs_acl_byteswap(void *buf, size_t size) +{ + zfs_ace_byteswap(buf, size, B_TRUE); +} + +void +zfs_znode_byteswap(void *buf, size_t size) +{ + znode_phys_t *zp = buf; + + ASSERT(size >= sizeof (znode_phys_t)); + + zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]); + zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]); + zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]); + zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]); + zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]); + zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]); + zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]); + zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]); + zp->zp_gen = BSWAP_64(zp->zp_gen); + zp->zp_mode = BSWAP_64(zp->zp_mode); + zp->zp_size = BSWAP_64(zp->zp_size); + zp->zp_parent = BSWAP_64(zp->zp_parent); + zp->zp_links = BSWAP_64(zp->zp_links); + zp->zp_xattr = BSWAP_64(zp->zp_xattr); + zp->zp_rdev = BSWAP_64(zp->zp_rdev); + zp->zp_flags = BSWAP_64(zp->zp_flags); + zp->zp_uid = BSWAP_64(zp->zp_uid); + zp->zp_gid = BSWAP_64(zp->zp_gid); + zp->zp_zap = BSWAP_64(zp->zp_zap); + zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]); + zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]); + zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]); + + zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj); + zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size); + zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version); + zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count); + if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) { + zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0], + ZFS_ACE_SPACE); + } else { + zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0], + ACE_SLOT_CNT); + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c new file mode 100644 index 000000000000..381dd75f2d0c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c @@ -0,0 +1,1358 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + */ + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' directory, but this may expand in the + * future. The elements are built using the GFS primitives, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding vnode. + * + * All mounts are handled automatically by the kernel, but unmounts are + * (currently) handled from user land. The main reason is that there is no + * reliable way to auto-unmount the filesystem when it's "no longer in use". + * When the user unmounts a filesystem, we call zfsctl_unmount(), which + * unmounts any snapshots within the snapshot directory. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and + * share the same vfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>' + * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. + * However, vnodes within these mounted on file systems have their v_vfsp + * fields set to the head filesystem to make NFS happy (see + * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t + * so that it cannot be freed until all snapshots have been unmounted. + */ + +#include <sys/zfs_context.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_vfsops.h> +#include <sys/namei.h> +#include <sys/stat.h> +#include <sys/dmu.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_deleg.h> +#include <sys/mount.h> +#include <sys/zap.h> + +#include "zfs_namecheck.h" + +/* Common access mode for all virtual directories under the ctldir */ +const u_short zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | + S_IROTH | S_IXOTH; + +/* + * "Synthetic" filesystem implementation. + */ + +/* + * Assert that A implies B. + */ +#define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg)); + +static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes"); + +typedef struct sfs_node { + char sn_name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t sn_parent_id; + uint64_t sn_id; +} sfs_node_t; + +/* + * Check the parent's ID as well as the node's to account for a chance + * that IDs originating from different domains (snapshot IDs, artifical + * IDs, znode IDs) may clash. + */ +static int +sfs_compare_ids(struct vnode *vp, void *arg) +{ + sfs_node_t *n1 = vp->v_data; + sfs_node_t *n2 = arg; + bool equal; + + equal = n1->sn_id == n2->sn_id && + n1->sn_parent_id == n2->sn_parent_id; + + /* Zero means equality. */ + return (!equal); +} + +static int +sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id, + uint64_t id, struct vnode **vpp) +{ + sfs_node_t search; + int err; + + search.sn_id = id; + search.sn_parent_id = parent_id; + err = vfs_hash_get(mp, (u_int)id, flags, curthread, vpp, + sfs_compare_ids, &search); + return (err); +} + +static int +sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id, + uint64_t id, struct vnode **vpp) +{ + int err; + + KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data")); + err = vfs_hash_insert(vp, (u_int)id, flags, curthread, vpp, + sfs_compare_ids, vp->v_data); + return (err); +} + +static void +sfs_vnode_remove(struct vnode *vp) +{ + vfs_hash_remove(vp); +} + +typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg); + +static int +sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id, + const char *tag, struct vop_vector *vops, + sfs_vnode_setup_fn setup, void *arg, + struct vnode **vpp) +{ + struct vnode *vp; + int error; + + error = sfs_vnode_get(mp, flags, parent_id, id, vpp); + if (error != 0 || *vpp != NULL) { + KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, + "sfs vnode with no data"); + return (error); + } + + /* Allocate a new vnode/inode. */ + error = getnewvnode(tag, mp, vops, &vp); + if (error != 0) { + *vpp = NULL; + return (error); + } + + /* + * Exclusively lock the vnode vnode while it's being constructed. + */ + lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); + error = insmntque(vp, mp); + if (error != 0) { + *vpp = NULL; + return (error); + } + + setup(vp, arg); + + error = sfs_vnode_insert(vp, flags, parent_id, id, vpp); + if (error != 0 || *vpp != NULL) { + KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, + "sfs vnode with no data"); + return (error); + } + + *vpp = vp; + return (0); +} + +static void +sfs_print_node(sfs_node_t *node) +{ + printf("\tname = %s\n", node->sn_name); + printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id); + printf("\tid = %ju\n", (uintmax_t)node->sn_id); +} + +static sfs_node_t * +sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id) +{ + struct sfs_node *node; + + KASSERT(strlen(name) < sizeof(node->sn_name), + ("sfs node name is too long")); + KASSERT(size >= sizeof(*node), ("sfs node size is too small")); + node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO); + strlcpy(node->sn_name, name, sizeof(node->sn_name)); + node->sn_parent_id = parent_id; + node->sn_id = id; + + return (node); +} + +static void +sfs_destroy_node(sfs_node_t *node) +{ + free(node, M_SFSNODES); +} + +static void * +sfs_reclaim_vnode(vnode_t *vp) +{ + sfs_node_t *node; + void *data; + + sfs_vnode_remove(vp); + data = vp->v_data; + vp->v_data = NULL; + return (data); +} + +static int +sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap, + uio_t *uio, off_t *offp) +{ + struct dirent entry; + int error; + + /* Reset ncookies for subsequent use of vfs_read_dirent. */ + if (ap->a_ncookies != NULL) + *ap->a_ncookies = 0; + + if (uio->uio_resid < sizeof(entry)) + return (SET_ERROR(EINVAL)); + + if (uio->uio_offset < 0) + return (SET_ERROR(EINVAL)); + if (uio->uio_offset == 0) { + entry.d_fileno = id; + entry.d_type = DT_DIR; + entry.d_name[0] = '.'; + entry.d_namlen = 1; + entry.d_reclen = sizeof(entry); + dirent_terminate(&entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) + return (SET_ERROR(error)); + } + + if (uio->uio_offset < sizeof(entry)) + return (SET_ERROR(EINVAL)); + if (uio->uio_offset == sizeof(entry)) { + entry.d_fileno = parent_id; + entry.d_type = DT_DIR; + entry.d_name[0] = '.'; + entry.d_name[1] = '.'; + entry.d_namlen = 2; + entry.d_reclen = sizeof(entry); + dirent_terminate(&entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) + return (SET_ERROR(error)); + } + + if (offp != NULL) + *offp = 2 * sizeof(entry); + return (0); +} + + +/* + * .zfs inode namespace + * + * We need to generate unique inode numbers for all files and directories + * within the .zfs pseudo-filesystem. We use the following scheme: + * + * ENTRY ZFSCTL_INODE + * .zfs 1 + * .zfs/snapshot 2 + * .zfs/snapshot/<snap> objectid(snap) + */ +#define ZFSCTL_INO_SNAP(id) (id) + +static struct vop_vector zfsctl_ops_root; +static struct vop_vector zfsctl_ops_snapdir; +static struct vop_vector zfsctl_ops_snapshot; +static struct vop_vector zfsctl_ops_shares_dir; + +void +zfsctl_init(void) +{ +} + +void +zfsctl_fini(void) +{ +} + +boolean_t +zfsctl_is_node(vnode_t *vp) +{ + return (vn_matchops(vp, zfsctl_ops_root) || + vn_matchops(vp, zfsctl_ops_snapdir) || + vn_matchops(vp, zfsctl_ops_snapshot) || + vn_matchops(vp, zfsctl_ops_shares_dir)); + +} + +typedef struct zfsctl_root { + sfs_node_t node; + sfs_node_t *snapdir; + timestruc_t cmtime; +} zfsctl_root_t; + + +/* + * Create the '.zfs' directory. + */ +void +zfsctl_create(zfsvfs_t *zfsvfs) +{ + zfsctl_root_t *dot_zfs; + sfs_node_t *snapdir; + vnode_t *rvp; + uint64_t crtime[2]; + + ASSERT(zfsvfs->z_ctldir == NULL); + + snapdir = sfs_alloc_node(sizeof(*snapdir), "snapshot", ZFSCTL_INO_ROOT, + ZFSCTL_INO_SNAPDIR); + dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof(*dot_zfs), ".zfs", 0, + ZFSCTL_INO_ROOT); + dot_zfs->snapdir = snapdir; + + VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0); + VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + &crtime, sizeof(crtime))); + ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime); + vput(rvp); + + zfsvfs->z_ctldir = dot_zfs; +} + +/* + * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. + * The nodes must not have any associated vnodes by now as they should be + * vflush-ed. + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + sfs_destroy_node(zfsvfs->z_ctldir->snapdir); + sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir); + zfsvfs->z_ctldir = NULL; +} + +static int +zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + return (VFS_ROOT(mp, flags, vpp)); +} + +static void +zfsctl_common_vnode_setup(vnode_t *vp, void *arg) +{ + ASSERT_VOP_ELOCKED(vp, __func__); + + /* We support shared locking. */ + VN_LOCK_ASHARE(vp); + vp->v_type = VDIR; + vp->v_data = arg; +} + +static int +zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + void *node; + int err; + + node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir; + err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root, + zfsctl_common_vnode_setup, node, vpp); + return (err); +} + +static int +zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + void *node; + int err; + + node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir->snapdir; + err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs", + &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp); + return (err); +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +int +zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp) +{ + vnode_t *vp; + int error; + + error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp); + return (error); +} + +/* + * Common open routine. Disallow any write access. + */ +static int +zfsctl_common_open(struct vop_open_args *ap) +{ + int flags = ap->a_mode; + + if (flags & FWRITE) + return (SET_ERROR(EACCES)); + + return (0); +} + +/* + * Common close routine. Nothing to do here. + */ +/* ARGSUSED */ +static int +zfsctl_common_close(struct vop_close_args *ap) +{ + return (0); +} + +/* + * Common access routine. Disallow writes. + */ +static int +zfsctl_common_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + accmode_t a_accmode; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + accmode_t accmode = ap->a_accmode; + + if (accmode & VWRITE) + return (SET_ERROR(EACCES)); + return (0); +} + +/* + * Common getattr function. Fill in basic information. + */ +static void +zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) +{ + timestruc_t now; + sfs_node_t *node; + + node = vp->v_data; + + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_rdev = 0; + /* + * We are a purely virtual object, so we have no + * blocksize or allocated blocks. + */ + vap->va_blksize = 0; + vap->va_nblocks = 0; + vap->va_seq = 0; + vn_fsid(vp, vap); + vap->va_mode = zfsctl_ctldir_mode; + vap->va_type = VDIR; + /* + * We live in the now (for atime). + */ + gethrestime(&now); + vap->va_atime = now; + /* FreeBSD: Reset chflags(2) flags. */ + vap->va_flags = 0; + + vap->va_nodeid = node->sn_id; + + /* At least '.' and '..'. */ + vap->va_nlink = 2; +} + +static int +zfsctl_common_fid(ap) + struct vop_fid_args /* { + struct vnode *a_vp; + struct fid *a_fid; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + fid_t *fidp = (void *)ap->a_fid; + sfs_node_t *node = vp->v_data; + uint64_t object = node->sn_id; + zfid_short_t *zfid; + int i; + + zfid = (zfid_short_t *)fidp; + zfid->zf_len = SHORT_FID_LEN; + + for (i = 0; i < sizeof(zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* .zfs nodes always have a generation number of 0 */ + for (i = 0; i < sizeof(zfid->zf_gen); i++) + zfid->zf_gen[i] = 0; + + return (0); +} + +static int +zfsctl_common_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + + (void) sfs_reclaim_vnode(vp); + return (0); +} + +static int +zfsctl_common_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + sfs_print_node(ap->a_vp->v_data); + return (0); +} + +/* + * Get root directory attributes. + */ +static int +zfsctl_root_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vattr *vap = ap->a_vap; + zfsctl_root_t *node = vp->v_data; + + zfsctl_common_getattr(vp, vap); + vap->va_ctime = node->cmtime; + vap->va_mtime = vap->va_ctime; + vap->va_birthtime = vap->va_ctime; + vap->va_nlink += 1; /* snapdir */ + vap->va_size = vap->va_nlink; + return (0); +} + +/* + * When we lookup "." we still can be asked to lock it + * differently, can't we? + */ +int +zfsctl_relock_dot(vnode_t *dvp, int ltype) +{ + vref(dvp); + if (ltype != VOP_ISLOCKED(dvp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); + + /* Relock for the "." case may left us with reclaimed vnode. */ + if ((dvp->v_iflag & VI_DOOMED) != 0) { + vrele(dvp); + return (SET_ERROR(ENOENT)); + } + } + return (0); +} + +/* + * Special case the handling of "..". + */ +int +zfsctl_root_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + struct componentname *cnp = ap->a_cnp; + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + cred_t *cr = ap->a_cnp->cn_cred; + int flags = ap->a_cnp->cn_flags; + int lkflags = ap->a_cnp->cn_lkflags; + int nameiop = ap->a_cnp->cn_nameiop; + int err; + int ltype; + + ASSERT(dvp->v_type == VDIR); + + if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + + if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { + err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); + if (err == 0) + *vpp = dvp; + } else if ((flags & ISDOTDOT) != 0) { + err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL, + lkflags, vpp); + } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) { + err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp); + } else { + err = SET_ERROR(ENOENT); + } + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +zfsctl_root_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *ncookies; + u_long **a_cookies; + } */ *ap; +{ + struct dirent entry; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_root_t *node = vp->v_data; + uio_t *uio = ap->a_uio; + int *eofp = ap->a_eofflag; + off_t dots_offset; + int error; + + ASSERT(vp->v_type == VDIR); + + error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio, + &dots_offset); + if (error != 0) { + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; + return (error); + } + if (uio->uio_offset != dots_offset) + return (SET_ERROR(EINVAL)); + + CTASSERT(sizeof(node->snapdir->sn_name) <= sizeof(entry.d_name)); + entry.d_fileno = node->snapdir->sn_id; + entry.d_type = DT_DIR; + strcpy(entry.d_name, node->snapdir->sn_name); + entry.d_namlen = strlen(entry.d_name); + entry.d_reclen = sizeof(entry); + dirent_terminate(&entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) { + if (error == ENAMETOOLONG) + error = 0; + return (SET_ERROR(error)); + } + if (eofp != NULL) + *eofp = 1; + return (0); +} + +static int +zfsctl_root_vptocnp(struct vop_vptocnp_args *ap) +{ + static const char dotzfs_name[4] = ".zfs"; + vnode_t *dvp; + int error; + + if (*ap->a_buflen < sizeof (dotzfs_name)) + return (SET_ERROR(ENOMEM)); + + error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL, + LK_SHARED, &dvp); + if (error != 0) + return (SET_ERROR(error)); + + VOP_UNLOCK(dvp, 0); + *ap->a_vpp = dvp; + *ap->a_buflen -= sizeof (dotzfs_name); + bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name)); + return (0); +} + +static int +zfsctl_common_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + /* + * We care about ACL variables so that user land utilities like ls + * can display them correctly. Since the ctldir's st_dev is set to be + * the same as the parent dataset, we must support all variables that + * it supports. + */ + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX); + return (0); + + case _PC_FILESIZEBITS: + *ap->a_retval = 64; + return (0); + + case _PC_MIN_HOLE_SIZE: + *ap->a_retval = (int)SPA_MINBLOCKSIZE; + return (0); + + case _PC_ACL_NFS4: + *ap->a_retval = 1; + return (0); + + case _PC_ACL_PATH_MAX: + *ap->a_retval = ACL_MAX_ENTRIES; + return (0); + + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + return (0); + + default: + return (vop_stdpathconf(ap)); + } +} + +/** + * Returns a trivial ACL + */ +int +zfsctl_common_getacl(ap) + struct vop_getacl_args /* { + struct vnode *vp; + acl_type_t a_type; + struct acl *a_aclp; + struct ucred *cred; + struct thread *td; + } */ *ap; +{ + int i; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0); + /* + * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify + * attributes. That is not the case for the ctldir, so we must clear + * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs + * aren't supported by the ctldir. + */ + for (i = 0; i < ap->a_aclp->acl_cnt; i++) { + struct acl_entry *entry; + entry = &(ap->a_aclp->acl_entry[i]); + uint32_t old_perm = entry->ae_perm; + entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER | + ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS | + ACL_READ_NAMED_ATTRS ); + } + + return (0); +} + +static struct vop_vector zfsctl_ops_root = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_ioctl = VOP_EINVAL, + .vop_getattr = zfsctl_root_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = zfsctl_root_readdir, + .vop_lookup = zfsctl_root_lookup, + .vop_inactive = VOP_NULL, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_common_print, + .vop_vptocnp = zfsctl_root_vptocnp, + .vop_pathconf = zfsctl_common_pathconf, + .vop_getacl = zfsctl_common_getacl, +}; + +static int +zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + + dmu_objset_name(os, zname); + if (strlen(zname) + 1 + strlen(name) >= len) + return (SET_ERROR(ENAMETOOLONG)); + (void) strcat(zname, "@"); + (void) strcat(zname, name); + return (0); +} + +static int +zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + int err; + + err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id); + return (err); +} + +/* + * Given a vnode get a root vnode of a filesystem mounted on top of + * the vnode, if any. The root vnode is referenced and locked. + * If no filesystem is mounted then the orinal vnode remains referenced + * and locked. If any error happens the orinal vnode is unlocked and + * released. + */ +static int +zfsctl_mounted_here(vnode_t **vpp, int flags) +{ + struct mount *mp; + int err; + + ASSERT_VOP_LOCKED(*vpp, __func__); + ASSERT3S((*vpp)->v_type, ==, VDIR); + + if ((mp = (*vpp)->v_mountedhere) != NULL) { + err = vfs_busy(mp, 0); + KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err)); + KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint")); + vput(*vpp); + err = VFS_ROOT(mp, flags, vpp); + vfs_unbusy(mp); + return (err); + } + return (EJUSTRETURN); +} + +typedef struct { + const char *snap_name; + uint64_t snap_id; +} snapshot_setup_arg_t; + +static void +zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg) +{ + snapshot_setup_arg_t *ssa = arg; + sfs_node_t *node; + + ASSERT_VOP_ELOCKED(vp, __func__); + + node = sfs_alloc_node(sizeof(sfs_node_t), + ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id); + zfsctl_common_vnode_setup(vp, node); + + /* We have to support recursive locking. */ + VN_LOCK_AREC(vp); +} + +/* + * Lookup entry point for the 'snapshot' directory. Try to open the + * snapshot if it exist, creating the pseudo filesystem vnode as necessary. + * Perform a mount of the associated dataset on top of the vnode. + * There are four possibilities: + * - the snapshot node and vnode do not exist + * - the snapshot vnode is covered by the mounted snapshot + * - the snapshot vnode is not covered yet, the mount operation is in progress + * - the snapshot vnode is not covered, because the snapshot has been unmounted + * The last two states are transient and should be relatively short-lived. + */ +int +zfsctl_snapdir_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + char name[NAME_MAX + 1]; + char fullname[ZFS_MAX_DATASET_NAME_LEN]; + char *mountpoint; + size_t mountpoint_len; + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + uint64_t snap_id; + int nameiop = cnp->cn_nameiop; + int lkflags = cnp->cn_lkflags; + int flags = cnp->cn_flags; + int err; + + ASSERT(dvp->v_type == VDIR); + + if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + + if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { + err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); + if (err == 0) + *vpp = dvp; + return (err); + } + if (flags & ISDOTDOT) { + err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags, + vpp); + return (err); + } + + if (cnp->cn_namelen >= sizeof(name)) + return (SET_ERROR(ENAMETOOLONG)); + + strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); + err = zfsctl_snapshot_lookup(dvp, name, &snap_id); + if (err != 0) + return (SET_ERROR(ENOENT)); + + for (;;) { + snapshot_setup_arg_t ssa; + + ssa.snap_name = name; + ssa.snap_id = snap_id; + err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR, + snap_id, "zfs", &zfsctl_ops_snapshot, + zfsctl_snapshot_vnode_setup, &ssa, vpp); + if (err != 0) + return (err); + + /* Check if a new vnode has just been created. */ + if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE) + break; + + /* + * Check if a snapshot is already mounted on top of the vnode. + */ + err = zfsctl_mounted_here(vpp, lkflags); + if (err != EJUSTRETURN) + return (err); + + /* + * If the vnode is not covered, then either the mount operation + * is in progress or the snapshot has already been unmounted + * but the vnode hasn't been inactivated and reclaimed yet. + * We can try to re-use the vnode in the latter case. + */ + VI_LOCK(*vpp); + if (((*vpp)->v_iflag & VI_MOUNT) == 0) { + /* Upgrade to exclusive lock in order to: + * - avoid race conditions + * - satisfy the contract of mount_snapshot() + */ + err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK); + if (err == 0) + break; + } else { + VI_UNLOCK(*vpp); + } + + /* + * In this state we can loop on uncontested locks and starve + * the thread doing the lengthy, non-trivial mount operation. + * So, yield to prevent that from happening. + */ + vput(*vpp); + kern_yield(PRI_USER); + } + + VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof(fullname), fullname)); + + mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + + strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1; + mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); + (void) snprintf(mountpoint, mountpoint_len, + "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", + dvp->v_vfsp->mnt_stat.f_mntonname, name); + + err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0); + kmem_free(mountpoint, mountpoint_len); + if (err == 0) { + /* + * Fix up the root vnode mounted on .zfs/snapshot/<snapname>. + * + * This is where we lie about our v_vfsp in order to + * make .zfs/snapshot/<snapname> accessible over NFS + * without requiring manual mounts of <snapname>. + */ + ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); + VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; + + /* Clear the root flag (set via VFS_ROOT) as well. */ + (*vpp)->v_vflag &= ~VV_ROOT; + } + + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +zfsctl_snapdir_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *ncookies; + u_long **a_cookies; + } */ *ap; +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + struct dirent entry; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + uio_t *uio = ap->a_uio; + int *eofp = ap->a_eofflag; + off_t dots_offset; + int error; + + ASSERT(vp->v_type == VDIR); + + error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio, + &dots_offset); + if (error != 0) { + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; + return (error); + } + + ZFS_ENTER(zfsvfs); + for (;;) { + uint64_t cookie; + uint64_t id; + + cookie = uio->uio_offset - dots_offset; + + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname), + snapname, &id, &cookie, NULL); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error != 0) { + if (error == ENOENT) { + if (eofp != NULL) + *eofp = 1; + error = 0; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + entry.d_fileno = id; + entry.d_type = DT_DIR; + strcpy(entry.d_name, snapname); + entry.d_namlen = strlen(entry.d_name); + entry.d_reclen = sizeof(entry); + /* NOTE: d_off is the offset for the *next* entry. */ + entry.d_off = cookie + dots_offset; + dirent_terminate(&entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) { + if (error == ENAMETOOLONG) + error = 0; + ZFS_EXIT(zfsvfs); + return (SET_ERROR(error)); + } + uio->uio_offset = cookie + dots_offset; + } + /* NOTREACHED */ +} + +static int +zfsctl_snapdir_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os); + sfs_node_t *node = vp->v_data; + uint64_t snap_count; + int err; + + ZFS_ENTER(zfsvfs); + zfsctl_common_getattr(vp, vap); + vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os); + vap->va_mtime = vap->va_ctime; + vap->va_birthtime = vap->va_ctime; + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { + err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + if (err != 0) { + ZFS_EXIT(zfsvfs); + return (err); + } + vap->va_nlink += snap_count; + } + vap->va_size = vap->va_nlink; + + ZFS_EXIT(zfsvfs); + return (0); +} + +static struct vop_vector zfsctl_ops_snapdir = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_getattr = zfsctl_snapdir_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = zfsctl_snapdir_readdir, + .vop_lookup = zfsctl_snapdir_lookup, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_common_print, + .vop_pathconf = zfsctl_common_pathconf, + .vop_getacl = zfsctl_common_getacl, +}; + +static int +zfsctl_snapshot_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + + VERIFY(vrecycle(vp) == 1); + return (0); +} + +static int +zfsctl_snapshot_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + void *data = vp->v_data; + + sfs_reclaim_vnode(vp); + sfs_destroy_node(data); + return (0); +} + +static int +zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap) +{ + struct mount *mp; + vnode_t *dvp; + vnode_t *vp; + sfs_node_t *node; + size_t len; + int locked; + int error; + + vp = ap->a_vp; + node = vp->v_data; + len = strlen(node->sn_name); + if (*ap->a_buflen < len) + return (SET_ERROR(ENOMEM)); + + /* + * Prevent unmounting of the snapshot while the vnode lock + * is not held. That is not strictly required, but allows + * us to assert that an uncovered snapshot vnode is never + * "leaked". + */ + mp = vp->v_mountedhere; + if (mp == NULL) + return (SET_ERROR(ENOENT)); + error = vfs_busy(mp, 0); + KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error)); + + /* + * We can vput the vnode as we can now depend on the reference owned + * by the busied mp. But we also need to hold the vnode, because + * the reference may go after vfs_unbusy() which has to be called + * before we can lock the vnode again. + */ + locked = VOP_ISLOCKED(vp); + vhold(vp); + vput(vp); + + /* Look up .zfs/snapshot, our parent. */ + error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp); + if (error == 0) { + VOP_UNLOCK(dvp, 0); + *ap->a_vpp = dvp; + *ap->a_buflen -= len; + bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len); + } + vfs_unbusy(mp); + vget(vp, locked | LK_VNHELD | LK_RETRY, curthread); + return (error); +} + +/* + * These VP's should never see the light of day. They should always + * be covered. + */ +static struct vop_vector zfsctl_ops_snapshot = { + .vop_default = NULL, /* ensure very restricted access */ + .vop_inactive = zfsctl_snapshot_inactive, + .vop_reclaim = zfsctl_snapshot_reclaim, + .vop_vptocnp = zfsctl_snapshot_vptocnp, + .vop_lock1 = vop_stdlock, + .vop_unlock = vop_stdunlock, + .vop_islocked = vop_stdislocked, + .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */ + .vop_print = zfsctl_common_print, +}; + +int +zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) +{ + struct mount *mp; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + vnode_t *vp; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + *zfsvfsp = NULL; + error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, + ZFSCTL_INO_SNAPDIR, objsetid, &vp); + if (error == 0 && vp != NULL) { + /* + * XXX Probably need to at least reference, if not busy, the mp. + */ + if (vp->v_mountedhere != NULL) + *zfsvfsp = vp->v_mountedhere->mnt_data; + vput(vp); + } + if (*zfsvfsp == NULL) + return (SET_ERROR(EINVAL)); + return (0); +} + +/* + * Unmount any snapshots for the given filesystem. This is called from + * zfs_umount() - if we have a ctldir, then go through and unmount all the + * snapshots. + */ +int +zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + struct mount *mp; + vnode_t *dvp; + vnode_t *vp; + sfs_node_t *node; + sfs_node_t *snap; + uint64_t cookie; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + + cookie = 0; + for (;;) { + uint64_t id; + + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname), + snapname, &id, &cookie, NULL); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error != 0) { + if (error == ENOENT) + error = 0; + break; + } + + for (;;) { + error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, + ZFSCTL_INO_SNAPDIR, id, &vp); + if (error != 0 || vp == NULL) + break; + + mp = vp->v_mountedhere; + + /* + * v_mountedhere being NULL means that the + * (uncovered) vnode is in a transient state + * (mounting or unmounting), so loop until it + * settles down. + */ + if (mp != NULL) + break; + vput(vp); + } + if (error != 0) + break; + if (vp == NULL) + continue; /* no mountpoint, nothing to do */ + + /* + * The mount-point vnode is kept locked to avoid spurious EBUSY + * from a concurrent umount. + * The vnode lock must have recursive locking enabled. + */ + vfs_ref(mp); + error = dounmount(mp, fflags, curthread); + KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1, + ("extra references after unmount")); + vput(vp); + if (error != 0) + break; + } + KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0, + ("force unmounting failed")); + return (error); +} + diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c new file mode 100644 index 000000000000..a9cbe4dfe392 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c @@ -0,0 +1,112 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> + +list_t zfs_dbgmsgs; +int zfs_dbgmsg_size; +kmutex_t zfs_dbgmsgs_lock; +int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ + +void +zfs_dbgmsg_init(void) +{ + list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), + offsetof(zfs_dbgmsg_t, zdm_node)); + mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +zfs_dbgmsg_fini(void) +{ + zfs_dbgmsg_t *zdm; + + while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) { + int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_destroy(&zfs_dbgmsgs_lock); + ASSERT0(zfs_dbgmsg_size); +} + +/* + * Print these messages by running: + * echo ::zfs_dbgmsg | mdb -k + * + * Monitor these messages by running: + * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' + * + * When used with libzpool, monitor with: + * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}' + */ +void +zfs_dbgmsg(const char *fmt, ...) +{ + int size; + va_list adx; + zfs_dbgmsg_t *zdm; + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx); + va_end(adx); + + /* + * There is one byte of string in sizeof (zfs_dbgmsg_t), used + * for the terminating null. + */ + zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP); + zdm->zdm_timestamp = gethrestime_sec(); + + va_start(adx, fmt); + (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx); + va_end(adx); + + DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg); + + mutex_enter(&zfs_dbgmsgs_lock); + list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size; + while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) { + zdm = list_remove_head(&zfs_dbgmsgs); + size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_exit(&zfs_dbgmsgs_lock); +} + +void +zfs_dbgmsg_print(const char *tag) +{ + zfs_dbgmsg_t *zdm; + + (void) printf("ZFS_DBGMSG(%s):\n", tag); + mutex_enter(&zfs_dbgmsgs_lock); + for (zdm = list_head(&zfs_dbgmsgs); zdm; + zdm = list_next(&zfs_dbgmsgs, zdm)) + (void) printf("%s\n", zdm->zdm_msg); + mutex_exit(&zfs_dbgmsgs_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c new file mode 100644 index 000000000000..775697867f62 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c @@ -0,0 +1,967 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/uio.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/unistd.h> +#include <sys/sunddi.h> +#include <sys/random.h> +#include <sys/policy.h> +#include <sys/kcondvar.h> +#include <sys/callb.h> +#include <sys/smp.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/fs/zfs.h> +#include <sys/zap.h> +#include <sys/dmu.h> +#include <sys/atomic.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> +#include <sys/sa.h> +#include <sys/zfs_sa.h> +#include <sys/dnlc.h> +#include <sys/extdirent.h> + +/* + * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups + * of names after deciding which is the appropriate lookup interface. + */ +static int +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, + matchtype_t mt, uint64_t *zoid) +{ + int error; + + if (zfsvfs->z_norm) { + + /* + * In the non-mixed case we only expect there would ever + * be one match, but we need to use the normalizing lookup. + */ + error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, + zoid, mt, NULL, 0, NULL); + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); + } + *zoid = ZFS_DIRENT_OBJ(*zoid); + + return (error); +} + +/* + * Look up a directory entry under a locked vnode. + * dvp being locked gives us a guarantee that there are no concurrent + * modification of the directory and, thus, if a node can be found in + * the directory, then it must not be unlinked. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZXATTR: we want dzp's xattr directory + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + */ +int +zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + matchtype_t mt = 0; + uint64_t zoid; + vnode_t *vp = NULL; + int error = 0; + + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + + *zpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if (name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) || + zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) + return (SET_ERROR(EEXIST)); + + /* + * Case sensitivity and normalization preferences are set when + * the file system is created. These are stored in the + * zfsvfs->z_case and zfsvfs->z_norm fields. These choices + * affect how we perform zap lookups. + * + * When matching we may need to normalize & change case according to + * FS settings. + * + * Note that a normalized match is necessary for a case insensitive + * filesystem when the lookup request is not exact because normalization + * can fold case independent of normalizing code point sequences. + * + * See the table above zfs_dropname(). + */ + if (zfsvfs->z_norm != 0) { + mt = MT_NORMALIZE; + + /* + * Determine if the match needs to honor the case specified in + * lookup, and if so keep track of that so that during + * normalization we don't fold case. + */ + if (zfsvfs->z_case == ZFS_CASE_MIXED) { + mt |= MT_MATCH_CASE; + } + } + + /* + * Only look in or update the DNLC if we are looking for the + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name. + * + * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE + * because in that case MT_EXACT and MT_FIRST should produce exactly + * the same result. + */ + + if (dzp->z_unlinked && !(flag & ZXATTR)) + return (ENOENT); + if (flag & ZXATTR) { + error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, + sizeof (zoid)); + if (error == 0) + error = (zoid == 0 ? ENOENT : 0); + } else { + error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid); + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + return (error); + } + } else { + if (flag & ZNEW) { + return (SET_ERROR(EEXIST)); + } + error = zfs_zget(zfsvfs, zoid, zpp); + if (error) + return (error); + ASSERT(!(*zpp)->z_unlinked); + } + + return (0); +} + +static int +zfs_dd_lookup(znode_t *dzp, znode_t **zpp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + znode_t *zp; + uint64_t parent; + int error; + + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); + + if (dzp->z_unlinked) + return (ENOENT); + + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); + + error = zfs_zget(zfsvfs, parent, &zp); + if (error == 0) + *zpp = zp; + return (error); +} + +int +zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + znode_t *zp; + int error = 0; + + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); + + if (dzp->z_unlinked) + return (SET_ERROR(ENOENT)); + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *zpp = dzp; + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + error = zfs_dd_lookup(dzp, zpp); + } else { + error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS); + if (error == 0) { + dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + *zpp = zp; + } + } + return (error); +} + +/* + * unlinked Set (formerly known as the "delete queue") Error Handling + * + * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * unlinked set, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the nlink list is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the unlinked set below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ +void +zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp->z_unlinked); + ASSERT(zp->z_links == 0); + + VERIFY3U(0, ==, + zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); +} + +/* + * Clean up any znodes that had no links when we either crashed or + * (force) umounted the file system. + */ +void +zfs_unlinked_drain(zfsvfs_t *zfsvfs) +{ + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + dmu_tx_t *tx; + int error; + + /* + * Interate over the contents of the unlinked set. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); + zap_cursor_retrieve(&zc, &zap) == 0; + zap_cursor_advance(&zc)) { + + /* + * See what kind of object we have in list + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these list entries for deletion, + * so we pull them back into core and set zp->z_unlinked. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for deletion. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system marked z_unlinked. + */ + if (error != 0) + continue; + + vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); +#if defined(__FreeBSD__) + /* + * Due to changes in zfs_rmnode we need to make sure the + * link count is set to zero here. + */ + if (zp->z_links != 0) { + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + vput(ZTOV(zp)); + continue; + } + zp->z_links = 0; + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &zp->z_links, sizeof (zp->z_links), tx)); + dmu_tx_commit(tx); + } +#endif + zp->z_unlinked = B_TRUE; + vput(ZTOV(zp)); + } + zap_cursor_fini(&zc); +} + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. If we encounter + * an error, return a count of at least one so that the directory stays + * in the unlinked set. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + int skipped = 0; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget(zfsvfs, + ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); + if (error) { + skipped += 1; + continue; + } + + vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); + ASSERT((ZTOV(xzp)->v_type == VREG) || + (ZTOV(xzp)->v_type == VLNK)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* Is this really needed ? */ + zfs_sa_upgrade_txholds(tx, xzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + vput(ZTOV(xzp)); + skipped += 1; + continue; + } + + error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL); + if (error) + skipped += 1; + dmu_tx_commit(tx); + + vput(ZTOV(xzp)); + } + zap_cursor_fini(&zc); + if (error != ENOENT) + skipped += 1; + return (skipped); +} + +#if defined(__FreeBSD__) +extern taskq_t *zfsvfs_taskq; +#endif + +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + uint64_t acl_obj; + uint64_t xattr_obj; + int error; + + ASSERT(zp->z_links == 0); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + + /* + * If this is an attribute directory, purge its contents. + */ + if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && + (zp->z_pflags & ZFS_XATTR)) { + if (zfs_purgedir(zp) != 0) { + /* + * Not enough space to delete some xattrs. + * Leave it in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + } else { + /* + * Free up all the data in the file. We don't do this for + * XATTR directories because we need truncate and remove to be + * in the same tx, like in zfs_znode_delete(). Otherwise, if + * we crash here we'll end up with an inconsistent truncated + * zap object in the delete queue. Note a truncated file is + * harmless since it only contains user data. + */ + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space or we were interrupted by unmount. + * Leave the file in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + } + + /* + * If the file has extended attributes, we're going to unlink + * the xattr dir. + */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error) + xattr_obj = 0; + + acl_obj = zfs_external_acl(zp); + + /* + * Set up the final transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + if (xattr_obj) + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + /* + * Not enough space to delete the file. Leave it in the + * unlinked set, leaking it until the fs is remounted (at + * which point we'll call zfs_unlinked_drain() to process it). + */ + dmu_tx_abort(tx); + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + +#if defined(__FreeBSD__) + /* + * FreeBSD's implemention of zfs_zget requires a vnode to back it. + * This means that we could end up calling into getnewvnode while + * calling zfs_rmnode as a result of a prior call to getnewvnode + * trying to clear vnodes out of the cache. If this repeats we can + * recurse enough that we overflow our stack. To avoid this, we + * avoid calling zfs_zget on the xattr znode and instead simply add + * it to the unlinked set and schedule a call to zfs_unlinked_drain. + */ + if (xattr_obj) { + /* Add extended attribute directory to the unlinked set. */ + VERIFY3U(0, ==, + zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx)); + } +#else + if (xzp) { + ASSERT(error == 0); + xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ + xzp->z_links = 0; /* no more links to it */ + VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &xzp->z_links, sizeof (xzp->z_links), tx)); + zfs_unlinked_add(xzp, tx); + } +#endif + + /* Remove this znode from the unlinked set */ + VERIFY3U(0, ==, + zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); + +#if defined(__FreeBSD__) + if (xattr_obj) { + /* + * We're using the FreeBSD taskqueue API here instead of + * the Solaris taskq API since the FreeBSD API allows for a + * task to be enqueued multiple times but executed once. + */ + taskqueue_enqueue(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task); + } +#endif +} + +static uint64_t +zfs_dirent(znode_t *zp, uint64_t mode) +{ + uint64_t de = zp->z_id; + + if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) + de |= IFTODT(mode) << 60; + return (de); +} + +/* + * Link zp into dzp. Can only fail if zp has been unlinked. + */ +int +zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + uint64_t value; + int zp_is_dir = (vp->v_type == VDIR); + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); +#ifdef __FreeBSD__ + if (zp_is_dir) { + if (dzp->z_links >= ZFS_LINK_MAX) + return (SET_ERROR(EMLINK)); + } +#endif + if (!(flag & ZRENAMING)) { + if (zp->z_unlinked) { /* no new links to unlinked zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + return (SET_ERROR(ENOENT)); + } +#ifdef __FreeBSD__ + if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) { + return (SET_ERROR(EMLINK)); + } +#endif + zp->z_links++; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + + } else { + ASSERT(zp->z_unlinked == 0); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &dzp->z_id, sizeof (dzp->z_id)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (!(flag & ZNEW)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime, B_TRUE); + } + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + + dzp->z_size++; + dzp->z_links += zp_is_dir; + count = 0; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + + value = zfs_dirent(zp, zp->z_mode); + error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name, + 8, 1, &value, tx); + VERIFY0(error); + + return (0); +} + +/* + * The match type in the code for this function should conform to: + * + * ------------------------------------------------------------------------ + * fs type | z_norm | lookup type | match type + * ---------|-------------|-------------|---------------------------------- + * CS !norm | 0 | 0 | 0 (exact) + * CS norm | formX | 0 | MT_NORMALIZE + * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE + * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE + * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | ZCILOOK | MT_NORMALIZE + * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE + * + * Abbreviations: + * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed + * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) + * formX = unicode normalization form set on fs creation + */ +static int +zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag) +{ + int error; + + if (zp->z_zfsvfs->z_norm) { + matchtype_t mt = MT_NORMALIZE; + + if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) { + mt |= MT_MATCH_CASE; + } + + error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id, + name, mt, tx); + } else { + error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx); + } + + return (error); +} + +/* + * Unlink zp from dzp, and mark zp for deletion if this was the last link. + * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). + * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. + * If it's non-NULL, we use it to indicate whether the znode needs deletion, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag, boolean_t *unlinkedp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + int zp_is_dir = (vp->v_type == VDIR); + boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + + if (!(flag & ZRENAMING)) { + + if (zp_is_dir && !zfs_dirempty(zp)) { +#ifdef illumos + return (SET_ERROR(EEXIST)); +#else + return (SET_ERROR(ENOTEMPTY)); +#endif + } + + /* + * If we get here, we are going to try to remove the object. + * First try removing the name from the directory; if that + * fails, return the error. + */ + error = zfs_dropname(dzp, name, zp, tx, flag); + if (error != 0) { + return (error); + } + + if (zp->z_links <= zp_is_dir) { + zfs_panic_recover("zfs: link count on vnode %p is %u, " + "should be at least %u", zp->z_vnode, + (int)zp->z_links, + zp_is_dir + 1); + zp->z_links = zp_is_dir + 1; + } + if (--zp->z_links == zp_is_dir) { + zp->z_unlinked = B_TRUE; + zp->z_links = 0; + unlinked = B_TRUE; + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, + B_TRUE); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &zp->z_links, sizeof (zp->z_links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + count = 0; + ASSERT0(error); + } else { + ASSERT(zp->z_unlinked == 0); + error = zfs_dropname(dzp, name, zp, tx, flag); + if (error != 0) + return (error); + } + + dzp->z_size--; /* one dirent removed */ + dzp->z_links -= zp_is_dir; /* ".." link from zp */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + return (dzp->z_size == 2); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + dmu_tx_t *tx; + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t parent; + + *xvpp = NULL; + + /* + * In FreeBSD, access checking for creating an EA is being done + * in zfs_setextattr(), + */ +#ifndef __FreeBSD_kernel__ + if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)) + return (error); +#endif + + if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, + &acl_ids)) != 0) + return (error); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + return (SET_ERROR(EDQUOT)); + } + + getnewvnode_reserve(1); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + return (error); + } + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + +#ifdef DEBUG + error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + ASSERT(error == 0 && parent == zp->z_id); +#endif + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + sizeof (xzp->z_id), tx)); + + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, + xzp, "", NULL, acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + *xvpp = ZTOV(xzp); + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * flags - flags from the VOP_LOOKUP call + * + * OUT: xzpp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + vattr_t va; + int error; +top: + error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR); + if (error) + return (error); + + if (xzp != NULL) { + *xvpp = ZTOV(xzp); + return (0); + } + + + if (!(flags & CREATE_XATTR_DIR)) { +#ifdef illumos + return (SET_ERROR(ENOENT)); +#else + return (SET_ERROR(ENOATTR)); +#endif + } + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + return (SET_ERROR(EROFS)); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID; + va.va_type = VDIR; + va.va_mode = S_IFDIR | S_ISVTX | 0777; + zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); + + error = zfs_make_xattrdir(zp, &va, xvpp, cr); + + if (error == ERESTART) { + /* NB: we already did dmu_tx_wait() if necessary */ + goto top; + } + if (error == 0) + VOP_UNLOCK(*xvpp, 0); + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * the entry is a plain file and you have write access, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + + if (zdp->z_zfsvfs->z_replay) + return (0); + + if ((zdp->z_mode & S_ISVTX) == 0) + return (0); + + downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || + (ZTOV(zp)->v_type == VREG && + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) + return (0); + else + return (secpolicy_vnode_remove(ZTOV(zp), cr)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c new file mode 100644 index 000000000000..398a3d04aa6e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c @@ -0,0 +1,871 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> + +#include <sys/fm/fs/zfs.h> +#include <sys/fm/protocol.h> +#include <sys/fm/util.h> +#include <sys/sysevent.h> + +/* + * This general routine is responsible for generating all the different ZFS + * ereports. The payload is dependent on the class, and which arguments are + * supplied to the function: + * + * EREPORT POOL VDEV IO + * block X X X + * data X X + * device X X + * pool X + * + * If we are in a loading state, all errors are chained together by the same + * SPA-wide ENA (Error Numeric Association). + * + * For isolated I/O requests, we get the ENA from the zio_t. The propagation + * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want + * to chain together all ereports associated with a logical piece of data. For + * read I/Os, there are basically three 'types' of I/O, which form a roughly + * layered diagram: + * + * +---------------+ + * | Aggregate I/O | No associated logical data or device + * +---------------+ + * | + * V + * +---------------+ Reads associated with a piece of logical data. + * | Read I/O | This includes reads on behalf of RAID-Z, + * +---------------+ mirrors, gang blocks, retries, etc. + * | + * V + * +---------------+ Reads associated with a particular device, but + * | Physical I/O | no logical data. Issued as part of vdev caching + * +---------------+ and I/O aggregation. + * + * Note that 'physical I/O' here is not the same terminology as used in the rest + * of ZIO. Typically, 'physical I/O' simply means that there is no attached + * blockpointer. But I/O with no associated block pointer can still be related + * to a logical piece of data (i.e. RAID-Z requests). + * + * Purely physical I/O always have unique ENAs. They are not related to a + * particular piece of logical data, and therefore cannot be chained together. + * We still generate an ereport, but the DE doesn't correlate it with any + * logical piece of data. When such an I/O fails, the delegated I/O requests + * will issue a retry, which will trigger the 'real' ereport with the correct + * ENA. + * + * We keep track of the ENA for a ZIO chain through the 'io_logical' member. + * When a new logical I/O is issued, we set this to point to itself. Child I/Os + * then inherit this pointer, so that when it is first set subsequent failures + * will use the same ENA. For vdev cache fill and queue aggregation I/O, + * this pointer is set to NULL, and no ereport will be generated (since it + * doesn't actually correspond to any particular device or piece of data, + * and the caller will always retry without caching or queueing anyway). + * + * For checksum errors, we want to include more information about the actual + * error which occurs. Accordingly, we build an ereport when the error is + * noticed, but instead of sending it in immediately, we hang it off of the + * io_cksum_report field of the logical IO. When the logical IO completes + * (successfully or not), zfs_ereport_finish_checksum() is called with the + * good and bad versions of the buffer (if available), and we annotate the + * ereport with information about the differences. + */ +#ifdef _KERNEL +static void +zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, + const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, + uint64_t stateoroffset, uint64_t size) +{ + nvlist_t *ereport, *detector; + + uint64_t ena; + char class[64]; + + /* + * If we are doing a spa_tryimport() or in recovery mode, + * ignore errors. + */ + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) + return; + + /* + * If we are in the middle of opening a pool, and the previous attempt + * failed, don't bother logging any new ereports - we're just going to + * get the same diagnosis anyway. + */ + if (spa_load_state(spa) != SPA_LOAD_NONE && + spa->spa_last_open_failed) + return; + + if (zio != NULL) { + /* + * If this is not a read or write zio, ignore the error. This + * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. + */ + if (zio->io_type != ZIO_TYPE_READ && + zio->io_type != ZIO_TYPE_WRITE) + return; + + /* + * Ignore any errors from speculative I/Os, as failure is an + * expected result. + */ + if (zio->io_flags & ZIO_FLAG_SPECULATIVE) + return; + + /* + * If this I/O is not a retry I/O, don't post an ereport. + * Otherwise, we risk making bad diagnoses based on B_FAILFAST + * I/Os. + */ + if (zio->io_error == EIO && + !(zio->io_flags & ZIO_FLAG_IO_RETRY)) + return; + + if (vd != NULL) { + /* + * If the vdev has already been marked as failing due + * to a failed probe, then ignore any subsequent I/O + * errors, as the DE will automatically fault the vdev + * on the first such failure. This also catches cases + * where vdev_remove_wanted is set and the device has + * not yet been asynchronously placed into the REMOVED + * state. + */ + if (zio->io_vd == vd && !vdev_accessible(vd, zio)) + return; + + /* + * Ignore checksum errors for reads from DTL regions of + * leaf vdevs. + */ + if (zio->io_type == ZIO_TYPE_READ && + zio->io_error == ECKSUM && + vd->vdev_ops->vdev_op_leaf && + vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) + return; + } + } + + /* + * For probe failure, we want to avoid posting ereports if we've + * already removed the device in the meantime. + */ + if (vd != NULL && + strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && + (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) + return; + + if ((ereport = fm_nvlist_create(NULL)) == NULL) + return; + + if ((detector = fm_nvlist_create(NULL)) == NULL) { + fm_nvlist_destroy(ereport, FM_NVA_FREE); + return; + } + + /* + * Serialize ereport generation + */ + mutex_enter(&spa->spa_errlist_lock); + + /* + * Determine the ENA to use for this event. If we are in a loading + * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use + * a root zio-wide ENA. Otherwise, simply use a unique ENA. + */ + if (spa_load_state(spa) != SPA_LOAD_NONE) { + if (spa->spa_ena == 0) + spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); + ena = spa->spa_ena; + } else if (zio != NULL && zio->io_logical != NULL) { + if (zio->io_logical->io_ena == 0) + zio->io_logical->io_ena = + fm_ena_generate(0, FM_ENA_FMT1); + ena = zio->io_logical->io_ena; + } else { + ena = fm_ena_generate(0, FM_ENA_FMT1); + } + + /* + * Construct the full class, detector, and other standard FMA fields. + */ + (void) snprintf(class, sizeof (class), "%s.%s", + ZFS_ERROR_CLASS, subclass); + + fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), + vd != NULL ? vd->vdev_guid : 0); + + fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); + + /* + * Construct the per-ereport payload, depending on which parameters are + * passed in. + */ + + /* + * Generic payload members common to all ereports. + */ + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, + DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + DATA_TYPE_UINT64, spa_guid(spa), + FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, + spa_load_state(spa), NULL); + + if (spa != NULL) { + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, + DATA_TYPE_STRING, + spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? + FM_EREPORT_FAILMODE_WAIT : + spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? + FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, + NULL); + } + + if (vd != NULL) { + vdev_t *pvd = vd->vdev_parent; + + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + DATA_TYPE_UINT64, vd->vdev_guid, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); + if (vd->vdev_path != NULL) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, + DATA_TYPE_STRING, vd->vdev_path, NULL); + if (vd->vdev_devid != NULL) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, + DATA_TYPE_STRING, vd->vdev_devid, NULL); + if (vd->vdev_fru != NULL) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, + DATA_TYPE_STRING, vd->vdev_fru, NULL); + + if (pvd != NULL) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, + DATA_TYPE_UINT64, pvd->vdev_guid, + FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, + DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, + NULL); + if (pvd->vdev_path) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, + DATA_TYPE_STRING, pvd->vdev_path, NULL); + if (pvd->vdev_devid) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, + DATA_TYPE_STRING, pvd->vdev_devid, NULL); + } + } + + if (zio != NULL) { + /* + * Payload common to all I/Os. + */ + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, + DATA_TYPE_INT32, zio->io_error, NULL); + + /* + * If the 'size' parameter is non-zero, it indicates this is a + * RAID-Z or other I/O where the physical offset and length are + * provided for us, instead of within the zio_t. + */ + if (vd != NULL) { + if (size) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + DATA_TYPE_UINT64, stateoroffset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, + DATA_TYPE_UINT64, size, NULL); + else + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + DATA_TYPE_UINT64, zio->io_offset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, + DATA_TYPE_UINT64, zio->io_size, NULL); + } + + /* + * Payload for I/Os with corresponding logical information. + */ + if (zio->io_logical != NULL) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, + DATA_TYPE_UINT64, + zio->io_logical->io_bookmark.zb_objset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, + DATA_TYPE_UINT64, + zio->io_logical->io_bookmark.zb_object, + FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, + DATA_TYPE_INT64, + zio->io_logical->io_bookmark.zb_level, + FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, + DATA_TYPE_UINT64, + zio->io_logical->io_bookmark.zb_blkid, NULL); + } else if (vd != NULL) { + /* + * If we have a vdev but no zio, this is a device fault, and the + * 'stateoroffset' parameter indicates the previous state of the + * vdev. + */ + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, + DATA_TYPE_UINT64, stateoroffset, NULL); + } + + mutex_exit(&spa->spa_errlist_lock); + + *ereport_out = ereport; + *detector_out = detector; +} + +/* if it's <= 128 bytes, save the corruption directly */ +#define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) + +#define MAX_RANGES 16 + +typedef struct zfs_ecksum_info { + /* histograms of set and cleared bits by bit number in a 64-bit word */ + uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY]; + uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; + + /* inline arrays of bits set and cleared. */ + uint64_t zei_bits_set[ZFM_MAX_INLINE]; + uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; + + /* + * for each range, the number of bits set and cleared. The Hamming + * distance between the good and bad buffers is the sum of them all. + */ + uint32_t zei_range_sets[MAX_RANGES]; + uint32_t zei_range_clears[MAX_RANGES]; + + struct zei_ranges { + uint32_t zr_start; + uint32_t zr_end; + } zei_ranges[MAX_RANGES]; + + size_t zei_range_count; + uint32_t zei_mingap; + uint32_t zei_allowed_mingap; + +} zfs_ecksum_info_t; + +static void +update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count) +{ + size_t i; + size_t bits = 0; + uint64_t value = BE_64(value_arg); + + /* We store the bits in big-endian (largest-first) order */ + for (i = 0; i < 64; i++) { + if (value & (1ull << i)) { + hist[63 - i]++; + ++bits; + } + } + /* update the count of bits changed */ + *count += bits; +} + +/* + * We've now filled up the range array, and need to increase "mingap" and + * shrink the range list accordingly. zei_mingap is always the smallest + * distance between array entries, so we set the new_allowed_gap to be + * one greater than that. We then go through the list, joining together + * any ranges which are closer than the new_allowed_gap. + * + * By construction, there will be at least one. We also update zei_mingap + * to the new smallest gap, to prepare for our next invocation. + */ +static void +shrink_ranges(zfs_ecksum_info_t *eip) +{ + uint32_t mingap = UINT32_MAX; + uint32_t new_allowed_gap = eip->zei_mingap + 1; + + size_t idx, output; + size_t max = eip->zei_range_count; + + struct zei_ranges *r = eip->zei_ranges; + + ASSERT3U(eip->zei_range_count, >, 0); + ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); + + output = idx = 0; + while (idx < max - 1) { + uint32_t start = r[idx].zr_start; + uint32_t end = r[idx].zr_end; + + while (idx < max - 1) { + idx++; + + uint32_t nstart = r[idx].zr_start; + uint32_t nend = r[idx].zr_end; + + uint32_t gap = nstart - end; + if (gap < new_allowed_gap) { + end = nend; + continue; + } + if (gap < mingap) + mingap = gap; + break; + } + r[output].zr_start = start; + r[output].zr_end = end; + output++; + } + ASSERT3U(output, <, eip->zei_range_count); + eip->zei_range_count = output; + eip->zei_mingap = mingap; + eip->zei_allowed_mingap = new_allowed_gap; +} + +static void +add_range(zfs_ecksum_info_t *eip, int start, int end) +{ + struct zei_ranges *r = eip->zei_ranges; + size_t count = eip->zei_range_count; + + if (count >= MAX_RANGES) { + shrink_ranges(eip); + count = eip->zei_range_count; + } + if (count == 0) { + eip->zei_mingap = UINT32_MAX; + eip->zei_allowed_mingap = 1; + } else { + int gap = start - r[count - 1].zr_end; + + if (gap < eip->zei_allowed_mingap) { + r[count - 1].zr_end = end; + return; + } + if (gap < eip->zei_mingap) + eip->zei_mingap = gap; + } + r[count].zr_start = start; + r[count].zr_end = end; + eip->zei_range_count++; +} + +static size_t +range_total_size(zfs_ecksum_info_t *eip) +{ + struct zei_ranges *r = eip->zei_ranges; + size_t count = eip->zei_range_count; + size_t result = 0; + size_t idx; + + for (idx = 0; idx < count; idx++) + result += (r[idx].zr_end - r[idx].zr_start); + + return (result); +} + +static zfs_ecksum_info_t * +annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, + const uint8_t *goodbuf, const uint8_t *badbuf, size_t size, + boolean_t drop_if_identical) +{ + const uint64_t *good = (const uint64_t *)goodbuf; + const uint64_t *bad = (const uint64_t *)badbuf; + + uint64_t allset = 0; + uint64_t allcleared = 0; + + size_t nui64s = size / sizeof (uint64_t); + + size_t inline_size; + int no_inline = 0; + size_t idx; + size_t range; + + size_t offset = 0; + ssize_t start = -1; + + zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); + + /* don't do any annotation for injected checksum errors */ + if (info != NULL && info->zbc_injected) + return (eip); + + if (info != NULL && info->zbc_has_cksum) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, + DATA_TYPE_UINT64_ARRAY, + sizeof (info->zbc_expected) / sizeof (uint64_t), + (uint64_t *)&info->zbc_expected, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, + DATA_TYPE_UINT64_ARRAY, + sizeof (info->zbc_actual) / sizeof (uint64_t), + (uint64_t *)&info->zbc_actual, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, + DATA_TYPE_STRING, + info->zbc_checksum_name, + NULL); + + if (info->zbc_byteswapped) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, + DATA_TYPE_BOOLEAN, 1, + NULL); + } + } + + if (badbuf == NULL || goodbuf == NULL) + return (eip); + + ASSERT3U(nui64s, <=, UINT32_MAX); + ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(size, <=, UINT32_MAX); + + /* build up the range list by comparing the two buffers. */ + for (idx = 0; idx < nui64s; idx++) { + if (good[idx] == bad[idx]) { + if (start == -1) + continue; + + add_range(eip, start, idx); + start = -1; + } else { + if (start != -1) + continue; + + start = idx; + } + } + if (start != -1) + add_range(eip, start, idx); + + /* See if it will fit in our inline buffers */ + inline_size = range_total_size(eip); + if (inline_size > ZFM_MAX_INLINE) + no_inline = 1; + + /* + * If there is no change and we want to drop if the buffers are + * identical, do so. + */ + if (inline_size == 0 && drop_if_identical) { + kmem_free(eip, sizeof (*eip)); + return (NULL); + } + + /* + * Now walk through the ranges, filling in the details of the + * differences. Also convert our uint64_t-array offsets to byte + * offsets. + */ + for (range = 0; range < eip->zei_range_count; range++) { + size_t start = eip->zei_ranges[range].zr_start; + size_t end = eip->zei_ranges[range].zr_end; + + for (idx = start; idx < end; idx++) { + uint64_t set, cleared; + + // bits set in bad, but not in good + set = ((~good[idx]) & bad[idx]); + // bits set in good, but not in bad + cleared = (good[idx] & (~bad[idx])); + + allset |= set; + allcleared |= cleared; + + if (!no_inline) { + ASSERT3U(offset, <, inline_size); + eip->zei_bits_set[offset] = set; + eip->zei_bits_cleared[offset] = cleared; + offset++; + } + + update_histogram(set, eip->zei_histogram_set, + &eip->zei_range_sets[range]); + update_histogram(cleared, eip->zei_histogram_cleared, + &eip->zei_range_clears[range]); + } + + /* convert to byte offsets */ + eip->zei_ranges[range].zr_start *= sizeof (uint64_t); + eip->zei_ranges[range].zr_end *= sizeof (uint64_t); + } + eip->zei_allowed_mingap *= sizeof (uint64_t); + inline_size *= sizeof (uint64_t); + + /* fill in ereport */ + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, + DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, + (uint32_t *)eip->zei_ranges, + FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, + DATA_TYPE_UINT32, eip->zei_allowed_mingap, + FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, + DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, + FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, + DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, + NULL); + + if (!no_inline) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, + DATA_TYPE_UINT8_ARRAY, + inline_size, (uint8_t *)eip->zei_bits_set, + FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, + DATA_TYPE_UINT8_ARRAY, + inline_size, (uint8_t *)eip->zei_bits_cleared, + NULL); + } else { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, + DATA_TYPE_UINT32_ARRAY, + NBBY * sizeof (uint64_t), eip->zei_histogram_set, + FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, + DATA_TYPE_UINT32_ARRAY, + NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, + NULL); + } + return (eip); +} +#endif + +void +zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, + uint64_t stateoroffset, uint64_t size) +{ +#ifdef _KERNEL + nvlist_t *ereport = NULL; + nvlist_t *detector = NULL; + + zfs_ereport_start(&ereport, &detector, + subclass, spa, vd, zio, stateoroffset, size); + + if (ereport == NULL) + return; + + fm_ereport_post(ereport, EVCH_SLEEP); + + fm_nvlist_destroy(ereport, FM_NVA_FREE); + fm_nvlist_destroy(detector, FM_NVA_FREE); +#endif +} + +void +zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t offset, uint64_t length, void *arg, + zio_bad_cksum_t *info) +{ + zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP); + + if (zio->io_vsd != NULL) + zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); + else + zio_vsd_default_cksum_report(zio, report, arg); + + /* copy the checksum failure information if it was provided */ + if (info != NULL) { + report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); + bcopy(info, report->zcr_ckinfo, sizeof (*info)); + } + + report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; + report->zcr_length = length; + +#ifdef _KERNEL + zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, + FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); + + if (report->zcr_ereport == NULL) { + report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo); + if (report->zcr_ckinfo != NULL) { + kmem_free(report->zcr_ckinfo, + sizeof (*report->zcr_ckinfo)); + } + kmem_free(report, sizeof (*report)); + return; + } +#endif + + mutex_enter(&spa->spa_errlist_lock); + report->zcr_next = zio->io_logical->io_cksum_report; + zio->io_logical->io_cksum_report = report; + mutex_exit(&spa->spa_errlist_lock); +} + +void +zfs_ereport_finish_checksum(zio_cksum_report_t *report, + const void *good_data, const void *bad_data, boolean_t drop_if_identical) +{ +#ifdef _KERNEL + zfs_ecksum_info_t *info = NULL; + info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, + good_data, bad_data, report->zcr_length, drop_if_identical); + + if (info != NULL) + fm_ereport_post(report->zcr_ereport, EVCH_SLEEP); + + fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE); + fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE); + report->zcr_ereport = report->zcr_detector = NULL; + + if (info != NULL) + kmem_free(info, sizeof (*info)); +#endif +} + +void +zfs_ereport_free_checksum(zio_cksum_report_t *rpt) +{ +#ifdef _KERNEL + if (rpt->zcr_ereport != NULL) { + fm_nvlist_destroy(rpt->zcr_ereport, + FM_NVA_FREE); + fm_nvlist_destroy(rpt->zcr_detector, + FM_NVA_FREE); + } +#endif + rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); + + if (rpt->zcr_ckinfo != NULL) + kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); + + kmem_free(rpt, sizeof (*rpt)); +} + +void +zfs_ereport_send_interim_checksum(zio_cksum_report_t *report) +{ +#ifdef _KERNEL + fm_ereport_post(report->zcr_ereport, EVCH_SLEEP); +#endif +} + +void +zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t offset, uint64_t length, + const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc) +{ +#ifdef _KERNEL + nvlist_t *ereport = NULL; + nvlist_t *detector = NULL; + zfs_ecksum_info_t *info; + + zfs_ereport_start(&ereport, &detector, + FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); + + if (ereport == NULL) + return; + + info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, + B_FALSE); + + if (info != NULL) + fm_ereport_post(ereport, EVCH_SLEEP); + + fm_nvlist_destroy(ereport, FM_NVA_FREE); + fm_nvlist_destroy(detector, FM_NVA_FREE); + + if (info != NULL) + kmem_free(info, sizeof (*info)); +#endif +} + +static void +zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) +{ +#ifdef _KERNEL + nvlist_t *resource; + char class[64]; + + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) + return; + + if ((resource = fm_nvlist_create(NULL)) == NULL) + return; + + (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, + ZFS_ERROR_CLASS, name); + VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); + VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); + VERIFY(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); + if (vd) + VERIFY(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); + + fm_ereport_post(resource, EVCH_SLEEP); + + fm_nvlist_destroy(resource, FM_NVA_FREE); +#endif +} + +/* + * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev + * has been removed from the system. This will cause the DE to ignore any + * recent I/O errors, inferring that they are due to the asynchronous device + * removal. + */ +void +zfs_post_remove(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_REMOVED); +} + +/* + * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool + * has the 'autoreplace' property set, and therefore any broken vdevs will be + * handled by higher level logic, and no vdev fault should be generated. + */ +void +zfs_post_autoreplace(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); +} + +/* + * The 'resource.fs.zfs.statechange' event is an internal signal that the + * given vdev has transitioned its state to DEGRADED or HEALTHY. This will + * cause the retire agent to repair any outstanding fault management cases + * open because the device was not found (fault.fs.zfs.device). + */ +void +zfs_post_state_change(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c new file mode 100644 index 000000000000..581b6b1bfb64 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c @@ -0,0 +1,762 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/dmu.h> +#include <sys/avl.h> +#include <sys/zap.h> +#include <sys/refcount.h> +#include <sys/nvpair.h> +#ifdef _KERNEL +#include <sys/kidmap.h> +#include <sys/sid.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> +#endif +#include <sys/zfs_fuid.h> + +/* + * FUID Domain table(s). + * + * The FUID table is stored as a packed nvlist of an array + * of nvlists which contain an index, domain string and offset + * + * During file system initialization the nvlist(s) are read and + * two AVL trees are created. One tree is keyed by the index number + * and the other by the domain string. Nodes are never removed from + * trees, but new entries may be added. If a new entry is added then + * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then + * be responsible for calling zfs_fuid_sync() to sync the changes to disk. + * + */ + +#define FUID_IDX "fuid_idx" +#define FUID_DOMAIN "fuid_domain" +#define FUID_OFFSET "fuid_offset" +#define FUID_NVP_ARRAY "fuid_nvlist" + +typedef struct fuid_domain { + avl_node_t f_domnode; + avl_node_t f_idxnode; + ksiddomain_t *f_ksid; + uint64_t f_idx; +} fuid_domain_t; + +static char *nulldomain = ""; + +/* + * Compare two indexes. + */ +static int +idx_compare(const void *arg1, const void *arg2) +{ + const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; + const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; + + return (AVL_CMP(node1->f_idx, node2->f_idx)); +} + +/* + * Compare two domain strings. + */ +static int +domain_compare(const void *arg1, const void *arg2) +{ + const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; + const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; + int val; + + val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); + + return (AVL_ISIGN(val)); +} + +void +zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree) +{ + avl_create(idx_tree, idx_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); + avl_create(domain_tree, domain_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); +} + +/* + * load initial fuid domain and idx trees. This function is used by + * both the kernel and zdb. + */ +uint64_t +zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, + avl_tree_t *domain_tree) +{ + dmu_buf_t *db; + uint64_t fuid_size; + + ASSERT(fuid_obj != 0); + VERIFY(0 == dmu_bonus_hold(os, fuid_obj, + FTAG, &db)); + fuid_size = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + if (fuid_size) { + nvlist_t **fuidnvp; + nvlist_t *nvp = NULL; + uint_t count; + char *packed; + int i; + + packed = kmem_alloc(fuid_size, KM_SLEEP); + VERIFY(dmu_read(os, fuid_obj, 0, + fuid_size, packed, DMU_READ_PREFETCH) == 0); + VERIFY(nvlist_unpack(packed, fuid_size, + &nvp, 0) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, + &fuidnvp, &count) == 0); + + for (i = 0; i != count; i++) { + fuid_domain_t *domnode; + char *domain; + uint64_t idx; + + VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN, + &domain) == 0); + VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX, + &idx) == 0); + + domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); + + domnode->f_idx = idx; + domnode->f_ksid = ksid_lookupdomain(domain); + avl_add(idx_tree, domnode); + avl_add(domain_tree, domnode); + } + nvlist_free(nvp); + kmem_free(packed, fuid_size); + } + return (fuid_size); +} + +void +zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree) +{ + fuid_domain_t *domnode; + void *cookie; + + cookie = NULL; + while (domnode = avl_destroy_nodes(domain_tree, &cookie)) + ksiddomain_rele(domnode->f_ksid); + + avl_destroy(domain_tree); + cookie = NULL; + while (domnode = avl_destroy_nodes(idx_tree, &cookie)) + kmem_free(domnode, sizeof (fuid_domain_t)); + avl_destroy(idx_tree); +} + +char * +zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) +{ + fuid_domain_t searchnode, *findnode; + avl_index_t loc; + + searchnode.f_idx = idx; + + findnode = avl_find(idx_tree, &searchnode, &loc); + + return (findnode ? findnode->f_ksid->kd_name : nulldomain); +} + +#ifdef _KERNEL +/* + * Load the fuid table(s) into memory. + */ +static void +zfs_fuid_init(zfsvfs_t *zfsvfs) +{ + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + + if (zfsvfs->z_fuid_loaded) { + rw_exit(&zfsvfs->z_fuid_lock); + return; + } + + zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); + + (void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); + if (zfsvfs->z_fuid_obj != 0) { + zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os, + zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, + &zfsvfs->z_fuid_domain); + } + + zfsvfs->z_fuid_loaded = B_TRUE; + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * sync out AVL trees to persistent storage. + */ +void +zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + nvlist_t *nvp; + nvlist_t **fuids; + size_t nvsize = 0; + char *packed; + dmu_buf_t *db; + fuid_domain_t *domnode; + int numnodes; + int i; + + if (!zfsvfs->z_fuid_dirty) { + return; + } + + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + + /* + * First see if table needs to be created? + */ + if (zfsvfs->z_fuid_obj == 0) { + zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, + DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, + sizeof (uint64_t), tx); + VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, sizeof (uint64_t), 1, + &zfsvfs->z_fuid_obj, tx) == 0); + } + + VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + numnodes = avl_numnodes(&zfsvfs->z_fuid_idx); + fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP); + for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++, + domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) { + VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, + domnode->f_idx) == 0); + VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0); + VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN, + domnode->f_ksid->kd_name) == 0); + } + VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, + fuids, numnodes) == 0); + for (i = 0; i != numnodes; i++) + nvlist_free(fuids[i]); + kmem_free(fuids, numnodes * sizeof (void *)); + VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); + packed = kmem_alloc(nvsize, KM_SLEEP); + VERIFY(nvlist_pack(nvp, &packed, &nvsize, + NV_ENCODE_XDR, KM_SLEEP) == 0); + nvlist_free(nvp); + zfsvfs->z_fuid_size = nvsize; + dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, + zfsvfs->z_fuid_size, packed, tx); + kmem_free(packed, zfsvfs->z_fuid_size); + VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, + FTAG, &db)); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; + dmu_buf_rele(db, FTAG); + + zfsvfs->z_fuid_dirty = B_FALSE; + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * Query domain table for a given domain. + * + * If domain isn't found and addok is set, it is added to AVL trees and + * the zfsvfs->z_fuid_dirty flag will be set to TRUE. It will then be + * necessary for the caller or another thread to detect the dirty table + * and sync out the changes. + */ +int +zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, + char **retdomain, boolean_t addok) +{ + fuid_domain_t searchnode, *findnode; + avl_index_t loc; + krw_t rw = RW_READER; + + /* + * If the dummy "nobody" domain then return an index of 0 + * to cause the created FUID to be a standard POSIX id + * for the user nobody. + */ + if (domain[0] == '\0') { + if (retdomain) + *retdomain = nulldomain; + return (0); + } + + searchnode.f_ksid = ksid_lookupdomain(domain); + if (retdomain) + *retdomain = searchnode.f_ksid->kd_name; + if (!zfsvfs->z_fuid_loaded) + zfs_fuid_init(zfsvfs); + +retry: + rw_enter(&zfsvfs->z_fuid_lock, rw); + findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc); + + if (findnode) { + rw_exit(&zfsvfs->z_fuid_lock); + ksiddomain_rele(searchnode.f_ksid); + return (findnode->f_idx); + } else if (addok) { + fuid_domain_t *domnode; + uint64_t retidx; + + if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) { + rw_exit(&zfsvfs->z_fuid_lock); + rw = RW_WRITER; + goto retry; + } + + domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); + domnode->f_ksid = searchnode.f_ksid; + + retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1; + + avl_add(&zfsvfs->z_fuid_domain, domnode); + avl_add(&zfsvfs->z_fuid_idx, domnode); + zfsvfs->z_fuid_dirty = B_TRUE; + rw_exit(&zfsvfs->z_fuid_lock); + return (retidx); + } else { + rw_exit(&zfsvfs->z_fuid_lock); + return (-1); + } +} + +/* + * Query domain table by index, returning domain string + * + * Returns a pointer from an avl node of the domain string. + * + */ +const char * +zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) +{ + char *domain; + + if (idx == 0 || !zfsvfs->z_use_fuids) + return (NULL); + + if (!zfsvfs->z_fuid_loaded) + zfs_fuid_init(zfsvfs); + + rw_enter(&zfsvfs->z_fuid_lock, RW_READER); + + if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty) + domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx); + else + domain = nulldomain; + rw_exit(&zfsvfs->z_fuid_lock); + + ASSERT(domain); + return (domain); +} + +void +zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) +{ + *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP); +} + +uid_t +zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, + cred_t *cr, zfs_fuid_type_t type) +{ + uint32_t index = FUID_INDEX(fuid); + const char *domain; + uid_t id; + + if (index == 0) + return (fuid); + + domain = zfs_fuid_find_by_idx(zfsvfs, index); + ASSERT(domain != NULL); + +#ifdef illumos + if (type == ZFS_OWNER || type == ZFS_ACE_USER) { + (void) kidmap_getuidbysid(crgetzone(cr), domain, + FUID_RID(fuid), &id); + } else { + (void) kidmap_getgidbysid(crgetzone(cr), domain, + FUID_RID(fuid), &id); + } +#else + id = UID_NOBODY; +#endif + return (id); +} + +/* + * Add a FUID node to the list of fuid's being created for this + * ACL + * + * If ACL has multiple domains, then keep only one copy of each unique + * domain. + */ +void +zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, + uint64_t idx, uint64_t id, zfs_fuid_type_t type) +{ + zfs_fuid_t *fuid; + zfs_fuid_domain_t *fuid_domain; + zfs_fuid_info_t *fuidp; + uint64_t fuididx; + boolean_t found = B_FALSE; + + if (*fuidpp == NULL) + *fuidpp = zfs_fuid_info_alloc(); + + fuidp = *fuidpp; + /* + * First find fuid domain index in linked list + * + * If one isn't found then create an entry. + */ + + for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains); + fuid_domain; fuid_domain = list_next(&fuidp->z_domains, + fuid_domain), fuididx++) { + if (idx == fuid_domain->z_domidx) { + found = B_TRUE; + break; + } + } + + if (!found) { + fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP); + fuid_domain->z_domain = domain; + fuid_domain->z_domidx = idx; + list_insert_tail(&fuidp->z_domains, fuid_domain); + fuidp->z_domain_str_sz += strlen(domain) + 1; + fuidp->z_domain_cnt++; + } + + if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) { + + /* + * Now allocate fuid entry and add it on the end of the list + */ + + fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); + fuid->z_id = id; + fuid->z_domidx = idx; + fuid->z_logfuid = FUID_ENCODE(fuididx, rid); + + list_insert_tail(&fuidp->z_fuids, fuid); + fuidp->z_fuid_cnt++; + } else { + if (type == ZFS_OWNER) + fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid); + else + fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid); + } +} + +/* + * Create a file system FUID, based on information in the users cred + * + * If cred contains KSID_OWNER then it should be used to determine + * the uid otherwise cred's uid will be used. By default cred's gid + * is used unless it's an ephemeral ID in which case KSID_GROUP will + * be used if it exists. + */ +uint64_t +zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, + cred_t *cr, zfs_fuid_info_t **fuidp) +{ + uint64_t idx; + ksid_t *ksid; + uint32_t rid; + char *kdomain; + const char *domain; + uid_t id; + + VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); + + ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP); + + if (!zfsvfs->z_use_fuids || (ksid == NULL)) { + id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr); + + if (IS_EPHEMERAL(id)) + return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY); + + return ((uint64_t)id); + } + + /* + * ksid is present and FUID is supported + */ + id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr); + + if (!IS_EPHEMERAL(id)) + return ((uint64_t)id); + + if (type == ZFS_GROUP) + id = ksid_getid(ksid); + + rid = ksid_getrid(ksid); + domain = ksid_getdomain(ksid); + + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); + + zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); + + return (FUID_ENCODE(idx, rid)); +} + +/* + * Create a file system FUID for an ACL ace + * or a chown/chgrp of the file. + * This is similar to zfs_fuid_create_cred, except that + * we can't find the domain + rid information in the + * cred. Instead we have to query Winchester for the + * domain and rid. + * + * During replay operations the domain+rid information is + * found in the zfs_fuid_info_t that the replay code has + * attached to the zfsvfs of the file system. + */ +uint64_t +zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, + zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp) +{ + const char *domain; + char *kdomain; + uint32_t fuid_idx = FUID_INDEX(id); + uint32_t rid; + idmap_stat status; + uint64_t idx = 0; + zfs_fuid_t *zfuid = NULL; + zfs_fuid_info_t *fuidp = NULL; + + /* + * If POSIX ID, or entry is already a FUID then + * just return the id + * + * We may also be handed an already FUID'ized id via + * chmod. + */ + + if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0) + return (id); + + if (zfsvfs->z_replay) { + fuidp = zfsvfs->z_fuid_replay; + + /* + * If we are passed an ephemeral id, but no + * fuid_info was logged then return NOBODY. + * This is most likely a result of idmap service + * not being available. + */ + if (fuidp == NULL) + return (UID_NOBODY); + + VERIFY3U(type, >=, ZFS_OWNER); + VERIFY3U(type, <=, ZFS_ACE_GROUP); + + switch (type) { + case ZFS_ACE_USER: + case ZFS_ACE_GROUP: + zfuid = list_head(&fuidp->z_fuids); + rid = FUID_RID(zfuid->z_logfuid); + idx = FUID_INDEX(zfuid->z_logfuid); + break; + case ZFS_OWNER: + rid = FUID_RID(fuidp->z_fuid_owner); + idx = FUID_INDEX(fuidp->z_fuid_owner); + break; + case ZFS_GROUP: + rid = FUID_RID(fuidp->z_fuid_group); + idx = FUID_INDEX(fuidp->z_fuid_group); + break; + }; + domain = fuidp->z_domain_table[idx - 1]; + } else { + if (type == ZFS_OWNER || type == ZFS_ACE_USER) + status = kidmap_getsidbyuid(crgetzone(cr), id, + &domain, &rid); + else + status = kidmap_getsidbygid(crgetzone(cr), id, + &domain, &rid); + + if (status != 0) { + /* + * When returning nobody we will need to + * make a dummy fuid table entry for logging + * purposes. + */ + rid = UID_NOBODY; + domain = nulldomain; + } + } + + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); + + if (!zfsvfs->z_replay) + zfs_fuid_node_add(fuidpp, kdomain, + rid, idx, id, type); + else if (zfuid != NULL) { + list_remove(&fuidp->z_fuids, zfuid); + kmem_free(zfuid, sizeof (zfs_fuid_t)); + } + return (FUID_ENCODE(idx, rid)); +} + +void +zfs_fuid_destroy(zfsvfs_t *zfsvfs) +{ + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + if (!zfsvfs->z_fuid_loaded) { + rw_exit(&zfsvfs->z_fuid_lock); + return; + } + zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * Allocate zfs_fuid_info for tracking FUIDs created during + * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR() + */ +zfs_fuid_info_t * +zfs_fuid_info_alloc(void) +{ + zfs_fuid_info_t *fuidp; + + fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP); + list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t), + offsetof(zfs_fuid_domain_t, z_next)); + list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t), + offsetof(zfs_fuid_t, z_next)); + return (fuidp); +} + +/* + * Release all memory associated with zfs_fuid_info_t + */ +void +zfs_fuid_info_free(zfs_fuid_info_t *fuidp) +{ + zfs_fuid_t *zfuid; + zfs_fuid_domain_t *zdomain; + + while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { + list_remove(&fuidp->z_fuids, zfuid); + kmem_free(zfuid, sizeof (zfs_fuid_t)); + } + + if (fuidp->z_domain_table != NULL) + kmem_free(fuidp->z_domain_table, + (sizeof (char **)) * fuidp->z_domain_cnt); + + while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { + list_remove(&fuidp->z_domains, zdomain); + kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); + } + + kmem_free(fuidp, sizeof (zfs_fuid_info_t)); +} + +/* + * Check to see if id is a groupmember. If cred + * has ksid info then sidlist is checked first + * and if still not found then POSIX groups are checked + * + * Will use a straight FUID compare when possible. + */ +boolean_t +zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) +{ +#ifdef illumos + ksid_t *ksid = crgetsid(cr, KSID_GROUP); + ksidlist_t *ksidlist = crgetsidlist(cr); +#endif + uid_t gid; + +#ifdef illumos + if (ksid && ksidlist) { + int i; + ksid_t *ksid_groups; + uint32_t idx = FUID_INDEX(id); + uint32_t rid = FUID_RID(id); + + ksid_groups = ksidlist->ksl_sids; + + for (i = 0; i != ksidlist->ksl_nsid; i++) { + if (idx == 0) { + if (id != IDMAP_WK_CREATOR_GROUP_GID && + id == ksid_groups[i].ks_id) { + return (B_TRUE); + } + } else { + const char *domain; + + domain = zfs_fuid_find_by_idx(zfsvfs, idx); + ASSERT(domain != NULL); + + if (strcmp(domain, + IDMAP_WK_CREATOR_SID_AUTHORITY) == 0) + return (B_FALSE); + + if ((strcmp(domain, + ksid_groups[i].ks_domain->kd_name) == 0) && + rid == ksid_groups[i].ks_rid) + return (B_TRUE); + } + } + } +#endif /* illumos */ + + /* + * Not found in ksidlist, check posix groups + */ + gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); + return (groupmember(gid, cr)); +} + +void +zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } +} +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c new file mode 100644 index 000000000000..c9adb54380e0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -0,0 +1,7174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. + * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. + * Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Toomas Soome <tsoome@me.com> + * Copyright 2017 RackTop Systems. + * Copyright (c) 2017 Datto Inc. + */ + +/* + * ZFS ioctls. + * + * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage + * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. + * + * There are two ways that we handle ioctls: the legacy way where almost + * all of the logic is in the ioctl callback, and the new way where most + * of the marshalling is handled in the common entry point, zfsdev_ioctl(). + * + * Non-legacy ioctls should be registered by calling + * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked + * from userland by lzc_ioctl(). + * + * The registration arguments are as follows: + * + * const char *name + * The name of the ioctl. This is used for history logging. If the + * ioctl returns successfully (the callback returns 0), and allow_log + * is true, then a history log entry will be recorded with the input & + * output nvlists. The log entry can be printed with "zpool history -i". + * + * zfs_ioc_t ioc + * The ioctl request number, which userland will pass to ioctl(2). + * The ioctl numbers can change from release to release, because + * the caller (libzfs) must be matched to the kernel. + * + * zfs_secpolicy_func_t *secpolicy + * This function will be called before the zfs_ioc_func_t, to + * determine if this operation is permitted. It should return EPERM + * on failure, and 0 on success. Checks include determining if the + * dataset is visible in this zone, and if the user has either all + * zfs privileges in the zone (SYS_MOUNT), or has been granted permission + * to do this operation on this dataset with "zfs allow". + * + * zfs_ioc_namecheck_t namecheck + * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool + * name, a dataset name, or nothing. If the name is not well-formed, + * the ioctl will fail and the callback will not be called. + * Therefore, the callback can assume that the name is well-formed + * (e.g. is null-terminated, doesn't have more than one '@' character, + * doesn't have invalid characters). + * + * zfs_ioc_poolcheck_t pool_check + * This specifies requirements on the pool state. If the pool does + * not meet them (is suspended or is readonly), the ioctl will fail + * and the callback will not be called. If any checks are specified + * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. + * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | + * POOL_CHECK_READONLY). + * + * boolean_t smush_outnvlist + * If smush_outnvlist is true, then the output is presumed to be a + * list of errors, and it will be "smushed" down to fit into the + * caller's buffer, by removing some entries and replacing them with a + * single "N_MORE_ERRORS" entry indicating how many were removed. See + * nvlist_smush() for details. If smush_outnvlist is false, and the + * outnvlist does not fit into the userland-provided buffer, then the + * ioctl will fail with ENOMEM. + * + * zfs_ioc_func_t *func + * The callback function that will perform the operation. + * + * The callback should return 0 on success, or an error number on + * failure. If the function fails, the userland ioctl will return -1, + * and errno will be set to the callback's return value. The callback + * will be called with the following arguments: + * + * const char *name + * The name of the pool or dataset to operate on, from + * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the + * expected type (pool, dataset, or none). + * + * nvlist_t *innvl + * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or + * NULL if no input nvlist was provided. Changes to this nvlist are + * ignored. If the input nvlist could not be deserialized, the + * ioctl will fail and the callback will not be called. + * + * nvlist_t *outnvl + * The output nvlist, initially empty. The callback can fill it in, + * and it will be returned to userland by serializing it into + * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization + * fails (e.g. because the caller didn't supply a large enough + * buffer), then the overall ioctl will fail. See the + * 'smush_nvlist' argument above for additional behaviors. + * + * There are two typical uses of the output nvlist: + * - To return state, e.g. property values. In this case, + * smush_outnvlist should be false. If the buffer was not large + * enough, the caller will reallocate a larger buffer and try + * the ioctl again. + * + * - To return multiple errors from an ioctl which makes on-disk + * changes. In this case, smush_outnvlist should be true. + * Ioctls which make on-disk modifications should generally not + * use the outnvl if they succeed, because the caller can not + * distinguish between the operation failing, and + * deserialization failing. + */ +#ifdef __FreeBSD__ +#include "opt_kstack_pages.h" +#endif + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/errno.h> +#include <sys/uio.h> +#include <sys/buf.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> +#include <sys/zap.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev.h> +#include <sys/dmu.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_deleg.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/sunddi.h> +#include <sys/policy.h> +#include <sys/zone.h> +#include <sys/nvpair.h> +#include <sys/mount.h> +#include <sys/taskqueue.h> +#include <sys/sdt.h> +#include <sys/varargs.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_onexit.h> +#include <sys/zvol.h> +#include <sys/dsl_scan.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_send.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_bookmark.h> +#include <sys/dsl_userhold.h> +#include <sys/zfeature.h> +#include <sys/zcp.h> +#include <sys/zio_checksum.h> +#include <sys/vdev_removal.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_initialize.h> + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "zfs_deleg.h" +#include "zfs_comutil.h" +#include "zfs_ioctl_compat.h" + +#include "lua.h" +#include "lauxlib.h" + +static struct cdev *zfsdev; + +extern void zfs_init(void); +extern void zfs_fini(void); + +uint_t zfs_fsyncer_key; +extern uint_t rrw_tsd_key; +static uint_t zfs_allow_log_key; +extern uint_t zfs_geom_probe_vdev_key; + +typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); +typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); +typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *); + +typedef enum { + NO_NAME, + POOL_NAME, + DATASET_NAME +} zfs_ioc_namecheck_t; + +typedef enum { + POOL_CHECK_NONE = 1 << 0, + POOL_CHECK_SUSPENDED = 1 << 1, + POOL_CHECK_READONLY = 1 << 2, +} zfs_ioc_poolcheck_t; + +typedef struct zfs_ioc_vec { + zfs_ioc_legacy_func_t *zvec_legacy_func; + zfs_ioc_func_t *zvec_func; + zfs_secpolicy_func_t *zvec_secpolicy; + zfs_ioc_namecheck_t zvec_namecheck; + boolean_t zvec_allow_log; + zfs_ioc_poolcheck_t zvec_pool_check; + boolean_t zvec_smush_outnvlist; + const char *zvec_name; +} zfs_ioc_vec_t; + +/* This array is indexed by zfs_userquota_prop_t */ +static const char *userquota_perms[] = { + ZFS_DELEG_PERM_USERUSED, + ZFS_DELEG_PERM_USERQUOTA, + ZFS_DELEG_PERM_GROUPUSED, + ZFS_DELEG_PERM_GROUPQUOTA, +}; + +static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); +static int zfs_check_settable(const char *name, nvpair_t *property, + cred_t *cr); +static int zfs_check_clearable(char *dataset, nvlist_t *props, + nvlist_t **errors); +static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, + boolean_t *); +int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); +static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); + +static void zfsdev_close(void *data); + +static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature); + +/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ +void +__dprintf(const char *file, const char *func, int line, const char *fmt, ...) +{ + const char *newfile; + char buf[512]; + va_list adx; + + /* + * Get rid of annoying "../common/" prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + /* + * To get this data, use the zfs-dprintf probe as so: + * dtrace -q -n 'zfs-dprintf \ + * /stringof(arg0) == "dbuf.c"/ \ + * {printf("%s: %s", stringof(arg1), stringof(arg3))}' + * arg0 = file name + * arg1 = function name + * arg2 = line number + * arg3 = message + */ + DTRACE_PROBE4(zfs__dprintf, + char *, newfile, char *, func, int, line, char *, buf); +} + +static void +history_str_free(char *buf) +{ + kmem_free(buf, HIS_MAX_RECORD_LEN); +} + +static char * +history_str_get(zfs_cmd_t *zc) +{ + char *buf; + + if (zc->zc_history == 0) + return (NULL); + + buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); + if (copyinstr((void *)(uintptr_t)zc->zc_history, + buf, HIS_MAX_RECORD_LEN, NULL) != 0) { + history_str_free(buf); + return (NULL); + } + + buf[HIS_MAX_RECORD_LEN -1] = '\0'; + + return (buf); +} + +/* + * Check to see if the named dataset is currently defined as bootable + */ +static boolean_t +zfs_is_bootfs(const char *name) +{ + objset_t *os; + + if (dmu_objset_hold(name, FTAG, &os) == 0) { + boolean_t ret; + ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os))); + dmu_objset_rele(os, FTAG); + return (ret); + } + return (B_FALSE); +} + +/* + * Return non-zero if the spa version is less than requested version. + */ +static int +zfs_earlier_version(const char *name, int version) +{ + spa_t *spa; + + if (spa_open(name, &spa, FTAG) == 0) { + if (spa_version(spa) < version) { + spa_close(spa, FTAG); + return (1); + } + spa_close(spa, FTAG); + } + return (0); +} + +/* + * Return TRUE if the ZPL version is less than requested version. + */ +static boolean_t +zpl_earlier_version(const char *name, int version) +{ + objset_t *os; + boolean_t rc = B_TRUE; + + if (dmu_objset_hold(name, FTAG, &os) == 0) { + uint64_t zplversion; + + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (B_TRUE); + } + /* XXX reading from non-owned objset */ + if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) + rc = zplversion < version; + dmu_objset_rele(os, FTAG); + } + return (rc); +} + +static void +zfs_log_history(zfs_cmd_t *zc) +{ + spa_t *spa; + char *buf; + + if ((buf = history_str_get(zc)) == NULL) + return; + + if (spa_open(zc->zc_name, &spa, FTAG) == 0) { + if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) + (void) spa_history_log(spa, buf); + spa_close(spa, FTAG); + } + history_str_free(buf); +} + +/* + * Policy for top-level read operations (list pools). Requires no privileges, + * and can be used in the local zone, as there is no associated dataset. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (0); +} + +/* + * Policy for dataset read operations (list children, get statistics). Requires + * no privileges, but must be visible in the local zone. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + if (INGLOBALZONE(curthread) || + zone_dataset_visible(zc->zc_name, NULL)) + return (0); + + return (SET_ERROR(ENOENT)); +} + +static int +zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) +{ + int writable = 1; + + /* + * The dataset must be visible by this zone -- check this first + * so they don't see EPERM on something they shouldn't know about. + */ + if (!INGLOBALZONE(curthread) && + !zone_dataset_visible(dataset, &writable)) + return (SET_ERROR(ENOENT)); + + if (INGLOBALZONE(curthread)) { + /* + * If the fs is zoned, only root can access it from the + * global zone. + */ + if (secpolicy_zfs(cr) && zoned) + return (SET_ERROR(EPERM)); + } else { + /* + * If we are in a local zone, the 'zoned' property must be set. + */ + if (!zoned) + return (SET_ERROR(EPERM)); + + /* must be writable by this zone */ + if (!writable) + return (SET_ERROR(EPERM)); + } + return (0); +} + +static int +zfs_dozonecheck(const char *dataset, cred_t *cr) +{ + uint64_t zoned; + + if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL)) + return (SET_ERROR(ENOENT)); + + return (zfs_dozonecheck_impl(dataset, zoned, cr)); +} + +static int +zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) +{ + uint64_t zoned; + + if (dsl_prop_get_int_ds(ds, "jailed", &zoned)) + return (SET_ERROR(ENOENT)); + + return (zfs_dozonecheck_impl(dataset, zoned, cr)); +} + +static int +zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, + const char *perm, cred_t *cr) +{ + int error; + + error = zfs_dozonecheck_ds(name, ds, cr); + if (error == 0) { + error = secpolicy_zfs(cr); + if (error != 0) + error = dsl_deleg_access_impl(ds, perm, cr); + } + return (error); +} + +static int +zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) +{ + int error; + dsl_dataset_t *ds; + dsl_pool_t *dp; + + /* + * First do a quick check for root in the global zone, which + * is allowed to do all write_perms. This ensures that zfs_ioc_* + * will get to handle nonexistent datasets. + */ + if (INGLOBALZONE(curthread) && secpolicy_zfs(cr) == 0) + return (0); + + error = dsl_pool_hold(name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, name, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); + + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); +} + +#ifdef SECLABEL +/* + * Policy for setting the security label property. + * + * Returns 0 for success, non-zero for access and other errors. + */ +static int +zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) +{ + char ds_hexsl[MAXNAMELEN]; + bslabel_t ds_sl, new_sl; + boolean_t new_default = FALSE; + uint64_t zoned; + int needed_priv = -1; + int error; + + /* First get the existing dataset label. */ + error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + 1, sizeof (ds_hexsl), &ds_hexsl, NULL); + if (error != 0) + return (SET_ERROR(EPERM)); + + if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) + new_default = TRUE; + + /* The label must be translatable */ + if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) + return (SET_ERROR(EINVAL)); + + /* + * In a non-global zone, disallow attempts to set a label that + * doesn't match that of the zone; otherwise no other checks + * are needed. + */ + if (!INGLOBALZONE(curproc)) { + if (new_default || !blequal(&new_sl, CR_SL(CRED()))) + return (SET_ERROR(EPERM)); + return (0); + } + + /* + * For global-zone datasets (i.e., those whose zoned property is + * "off", verify that the specified new label is valid for the + * global zone. + */ + if (dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) + return (SET_ERROR(EPERM)); + if (!zoned) { + if (zfs_check_global_label(name, strval) != 0) + return (SET_ERROR(EPERM)); + } + + /* + * If the existing dataset label is nondefault, check if the + * dataset is mounted (label cannot be changed while mounted). + * Get the zfsvfs; if there isn't one, then the dataset isn't + * mounted (or isn't a dataset, doesn't exist, ...). + */ + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { + objset_t *os; + static char *setsl_tag = "setsl_tag"; + + /* + * Try to own the dataset; abort if there is any error, + * (e.g., already mounted, in use, or other error). + */ + error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, + setsl_tag, &os); + if (error != 0) + return (SET_ERROR(EPERM)); + + dmu_objset_disown(os, setsl_tag); + + if (new_default) { + needed_priv = PRIV_FILE_DOWNGRADE_SL; + goto out_check; + } + + if (hexstr_to_label(strval, &new_sl) != 0) + return (SET_ERROR(EPERM)); + + if (blstrictdom(&ds_sl, &new_sl)) + needed_priv = PRIV_FILE_DOWNGRADE_SL; + else if (blstrictdom(&new_sl, &ds_sl)) + needed_priv = PRIV_FILE_UPGRADE_SL; + } else { + /* dataset currently has a default label */ + if (!new_default) + needed_priv = PRIV_FILE_UPGRADE_SL; + } + +out_check: + if (needed_priv != -1) + return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); + return (0); +} +#endif /* SECLABEL */ + +static int +zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, + cred_t *cr) +{ + char *strval; + + /* + * Check permissions for special properties. + */ + switch (prop) { + case ZFS_PROP_ZONED: + /* + * Disallow setting of 'zoned' from within a local zone. + */ + if (!INGLOBALZONE(curthread)) + return (SET_ERROR(EPERM)); + break; + + case ZFS_PROP_QUOTA: + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + if (!INGLOBALZONE(curthread)) { + uint64_t zoned; + char setpoint[ZFS_MAX_DATASET_NAME_LEN]; + /* + * Unprivileged users are allowed to modify the + * limit on things *under* (ie. contained by) + * the thing they own. + */ + if (dsl_prop_get_integer(dsname, "jailed", &zoned, + setpoint)) + return (SET_ERROR(EPERM)); + if (!zoned || strlen(dsname) <= strlen(setpoint)) + return (SET_ERROR(EPERM)); + } + break; + + case ZFS_PROP_MLSLABEL: +#ifdef SECLABEL + if (!is_system_labeled()) + return (SET_ERROR(EPERM)); + + if (nvpair_value_string(propval, &strval) == 0) { + int err; + + err = zfs_set_slabel_policy(dsname, strval, CRED()); + if (err != 0) + return (err); + } +#else + return (EOPNOTSUPP); +#endif + break; + } + + return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + int error; + + error = zfs_dozonecheck(zc->zc_name, cr); + if (error != 0) + return (error); + + /* + * permission to set permissions will be evaluated later in + * dsl_deleg_can_allow() + */ + return (0); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_ROLLBACK, cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + char *cp; + int error; + + /* + * Generate the current snapshot name from the given objsetid, then + * use that name for the secpolicy/zone checks. + */ + cp = strchr(zc->zc_name, '@'); + if (cp == NULL) + return (SET_ERROR(EINVAL)); + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + dsl_dataset_name(ds, zc->zc_name); + + error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, + ZFS_DELEG_PERM_SEND, cr); + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + + return (error); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_SEND, cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + vnode_t *vp; + int error; + + if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, + NO_FOLLOW, NULL, &vp)) != 0) + return (error); + + /* Now make sure mntpnt and dataset are ZFS */ + + if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || + (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), + zc->zc_name) != 0)) { + VN_RELE(vp); + return (SET_ERROR(EPERM)); + } + + VN_RELE(vp); + return (dsl_deleg_access(zc->zc_name, + ZFS_DELEG_PERM_SHARE, cr)); +} + +int +zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + if (!INGLOBALZONE(curthread)) + return (SET_ERROR(EPERM)); + + if (secpolicy_nfs(cr) == 0) { + return (0); + } else { + return (zfs_secpolicy_deleg_share(zc, innvl, cr)); + } +} + +int +zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + if (!INGLOBALZONE(curthread)) + return (SET_ERROR(EPERM)); + + if (secpolicy_smb(cr) == 0) { + return (0); + } else { + return (zfs_secpolicy_deleg_share(zc, innvl, cr)); + } +} + +static int +zfs_get_parent(const char *datasetname, char *parent, int parentsize) +{ + char *cp; + + /* + * Remove the @bla or /bla from the end of the name to get the parent. + */ + (void) strncpy(parent, datasetname, parentsize); + cp = strrchr(parent, '@'); + if (cp != NULL) { + cp[0] = '\0'; + } else { + cp = strrchr(parent, '/'); + if (cp == NULL) + return (SET_ERROR(ENOENT)); + cp[0] = '\0'; + } + + return (0); +} + +int +zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) +{ + int error; + + if ((error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); +} + +/* + * Destroying snapshots with delegated permissions requires + * descendant mount and destroy permissions. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + nvlist_t *snaps; + nvpair_t *pair, *nextpair; + int error = 0; + + if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) + return (SET_ERROR(EINVAL)); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nextpair) { + nextpair = nvlist_next_nvpair(snaps, pair); + error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); + if (error == ENOENT) { + /* + * Ignore any snapshots that don't exist (we consider + * them "already destroyed"). Remove the name from the + * nvl here in case the snapshot is created between + * now and when we try to destroy it (in which case + * we don't want to destroy it since we haven't + * checked for permission). + */ + fnvlist_remove_nvpair(snaps, pair); + error = 0; + } + if (error != 0) + break; + } + + return (error); +} + +int +zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) +{ + char parentname[ZFS_MAX_DATASET_NAME_LEN]; + int error; + + if ((error = zfs_secpolicy_write_perms(from, + ZFS_DELEG_PERM_RENAME, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(from, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + if ((error = zfs_get_parent(to, parentname, + sizeof (parentname))) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_CREATE, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (error); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + char *at = NULL; + int error; + + if ((zc->zc_cookie & 1) != 0) { + /* + * This is recursive rename, so the starting snapshot might + * not exist. Check file system or volume permission instead. + */ + at = strchr(zc->zc_name, '@'); + if (at == NULL) + return (EINVAL); + *at = '\0'; + } + + error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr); + + if (at != NULL) + *at = '@'; + + return (error); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + dsl_pool_t *dp; + dsl_dataset_t *clone; + int error; + + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_PROMOTE, cr); + if (error != 0) + return (error); + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); + + if (error == 0) { + char parentname[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_t *origin = NULL; + dsl_dir_t *dd; + dd = clone->ds_dir; + + error = dsl_dataset_hold_obj(dd->dd_pool, + dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); + if (error != 0) { + dsl_dataset_rele(clone, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + + error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, + ZFS_DELEG_PERM_MOUNT, cr); + + dsl_dataset_name(origin, parentname); + if (error == 0) { + error = zfs_secpolicy_write_perms_ds(parentname, origin, + ZFS_DELEG_PERM_PROMOTE, cr); + } + dsl_dataset_rele(clone, FTAG); + dsl_dataset_rele(origin, FTAG); + } + dsl_pool_rele(dp, FTAG); + return (error); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + int error; + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_RECEIVE, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_CREATE, cr)); +} + +int +zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_SNAPSHOT, cr)); +} + +/* + * Check for permission to create each snapshot in the nvlist. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + nvlist_t *snaps; + int error; + nvpair_t *pair; + + if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) + return (SET_ERROR(EINVAL)); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + char *name = nvpair_name(pair); + char *atp = strchr(name, '@'); + + if (atp == NULL) { + error = SET_ERROR(EINVAL); + break; + } + *atp = '\0'; + error = zfs_secpolicy_snapshot_perms(name, cr); + *atp = '@'; + if (error != 0) + break; + } + return (error); +} + +/* + * Check for permission to create each snapshot in the nvlist. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + int error = 0; + + for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { + char *name = nvpair_name(pair); + char *hashp = strchr(name, '#'); + + if (hashp == NULL) { + error = SET_ERROR(EINVAL); + break; + } + *hashp = '\0'; + error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_BOOKMARK, cr); + *hashp = '#'; + if (error != 0) + break; + } + return (error); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_remap(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_REMAP, cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + nvpair_t *pair, *nextpair; + int error = 0; + + for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; + pair = nextpair) { + char *name = nvpair_name(pair); + char *hashp = strchr(name, '#'); + nextpair = nvlist_next_nvpair(innvl, pair); + + if (hashp == NULL) { + error = SET_ERROR(EINVAL); + break; + } + + *hashp = '\0'; + error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_DESTROY, cr); + *hashp = '#'; + if (error == ENOENT) { + /* + * Ignore any filesystems that don't exist (we consider + * their bookmarks "already destroyed"). Remove + * the name from the nvl here in case the filesystem + * is created between now and when we try to destroy + * the bookmark (in which case we don't want to + * destroy it since we haven't checked for permission). + */ + fnvlist_remove_nvpair(innvl, pair); + error = 0; + } + if (error != 0) + break; + } + + return (error); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + /* + * Even root must have a proper TSD so that we know what pool + * to log to. + */ + if (tsd_get(zfs_allow_log_key) == NULL) + return (SET_ERROR(EPERM)); + return (0); +} + +static int +zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + char parentname[ZFS_MAX_DATASET_NAME_LEN]; + int error; + char *origin; + + if ((error = zfs_get_parent(zc->zc_name, parentname, + sizeof (parentname))) != 0) + return (error); + + if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && + (error = zfs_secpolicy_write_perms(origin, + ZFS_DELEG_PERM_CLONE, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_CREATE, cr)) != 0) + return (error); + + return (zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_MOUNT, cr)); +} + +/* + * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires + * SYS_CONFIG privilege, which is not available in a local zone. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + if (secpolicy_sys_config(cr, B_FALSE) != 0) + return (SET_ERROR(EPERM)); + + return (0); +} + +/* + * Policy for object to name lookups. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + int error; + + if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) + return (0); + + error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); + return (error); +} + +/* + * Policy for fault injection. Requires all privileges. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (secpolicy_zinject(cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); + + if (prop == ZPROP_INVAL) { + if (!zfs_prop_user(zc->zc_value)) + return (SET_ERROR(EINVAL)); + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_USERPROP, cr)); + } else { + return (zfs_secpolicy_setprop(zc->zc_name, prop, + NULL, cr)); + } +} + +static int +zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + int err = zfs_secpolicy_read(zc, innvl, cr); + if (err) + return (err); + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (SET_ERROR(EINVAL)); + + if (zc->zc_value[0] == 0) { + /* + * They are asking about a posix uid/gid. If it's + * themself, allow it. + */ + if (zc->zc_objset_type == ZFS_PROP_USERUSED || + zc->zc_objset_type == ZFS_PROP_USERQUOTA) { + if (zc->zc_guid == crgetuid(cr)) + return (0); + } else { + if (groupmember(zc->zc_guid, cr)) + return (0); + } + } + + return (zfs_secpolicy_write_perms(zc->zc_name, + userquota_perms[zc->zc_objset_type], cr)); +} + +static int +zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + int err = zfs_secpolicy_read(zc, innvl, cr); + if (err) + return (err); + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (SET_ERROR(EINVAL)); + + return (zfs_secpolicy_write_perms(zc->zc_name, + userquota_perms[zc->zc_objset_type], cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, + NULL, cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + nvpair_t *pair; + nvlist_t *holds; + int error; + + error = nvlist_lookup_nvlist(innvl, "holds", &holds); + if (error != 0) + return (SET_ERROR(EINVAL)); + + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + error = dmu_fsname(nvpair_name(pair), fsname); + if (error != 0) + return (error); + error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_HOLD, cr); + if (error != 0) + return (error); + } + return (0); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + nvpair_t *pair; + int error; + + for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; + pair = nvlist_next_nvpair(innvl, pair)) { + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + error = dmu_fsname(nvpair_name(pair), fsname); + if (error != 0) + return (error); + error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_RELEASE, cr); + if (error != 0) + return (error); + } + return (0); +} + +/* + * Policy for allowing temporary snapshots to be taken or released + */ +static int +zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + /* + * A temporary snapshot is the same as a snapshot, + * hold, destroy and release all rolled into one. + * Delegated diff alone is sufficient that we allow this. + */ + int error; + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_DIFF, cr)) == 0) + return (0); + + error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); + if (error == 0) + error = zfs_secpolicy_hold(zc, innvl, cr); + if (error == 0) + error = zfs_secpolicy_release(zc, innvl, cr); + if (error == 0) + error = zfs_secpolicy_destroy(zc, innvl, cr); + return (error); +} + +/* + * Returns the nvlist as specified by the user in the zfs_cmd_t. + */ +static int +get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) +{ + char *packed; + int error; + nvlist_t *list = NULL; + + /* + * Read in and unpack the user-supplied nvlist. + */ + if (size == 0) + return (SET_ERROR(EINVAL)); + + packed = kmem_alloc(size, KM_SLEEP); + + if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, + iflag)) != 0) { + kmem_free(packed, size); + return (SET_ERROR(EFAULT)); + } + + if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { + kmem_free(packed, size); + return (error); + } + + kmem_free(packed, size); + + *nvp = list; + return (0); +} + +/* + * Reduce the size of this nvlist until it can be serialized in 'max' bytes. + * Entries will be removed from the end of the nvlist, and one int32 entry + * named "N_MORE_ERRORS" will be added indicating how many entries were + * removed. + */ +static int +nvlist_smush(nvlist_t *errors, size_t max) +{ + size_t size; + + size = fnvlist_size(errors); + + if (size > max) { + nvpair_t *more_errors; + int n = 0; + + if (max < 1024) + return (SET_ERROR(ENOMEM)); + + fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); + more_errors = nvlist_prev_nvpair(errors, NULL); + + do { + nvpair_t *pair = nvlist_prev_nvpair(errors, + more_errors); + fnvlist_remove_nvpair(errors, pair); + n++; + size = fnvlist_size(errors); + } while (size > max); + + fnvlist_remove_nvpair(errors, more_errors); + fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); + ASSERT3U(fnvlist_size(errors), <=, max); + } + + return (0); +} + +static int +put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) +{ + char *packed = NULL; + int error = 0; + size_t size; + + size = fnvlist_size(nvl); + + if (size > zc->zc_nvlist_dst_size) { + /* + * Solaris returns ENOMEM here, because even if an error is + * returned from an ioctl(2), new zc_nvlist_dst_size will be + * passed to the userland. This is not the case for FreeBSD. + * We need to return 0, so the kernel will copy the + * zc_nvlist_dst_size back and the userland can discover that a + * bigger buffer is needed. + */ + error = 0; + } else { + packed = fnvlist_pack(nvl, &size); + if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, + size, zc->zc_iflags) != 0) + error = SET_ERROR(EFAULT); + fnvlist_pack_free(packed, size); + } + + zc->zc_nvlist_dst_size = size; + zc->zc_nvlist_dst_filled = B_TRUE; + return (error); +} + +int +getzfsvfs_impl(objset_t *os, vfs_t **vfsp) +{ + zfsvfs_t *zfvp; + int error = 0; + + if (dmu_objset_type(os) != DMU_OST_ZFS) { + return (SET_ERROR(EINVAL)); + } + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + if (zfvp) { + *vfsp = zfvp->z_vfs; + vfs_ref(zfvp->z_vfs); + } else { + error = SET_ERROR(ESRCH); + } + mutex_exit(&os->os_user_ptr_lock); + return (error); +} + +int +getzfsvfs(const char *dsname, zfsvfs_t **zfvp) +{ + objset_t *os; + vfs_t *vfsp; + int error; + + error = dmu_objset_hold(dsname, FTAG, &os); + if (error != 0) + return (error); + error = getzfsvfs_impl(os, &vfsp); + dmu_objset_rele(os, FTAG); + if (error != 0) + return (error); + + error = vfs_busy(vfsp, 0); + vfs_rel(vfsp); + if (error != 0) { + *zfvp = NULL; + error = SET_ERROR(ESRCH); + } else { + *zfvp = vfsp->vfs_data; + } + return (error); +} + +/* + * Find a zfsvfs_t for a mounted filesystem, or create our own, in which + * case its z_vfs will be NULL, and it will be opened as the owner. + * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, + * which prevents all vnode ops from running. + */ +static int +zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) +{ + int error = 0; + + if (getzfsvfs(name, zfvp) != 0) + error = zfsvfs_create(name, zfvp); + if (error == 0) { + rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : + RW_READER, tag); +#ifdef illumos + if ((*zfvp)->z_unmounted) { + /* + * XXX we could probably try again, since the unmounting + * thread should be just about to disassociate the + * objset from the zfsvfs. + */ + rrm_exit(&(*zfvp)->z_teardown_lock, tag); + return (SET_ERROR(EBUSY)); + } +#else + /* + * vfs_busy() ensures that the filesystem is not and + * can not be unmounted. + */ + ASSERT(!(*zfvp)->z_unmounted); +#endif + } + return (error); +} + +static void +zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) +{ + rrm_exit(&zfsvfs->z_teardown_lock, tag); + + if (zfsvfs->z_vfs) { +#ifdef illumos + VFS_RELE(zfsvfs->z_vfs); +#else + vfs_unbusy(zfsvfs->z_vfs); +#endif + } else { + dmu_objset_disown(zfsvfs->z_os, zfsvfs); + zfsvfs_free(zfsvfs); + } +} + +static int +zfs_ioc_pool_create(zfs_cmd_t *zc) +{ + int error; + nvlist_t *config, *props = NULL; + nvlist_t *rootprops = NULL; + nvlist_t *zplprops = NULL; + char *spa_name = zc->zc_name; + + if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &config)) + return (error); + + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { + nvlist_free(config); + return (error); + } + + if (props) { + nvlist_t *nvl = NULL; + uint64_t version = SPA_VERSION; + char *tname; + + (void) nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); + if (!SPA_VERSION_IS_SUPPORTED(version)) { + error = SET_ERROR(EINVAL); + goto pool_props_bad; + } + (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); + if (nvl) { + error = nvlist_dup(nvl, &rootprops, KM_SLEEP); + if (error != 0) { + nvlist_free(config); + nvlist_free(props); + return (error); + } + (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); + } + VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + error = zfs_fill_zplprops_root(version, rootprops, + zplprops, NULL); + if (error != 0) + goto pool_props_bad; + + if (nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0) + spa_name = tname; + } + + error = spa_create(zc->zc_name, config, props, zplprops); + + /* + * Set the remaining root properties + */ + if (!error && (error = zfs_set_prop_nvlist(spa_name, + ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) + (void) spa_destroy(spa_name); + +pool_props_bad: + nvlist_free(rootprops); + nvlist_free(zplprops); + nvlist_free(config); + nvlist_free(props); + + return (error); +} + +static int +zfs_ioc_pool_destroy(zfs_cmd_t *zc) +{ + int error; + zfs_log_history(zc); + error = spa_destroy(zc->zc_name); + if (error == 0) + zvol_remove_minors(zc->zc_name); + return (error); +} + +static int +zfs_ioc_pool_import(zfs_cmd_t *zc) +{ + nvlist_t *config, *props = NULL; + uint64_t guid; + int error; + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &config)) != 0) + return (error); + + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { + nvlist_free(config); + return (error); + } + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || + guid != zc->zc_guid) + error = SET_ERROR(EINVAL); + else + error = spa_import(zc->zc_name, config, props, zc->zc_cookie); + + if (zc->zc_nvlist_dst != 0) { + int err; + + if ((err = put_nvlist(zc, config)) != 0) + error = err; + } + + nvlist_free(config); + + nvlist_free(props); + + return (error); +} + +static int +zfs_ioc_pool_export(zfs_cmd_t *zc) +{ + int error; + boolean_t force = (boolean_t)zc->zc_cookie; + boolean_t hardforce = (boolean_t)zc->zc_guid; + + zfs_log_history(zc); + error = spa_export(zc->zc_name, NULL, force, hardforce); + if (error == 0) + zvol_remove_minors(zc->zc_name); + return (error); +} + +static int +zfs_ioc_pool_configs(zfs_cmd_t *zc) +{ + nvlist_t *configs; + int error; + + if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) + return (SET_ERROR(EEXIST)); + + error = put_nvlist(zc, configs); + + nvlist_free(configs); + + return (error); +} + +/* + * inputs: + * zc_name name of the pool + * + * outputs: + * zc_cookie real errno + * zc_nvlist_dst config nvlist + * zc_nvlist_dst_size size of config nvlist + */ +static int +zfs_ioc_pool_stats(zfs_cmd_t *zc) +{ + nvlist_t *config; + int error; + int ret = 0; + + error = spa_get_stats(zc->zc_name, &config, zc->zc_value, + sizeof (zc->zc_value)); + + if (config != NULL) { + ret = put_nvlist(zc, config); + nvlist_free(config); + + /* + * The config may be present even if 'error' is non-zero. + * In this case we return success, and preserve the real errno + * in 'zc_cookie'. + */ + zc->zc_cookie = error; + } else { + ret = error; + } + + return (ret); +} + +/* + * Try to import the given pool, returning pool stats as appropriate so that + * user land knows which devices are available and overall pool health. + */ +static int +zfs_ioc_pool_tryimport(zfs_cmd_t *zc) +{ + nvlist_t *tryconfig, *config; + int error; + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &tryconfig)) != 0) + return (error); + + config = spa_tryimport(tryconfig); + + nvlist_free(tryconfig); + + if (config == NULL) + return (SET_ERROR(EINVAL)); + + error = put_nvlist(zc, config); + nvlist_free(config); + + return (error); +} + +/* + * inputs: + * zc_name name of the pool + * zc_cookie scan func (pool_scan_func_t) + * zc_flags scrub pause/resume flag (pool_scrub_cmd_t) + */ +static int +zfs_ioc_pool_scan(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (zc->zc_flags >= POOL_SCRUB_FLAGS_END) + return (SET_ERROR(EINVAL)); + + if (zc->zc_flags == POOL_SCRUB_PAUSE) + error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); + else if (zc->zc_cookie == POOL_SCAN_NONE) + error = spa_scan_stop(spa); + else + error = spa_scan(spa, zc->zc_cookie); + + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_freeze(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error == 0) { + spa_freeze(spa); + spa_close(spa, FTAG); + } + return (error); +} + +static int +zfs_ioc_pool_upgrade(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (zc->zc_cookie < spa_version(spa) || + !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + spa_upgrade(spa, zc->zc_cookie); + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_get_history(zfs_cmd_t *zc) +{ + spa_t *spa; + char *hist_buf; + uint64_t size; + int error; + + if ((size = zc->zc_history_len) == 0) + return (SET_ERROR(EINVAL)); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + hist_buf = kmem_alloc(size, KM_SLEEP); + if ((error = spa_history_get(spa, &zc->zc_history_offset, + &zc->zc_history_len, hist_buf)) == 0) { + error = ddi_copyout(hist_buf, + (void *)(uintptr_t)zc->zc_history, + zc->zc_history_len, zc->zc_iflags); + } + + spa_close(spa, FTAG); + kmem_free(hist_buf, size); + return (error); +} + +static int +zfs_ioc_pool_reguid(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error == 0) { + error = spa_change_guid(spa); + spa_close(spa, FTAG); + } + return (error); +} + +static int +zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) +{ + return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_obj object to find + * + * outputs: + * zc_value name of object + */ +static int +zfs_ioc_obj_to_path(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + + /* XXX reading from objset not owned */ + if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) + return (error); + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (SET_ERROR(EINVAL)); + } + error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, + sizeof (zc->zc_value)); + dmu_objset_rele(os, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_obj object to find + * + * outputs: + * zc_stat stats on object + * zc_value path to object + */ +static int +zfs_ioc_obj_to_stats(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + + /* XXX reading from objset not owned */ + if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) + return (error); + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (SET_ERROR(EINVAL)); + } + error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, + sizeof (zc->zc_value)); + dmu_objset_rele(os, FTAG); + + return (error); +} + +static int +zfs_ioc_vdev_add(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + nvlist_t *config, **l2cache, **spares; + uint_t nl2cache = 0, nspares = 0; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &config); + (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache); + + (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, + &spares, &nspares); + +#ifdef illumos + /* + * A root pool with concatenated devices is not supported. + * Thus, can not add a device to a root pool. + * + * Intent log device can not be added to a rootpool because + * during mountroot, zil is replayed, a seperated log device + * can not be accessed during the mountroot time. + * + * l2cache and spare devices are ok to be added to a rootpool. + */ + if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { + nvlist_free(config); + spa_close(spa, FTAG); + return (SET_ERROR(EDOM)); + } +#endif /* illumos */ + + if (error == 0) { + error = spa_vdev_add(spa, config); + nvlist_free(config); + } + spa_close(spa, FTAG); + return (error); +} + +/* + * inputs: + * zc_name name of the pool + * zc_guid guid of vdev to remove + * zc_cookie cancel removal + */ +static int +zfs_ioc_vdev_remove(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + if (zc->zc_cookie != 0) { + error = spa_vdev_remove_cancel(spa); + } else { + error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); + } + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_set_state(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + vdev_state_t newstate = VDEV_STATE_UNKNOWN; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + switch (zc->zc_cookie) { + case VDEV_STATE_ONLINE: + error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); + break; + + case VDEV_STATE_OFFLINE: + error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); + break; + + case VDEV_STATE_FAULTED: + if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && + zc->zc_obj != VDEV_AUX_EXTERNAL) + zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; + + error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); + break; + + case VDEV_STATE_DEGRADED: + if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && + zc->zc_obj != VDEV_AUX_EXTERNAL) + zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; + + error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); + break; + + default: + error = SET_ERROR(EINVAL); + } + zc->zc_cookie = newstate; + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_attach(zfs_cmd_t *zc) +{ + spa_t *spa; + int replacing = zc->zc_cookie; + nvlist_t *config; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &config)) == 0) { + error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); + nvlist_free(config); + } + + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_detach(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); + + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_split(zfs_cmd_t *zc) +{ + spa_t *spa; + nvlist_t *config, *props = NULL; + int error; + boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &config)) { + spa_close(spa, FTAG); + return (error); + } + + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { + spa_close(spa, FTAG); + nvlist_free(config); + return (error); + } + + error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); + + spa_close(spa, FTAG); + + nvlist_free(config); + nvlist_free(props); + + return (error); +} + +static int +zfs_ioc_vdev_setpath(zfs_cmd_t *zc) +{ + spa_t *spa; + char *path = zc->zc_value; + uint64_t guid = zc->zc_guid; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = spa_vdev_setpath(spa, guid, path); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_setfru(zfs_cmd_t *zc) +{ + spa_t *spa; + char *fru = zc->zc_value; + uint64_t guid = zc->zc_guid; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = spa_vdev_setfru(spa, guid, fru); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) +{ + int error = 0; + nvlist_t *nv; + + dmu_objset_fast_stat(os, &zc->zc_objset_stats); + + if (zc->zc_nvlist_dst != 0 && + (error = dsl_prop_get_all(os, &nv)) == 0) { + dmu_objset_stats(os, nv); + /* + * NB: zvol_get_stats() will read the objset contents, + * which we aren't supposed to do with a + * DS_MODE_USER hold, because it could be + * inconsistent. So this is a bit of a workaround... + * XXX reading with out owning + */ + if (!zc->zc_objset_stats.dds_inconsistent && + dmu_objset_type(os) == DMU_OST_ZVOL) { + error = zvol_get_stats(os, nv); + if (error == EIO) + return (error); + VERIFY0(error); + } + error = put_nvlist(zc, nv); + nvlist_free(nv); + } + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ +static int +zfs_ioc_objset_stats(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error == 0) { + error = zfs_ioc_objset_stats_impl(zc, os); + dmu_objset_rele(os, FTAG); + } + + if (error == ENOMEM) + error = 0; + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_nvlist_dst received property nvlist + * zc_nvlist_dst_size size of received property nvlist + * + * Gets received properties (distinct from local properties on or after + * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from + * local property values. + */ +static int +zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) +{ + int error = 0; + nvlist_t *nv; + + /* + * Without this check, we would return local property values if the + * caller has not already received properties on or after + * SPA_VERSION_RECVD_PROPS. + */ + if (!dsl_prop_get_hasrecvd(zc->zc_name)) + return (SET_ERROR(ENOTSUP)); + + if (zc->zc_nvlist_dst != 0 && + (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { + error = put_nvlist(zc, nv); + nvlist_free(nv); + } + + return (error); +} + +static int +nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) +{ + uint64_t value; + int error; + + /* + * zfs_get_zplprop() will either find a value or give us + * the default value (if there is one). + */ + if ((error = zfs_get_zplprop(os, prop, &value)) != 0) + return (error); + VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); + return (0); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for zpl property nvlist + * + * outputs: + * zc_nvlist_dst zpl property nvlist + * zc_nvlist_dst_size size of zpl property nvlist + */ +static int +zfs_ioc_objset_zplprops(zfs_cmd_t *zc) +{ + objset_t *os; + int err; + + /* XXX reading without owning */ + if (err = dmu_objset_hold(zc->zc_name, FTAG, &os)) + return (err); + + dmu_objset_fast_stat(os, &zc->zc_objset_stats); + + /* + * NB: nvl_add_zplprop() will read the objset contents, + * which we aren't supposed to do with a DS_MODE_USER + * hold, because it could be inconsistent. + */ + if (zc->zc_nvlist_dst != 0 && + !zc->zc_objset_stats.dds_inconsistent && + dmu_objset_type(os) == DMU_OST_ZFS) { + nvlist_t *nv; + + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) + err = put_nvlist(zc, nv); + nvlist_free(nv); + } else { + err = SET_ERROR(ENOENT); + } + dmu_objset_rele(os, FTAG); + return (err); +} + +boolean_t +dataset_name_hidden(const char *name) +{ + /* + * Skip over datasets that are not visible in this zone, + * internal datasets (which have a $ in their name), and + * temporary datasets (which have a % in their name). + */ + if (strchr(name, '$') != NULL) + return (B_TRUE); + if (strchr(name, '%') != NULL) + return (B_TRUE); + if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL)) + return (B_TRUE); + return (B_FALSE); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_name name of next filesystem + * zc_cookie zap cursor + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ +static int +zfs_ioc_dataset_list_next(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + char *p; + size_t orig_len = strlen(zc->zc_name); + +top: + if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) { + if (error == ENOENT) + error = SET_ERROR(ESRCH); + return (error); + } + + p = strrchr(zc->zc_name, '/'); + if (p == NULL || p[1] != '\0') + (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); + p = zc->zc_name + strlen(zc->zc_name); + + do { + error = dmu_dir_list_next(os, + sizeof (zc->zc_name) - (p - zc->zc_name), p, + NULL, &zc->zc_cookie); + if (error == ENOENT) + error = SET_ERROR(ESRCH); + } while (error == 0 && dataset_name_hidden(zc->zc_name)); + dmu_objset_rele(os, FTAG); + + /* + * If it's an internal dataset (ie. with a '$' in its name), + * don't try to get stats for it, otherwise we'll return ENOENT. + */ + if (error == 0 && strchr(zc->zc_name, '$') == NULL) { + error = zfs_ioc_objset_stats(zc); /* fill in the stats */ + if (error == ENOENT) { + /* We lost a race with destroy, get the next one. */ + zc->zc_name[orig_len] = '\0'; + goto top; + } + } + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_nvlist_dst_size size of buffer for property nvlist + * zc_simple when set, only name is requested + * + * outputs: + * zc_name name of next snapshot + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ +static int +zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error != 0) { + return (error == ENOENT ? ESRCH : error); + } + + /* + * A dataset name of maximum length cannot have any snapshots, + * so exit immediately. + */ + if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= + ZFS_MAX_DATASET_NAME_LEN) { + dmu_objset_rele(os, FTAG); + return (SET_ERROR(ESRCH)); + } + + error = dmu_snapshot_list_next(os, + sizeof (zc->zc_name) - strlen(zc->zc_name), + zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, + NULL); + + if (error == 0 && !zc->zc_simple) { + dsl_dataset_t *ds; + dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; + + error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); + if (error == 0) { + objset_t *ossnap; + + error = dmu_objset_from_ds(ds, &ossnap); + if (error == 0) + error = zfs_ioc_objset_stats_impl(zc, ossnap); + dsl_dataset_rele(ds, FTAG); + } + } else if (error == ENOENT) { + error = SET_ERROR(ESRCH); + } + + dmu_objset_rele(os, FTAG); + /* if we failed, undo the @ that we tacked on to zc_name */ + if (error != 0) + *strchr(zc->zc_name, '@') = '\0'; + return (error); +} + +static int +zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) +{ + const char *propname = nvpair_name(pair); + uint64_t *valary; + unsigned int vallen; + const char *domain; + char *dash; + zfs_userquota_prop_t type; + uint64_t rid; + uint64_t quota; + zfsvfs_t *zfsvfs; + int err; + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) != 0) + return (SET_ERROR(EINVAL)); + } + + /* + * A correctly constructed propname is encoded as + * userquota@<rid>-<domain>. + */ + if ((dash = strchr(propname, '-')) == NULL || + nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || + vallen != 3) + return (SET_ERROR(EINVAL)); + + domain = dash + 1; + type = valary[0]; + rid = valary[1]; + quota = valary[2]; + + err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); + if (err == 0) { + err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); + zfsvfs_rele(zfsvfs, FTAG); + } + + return (err); +} + +/* + * If the named property is one that has a special function to set its value, + * return 0 on success and a positive error code on failure; otherwise if it is + * not one of the special properties handled by this function, return -1. + * + * XXX: It would be better for callers of the property interface if we handled + * these special cases in dsl_prop.c (in the dsl layer). + */ +static int +zfs_prop_set_special(const char *dsname, zprop_source_t source, + nvpair_t *pair) +{ + const char *propname = nvpair_name(pair); + zfs_prop_t prop = zfs_name_to_prop(propname); + uint64_t intval; + int err = -1; + + if (prop == ZPROP_INVAL) { + if (zfs_prop_userquota(propname)) + return (zfs_prop_set_userquota(dsname, pair)); + return (-1); + } + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) == 0); + } + + if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) + return (-1); + + VERIFY(0 == nvpair_value_uint64(pair, &intval)); + + switch (prop) { + case ZFS_PROP_QUOTA: + err = dsl_dir_set_quota(dsname, source, intval); + break; + case ZFS_PROP_REFQUOTA: + err = dsl_dataset_set_refquota(dsname, source, intval); + break; + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + if (intval == UINT64_MAX) { + /* clearing the limit, just do it */ + err = 0; + } else { + err = dsl_dir_activate_fs_ss_limit(dsname); + } + /* + * Set err to -1 to force the zfs_set_prop_nvlist code down the + * default path to set the value in the nvlist. + */ + if (err == 0) + err = -1; + break; + case ZFS_PROP_RESERVATION: + err = dsl_dir_set_reservation(dsname, source, intval); + break; + case ZFS_PROP_REFRESERVATION: + err = dsl_dataset_set_refreservation(dsname, source, intval); + break; + case ZFS_PROP_VOLSIZE: + err = zvol_set_volsize(dsname, intval); + break; + case ZFS_PROP_VERSION: + { + zfsvfs_t *zfsvfs; + + if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) + break; + + err = zfs_set_version(zfsvfs, intval); + zfsvfs_rele(zfsvfs, FTAG); + + if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { + zfs_cmd_t *zc; + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + (void) strcpy(zc->zc_name, dsname); + (void) zfs_ioc_userspace_upgrade(zc); + kmem_free(zc, sizeof (zfs_cmd_t)); + } + break; + } + default: + err = -1; + } + + return (err); +} + +/* + * This function is best effort. If it fails to set any of the given properties, + * it continues to set as many as it can and returns the last error + * encountered. If the caller provides a non-NULL errlist, it will be filled in + * with the list of names of all the properties that failed along with the + * corresponding error numbers. + * + * If every property is set successfully, zero is returned and errlist is not + * modified. + */ +int +zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, + nvlist_t *errlist) +{ + nvpair_t *pair; + nvpair_t *propval; + int rv = 0; + uint64_t intval; + char *strval; + nvlist_t *genericnvl = fnvlist_alloc(); + nvlist_t *retrynvl = fnvlist_alloc(); + +retry: + pair = NULL; + while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { + const char *propname = nvpair_name(pair); + zfs_prop_t prop = zfs_name_to_prop(propname); + int err = 0; + + /* decode the property value */ + propval = pair; + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + attrs = fnvpair_value_nvlist(pair); + if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &propval) != 0) + err = SET_ERROR(EINVAL); + } + + /* Validate value type */ + if (err == 0 && prop == ZPROP_INVAL) { + if (zfs_prop_user(propname)) { + if (nvpair_type(propval) != DATA_TYPE_STRING) + err = SET_ERROR(EINVAL); + } else if (zfs_prop_userquota(propname)) { + if (nvpair_type(propval) != + DATA_TYPE_UINT64_ARRAY) + err = SET_ERROR(EINVAL); + } else { + err = SET_ERROR(EINVAL); + } + } else if (err == 0) { + if (nvpair_type(propval) == DATA_TYPE_STRING) { + if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) + err = SET_ERROR(EINVAL); + } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { + const char *unused; + + intval = fnvpair_value_uint64(propval); + + switch (zfs_prop_get_type(prop)) { + case PROP_TYPE_NUMBER: + break; + case PROP_TYPE_STRING: + err = SET_ERROR(EINVAL); + break; + case PROP_TYPE_INDEX: + if (zfs_prop_index_to_string(prop, + intval, &unused) != 0) + err = SET_ERROR(EINVAL); + break; + default: + cmn_err(CE_PANIC, + "unknown property type"); + } + } else { + err = SET_ERROR(EINVAL); + } + } + + /* Validate permissions */ + if (err == 0) + err = zfs_check_settable(dsname, pair, CRED()); + + if (err == 0) { + err = zfs_prop_set_special(dsname, source, pair); + if (err == -1) { + /* + * For better performance we build up a list of + * properties to set in a single transaction. + */ + err = nvlist_add_nvpair(genericnvl, pair); + } else if (err != 0 && nvl != retrynvl) { + /* + * This may be a spurious error caused by + * receiving quota and reservation out of order. + * Try again in a second pass. + */ + err = nvlist_add_nvpair(retrynvl, pair); + } + } + + if (err != 0) { + if (errlist != NULL) + fnvlist_add_int32(errlist, propname, err); + rv = err; + } + } + + if (nvl != retrynvl && !nvlist_empty(retrynvl)) { + nvl = retrynvl; + goto retry; + } + + if (!nvlist_empty(genericnvl) && + dsl_props_set(dsname, source, genericnvl) != 0) { + /* + * If this fails, we still want to set as many properties as we + * can, so try setting them individually. + */ + pair = NULL; + while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { + const char *propname = nvpair_name(pair); + int err = 0; + + propval = pair; + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + attrs = fnvpair_value_nvlist(pair); + propval = fnvlist_lookup_nvpair(attrs, + ZPROP_VALUE); + } + + if (nvpair_type(propval) == DATA_TYPE_STRING) { + strval = fnvpair_value_string(propval); + err = dsl_prop_set_string(dsname, propname, + source, strval); + } else { + intval = fnvpair_value_uint64(propval); + err = dsl_prop_set_int(dsname, propname, source, + intval); + } + + if (err != 0) { + if (errlist != NULL) { + fnvlist_add_int32(errlist, propname, + err); + } + rv = err; + } + } + } + nvlist_free(genericnvl); + nvlist_free(retrynvl); + + return (rv); +} + +/* + * Check that all the properties are valid user properties. + */ +static int +zfs_check_userprops(const char *fsname, nvlist_t *nvl) +{ + nvpair_t *pair = NULL; + int error = 0; + + while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { + const char *propname = nvpair_name(pair); + + if (!zfs_prop_user(propname) || + nvpair_type(pair) != DATA_TYPE_STRING) + return (SET_ERROR(EINVAL)); + + if (error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_USERPROP, CRED())) + return (error); + + if (strlen(propname) >= ZAP_MAXNAMELEN) + return (SET_ERROR(ENAMETOOLONG)); + + if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) + return (E2BIG); + } + return (0); +} + +static void +props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) +{ + nvpair_t *pair; + + VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + pair = NULL; + while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { + if (nvlist_exists(skipped, nvpair_name(pair))) + continue; + + VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); + } +} + +static int +clear_received_props(const char *dsname, nvlist_t *props, + nvlist_t *skipped) +{ + int err = 0; + nvlist_t *cleared_props = NULL; + props_skip(props, skipped, &cleared_props); + if (!nvlist_empty(cleared_props)) { + /* + * Acts on local properties until the dataset has received + * properties at least once on or after SPA_VERSION_RECVD_PROPS. + */ + zprop_source_t flags = (ZPROP_SRC_NONE | + (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); + err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); + } + nvlist_free(cleared_props); + return (err); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value name of property to set + * zc_nvlist_src{_size} nvlist of properties to apply + * zc_cookie received properties flag + * + * outputs: + * zc_nvlist_dst{_size} error for each unapplied received property + */ +static int +zfs_ioc_set_prop(zfs_cmd_t *zc) +{ + nvlist_t *nvl; + boolean_t received = zc->zc_cookie; + zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : + ZPROP_SRC_LOCAL); + nvlist_t *errors; + int error; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl)) != 0) + return (error); + + if (received) { + nvlist_t *origprops; + + if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { + (void) clear_received_props(zc->zc_name, + origprops, nvl); + nvlist_free(origprops); + } + + error = dsl_prop_set_hasrecvd(zc->zc_name); + } + + errors = fnvlist_alloc(); + if (error == 0) + error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); + + if (zc->zc_nvlist_dst != 0 && errors != NULL) { + (void) put_nvlist(zc, errors); + } + + nvlist_free(errors); + nvlist_free(nvl); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value name of property to inherit + * zc_cookie revert to received value if TRUE + * + * outputs: none + */ +static int +zfs_ioc_inherit_prop(zfs_cmd_t *zc) +{ + const char *propname = zc->zc_value; + zfs_prop_t prop = zfs_name_to_prop(propname); + boolean_t received = zc->zc_cookie; + zprop_source_t source = (received + ? ZPROP_SRC_NONE /* revert to received value, if any */ + : ZPROP_SRC_INHERITED); /* explicitly inherit */ + + if (received) { + nvlist_t *dummy; + nvpair_t *pair; + zprop_type_t type; + int err; + + /* + * zfs_prop_set_special() expects properties in the form of an + * nvpair with type info. + */ + if (prop == ZPROP_INVAL) { + if (!zfs_prop_user(propname)) + return (SET_ERROR(EINVAL)); + + type = PROP_TYPE_STRING; + } else if (prop == ZFS_PROP_VOLSIZE || + prop == ZFS_PROP_VERSION) { + return (SET_ERROR(EINVAL)); + } else { + type = zfs_prop_get_type(prop); + } + + VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + switch (type) { + case PROP_TYPE_STRING: + VERIFY(0 == nvlist_add_string(dummy, propname, "")); + break; + case PROP_TYPE_NUMBER: + case PROP_TYPE_INDEX: + VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); + break; + default: + nvlist_free(dummy); + return (SET_ERROR(EINVAL)); + } + + pair = nvlist_next_nvpair(dummy, NULL); + err = zfs_prop_set_special(zc->zc_name, source, pair); + nvlist_free(dummy); + if (err != -1) + return (err); /* special property already handled */ + } else { + /* + * Only check this in the non-received case. We want to allow + * 'inherit -S' to revert non-inheritable properties like quota + * and reservation to the received or default values even though + * they are not considered inheritable. + */ + if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) + return (SET_ERROR(EINVAL)); + } + + /* property name has been validated by zfs_secpolicy_inherit_prop() */ + return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source)); +} + +static int +zfs_ioc_pool_set_props(zfs_cmd_t *zc) +{ + nvlist_t *props; + spa_t *spa; + int error; + nvpair_t *pair; + + if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props)) + return (error); + + /* + * If the only property is the configfile, then just do a spa_lookup() + * to handle the faulted case. + */ + pair = nvlist_next_nvpair(props, NULL); + if (pair != NULL && strcmp(nvpair_name(pair), + zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && + nvlist_next_nvpair(props, pair) == NULL) { + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(zc->zc_name)) != NULL) { + spa_configfile_set(spa, props, B_FALSE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); + } + mutex_exit(&spa_namespace_lock); + if (spa != NULL) { + nvlist_free(props); + return (0); + } + } + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { + nvlist_free(props); + return (error); + } + + error = spa_prop_set(spa, props); + + nvlist_free(props); + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_get_props(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + nvlist_t *nvp = NULL; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { + /* + * If the pool is faulted, there may be properties we can still + * get (such as altroot and cachefile), so attempt to get them + * anyway. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(zc->zc_name)) != NULL) + error = spa_prop_get(spa, &nvp); + mutex_exit(&spa_namespace_lock); + } else { + error = spa_prop_get(spa, &nvp); + spa_close(spa, FTAG); + } + + if (error == 0 && zc->zc_nvlist_dst != 0) + error = put_nvlist(zc, nvp); + else + error = SET_ERROR(EFAULT); + + nvlist_free(nvp); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_src{_size} nvlist of delegated permissions + * zc_perm_action allow/unallow flag + * + * outputs: none + */ +static int +zfs_ioc_set_fsacl(zfs_cmd_t *zc) +{ + int error; + nvlist_t *fsaclnv = NULL; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &fsaclnv)) != 0) + return (error); + + /* + * Verify nvlist is constructed correctly + */ + if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { + nvlist_free(fsaclnv); + return (SET_ERROR(EINVAL)); + } + + /* + * If we don't have PRIV_SYS_MOUNT, then validate + * that user is allowed to hand out each permission in + * the nvlist(s) + */ + + error = secpolicy_zfs(CRED()); + if (error != 0) { + if (zc->zc_perm_action == B_FALSE) { + error = dsl_deleg_can_allow(zc->zc_name, + fsaclnv, CRED()); + } else { + error = dsl_deleg_can_unallow(zc->zc_name, + fsaclnv, CRED()); + } + } + + if (error == 0) + error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); + + nvlist_free(fsaclnv); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * zc_nvlist_src{_size} nvlist of delegated permissions + */ +static int +zfs_ioc_get_fsacl(zfs_cmd_t *zc) +{ + nvlist_t *nvp; + int error; + + if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { + error = put_nvlist(zc, nvp); + nvlist_free(nvp); + } + + return (error); +} + +/* ARGSUSED */ +static void +zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +{ + zfs_creat_t *zct = arg; + + zfs_create_fs(os, cr, zct->zct_zplprops, tx); +} + +#define ZFS_PROP_UNDEFINED ((uint64_t)-1) + +/* + * inputs: + * os parent objset pointer (NULL if root fs) + * fuids_ok fuids allowed in this version of the spa? + * sa_ok SAs allowed in this version of the spa? + * createprops list of properties requested by creator + * + * outputs: + * zplprops values for the zplprops we attach to the master node object + * is_ci true if requested file system will be purely case-insensitive + * + * Determine the settings for utf8only, normalization and + * casesensitivity. Specific values may have been requested by the + * creator and/or we can inherit values from the parent dataset. If + * the file system is of too early a vintage, a creator can not + * request settings for these properties, even if the requested + * setting is the default value. We don't actually want to create dsl + * properties for these, so remove them from the source nvlist after + * processing. + */ +static int +zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, + boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, + nvlist_t *zplprops, boolean_t *is_ci) +{ + uint64_t sense = ZFS_PROP_UNDEFINED; + uint64_t norm = ZFS_PROP_UNDEFINED; + uint64_t u8 = ZFS_PROP_UNDEFINED; + + ASSERT(zplprops != NULL); + + if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS) + return (SET_ERROR(EINVAL)); + + /* + * Pull out creator prop choices, if any. + */ + if (createprops) { + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE)); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_CASE), &sense); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_CASE)); + } + + /* + * If the zpl version requested is whacky or the file system + * or pool is version is too "young" to support normalization + * and the creator tried to set a value for one of the props, + * error out. + */ + if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || + (zplver >= ZPL_VERSION_FUID && !fuids_ok) || + (zplver >= ZPL_VERSION_SA && !sa_ok) || + (zplver < ZPL_VERSION_NORMALIZATION && + (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || + sense != ZFS_PROP_UNDEFINED))) + return (SET_ERROR(ENOTSUP)); + + /* + * Put the version in the zplprops + */ + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); + + if (norm == ZFS_PROP_UNDEFINED) + VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); + + /* + * If we're normalizing, names must always be valid UTF-8 strings. + */ + if (norm) + u8 = 1; + if (u8 == ZFS_PROP_UNDEFINED) + VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); + + if (sense == ZFS_PROP_UNDEFINED) + VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); + + if (is_ci) + *is_ci = (sense == ZFS_CASE_INSENSITIVE); + + return (0); +} + +static int +zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, + nvlist_t *zplprops, boolean_t *is_ci) +{ + boolean_t fuids_ok, sa_ok; + uint64_t zplver = ZPL_VERSION; + objset_t *os = NULL; + char parentname[ZFS_MAX_DATASET_NAME_LEN]; + char *cp; + spa_t *spa; + uint64_t spa_vers; + int error; + + (void) strlcpy(parentname, dataset, sizeof (parentname)); + cp = strrchr(parentname, '/'); + ASSERT(cp != NULL); + cp[0] = '\0'; + + if ((error = spa_open(dataset, &spa, FTAG)) != 0) + return (error); + + spa_vers = spa_version(spa); + spa_close(spa, FTAG); + + zplver = zfs_zpl_version_map(spa_vers); + fuids_ok = (zplver >= ZPL_VERSION_FUID); + sa_ok = (zplver >= ZPL_VERSION_SA); + + /* + * Open parent object set so we can inherit zplprop values. + */ + if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) + return (error); + + error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, + zplprops, is_ci); + dmu_objset_rele(os, FTAG); + return (error); +} + +static int +zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, + nvlist_t *zplprops, boolean_t *is_ci) +{ + boolean_t fuids_ok; + boolean_t sa_ok; + uint64_t zplver = ZPL_VERSION; + int error; + + zplver = zfs_zpl_version_map(spa_vers); + fuids_ok = (zplver >= ZPL_VERSION_FUID); + sa_ok = (zplver >= ZPL_VERSION_SA); + + error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, + createprops, zplprops, is_ci); + return (error); +} + +/* + * innvl: { + * "type" -> dmu_objset_type_t (int32) + * (optional) "props" -> { prop -> value } + * } + * + * outnvl: propname -> error code (int32) + */ +static int +zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int error = 0; + zfs_creat_t zct = { 0 }; + nvlist_t *nvprops = NULL; + void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); + int32_t type32; + dmu_objset_type_t type; + boolean_t is_insensitive = B_FALSE; + + if (nvlist_lookup_int32(innvl, "type", &type32) != 0) + return (SET_ERROR(EINVAL)); + type = type32; + (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); + + switch (type) { + case DMU_OST_ZFS: + cbfunc = zfs_create_cb; + break; + + case DMU_OST_ZVOL: + cbfunc = zvol_create_cb; + break; + + default: + cbfunc = NULL; + break; + } + if (strchr(fsname, '@') || + strchr(fsname, '%')) + return (SET_ERROR(EINVAL)); + + zct.zct_props = nvprops; + + if (cbfunc == NULL) + return (SET_ERROR(EINVAL)); + + if (type == DMU_OST_ZVOL) { + uint64_t volsize, volblocksize; + + if (nvprops == NULL) + return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) + return (SET_ERROR(EINVAL)); + + if ((error = nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + &volblocksize)) != 0 && error != ENOENT) + return (SET_ERROR(EINVAL)); + + if (error != 0) + volblocksize = zfs_prop_default_numeric( + ZFS_PROP_VOLBLOCKSIZE); + + if ((error = zvol_check_volblocksize( + volblocksize)) != 0 || + (error = zvol_check_volsize(volsize, + volblocksize)) != 0) + return (error); + } else if (type == DMU_OST_ZFS) { + int error; + + /* + * We have to have normalization and + * case-folding flags correct when we do the + * file system creation, so go figure them out + * now. + */ + VERIFY(nvlist_alloc(&zct.zct_zplprops, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + error = zfs_fill_zplprops(fsname, nvprops, + zct.zct_zplprops, &is_insensitive); + if (error != 0) { + nvlist_free(zct.zct_zplprops); + return (error); + } + } + + error = dmu_objset_create(fsname, type, + is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); + nvlist_free(zct.zct_zplprops); + + /* + * It would be nice to do this atomically. + */ + if (error == 0) { + error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, + nvprops, outnvl); + if (error != 0) + (void) dsl_destroy_head(fsname); + } +#ifdef __FreeBSD__ + if (error == 0 && type == DMU_OST_ZVOL) + zvol_create_minors(fsname); +#endif + return (error); +} + +/* + * innvl: { + * "origin" -> name of origin snapshot + * (optional) "props" -> { prop -> value } + * } + * + * outnvl: propname -> error code (int32) + */ +static int +zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int error = 0; + nvlist_t *nvprops = NULL; + char *origin_name; + + if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0) + return (SET_ERROR(EINVAL)); + (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); + + if (strchr(fsname, '@') || + strchr(fsname, '%')) + return (SET_ERROR(EINVAL)); + + if (dataset_namecheck(origin_name, NULL, NULL) != 0) + return (SET_ERROR(EINVAL)); + error = dmu_objset_clone(fsname, origin_name); + if (error != 0) + return (error); + + /* + * It would be nice to do this atomically. + */ + if (error == 0) { + error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, + nvprops, outnvl); + if (error != 0) + (void) dsl_destroy_head(fsname); + } +#ifdef __FreeBSD__ + if (error == 0) + zvol_create_minors(fsname); +#endif + return (error); +} + +/* ARGSUSED */ +static int +zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + if (strchr(fsname, '@') || + strchr(fsname, '%')) + return (SET_ERROR(EINVAL)); + + return (dmu_objset_remap_indirects(fsname)); +} + +/* + * innvl: { + * "snaps" -> { snapshot1, snapshot2 } + * (optional) "props" -> { prop -> value (string) } + * } + * + * outnvl: snapshot -> error code (int32) + */ +static int +zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + nvlist_t *snaps; + nvlist_t *props = NULL; + int error, poollen; + nvpair_t *pair; + + (void) nvlist_lookup_nvlist(innvl, "props", &props); + if ((error = zfs_check_userprops(poolname, props)) != 0) + return (error); + + if (!nvlist_empty(props) && + zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) + return (SET_ERROR(ENOTSUP)); + + if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) + return (SET_ERROR(EINVAL)); + poollen = strlen(poolname); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + const char *name = nvpair_name(pair); + const char *cp = strchr(name, '@'); + + /* + * The snap name must contain an @, and the part after it must + * contain only valid characters. + */ + if (cp == NULL || + zfs_component_namecheck(cp + 1, NULL, NULL) != 0) + return (SET_ERROR(EINVAL)); + + /* + * The snap must be in the specified pool. + */ + if (strncmp(name, poolname, poollen) != 0 || + (name[poollen] != '/' && name[poollen] != '@')) + return (SET_ERROR(EXDEV)); + + /* This must be the only snap of this fs. */ + for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); + pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { + if (strncmp(name, nvpair_name(pair2), cp - name + 1) + == 0) { + return (SET_ERROR(EXDEV)); + } + } + } + + error = dsl_dataset_snapshot(snaps, props, outnvl); + return (error); +} + +/* + * innvl: "message" -> string + */ +/* ARGSUSED */ +static int +zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) +{ + char *message; + spa_t *spa; + int error; + char *poolname; + + /* + * The poolname in the ioctl is not set, we get it from the TSD, + * which was set at the end of the last successful ioctl that allows + * logging. The secpolicy func already checked that it is set. + * Only one log ioctl is allowed after each successful ioctl, so + * we clear the TSD here. + */ + poolname = tsd_get(zfs_allow_log_key); + (void) tsd_set(zfs_allow_log_key, NULL); + error = spa_open(poolname, &spa, FTAG); + strfree(poolname); + if (error != 0) + return (error); + + if (nvlist_lookup_string(innvl, "message", &message) != 0) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + error = spa_history_log(spa, message); + spa_close(spa, FTAG); + return (error); +} + +#ifdef __FreeBSD__ +static int +zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) +{ + char name[MAXNAMELEN]; + spa_t *spa; + vdev_t *vd; + char *command; + uint64_t pool_guid; + uint64_t vdev_guid; + int error; + + if (nvlist_lookup_uint64(innvl, + ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + return (EINVAL); + if (nvlist_lookup_uint64(innvl, + ZPOOL_CONFIG_GUID, &vdev_guid) != 0) + return (EINVAL); + if (nvlist_lookup_string(innvl, + "command", &command) != 0) + return (EINVAL); + + mutex_enter(&spa_namespace_lock); + spa = spa_by_guid(pool_guid, vdev_guid); + if (spa != NULL) + strcpy(name, spa_name(spa)); + mutex_exit(&spa_namespace_lock); + if (spa == NULL) + return (ENOENT); + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); + if (vd == NULL) { + (void) spa_vdev_state_exit(spa, NULL, ENXIO); + spa_close(spa, FTAG); + return (ENODEV); + } + error = vdev_label_write_pad2(vd, command, strlen(command)); + (void) spa_vdev_state_exit(spa, NULL, 0); + txg_wait_synced(spa->spa_dsl_pool, 0); + spa_close(spa, FTAG); + return (error); +} +#endif + +/* + * The dp_config_rwlock must not be held when calling this, because the + * unmount may need to write out data. + * + * This function is best-effort. Callers must deal gracefully if it + * remains mounted (or is remounted after this call). + * + * Returns 0 if the argument is not a snapshot, or it is not currently a + * filesystem, or we were able to unmount it. Returns error code otherwise. + */ +void +zfs_unmount_snap(const char *snapname) +{ + vfs_t *vfsp = NULL; + zfsvfs_t *zfsvfs = NULL; + + if (strchr(snapname, '@') == NULL) + return; + + int err = getzfsvfs(snapname, &zfsvfs); + if (err != 0) { + ASSERT3P(zfsvfs, ==, NULL); + return; + } + vfsp = zfsvfs->z_vfs; + + ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); + +#ifdef illumos + err = vn_vfswlock(vfsp->vfs_vnodecovered); + VFS_RELE(vfsp); + if (err != 0) + return; +#endif + + /* + * Always force the unmount for snapshots. + */ +#ifdef illumos + (void) dounmount(vfsp, MS_FORCE, kcred); +#else + vfs_ref(vfsp); + vfs_unbusy(vfsp); + (void) dounmount(vfsp, MS_FORCE, curthread); +#endif +} + +/* ARGSUSED */ +static int +zfs_unmount_snap_cb(const char *snapname, void *arg) +{ + zfs_unmount_snap(snapname); + return (0); +} + +/* + * When a clone is destroyed, its origin may also need to be destroyed, + * in which case it must be unmounted. This routine will do that unmount + * if necessary. + */ +void +zfs_destroy_unmount_origin(const char *fsname) +{ + int error; + objset_t *os; + dsl_dataset_t *ds; + + error = dmu_objset_hold(fsname, FTAG, &os); + if (error != 0) + return; + ds = dmu_objset_ds(os); + if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { + char originname[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(ds->ds_prev, originname); + dmu_objset_rele(os, FTAG); + zfs_unmount_snap(originname); + } else { + dmu_objset_rele(os, FTAG); + } +} + +/* + * innvl: { + * "snaps" -> { snapshot1, snapshot2 } + * (optional boolean) "defer" + * } + * + * outnvl: snapshot -> error code (int32) + * + */ +/* ARGSUSED */ +static int +zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int error, poollen; + nvlist_t *snaps; + nvpair_t *pair; + boolean_t defer; + + if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) + return (SET_ERROR(EINVAL)); + defer = nvlist_exists(innvl, "defer"); + + poollen = strlen(poolname); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + const char *name = nvpair_name(pair); + + /* + * The snap must be in the specified pool to prevent the + * invalid removal of zvol minors below. + */ + if (strncmp(name, poolname, poollen) != 0 || + (name[poollen] != '/' && name[poollen] != '@')) + return (SET_ERROR(EXDEV)); + + zfs_unmount_snap(nvpair_name(pair)); +#if defined(__FreeBSD__) + zvol_remove_minors(name); +#endif + } + + return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); +} + +/* + * Create bookmarks. Bookmark names are of the form <fs>#<bmark>. + * All bookmarks must be in the same pool. + * + * innvl: { + * bookmark1 -> snapshot1, bookmark2 -> snapshot2 + * } + * + * outnvl: bookmark -> error code (int32) + * + */ +/* ARGSUSED */ +static int +zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { + char *snap_name; + + /* + * Verify the snapshot argument. + */ + if (nvpair_value_string(pair, &snap_name) != 0) + return (SET_ERROR(EINVAL)); + + + /* Verify that the keys (bookmarks) are unique */ + for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair); + pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) { + if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0) + return (SET_ERROR(EINVAL)); + } + } + + return (dsl_bookmark_create(innvl, outnvl)); +} + +/* + * innvl: { + * property 1, property 2, ... + * } + * + * outnvl: { + * bookmark name 1 -> { property 1, property 2, ... }, + * bookmark name 2 -> { property 1, property 2, ... } + * } + * + */ +static int +zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + return (dsl_get_bookmarks(fsname, innvl, outnvl)); +} + +/* + * innvl: { + * bookmark name 1, bookmark name 2 + * } + * + * outnvl: bookmark -> error code (int32) + * + */ +static int +zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, + nvlist_t *outnvl) +{ + int error, poollen; + + poollen = strlen(poolname); + for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { + const char *name = nvpair_name(pair); + const char *cp = strchr(name, '#'); + + /* + * The bookmark name must contain an #, and the part after it + * must contain only valid characters. + */ + if (cp == NULL || + zfs_component_namecheck(cp + 1, NULL, NULL) != 0) + return (SET_ERROR(EINVAL)); + + /* + * The bookmark must be in the specified pool. + */ + if (strncmp(name, poolname, poollen) != 0 || + (name[poollen] != '/' && name[poollen] != '#')) + return (SET_ERROR(EXDEV)); + } + + error = dsl_bookmark_destroy(innvl, outnvl); + return (error); +} + +static int +zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, + nvlist_t *outnvl) +{ + char *program; + uint64_t instrlimit, memlimit; + boolean_t sync_flag; + nvpair_t *nvarg = NULL; + + if (0 != nvlist_lookup_string(innvl, ZCP_ARG_PROGRAM, &program)) { + return (EINVAL); + } + if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { + sync_flag = B_TRUE; + } + if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) { + instrlimit = ZCP_DEFAULT_INSTRLIMIT; + } + if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { + memlimit = ZCP_DEFAULT_MEMLIMIT; + } + if (0 != nvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST, &nvarg)) { + return (EINVAL); + } + + if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) + return (EINVAL); + if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) + return (EINVAL); + + return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, + nvarg, outnvl)); +} + +/* + * innvl: unused + * outnvl: empty + */ +/* ARGSUSED */ +static int +zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + return (spa_checkpoint(poolname)); +} + +/* + * innvl: unused + * outnvl: empty + */ +/* ARGSUSED */ +static int +zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, + nvlist_t *outnvl) +{ + return (spa_checkpoint_discard(poolname)); +} + +/* + * inputs: + * zc_name name of dataset to destroy + * zc_defer_destroy mark for deferred destroy + * + * outputs: none + */ +static int +zfs_ioc_destroy(zfs_cmd_t *zc) +{ + objset_t *os; + dmu_objset_type_t ost; + int err; + + err = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (err != 0) + return (err); + ost = dmu_objset_type(os); + dmu_objset_rele(os, FTAG); + + if (ost == DMU_OST_ZFS) + zfs_unmount_snap(zc->zc_name); + + if (strchr(zc->zc_name, '@')) + err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); + else + err = dsl_destroy_head(zc->zc_name); + if (ost == DMU_OST_ZVOL && err == 0) +#ifdef __FreeBSD__ + zvol_remove_minors(zc->zc_name); +#else + (void) zvol_remove_minor(zc->zc_name); +#endif + return (err); +} + +/* + * innvl: { + * vdevs: { + * guid 1, guid 2, ... + * }, + * func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND} + * } + * + * outnvl: { + * [func: EINVAL (if provided command type didn't make sense)], + * [vdevs: { + * guid1: errno, (see function body for possible errnos) + * ... + * }] + * } + * + */ +static int +zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + + error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + uint64_t cmd_type; + if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, + &cmd_type) != 0) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + if (!(cmd_type == POOL_INITIALIZE_CANCEL || + cmd_type == POOL_INITIALIZE_DO || + cmd_type == POOL_INITIALIZE_SUSPEND)) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + nvlist_t *vdev_guids; + if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, + &vdev_guids) != 0) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + nvlist_t *vdev_errlist = fnvlist_alloc(); + int total_errors = 0; + + for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); + pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { + uint64_t vdev_guid = fnvpair_value_uint64(pair); + + error = spa_vdev_initialize(spa, vdev_guid, cmd_type); + if (error != 0) { + char guid_as_str[MAXNAMELEN]; + + (void) snprintf(guid_as_str, sizeof (guid_as_str), + "%llu", (unsigned long long)vdev_guid); + fnvlist_add_int64(vdev_errlist, guid_as_str, error); + total_errors++; + } + } + if (fnvlist_size(vdev_errlist) > 0) { + fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, + vdev_errlist); + } + fnvlist_free(vdev_errlist); + + spa_close(spa, FTAG); + return (total_errors > 0 ? EINVAL : 0); +} + +/* + * fsname is name of dataset to rollback (to most recent snapshot) + * + * innvl may contain name of expected target snapshot + * + * outnvl: "target" -> name of most recent snapshot + * } + */ +/* ARGSUSED */ +static int +zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + zfsvfs_t *zfsvfs; + char *target = NULL; + int error; + + (void) nvlist_lookup_string(innvl, "target", &target); + if (target != NULL) { + const char *cp = strchr(target, '@'); + + /* + * The snap name must contain an @, and the part after it must + * contain only valid characters. + */ + if (cp == NULL || + zfs_component_namecheck(cp + 1, NULL, NULL) != 0) + return (SET_ERROR(EINVAL)); + } + + if (getzfsvfs(fsname, &zfsvfs) == 0) { + dsl_dataset_t *ds; + + ds = dmu_objset_ds(zfsvfs->z_os); + error = zfs_suspend_fs(zfsvfs); + if (error == 0) { + int resume_err; + + error = dsl_dataset_rollback(fsname, target, zfsvfs, + outnvl); + resume_err = zfs_resume_fs(zfsvfs, ds); + error = error ? error : resume_err; + } +#ifdef illumos + VFS_RELE(zfsvfs->z_vfs); +#else + vfs_unbusy(zfsvfs->z_vfs); +#endif + } else { + error = dsl_dataset_rollback(fsname, target, NULL, outnvl); + } + return (error); +} + +static int +recursive_unmount(const char *fsname, void *arg) +{ + const char *snapname = arg; + char fullname[ZFS_MAX_DATASET_NAME_LEN]; + + (void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname); + zfs_unmount_snap(fullname); + + return (0); +} + +/* + * inputs: + * zc_name old name of dataset + * zc_value new name of dataset + * zc_cookie recursive flag (only valid for snapshots) + * + * outputs: none + */ +static int +zfs_ioc_rename(zfs_cmd_t *zc) +{ + objset_t *os; + dmu_objset_type_t ost; + boolean_t recursive = zc->zc_cookie & 1; + char *at; + boolean_t allow_mounted = B_TRUE; + int err; + +#ifdef __FreeBSD__ + allow_mounted = (zc->zc_cookie & 2) != 0; +#endif + + /* "zfs rename" from and to ...%recv datasets should both fail */ + zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; + if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || + dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || + strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%')) + return (SET_ERROR(EINVAL)); + + err = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (err != 0) + return (err); + ost = dmu_objset_type(os); + dmu_objset_rele(os, FTAG); + + at = strchr(zc->zc_name, '@'); + if (at != NULL) { + /* snaps must be in same fs */ + int error; + + if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) + return (SET_ERROR(EXDEV)); + *at = '\0'; + if (ost == DMU_OST_ZFS && !allow_mounted) { + error = dmu_objset_find(zc->zc_name, + recursive_unmount, at + 1, + recursive ? DS_FIND_CHILDREN : 0); + if (error != 0) { + *at = '@'; + return (error); + } + } + error = dsl_dataset_rename_snapshot(zc->zc_name, + at + 1, strchr(zc->zc_value, '@') + 1, recursive); + *at = '@'; + + return (error); + } else { +#ifdef illumos + if (ost == DMU_OST_ZVOL) + (void) zvol_remove_minor(zc->zc_name); +#endif + return (dsl_dir_rename(zc->zc_name, zc->zc_value)); + } +} + +static int +zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) +{ + const char *propname = nvpair_name(pair); + boolean_t issnap = (strchr(dsname, '@') != NULL); + zfs_prop_t prop = zfs_name_to_prop(propname); + uint64_t intval; + int err; + + if (prop == ZPROP_INVAL) { + if (zfs_prop_user(propname)) { + if (err = zfs_secpolicy_write_perms(dsname, + ZFS_DELEG_PERM_USERPROP, cr)) + return (err); + return (0); + } + + if (!issnap && zfs_prop_userquota(propname)) { + const char *perm = NULL; + const char *uq_prefix = + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; + const char *gq_prefix = + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; + + if (strncmp(propname, uq_prefix, + strlen(uq_prefix)) == 0) { + perm = ZFS_DELEG_PERM_USERQUOTA; + } else if (strncmp(propname, gq_prefix, + strlen(gq_prefix)) == 0) { + perm = ZFS_DELEG_PERM_GROUPQUOTA; + } else { + /* USERUSED and GROUPUSED are read-only */ + return (SET_ERROR(EINVAL)); + } + + if (err = zfs_secpolicy_write_perms(dsname, perm, cr)) + return (err); + return (0); + } + + return (SET_ERROR(EINVAL)); + } + + if (issnap) + return (SET_ERROR(EINVAL)); + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + /* + * dsl_prop_get_all_impl() returns properties in this + * format. + */ + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) == 0); + } + + /* + * Check that this value is valid for this pool version + */ + switch (prop) { + case ZFS_PROP_COMPRESSION: + /* + * If the user specified gzip compression, make sure + * the SPA supports it. We ignore any errors here since + * we'll catch them later. + */ + if (nvpair_value_uint64(pair, &intval) == 0) { + if (intval >= ZIO_COMPRESS_GZIP_1 && + intval <= ZIO_COMPRESS_GZIP_9 && + zfs_earlier_version(dsname, + SPA_VERSION_GZIP_COMPRESSION)) { + return (SET_ERROR(ENOTSUP)); + } + + if (intval == ZIO_COMPRESS_ZLE && + zfs_earlier_version(dsname, + SPA_VERSION_ZLE_COMPRESSION)) + return (SET_ERROR(ENOTSUP)); + + if (intval == ZIO_COMPRESS_LZ4) { + spa_t *spa; + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_LZ4_COMPRESS)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } + + /* + * If this is a bootable dataset then + * verify that the compression algorithm + * is supported for booting. We must return + * something other than ENOTSUP since it + * implies a downrev pool version. + */ + if (zfs_is_bootfs(dsname) && + !BOOTFS_COMPRESS_VALID(intval)) { + return (SET_ERROR(ERANGE)); + } + } + break; + + case ZFS_PROP_COPIES: + if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) + return (SET_ERROR(ENOTSUP)); + break; + + case ZFS_PROP_RECORDSIZE: + /* Record sizes above 128k need the feature to be enabled */ + if (nvpair_value_uint64(pair, &intval) == 0 && + intval > SPA_OLD_MAXBLOCKSIZE) { + spa_t *spa; + + /* + * We don't allow setting the property above 1MB, + * unless the tunable has been changed. + */ + if (intval > zfs_max_recordsize || + intval > SPA_MAXBLOCKSIZE) + return (SET_ERROR(ERANGE)); + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_LARGE_BLOCKS)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } + break; + + case ZFS_PROP_DNODESIZE: + /* Dnode sizes above 512 need the feature to be enabled */ + if (nvpair_value_uint64(pair, &intval) == 0 && + intval != ZFS_DNSIZE_LEGACY) { + spa_t *spa; + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_LARGE_DNODE)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } + break; + + case ZFS_PROP_SHARESMB: + if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) + return (SET_ERROR(ENOTSUP)); + break; + + case ZFS_PROP_ACLINHERIT: + if (nvpair_type(pair) == DATA_TYPE_UINT64 && + nvpair_value_uint64(pair, &intval) == 0) { + if (intval == ZFS_ACL_PASSTHROUGH_X && + zfs_earlier_version(dsname, + SPA_VERSION_PASSTHROUGH_X)) + return (SET_ERROR(ENOTSUP)); + } + break; + + case ZFS_PROP_CHECKSUM: + case ZFS_PROP_DEDUP: + { + spa_feature_t feature; + spa_t *spa; + + /* dedup feature version checks */ + if (prop == ZFS_PROP_DEDUP && + zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) + return (SET_ERROR(ENOTSUP)); + + if (nvpair_value_uint64(pair, &intval) != 0) + return (SET_ERROR(EINVAL)); + + /* check prop value is enabled in features */ + feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK); + if (feature == SPA_FEATURE_NONE) + break; + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + /* + * Salted checksums are not supported on root pools. + */ + if (spa_bootfs(spa) != 0 && + intval < ZIO_CHECKSUM_FUNCTIONS && + (zio_checksum_table[intval].ci_flags & + ZCHECKSUM_FLAG_SALTED)) { + spa_close(spa, FTAG); + return (SET_ERROR(ERANGE)); + } + if (!spa_feature_is_enabled(spa, feature)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + break; + } + } + + return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); +} + +/* + * Checks for a race condition to make sure we don't increment a feature flag + * multiple times. + */ +static int +zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_feature_t *featurep = arg; + + if (!spa_feature_is_active(spa, *featurep)) + return (0); + else + return (SET_ERROR(EBUSY)); +} + +/* + * The callback invoked on feature activation in the sync task caused by + * zfs_prop_activate_feature. + */ +static void +zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_feature_t *featurep = arg; + + spa_feature_incr(spa, *featurep, tx); +} + +/* + * Activates a feature on a pool in response to a property setting. This + * creates a new sync task which modifies the pool to reflect the feature + * as being active. + */ +static int +zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature) +{ + int err; + + /* EBUSY here indicates that the feature is already active */ + err = dsl_sync_task(spa_name(spa), + zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync, + &feature, 2, ZFS_SPACE_CHECK_RESERVED); + + if (err != 0 && err != EBUSY) + return (err); + else + return (0); +} + +/* + * Removes properties from the given props list that fail permission checks + * needed to clear them and to restore them in case of a receive error. For each + * property, make sure we have both set and inherit permissions. + * + * Returns the first error encountered if any permission checks fail. If the + * caller provides a non-NULL errlist, it also gives the complete list of names + * of all the properties that failed a permission check along with the + * corresponding error numbers. The caller is responsible for freeing the + * returned errlist. + * + * If every property checks out successfully, zero is returned and the list + * pointed at by errlist is NULL. + */ +static int +zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) +{ + zfs_cmd_t *zc; + nvpair_t *pair, *next_pair; + nvlist_t *errors; + int err, rv = 0; + + if (props == NULL) + return (0); + + VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); + (void) strcpy(zc->zc_name, dataset); + pair = nvlist_next_nvpair(props, NULL); + while (pair != NULL) { + next_pair = nvlist_next_nvpair(props, pair); + + (void) strcpy(zc->zc_value, nvpair_name(pair)); + if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || + (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { + VERIFY(nvlist_remove_nvpair(props, pair) == 0); + VERIFY(nvlist_add_int32(errors, + zc->zc_value, err) == 0); + } + pair = next_pair; + } + kmem_free(zc, sizeof (zfs_cmd_t)); + + if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { + nvlist_free(errors); + errors = NULL; + } else { + VERIFY(nvpair_value_int32(pair, &rv) == 0); + } + + if (errlist == NULL) + nvlist_free(errors); + else + *errlist = errors; + + return (rv); +} + +static boolean_t +propval_equals(nvpair_t *p1, nvpair_t *p2) +{ + if (nvpair_type(p1) == DATA_TYPE_NVLIST) { + /* dsl_prop_get_all_impl() format */ + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &p1) == 0); + } + + if (nvpair_type(p2) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &p2) == 0); + } + + if (nvpair_type(p1) != nvpair_type(p2)) + return (B_FALSE); + + if (nvpair_type(p1) == DATA_TYPE_STRING) { + char *valstr1, *valstr2; + + VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); + VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); + return (strcmp(valstr1, valstr2) == 0); + } else { + uint64_t intval1, intval2; + + VERIFY(nvpair_value_uint64(p1, &intval1) == 0); + VERIFY(nvpair_value_uint64(p2, &intval2) == 0); + return (intval1 == intval2); + } +} + +/* + * Remove properties from props if they are not going to change (as determined + * by comparison with origprops). Remove them from origprops as well, since we + * do not need to clear or restore properties that won't change. + */ +static void +props_reduce(nvlist_t *props, nvlist_t *origprops) +{ + nvpair_t *pair, *next_pair; + + if (origprops == NULL) + return; /* all props need to be received */ + + pair = nvlist_next_nvpair(props, NULL); + while (pair != NULL) { + const char *propname = nvpair_name(pair); + nvpair_t *match; + + next_pair = nvlist_next_nvpair(props, pair); + + if ((nvlist_lookup_nvpair(origprops, propname, + &match) != 0) || !propval_equals(pair, match)) + goto next; /* need to set received value */ + + /* don't clear the existing received value */ + (void) nvlist_remove_nvpair(origprops, match); + /* don't bother receiving the property */ + (void) nvlist_remove_nvpair(props, pair); +next: + pair = next_pair; + } +} + +/* + * Extract properties that cannot be set PRIOR to the receipt of a dataset. + * For example, refquota cannot be set until after the receipt of a dataset, + * because in replication streams, an older/earlier snapshot may exceed the + * refquota. We want to receive the older/earlier snapshot, but setting + * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent + * the older/earlier snapshot from being received (with EDQUOT). + * + * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. + * + * libzfs will need to be judicious handling errors encountered by props + * extracted by this function. + */ +static nvlist_t * +extract_delay_props(nvlist_t *props) +{ + nvlist_t *delayprops; + nvpair_t *nvp, *tmp; + static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 }; + int i; + + VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; + nvp = nvlist_next_nvpair(props, nvp)) { + /* + * strcmp() is safe because zfs_prop_to_name() always returns + * a bounded string. + */ + for (i = 0; delayable[i] != 0; i++) { + if (strcmp(zfs_prop_to_name(delayable[i]), + nvpair_name(nvp)) == 0) { + break; + } + } + if (delayable[i] != 0) { + tmp = nvlist_prev_nvpair(props, nvp); + VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); + VERIFY(nvlist_remove_nvpair(props, nvp) == 0); + nvp = tmp; + } + } + + if (nvlist_empty(delayprops)) { + nvlist_free(delayprops); + delayprops = NULL; + } + return (delayprops); +} + +#ifdef DEBUG +static boolean_t zfs_ioc_recv_inject_err; +#endif + +/* + * inputs: + * zc_name name of containing filesystem + * zc_nvlist_src{_size} nvlist of properties to apply + * zc_value name of snapshot to create + * zc_string name of clone origin (if DRR_FLAG_CLONE) + * zc_cookie file descriptor to recv from + * zc_begin_record the BEGIN record of the stream (not byteswapped) + * zc_guid force flag + * zc_cleanup_fd cleanup-on-exit file descriptor + * zc_action_handle handle for this guid/ds mapping (or zero on first call) + * zc_resumable if data is incomplete assume sender will resume + * + * outputs: + * zc_cookie number of bytes read + * zc_nvlist_dst{_size} error for each unapplied received property + * zc_obj zprop_errflags_t + * zc_action_handle handle for this guid/ds mapping + */ +static int +zfs_ioc_recv(zfs_cmd_t *zc) +{ + file_t *fp; + dmu_recv_cookie_t drc; + boolean_t force = (boolean_t)zc->zc_guid; + int fd; + int error = 0; + int props_error = 0; + nvlist_t *errors; + offset_t off; + nvlist_t *props = NULL; /* sent properties */ + nvlist_t *origprops = NULL; /* existing properties */ + nvlist_t *delayprops = NULL; /* sent properties applied post-receive */ + char *origin = NULL; + char *tosnap; + char tofs[ZFS_MAX_DATASET_NAME_LEN]; + boolean_t first_recvd_props = B_FALSE; + + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || + strchr(zc->zc_value, '@') == NULL || + strchr(zc->zc_value, '%')) + return (SET_ERROR(EINVAL)); + + (void) strcpy(tofs, zc->zc_value); + tosnap = strchr(tofs, '@'); + *tosnap++ = '\0'; + + if (zc->zc_nvlist_src != 0 && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props)) != 0) + return (error); + + fd = zc->zc_cookie; +#ifdef illumos + fp = getf(fd); +#else + fget_read(curthread, fd, &cap_pread_rights, &fp); +#endif + if (fp == NULL) { + nvlist_free(props); + return (SET_ERROR(EBADF)); + } + + errors = fnvlist_alloc(); + + if (zc->zc_string[0]) + origin = zc->zc_string; + + error = dmu_recv_begin(tofs, tosnap, + &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc); + if (error != 0) + goto out; + + /* + * Set properties before we receive the stream so that they are applied + * to the new data. Note that we must call dmu_recv_stream() if + * dmu_recv_begin() succeeds. + */ + if (props != NULL && !drc.drc_newfs) { + if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= + SPA_VERSION_RECVD_PROPS && + !dsl_prop_get_hasrecvd(tofs)) + first_recvd_props = B_TRUE; + + /* + * If new received properties are supplied, they are to + * completely replace the existing received properties, so stash + * away the existing ones. + */ + if (dsl_prop_get_received(tofs, &origprops) == 0) { + nvlist_t *errlist = NULL; + /* + * Don't bother writing a property if its value won't + * change (and avoid the unnecessary security checks). + * + * The first receive after SPA_VERSION_RECVD_PROPS is a + * special case where we blow away all local properties + * regardless. + */ + if (!first_recvd_props) + props_reduce(props, origprops); + if (zfs_check_clearable(tofs, origprops, &errlist) != 0) + (void) nvlist_merge(errors, errlist, 0); + nvlist_free(errlist); + + if (clear_received_props(tofs, origprops, + first_recvd_props ? NULL : props) != 0) + zc->zc_obj |= ZPROP_ERR_NOCLEAR; + } else { + zc->zc_obj |= ZPROP_ERR_NOCLEAR; + } + } + + if (props != NULL) { + props_error = dsl_prop_set_hasrecvd(tofs); + + if (props_error == 0) { + delayprops = extract_delay_props(props); + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, + props, errors); + } + } + + off = fp->f_offset; + error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd, + &zc->zc_action_handle); + + if (error == 0) { + zfsvfs_t *zfsvfs = NULL; + + if (getzfsvfs(tofs, &zfsvfs) == 0) { + /* online recv */ + dsl_dataset_t *ds; + int end_err; + + ds = dmu_objset_ds(zfsvfs->z_os); + error = zfs_suspend_fs(zfsvfs); + /* + * If the suspend fails, then the recv_end will + * likely also fail, and clean up after itself. + */ + end_err = dmu_recv_end(&drc, zfsvfs); + if (error == 0) + error = zfs_resume_fs(zfsvfs, ds); + error = error ? error : end_err; +#ifdef illumos + VFS_RELE(zfsvfs->z_vfs); +#else + vfs_unbusy(zfsvfs->z_vfs); +#endif + } else { + error = dmu_recv_end(&drc, NULL); + } + + /* Set delayed properties now, after we're done receiving. */ + if (delayprops != NULL && error == 0) { + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, + delayprops, errors); + } + } + + if (delayprops != NULL) { + /* + * Merge delayed props back in with initial props, in case + * we're DEBUG and zfs_ioc_recv_inject_err is set (which means + * we have to make sure clear_received_props() includes + * the delayed properties). + * + * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, + * using ASSERT() will be just like a VERIFY. + */ + ASSERT(nvlist_merge(props, delayprops, 0) == 0); + nvlist_free(delayprops); + } + + /* + * Now that all props, initial and delayed, are set, report the prop + * errors to the caller. + */ + if (zc->zc_nvlist_dst_size != 0 && + (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || + put_nvlist(zc, errors) != 0)) { + /* + * Caller made zc->zc_nvlist_dst less than the minimum expected + * size or supplied an invalid address. + */ + props_error = SET_ERROR(EINVAL); + } + + zc->zc_cookie = off - fp->f_offset; + if (off >= 0 && off <= MAXOFFSET_T) + fp->f_offset = off; + +#ifdef DEBUG + if (zfs_ioc_recv_inject_err) { + zfs_ioc_recv_inject_err = B_FALSE; + error = 1; + } +#endif + +#ifdef __FreeBSD__ + if (error == 0) + zvol_create_minors(tofs); +#endif + + /* + * On error, restore the original props. + */ + if (error != 0 && props != NULL && !drc.drc_newfs) { + if (clear_received_props(tofs, props, NULL) != 0) { + /* + * We failed to clear the received properties. + * Since we may have left a $recvd value on the + * system, we can't clear the $hasrecvd flag. + */ + zc->zc_obj |= ZPROP_ERR_NORESTORE; + } else if (first_recvd_props) { + dsl_prop_unset_hasrecvd(tofs); + } + + if (origprops == NULL && !drc.drc_newfs) { + /* We failed to stash the original properties. */ + zc->zc_obj |= ZPROP_ERR_NORESTORE; + } + + /* + * dsl_props_set() will not convert RECEIVED to LOCAL on or + * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL + * explictly if we're restoring local properties cleared in the + * first new-style receive. + */ + if (origprops != NULL && + zfs_set_prop_nvlist(tofs, (first_recvd_props ? + ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), + origprops, NULL) != 0) { + /* + * We stashed the original properties but failed to + * restore them. + */ + zc->zc_obj |= ZPROP_ERR_NORESTORE; + } + } +out: + nvlist_free(props); + nvlist_free(origprops); + nvlist_free(errors); + releasef(fd); + + if (error == 0) + error = props_error; + + return (error); +} + +/* + * inputs: + * zc_name name of snapshot to send + * zc_cookie file descriptor to send stream to + * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) + * zc_sendobj objsetid of snapshot to send + * zc_fromobj objsetid of incremental fromsnap (may be zero) + * zc_guid if set, estimate size of stream only. zc_cookie is ignored. + * output size in zc_objset_type. + * zc_flags lzc_send_flags + * + * outputs: + * zc_objset_type estimated size, if zc_guid is set + */ +static int +zfs_ioc_send(zfs_cmd_t *zc) +{ + int error; + offset_t off; + boolean_t estimate = (zc->zc_guid != 0); + boolean_t embedok = (zc->zc_flags & 0x1); + boolean_t large_block_ok = (zc->zc_flags & 0x2); + boolean_t compressok = (zc->zc_flags & 0x4); + + if (zc->zc_obj != 0) { + dsl_pool_t *dp; + dsl_dataset_t *tosnap; + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + if (dsl_dir_is_clone(tosnap->ds_dir)) + zc->zc_fromobj = + dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + } + + if (estimate) { + dsl_pool_t *dp; + dsl_dataset_t *tosnap; + dsl_dataset_t *fromsnap = NULL; + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + if (zc->zc_fromobj != 0) { + error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, + FTAG, &fromsnap); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + } + + error = dmu_send_estimate(tosnap, fromsnap, compressok, + &zc->zc_objset_type); + + if (fromsnap != NULL) + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + } else { + file_t *fp; + +#ifdef illumos + fp = getf(zc->zc_cookie); +#else + fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp); +#endif + if (fp == NULL) + return (SET_ERROR(EBADF)); + + off = fp->f_offset; + error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, + zc->zc_fromobj, embedok, large_block_ok, compressok, +#ifdef illumos + zc->zc_cookie, fp->f_vnode, &off); +#else + zc->zc_cookie, fp, &off); +#endif + + if (off >= 0 && off <= MAXOFFSET_T) + fp->f_offset = off; + releasef(zc->zc_cookie); + } + return (error); +} + +/* + * inputs: + * zc_name name of snapshot on which to report progress + * zc_cookie file descriptor of send stream + * + * outputs: + * zc_cookie number of bytes written in send stream thus far + */ +static int +zfs_ioc_send_progress(zfs_cmd_t *zc) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + dmu_sendarg_t *dsp = NULL; + int error; + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + mutex_enter(&ds->ds_sendstream_lock); + + /* + * Iterate over all the send streams currently active on this dataset. + * If there's one which matches the specified file descriptor _and_ the + * stream was started by the current process, return the progress of + * that stream. + */ + for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; + dsp = list_next(&ds->ds_sendstreams, dsp)) { + if (dsp->dsa_outfd == zc->zc_cookie && + dsp->dsa_proc == curproc) + break; + } + + if (dsp != NULL) + zc->zc_cookie = *(dsp->dsa_off); + else + error = SET_ERROR(ENOENT); + + mutex_exit(&ds->ds_sendstream_lock); + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); +} + +static int +zfs_ioc_inject_fault(zfs_cmd_t *zc) +{ + int id, error; + + error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, + &zc->zc_inject_record); + + if (error == 0) + zc->zc_guid = (uint64_t)id; + + return (error); +} + +static int +zfs_ioc_clear_fault(zfs_cmd_t *zc) +{ + return (zio_clear_fault((int)zc->zc_guid)); +} + +static int +zfs_ioc_inject_list_next(zfs_cmd_t *zc) +{ + int id = (int)zc->zc_guid; + int error; + + error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), + &zc->zc_inject_record); + + zc->zc_guid = id; + + return (error); +} + +static int +zfs_ioc_error_log(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + size_t count = (size_t)zc->zc_nvlist_dst_size; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, + &count); + if (error == 0) + zc->zc_nvlist_dst_size = count; + else + zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); + + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_clear(zfs_cmd_t *zc) +{ + spa_t *spa; + vdev_t *vd; + int error; + + /* + * On zpool clear we also fix up missing slogs + */ + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(zc->zc_name); + if (spa == NULL) { + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EIO)); + } + if (spa_get_log_state(spa) == SPA_LOG_MISSING) { + /* we need to let spa_open/spa_load clear the chains */ + spa_set_log_state(spa, SPA_LOG_CLEAR); + } + spa->spa_last_open_failed = 0; + mutex_exit(&spa_namespace_lock); + + if (zc->zc_cookie & ZPOOL_NO_REWIND) { + error = spa_open(zc->zc_name, &spa, FTAG); + } else { + nvlist_t *policy; + nvlist_t *config = NULL; + + if (zc->zc_nvlist_src == 0) + return (SET_ERROR(EINVAL)); + + if ((error = get_nvlist(zc->zc_nvlist_src, + zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { + error = spa_open_rewind(zc->zc_name, &spa, FTAG, + policy, &config); + if (config != NULL) { + int err; + + if ((err = put_nvlist(zc, config)) != 0) + error = err; + nvlist_free(config); + } + nvlist_free(policy); + } + } + + if (error != 0) + return (error); + + spa_vdev_state_enter(spa, SCL_NONE); + + if (zc->zc_guid == 0) { + vd = NULL; + } else { + vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); + if (vd == NULL) { + (void) spa_vdev_state_exit(spa, NULL, ENODEV); + spa_close(spa, FTAG); + return (SET_ERROR(ENODEV)); + } + } + + vdev_clear(spa, vd); + + (void) spa_vdev_state_exit(spa, NULL, 0); + + /* + * Resume any suspended I/Os. + */ + if (zio_resume(spa) != 0) + error = SET_ERROR(EIO); + + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_reopen(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + spa_vdev_state_enter(spa, SCL_NONE); + + /* + * If a resilver is already in progress then set the + * spa_scrub_reopen flag to B_TRUE so that we don't restart + * the scan as a side effect of the reopen. Otherwise, let + * vdev_open() decided if a resilver is required. + */ + spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool); + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (0); +} +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * zc_string name of conflicting snapshot, if there is one + */ +static int +zfs_ioc_promote(zfs_cmd_t *zc) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds, *ods; + char origin[ZFS_MAX_DATASET_NAME_LEN]; + char *cp; + int error; + + zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || + strchr(zc->zc_name, '%')) + return (SET_ERROR(EINVAL)); + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + if (!dsl_dir_is_clone(ds->ds_dir)) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + + dsl_dataset_name(ods, origin); + dsl_dataset_rele(ods, FTAG); + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + + /* + * We don't need to unmount *all* the origin fs's snapshots, but + * it's easier. + */ + cp = strchr(origin, '@'); + if (cp) + *cp = '\0'; + (void) dmu_objset_find(origin, + zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); + return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); +} + +/* + * Retrieve a single {user|group}{used|quota}@... property. + * + * inputs: + * zc_name name of filesystem + * zc_objset_type zfs_userquota_prop_t + * zc_value domain name (eg. "S-1-234-567-89") + * zc_guid RID/UID/GID + * + * outputs: + * zc_cookie property value + */ +static int +zfs_ioc_userspace_one(zfs_cmd_t *zc) +{ + zfsvfs_t *zfsvfs; + int error; + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (SET_ERROR(EINVAL)); + + error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); + if (error != 0) + return (error); + + error = zfs_userspace_one(zfsvfs, + zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); + zfsvfs_rele(zfsvfs, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_objset_type zfs_userquota_prop_t + * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) + * + * outputs: + * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) + * zc_cookie zap cursor + */ +static int +zfs_ioc_userspace_many(zfs_cmd_t *zc) +{ + zfsvfs_t *zfsvfs; + int bufsize = zc->zc_nvlist_dst_size; + + if (bufsize <= 0) + return (SET_ERROR(ENOMEM)); + + int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); + if (error != 0) + return (error); + + void *buf = kmem_alloc(bufsize, KM_SLEEP); + + error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, + buf, &zc->zc_nvlist_dst_size); + + if (error == 0) { + error = ddi_copyout(buf, + (void *)(uintptr_t)zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, zc->zc_iflags); + } + kmem_free(buf, bufsize); + zfsvfs_rele(zfsvfs, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * none + */ +static int +zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) +{ + objset_t *os; + int error = 0; + zfsvfs_t *zfsvfs; + + if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { + if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { + /* + * If userused is not enabled, it may be because the + * objset needs to be closed & reopened (to grow the + * objset_phys_t). Suspend/resume the fs will do that. + */ + dsl_dataset_t *ds, *newds; + + ds = dmu_objset_ds(zfsvfs->z_os); + error = zfs_suspend_fs(zfsvfs); + if (error == 0) { + dmu_objset_refresh_ownership(ds, &newds, + zfsvfs); + error = zfs_resume_fs(zfsvfs, newds); + } + } + if (error == 0) + error = dmu_objset_userspace_upgrade(zfsvfs->z_os); +#ifdef illumos + VFS_RELE(zfsvfs->z_vfs); +#else + vfs_unbusy(zfsvfs->z_vfs); +#endif + } else { + /* XXX kind of reading contents without owning */ + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error != 0) + return (error); + + error = dmu_objset_userspace_upgrade(os); + dmu_objset_rele(os, FTAG); + } + + return (error); +} + +#ifdef illumos +/* + * We don't want to have a hard dependency + * against some special symbols in sharefs + * nfs, and smbsrv. Determine them if needed when + * the first file system is shared. + * Neither sharefs, nfs or smbsrv are unloadable modules. + */ +int (*znfsexport_fs)(void *arg); +int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t); +int (*zsmbexport_fs)(void *arg, boolean_t add_share); + +int zfs_nfsshare_inited; +int zfs_smbshare_inited; + +ddi_modhandle_t nfs_mod; +ddi_modhandle_t sharefs_mod; +ddi_modhandle_t smbsrv_mod; +#endif /* illumos */ +kmutex_t zfs_share_lock; + +#ifdef illumos +static int +zfs_init_sharefs() +{ + int error; + + ASSERT(MUTEX_HELD(&zfs_share_lock)); + /* Both NFS and SMB shares also require sharetab support. */ + if (sharefs_mod == NULL && ((sharefs_mod = + ddi_modopen("fs/sharefs", + KRTLD_MODE_FIRST, &error)) == NULL)) { + return (SET_ERROR(ENOSYS)); + } + if (zshare_fs == NULL && ((zshare_fs = + (int (*)(enum sharefs_sys_op, share_t *, uint32_t)) + ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) { + return (SET_ERROR(ENOSYS)); + } + return (0); +} +#endif /* illumos */ + +static int +zfs_ioc_share(zfs_cmd_t *zc) +{ +#ifdef illumos + int error; + int opcode; + + switch (zc->zc_share.z_sharetype) { + case ZFS_SHARE_NFS: + case ZFS_UNSHARE_NFS: + if (zfs_nfsshare_inited == 0) { + mutex_enter(&zfs_share_lock); + if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs", + KRTLD_MODE_FIRST, &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (SET_ERROR(ENOSYS)); + } + if (znfsexport_fs == NULL && + ((znfsexport_fs = (int (*)(void *)) + ddi_modsym(nfs_mod, + "nfs_export", &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (SET_ERROR(ENOSYS)); + } + error = zfs_init_sharefs(); + if (error != 0) { + mutex_exit(&zfs_share_lock); + return (SET_ERROR(ENOSYS)); + } + zfs_nfsshare_inited = 1; + mutex_exit(&zfs_share_lock); + } + break; + case ZFS_SHARE_SMB: + case ZFS_UNSHARE_SMB: + if (zfs_smbshare_inited == 0) { + mutex_enter(&zfs_share_lock); + if (smbsrv_mod == NULL && ((smbsrv_mod = + ddi_modopen("drv/smbsrv", + KRTLD_MODE_FIRST, &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (SET_ERROR(ENOSYS)); + } + if (zsmbexport_fs == NULL && ((zsmbexport_fs = + (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod, + "smb_server_share", &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (SET_ERROR(ENOSYS)); + } + error = zfs_init_sharefs(); + if (error != 0) { + mutex_exit(&zfs_share_lock); + return (SET_ERROR(ENOSYS)); + } + zfs_smbshare_inited = 1; + mutex_exit(&zfs_share_lock); + } + break; + default: + return (SET_ERROR(EINVAL)); + } + + switch (zc->zc_share.z_sharetype) { + case ZFS_SHARE_NFS: + case ZFS_UNSHARE_NFS: + if (error = + znfsexport_fs((void *) + (uintptr_t)zc->zc_share.z_exportdata)) + return (error); + break; + case ZFS_SHARE_SMB: + case ZFS_UNSHARE_SMB: + if (error = zsmbexport_fs((void *) + (uintptr_t)zc->zc_share.z_exportdata, + zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? + B_TRUE: B_FALSE)) { + return (error); + } + break; + } + + opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS || + zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ? + SHAREFS_ADD : SHAREFS_REMOVE; + + /* + * Add or remove share from sharetab + */ + error = zshare_fs(opcode, + (void *)(uintptr_t)zc->zc_share.z_sharedata, + zc->zc_share.z_sharemax); + + return (error); + +#else /* !illumos */ + return (ENOSYS); +#endif /* illumos */ +} + +ace_t full_access[] = { + {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} +}; + +/* + * inputs: + * zc_name name of containing filesystem + * zc_obj object # beyond which we want next in-use object # + * + * outputs: + * zc_obj next in-use object # + */ +static int +zfs_ioc_next_obj(zfs_cmd_t *zc) +{ + objset_t *os = NULL; + int error; + + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error != 0) + return (error); + + error = dmu_object_next(os, &zc->zc_obj, B_FALSE, + dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg); + + dmu_objset_rele(os, FTAG); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value prefix name for snapshot + * zc_cleanup_fd cleanup-on-exit file descriptor for calling process + * + * outputs: + * zc_value short name of new snapshot + */ +static int +zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) +{ + char *snap_name; + char *hold_name; + int error; + minor_t minor; + + error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); + if (error != 0) + return (error); + + snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, + (u_longlong_t)ddi_get_lbolt64()); + hold_name = kmem_asprintf("%%%s", zc->zc_value); + + error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, + hold_name); + if (error == 0) + (void) strcpy(zc->zc_value, snap_name); + strfree(snap_name); + strfree(hold_name); + zfs_onexit_fd_rele(zc->zc_cleanup_fd); + return (error); +} + +/* + * inputs: + * zc_name name of "to" snapshot + * zc_value name of "from" snapshot + * zc_cookie file descriptor to write diff data on + * + * outputs: + * dmu_diff_record_t's to the file descriptor + */ +static int +zfs_ioc_diff(zfs_cmd_t *zc) +{ + file_t *fp; + offset_t off; + int error; + +#ifdef illumos + fp = getf(zc->zc_cookie); +#else + fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp); +#endif + if (fp == NULL) + return (SET_ERROR(EBADF)); + + off = fp->f_offset; + +#ifdef illumos + error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off); +#else + error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); +#endif + + if (off >= 0 && off <= MAXOFFSET_T) + fp->f_offset = off; + releasef(zc->zc_cookie); + + return (error); +} + +#ifdef illumos +/* + * Remove all ACL files in shares dir + */ +static int +zfs_smb_acl_purge(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred, + NULL, 0)) != 0) + break; + } + zap_cursor_fini(&zc); + return (error); +} +#endif /* illumos */ + +static int +zfs_ioc_smb_acl(zfs_cmd_t *zc) +{ +#ifdef illumos + vnode_t *vp; + znode_t *dzp; + vnode_t *resourcevp = NULL; + znode_t *sharedir; + zfsvfs_t *zfsvfs; + nvlist_t *nvlist; + char *src, *target; + vattr_t vattr; + vsecattr_t vsec; + int error = 0; + + if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, + NO_FOLLOW, NULL, &vp)) != 0) + return (error); + + /* Now make sure mntpnt and dataset are ZFS */ + + if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || + (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), + zc->zc_name) != 0)) { + VN_RELE(vp); + return (SET_ERROR(EINVAL)); + } + + dzp = VTOZ(vp); + zfsvfs = dzp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + + /* + * Create share dir if its missing. + */ + mutex_enter(&zfsvfs->z_lock); + if (zfsvfs->z_shares_dir == 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, + ZFS_SHARES_DIR); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + } else { + error = zfs_create_share_dir(zfsvfs, tx); + dmu_tx_commit(tx); + } + if (error != 0) { + mutex_exit(&zfsvfs->z_lock); + VN_RELE(vp); + ZFS_EXIT(zfsvfs); + return (error); + } + } + mutex_exit(&zfsvfs->z_lock); + + ASSERT(zfsvfs->z_shares_dir); + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) { + VN_RELE(vp); + ZFS_EXIT(zfsvfs); + return (error); + } + + switch (zc->zc_cookie) { + case ZFS_SMB_ACL_ADD: + vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; + vattr.va_type = VREG; + vattr.va_mode = S_IFREG|0777; + vattr.va_uid = 0; + vattr.va_gid = 0; + + vsec.vsa_mask = VSA_ACE; + vsec.vsa_aclentp = &full_access; + vsec.vsa_aclentsz = sizeof (full_access); + vsec.vsa_aclcnt = 1; + + error = VOP_CREATE(ZTOV(sharedir), zc->zc_string, + &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec); + if (resourcevp) + VN_RELE(resourcevp); + break; + + case ZFS_SMB_ACL_REMOVE: + error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred, + NULL, 0); + break; + + case ZFS_SMB_ACL_RENAME: + if ((error = get_nvlist(zc->zc_nvlist_src, + zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) { + VN_RELE(vp); + VN_RELE(ZTOV(sharedir)); + ZFS_EXIT(zfsvfs); + return (error); + } + if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) || + nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET, + &target)) { + VN_RELE(vp); + VN_RELE(ZTOV(sharedir)); + ZFS_EXIT(zfsvfs); + nvlist_free(nvlist); + return (error); + } + error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target, + kcred, NULL, 0); + nvlist_free(nvlist); + break; + + case ZFS_SMB_ACL_PURGE: + error = zfs_smb_acl_purge(sharedir); + break; + + default: + error = SET_ERROR(EINVAL); + break; + } + + VN_RELE(vp); + VN_RELE(ZTOV(sharedir)); + + ZFS_EXIT(zfsvfs); + + return (error); +#else /* !illumos */ + return (EOPNOTSUPP); +#endif /* illumos */ +} + +/* + * innvl: { + * "holds" -> { snapname -> holdname (string), ... } + * (optional) "cleanup_fd" -> fd (int32) + * } + * + * outnvl: { + * snapname -> error value (int32) + * ... + * } + */ +/* ARGSUSED */ +static int +zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) +{ + nvpair_t *pair; + nvlist_t *holds; + int cleanup_fd = -1; + int error; + minor_t minor = 0; + + error = nvlist_lookup_nvlist(args, "holds", &holds); + if (error != 0) + return (SET_ERROR(EINVAL)); + + /* make sure the user didn't pass us any invalid (empty) tags */ + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + char *htag; + + error = nvpair_value_string(pair, &htag); + if (error != 0) + return (SET_ERROR(error)); + + if (strlen(htag) == 0) + return (SET_ERROR(EINVAL)); + } + + if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { + error = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (error != 0) + return (error); + } + + error = dsl_dataset_user_hold(holds, minor, errlist); + if (minor != 0) + zfs_onexit_fd_rele(cleanup_fd); + return (error); +} + +/* + * innvl is not used. + * + * outnvl: { + * holdname -> time added (uint64 seconds since epoch) + * ... + * } + */ +/* ARGSUSED */ +static int +zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) +{ + return (dsl_dataset_get_holds(snapname, outnvl)); +} + +/* + * innvl: { + * snapname -> { holdname, ... } + * ... + * } + * + * outnvl: { + * snapname -> error value (int32) + * ... + * } + */ +/* ARGSUSED */ +static int +zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) +{ + return (dsl_dataset_user_release(holds, errlist)); +} + +/* + * inputs: + * zc_name name of new filesystem or snapshot + * zc_value full name of old snapshot + * + * outputs: + * zc_cookie space in bytes + * zc_objset_type compressed space in bytes + * zc_perm_action uncompressed space in bytes + */ +static int +zfs_ioc_space_written(zfs_cmd_t *zc) +{ + int error; + dsl_pool_t *dp; + dsl_dataset_t *new, *old; + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); + if (error != 0) { + dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + + error = dsl_dataset_space_written(old, new, &zc->zc_cookie, + &zc->zc_objset_type, &zc->zc_perm_action); + dsl_dataset_rele(old, FTAG); + dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); +} + +/* + * innvl: { + * "firstsnap" -> snapshot name + * } + * + * outnvl: { + * "used" -> space in bytes + * "compressed" -> compressed space in bytes + * "uncompressed" -> uncompressed space in bytes + * } + */ +static int +zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) +{ + int error; + dsl_pool_t *dp; + dsl_dataset_t *new, *old; + char *firstsnap; + uint64_t used, comp, uncomp; + + if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0) + return (SET_ERROR(EINVAL)); + + error = dsl_pool_hold(lastsnap, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); + if (error == 0 && !new->ds_is_snapshot) { + dsl_dataset_rele(new, FTAG); + error = SET_ERROR(EINVAL); + } + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); + if (error == 0 && !old->ds_is_snapshot) { + dsl_dataset_rele(old, FTAG); + error = SET_ERROR(EINVAL); + } + if (error != 0) { + dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + + error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); + dsl_dataset_rele(old, FTAG); + dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); + fnvlist_add_uint64(outnvl, "used", used); + fnvlist_add_uint64(outnvl, "compressed", comp); + fnvlist_add_uint64(outnvl, "uncompressed", uncomp); + return (error); +} + +static int +zfs_ioc_jail(zfs_cmd_t *zc) +{ + + return (zone_dataset_attach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_jailid)); +} + +static int +zfs_ioc_unjail(zfs_cmd_t *zc) +{ + + return (zone_dataset_detach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_jailid)); +} + +/* + * innvl: { + * "fd" -> file descriptor to write stream to (int32) + * (optional) "fromsnap" -> full snap name to send an incremental from + * (optional) "largeblockok" -> (value ignored) + * indicates that blocks > 128KB are permitted + * (optional) "embedok" -> (value ignored) + * presence indicates DRR_WRITE_EMBEDDED records are permitted + * (optional) "compressok" -> (value ignored) + * presence indicates compressed DRR_WRITE records are permitted + * (optional) "resume_object" and "resume_offset" -> (uint64) + * if present, resume send stream from specified object and offset. + * } + * + * outnvl is unused + */ +/* ARGSUSED */ +static int +zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) +{ + file_t *fp; + int error; + offset_t off; + char *fromname = NULL; + int fd; + boolean_t largeblockok; + boolean_t embedok; + boolean_t compressok; + uint64_t resumeobj = 0; + uint64_t resumeoff = 0; + + error = nvlist_lookup_int32(innvl, "fd", &fd); + if (error != 0) + return (SET_ERROR(EINVAL)); + + (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); + + largeblockok = nvlist_exists(innvl, "largeblockok"); + embedok = nvlist_exists(innvl, "embedok"); + compressok = nvlist_exists(innvl, "compressok"); + + (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); + (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); + +#ifdef illumos + file_t *fp = getf(fd); +#else + fget_write(curthread, fd, &cap_write_rights, &fp); +#endif + if (fp == NULL) + return (SET_ERROR(EBADF)); + + off = fp->f_offset; + error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, +#ifdef illumos + fd, resumeobj, resumeoff, fp->f_vnode, &off); +#else + fd, resumeobj, resumeoff, fp, &off); +#endif + +#ifdef illumos + if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) + fp->f_offset = off; +#else + fp->f_offset = off; +#endif + + releasef(fd); + return (error); +} + +/* + * Determine approximately how large a zfs send stream will be -- the number + * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). + * + * innvl: { + * (optional) "from" -> full snap or bookmark name to send an incremental + * from + * (optional) "largeblockok" -> (value ignored) + * indicates that blocks > 128KB are permitted + * (optional) "embedok" -> (value ignored) + * presence indicates DRR_WRITE_EMBEDDED records are permitted + * (optional) "compressok" -> (value ignored) + * presence indicates compressed DRR_WRITE records are permitted + * } + * + * outnvl: { + * "space" -> bytes of space (uint64) + * } + */ +static int +zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) +{ + dsl_pool_t *dp; + dsl_dataset_t *tosnap; + int error; + char *fromname; + boolean_t compressok; + uint64_t space; + + error = dsl_pool_hold(snapname, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + compressok = nvlist_exists(innvl, "compressok"); + + error = nvlist_lookup_string(innvl, "from", &fromname); + if (error == 0) { + if (strchr(fromname, '@') != NULL) { + /* + * If from is a snapshot, hold it and use the more + * efficient dmu_send_estimate to estimate send space + * size using deadlists. + */ + dsl_dataset_t *fromsnap; + error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); + if (error != 0) + goto out; + error = dmu_send_estimate(tosnap, fromsnap, compressok, + &space); + dsl_dataset_rele(fromsnap, FTAG); + } else if (strchr(fromname, '#') != NULL) { + /* + * If from is a bookmark, fetch the creation TXG of the + * snapshot it was created from and use that to find + * blocks that were born after it. + */ + zfs_bookmark_phys_t frombm; + + error = dsl_bookmark_lookup(dp, fromname, tosnap, + &frombm); + if (error != 0) + goto out; + error = dmu_send_estimate_from_txg(tosnap, + frombm.zbm_creation_txg, compressok, &space); + } else { + /* + * from is not properly formatted as a snapshot or + * bookmark + */ + error = SET_ERROR(EINVAL); + goto out; + } + } else { + /* + * If estimating the size of a full send, use dmu_send_estimate. + */ + error = dmu_send_estimate(tosnap, NULL, compressok, &space); + } + + fnvlist_add_uint64(outnvl, "space", space); + +out: + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); +} + +static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; + +static void +zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, + boolean_t log_history, zfs_ioc_poolcheck_t pool_check) +{ + zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; + + ASSERT3U(ioc, >=, ZFS_IOC_FIRST); + ASSERT3U(ioc, <, ZFS_IOC_LAST); + ASSERT3P(vec->zvec_legacy_func, ==, NULL); + ASSERT3P(vec->zvec_func, ==, NULL); + + vec->zvec_legacy_func = func; + vec->zvec_secpolicy = secpolicy; + vec->zvec_namecheck = namecheck; + vec->zvec_allow_log = log_history; + vec->zvec_pool_check = pool_check; +} + +/* + * See the block comment at the beginning of this file for details on + * each argument to this function. + */ +static void +zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, + zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, + zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, + boolean_t allow_log) +{ + zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; + + ASSERT3U(ioc, >=, ZFS_IOC_FIRST); + ASSERT3U(ioc, <, ZFS_IOC_LAST); + ASSERT3P(vec->zvec_legacy_func, ==, NULL); + ASSERT3P(vec->zvec_func, ==, NULL); + + /* if we are logging, the name must be valid */ + ASSERT(!allow_log || namecheck != NO_NAME); + + vec->zvec_name = name; + vec->zvec_func = func; + vec->zvec_secpolicy = secpolicy; + vec->zvec_namecheck = namecheck; + vec->zvec_pool_check = pool_check; + vec->zvec_smush_outnvlist = smush_outnvlist; + vec->zvec_allow_log = allow_log; +} + +static void +zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy, boolean_t log_history, + zfs_ioc_poolcheck_t pool_check) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + POOL_NAME, log_history, pool_check); +} + +static void +zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + DATASET_NAME, B_FALSE, pool_check); +} + +static void +zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) +{ + zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, + POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); +} + +static void +zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + NO_NAME, B_FALSE, POOL_CHECK_NONE); +} + +static void +zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, + zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); +} + +static void +zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) +{ + zfs_ioctl_register_dataset_read_secpolicy(ioc, func, + zfs_secpolicy_read); +} + +static void +zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); +} + +static void +zfs_ioctl_init(void) +{ + zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, + zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, + zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE); + + zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, + zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + + zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, + zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + + zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, + zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + + zfs_ioctl_register("create", ZFS_IOC_CREATE, + zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("clone", ZFS_IOC_CLONE, + zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("remap", ZFS_IOC_REMAP, + zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); + + zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, + zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("hold", ZFS_IOC_HOLD, + zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + zfs_ioctl_register("release", ZFS_IOC_RELEASE, + zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, + zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + + zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, + zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); + + zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, + zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, + zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + + zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, + zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, + POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, + zfs_ioc_channel_program, zfs_secpolicy_config, + POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, + B_TRUE); + + zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, + zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("zpool_discard_checkpoint", + ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, + zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, + zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + /* IOCTLS that use the legacy function signature */ + + zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, + zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); + + zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, + zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); + zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, + zfs_ioc_pool_scan); + zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, + zfs_ioc_pool_upgrade); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, + zfs_ioc_vdev_add); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, + zfs_ioc_vdev_remove); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, + zfs_ioc_vdev_set_state); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, + zfs_ioc_vdev_attach); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, + zfs_ioc_vdev_detach); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, + zfs_ioc_vdev_setpath); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, + zfs_ioc_vdev_setfru); + zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, + zfs_ioc_pool_set_props); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, + zfs_ioc_vdev_split); + zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, + zfs_ioc_pool_reguid); + + zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, + zfs_ioc_pool_configs, zfs_secpolicy_none); + zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, + zfs_ioc_pool_tryimport, zfs_secpolicy_config); + zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, + zfs_ioc_inject_fault, zfs_secpolicy_inject); + zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, + zfs_ioc_clear_fault, zfs_secpolicy_inject); + zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, + zfs_ioc_inject_list_next, zfs_secpolicy_inject); + + /* + * pool destroy, and export don't log the history as part of + * zfsdev_ioctl, but rather zfs_ioc_pool_export + * does the logging of those commands. + */ + zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, + zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); + zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, + zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); + + zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, + zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); + zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, + zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); + + zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, + zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE); + zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, + zfs_ioc_dsobj_to_dsname, + zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE); + zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, + zfs_ioc_pool_get_history, + zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); + + zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, + zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); + + zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, + zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY); + zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, + zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED); + + zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, + zfs_ioc_space_written); + zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, + zfs_ioc_objset_recvd_props); + zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, + zfs_ioc_next_obj); + zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, + zfs_ioc_get_fsacl); + zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, + zfs_ioc_objset_stats); + zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, + zfs_ioc_objset_zplprops); + zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, + zfs_ioc_dataset_list_next); + zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, + zfs_ioc_snapshot_list_next); + zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, + zfs_ioc_send_progress); + + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, + zfs_ioc_diff, zfs_secpolicy_diff); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, + zfs_ioc_obj_to_stats, zfs_secpolicy_diff); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, + zfs_ioc_obj_to_path, zfs_secpolicy_diff); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, + zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, + zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, + zfs_ioc_send, zfs_secpolicy_send); + + zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, + zfs_secpolicy_none); + zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, + zfs_secpolicy_destroy); + zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, + zfs_secpolicy_rename); + zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, + zfs_secpolicy_recv); + zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, + zfs_secpolicy_promote); + zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, + zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); + zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, + zfs_secpolicy_set_fsacl); + + zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, + zfs_secpolicy_share, POOL_CHECK_NONE); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, + zfs_secpolicy_smb_acl, POOL_CHECK_NONE); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, + zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, + zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); + +#ifdef __FreeBSD__ + zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail, + zfs_secpolicy_config, POOL_CHECK_NONE); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail, + zfs_secpolicy_config, POOL_CHECK_NONE); + zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT, + zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME, + POOL_CHECK_NONE, B_FALSE, B_FALSE); +#endif +} + +int +pool_status_check(const char *name, zfs_ioc_namecheck_t type, + zfs_ioc_poolcheck_t check) +{ + spa_t *spa; + int error; + + ASSERT(type == POOL_NAME || type == DATASET_NAME); + + if (check & POOL_CHECK_NONE) + return (0); + + error = spa_open(name, &spa, FTAG); + if (error == 0) { + if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) + error = SET_ERROR(EAGAIN); + else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) + error = SET_ERROR(EROFS); + spa_close(spa, FTAG); + } + return (error); +} + +/* + * Find a free minor number. + */ +minor_t +zfsdev_minor_alloc(void) +{ + static minor_t last_minor; + minor_t m; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + for (m = last_minor + 1; m != last_minor; m++) { + if (m > ZFSDEV_MAX_MINOR) + m = 1; + if (ddi_get_soft_state(zfsdev_state, m) == NULL) { + last_minor = m; + return (m); + } + } + + return (0); +} + +static int +zfs_ctldev_init(struct cdev *devp) +{ + minor_t minor; + zfs_soft_state_t *zs; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + minor = zfsdev_minor_alloc(); + if (minor == 0) + return (SET_ERROR(ENXIO)); + + if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) + return (SET_ERROR(EAGAIN)); + + devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close); + + zs = ddi_get_soft_state(zfsdev_state, minor); + zs->zss_type = ZSST_CTLDEV; + zfs_onexit_init((zfs_onexit_t **)&zs->zss_data); + + return (0); +} + +static void +zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + zfs_onexit_destroy(zo); + ddi_soft_state_free(zfsdev_state, minor); +} + +void * +zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which) +{ + zfs_soft_state_t *zp; + + zp = ddi_get_soft_state(zfsdev_state, minor); + if (zp == NULL || zp->zss_type != which) + return (NULL); + + return (zp->zss_data); +} + +static int +zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td) +{ + int error = 0; + +#ifdef illumos + if (getminor(*devp) != 0) + return (zvol_open(devp, flag, otyp, cr)); +#endif + + /* This is the control device. Allocate a new minor if requested. */ + if (flag & FEXCL) { + mutex_enter(&spa_namespace_lock); + error = zfs_ctldev_init(devp); + mutex_exit(&spa_namespace_lock); + } + + return (error); +} + +static void +zfsdev_close(void *data) +{ + zfs_onexit_t *zo; + minor_t minor = (minor_t)(uintptr_t)data; + + if (minor == 0) + return; + + mutex_enter(&spa_namespace_lock); + zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); + if (zo == NULL) { + mutex_exit(&spa_namespace_lock); + return; + } + zfs_ctldev_destroy(zo, minor); + mutex_exit(&spa_namespace_lock); +} + +static int +zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, + struct thread *td) +{ + zfs_cmd_t *zc; + uint_t vecnum; + int error, rc, len; +#ifdef illumos + minor_t minor = getminor(dev); +#else + zfs_iocparm_t *zc_iocparm; + int cflag, cmd, oldvecnum; + boolean_t newioc, compat; + void *compat_zc = NULL; + cred_t *cr = td->td_ucred; +#endif + const zfs_ioc_vec_t *vec; + char *saved_poolname = NULL; + nvlist_t *innvl = NULL; + + cflag = ZFS_CMD_COMPAT_NONE; + compat = B_FALSE; + newioc = B_TRUE; /* "new" style (zfs_iocparm_t) ioctl */ + + len = IOCPARM_LEN(zcmd); + vecnum = cmd = zcmd & 0xff; + + /* + * Check if we are talking to supported older binaries + * and translate zfs_cmd if necessary + */ + if (len != sizeof(zfs_iocparm_t)) { + newioc = B_FALSE; + compat = B_TRUE; + + vecnum = cmd; + + switch (len) { + case sizeof(zfs_cmd_zcmd_t): + cflag = ZFS_CMD_COMPAT_LZC; + break; + case sizeof(zfs_cmd_deadman_t): + cflag = ZFS_CMD_COMPAT_DEADMAN; + break; + case sizeof(zfs_cmd_v28_t): + cflag = ZFS_CMD_COMPAT_V28; + break; + case sizeof(zfs_cmd_v15_t): + if (cmd >= sizeof(zfs_ioctl_v15_to_v28) / + sizeof(zfs_ioctl_v15_to_v28[0])) + return (EINVAL); + + cflag = ZFS_CMD_COMPAT_V15; + vecnum = zfs_ioctl_v15_to_v28[cmd]; + + /* + * Return without further handling + * if the command is blacklisted. + */ + if (vecnum == ZFS_IOC_COMPAT_PASS) + return (0); + else if (vecnum == ZFS_IOC_COMPAT_FAIL) + return (ENOTSUP); + break; + default: + return (EINVAL); + } + } + +#ifdef illumos + vecnum = cmd - ZFS_IOC_FIRST; + ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip)); +#endif + + if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) + return (SET_ERROR(EINVAL)); + vec = &zfs_ioc_vec[vecnum]; + + zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); + +#ifdef illumos + error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); + if (error != 0) { + error = SET_ERROR(EFAULT); + goto out; + } +#else /* !illumos */ + bzero(zc, sizeof(zfs_cmd_t)); + + if (newioc) { + zc_iocparm = (void *)arg; + + switch (zc_iocparm->zfs_ioctl_version) { + case ZFS_IOCVER_CURRENT: + if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) { + error = SET_ERROR(EINVAL); + goto out; + } + break; + case ZFS_IOCVER_INLANES: + if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_inlanes_t)) { + error = SET_ERROR(EFAULT); + goto out; + } + compat = B_TRUE; + cflag = ZFS_CMD_COMPAT_INLANES; + break; + case ZFS_IOCVER_RESUME: + if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) { + error = SET_ERROR(EFAULT); + goto out; + } + compat = B_TRUE; + cflag = ZFS_CMD_COMPAT_RESUME; + break; + case ZFS_IOCVER_EDBP: + if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) { + error = SET_ERROR(EFAULT); + goto out; + } + compat = B_TRUE; + cflag = ZFS_CMD_COMPAT_EDBP; + break; + case ZFS_IOCVER_ZCMD: + if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) || + zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) { + error = SET_ERROR(EFAULT); + goto out; + } + compat = B_TRUE; + cflag = ZFS_CMD_COMPAT_ZCMD; + break; + default: + error = SET_ERROR(EINVAL); + goto out; + /* NOTREACHED */ + } + + if (compat) { + ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size); + compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); + bzero(compat_zc, sizeof(zfs_cmd_t)); + + error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, + compat_zc, zc_iocparm->zfs_cmd_size, flag); + if (error != 0) { + error = SET_ERROR(EFAULT); + goto out; + } + } else { + error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, + zc, zc_iocparm->zfs_cmd_size, flag); + if (error != 0) { + error = SET_ERROR(EFAULT); + goto out; + } + } + } + + if (compat) { + if (newioc) { + ASSERT(compat_zc != NULL); + zfs_cmd_compat_get(zc, compat_zc, cflag); + } else { + ASSERT(compat_zc == NULL); + zfs_cmd_compat_get(zc, arg, cflag); + } + oldvecnum = vecnum; + error = zfs_ioctl_compat_pre(zc, &vecnum, cflag); + if (error != 0) + goto out; + if (oldvecnum != vecnum) + vec = &zfs_ioc_vec[vecnum]; + } +#endif /* !illumos */ + + zc->zc_iflags = flag & FKIOCTL; + if (zc->zc_nvlist_src_size != 0) { + error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &innvl); + if (error != 0) + goto out; + } + + /* rewrite innvl for backwards compatibility */ + if (compat) + innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag); + + /* + * Ensure that all pool/dataset names are valid before we pass down to + * the lower layers. + */ + zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + switch (vec->zvec_namecheck) { + case POOL_NAME: + if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) + error = SET_ERROR(EINVAL); + else + error = pool_status_check(zc->zc_name, + vec->zvec_namecheck, vec->zvec_pool_check); + break; + + case DATASET_NAME: + if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) + error = SET_ERROR(EINVAL); + else + error = pool_status_check(zc->zc_name, + vec->zvec_namecheck, vec->zvec_pool_check); + break; + + case NO_NAME: + break; + } + + if (error == 0) + error = vec->zvec_secpolicy(zc, innvl, cr); + + if (error != 0) + goto out; + + /* legacy ioctls can modify zc_name */ + len = strcspn(zc->zc_name, "/@#") + 1; + saved_poolname = kmem_alloc(len, KM_SLEEP); + (void) strlcpy(saved_poolname, zc->zc_name, len); + + if (vec->zvec_func != NULL) { + nvlist_t *outnvl; + int puterror = 0; + spa_t *spa; + nvlist_t *lognv = NULL; + + ASSERT(vec->zvec_legacy_func == NULL); + + /* + * Add the innvl to the lognv before calling the func, + * in case the func changes the innvl. + */ + if (vec->zvec_allow_log) { + lognv = fnvlist_alloc(); + fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, + vec->zvec_name); + if (!nvlist_empty(innvl)) { + fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, + innvl); + } + } + + outnvl = fnvlist_alloc(); + error = vec->zvec_func(zc->zc_name, innvl, outnvl); + + /* + * Some commands can partially execute, modfiy state, and still + * return an error. In these cases, attempt to record what + * was modified. + */ + if ((error == 0 || + (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) && + vec->zvec_allow_log && + spa_open(zc->zc_name, &spa, FTAG) == 0) { + if (!nvlist_empty(outnvl)) { + fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, + outnvl); + } + if (error != 0) { + fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, + error); + } + (void) spa_history_log_nvl(spa, lognv); + spa_close(spa, FTAG); + } + fnvlist_free(lognv); + + /* rewrite outnvl for backwards compatibility */ + if (compat) + outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum, + cflag); + + if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { + int smusherror = 0; + if (vec->zvec_smush_outnvlist) { + smusherror = nvlist_smush(outnvl, + zc->zc_nvlist_dst_size); + } + if (smusherror == 0) + puterror = put_nvlist(zc, outnvl); + } + + if (puterror != 0) + error = puterror; + + nvlist_free(outnvl); + } else { + error = vec->zvec_legacy_func(zc); + } + +out: + nvlist_free(innvl); + +#ifdef illumos + rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); + if (error == 0 && rc != 0) + error = SET_ERROR(EFAULT); +#else + if (compat) { + zfs_ioctl_compat_post(zc, cmd, cflag); + if (newioc) { + ASSERT(compat_zc != NULL); + ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size); + + zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag); + rc = ddi_copyout(compat_zc, + (void *)(uintptr_t)zc_iocparm->zfs_cmd, + zc_iocparm->zfs_cmd_size, flag); + if (error == 0 && rc != 0) + error = SET_ERROR(EFAULT); + kmem_free(compat_zc, sizeof (zfs_cmd_t)); + } else { + zfs_cmd_compat_put(zc, arg, vecnum, cflag); + } + } else { + ASSERT(newioc); + + rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd, + sizeof (zfs_cmd_t), flag); + if (error == 0 && rc != 0) + error = SET_ERROR(EFAULT); + } +#endif + if (error == 0 && vec->zvec_allow_log) { + char *s = tsd_get(zfs_allow_log_key); + if (s != NULL) + strfree(s); + (void) tsd_set(zfs_allow_log_key, saved_poolname); + } else { + if (saved_poolname != NULL) + strfree(saved_poolname); + } + + kmem_free(zc, sizeof (zfs_cmd_t)); + return (error); +} + +#ifdef illumos +static int +zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0, + DDI_PSEUDO, 0) == DDI_FAILURE) + return (DDI_FAILURE); + + zfs_dip = dip; + + ddi_report_dev(dip); + + return (DDI_SUCCESS); +} + +static int +zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (spa_busy() || zfs_busy() || zvol_busy()) + return (DDI_FAILURE); + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + zfs_dip = NULL; + + ddi_prop_remove_all(dip); + ddi_remove_minor_node(dip, NULL); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = zfs_dip; + return (DDI_SUCCESS); + + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + return (DDI_SUCCESS); + } + + return (DDI_FAILURE); +} +#endif /* illumos */ + +/* + * OK, so this is a little weird. + * + * /dev/zfs is the control node, i.e. minor 0. + * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0. + * + * /dev/zfs has basically nothing to do except serve up ioctls, + * so most of the standard driver entry points are in zvol.c. + */ +#ifdef illumos +static struct cb_ops zfs_cb_ops = { + zfsdev_open, /* open */ + zfsdev_close, /* close */ + zvol_strategy, /* strategy */ + nodev, /* print */ + zvol_dump, /* dump */ + zvol_read, /* read */ + zvol_write, /* write */ + zfsdev_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* prop_op */ + NULL, /* streamtab */ + D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */ + CB_REV, /* version */ + nodev, /* async read */ + nodev, /* async write */ +}; + +static struct dev_ops zfs_dev_ops = { + DEVO_REV, /* version */ + 0, /* refcnt */ + zfs_info, /* info */ + nulldev, /* identify */ + nulldev, /* probe */ + zfs_attach, /* attach */ + zfs_detach, /* detach */ + nodev, /* reset */ + &zfs_cb_ops, /* driver operations */ + NULL, /* no bus operations */ + NULL, /* power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv zfs_modldrv = { + &mod_driverops, + "ZFS storage pool", + &zfs_dev_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&zfs_modlfs, + (void *)&zfs_modldrv, + NULL +}; +#endif /* illumos */ + +static struct cdevsw zfs_cdevsw = { + .d_version = D_VERSION, + .d_open = zfsdev_open, + .d_ioctl = zfsdev_ioctl, + .d_name = ZFS_DEV_NAME +}; + +static void +zfs_allow_log_destroy(void *arg) +{ + char *poolname = arg; + strfree(poolname); +} + +static void +zfsdev_init(void) +{ + zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666, + ZFS_DEV_NAME); +} + +static void +zfsdev_fini(void) +{ + if (zfsdev != NULL) + destroy_dev(zfsdev); +} + +static struct root_hold_token *zfs_root_token; +struct proc *zfsproc; + +#ifdef illumos +int +_init(void) +{ + int error; + + spa_init(FREAD | FWRITE); + zfs_init(); + zvol_init(); + zfs_ioctl_init(); + + if ((error = mod_install(&modlinkage)) != 0) { + zvol_fini(); + zfs_fini(); + spa_fini(); + return (error); + } + + tsd_create(&zfs_fsyncer_key, NULL); + tsd_create(&rrw_tsd_key, rrw_tsd_destroy); + tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); + + error = ldi_ident_from_mod(&modlinkage, &zfs_li); + ASSERT(error == 0); + mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} + +int +_fini(void) +{ + int error; + + if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) + return (SET_ERROR(EBUSY)); + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + zvol_fini(); + zfs_fini(); + spa_fini(); + if (zfs_nfsshare_inited) + (void) ddi_modclose(nfs_mod); + if (zfs_smbshare_inited) + (void) ddi_modclose(smbsrv_mod); + if (zfs_nfsshare_inited || zfs_smbshare_inited) + (void) ddi_modclose(sharefs_mod); + + tsd_destroy(&zfs_fsyncer_key); + ldi_ident_release(zfs_li); + zfs_li = NULL; + mutex_destroy(&zfs_share_lock); + + return (error); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} +#endif /* illumos */ + +static int zfs__init(void); +static int zfs__fini(void); +static void zfs_shutdown(void *, int); + +static eventhandler_tag zfs_shutdown_event_tag; + +#ifdef __FreeBSD__ +#define ZFS_MIN_KSTACK_PAGES 4 +#endif + +int +zfs__init(void) +{ + +#ifdef __FreeBSD__ +#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES + printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack " + "overflow panic!\nPlease consider adding " + "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES, + ZFS_MIN_KSTACK_PAGES); +#endif +#endif + zfs_root_token = root_mount_hold("ZFS"); + + mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); + + spa_init(FREAD | FWRITE); + zfs_init(); + zvol_init(); + zfs_ioctl_init(); + + tsd_create(&zfs_fsyncer_key, NULL); + tsd_create(&rrw_tsd_key, rrw_tsd_destroy); + tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); + tsd_create(&zfs_geom_probe_vdev_key, NULL); + + printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n"); + root_mount_rel(zfs_root_token); + + zfsdev_init(); + + return (0); +} + +int +zfs__fini(void) +{ + if (spa_busy() || zfs_busy() || zvol_busy() || + zio_injection_enabled) { + return (EBUSY); + } + + zfsdev_fini(); + zvol_fini(); + zfs_fini(); + spa_fini(); + + tsd_destroy(&zfs_fsyncer_key); + tsd_destroy(&rrw_tsd_key); + tsd_destroy(&zfs_allow_log_key); + + mutex_destroy(&zfs_share_lock); + + return (0); +} + +static void +zfs_shutdown(void *arg __unused, int howto __unused) +{ + + /* + * ZFS fini routines can not properly work in a panic-ed system. + */ + if (panicstr == NULL) + (void)zfs__fini(); +} + + +static int +zfs_modevent(module_t mod, int type, void *unused __unused) +{ + int err; + + switch (type) { + case MOD_LOAD: + err = zfs__init(); + if (err == 0) + zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( + shutdown_post_sync, zfs_shutdown, NULL, + SHUTDOWN_PRI_FIRST); + return (err); + case MOD_UNLOAD: + err = zfs__fini(); + if (err == 0 && zfs_shutdown_event_tag != NULL) + EVENTHANDLER_DEREGISTER(shutdown_post_sync, + zfs_shutdown_event_tag); + return (err); + case MOD_SHUTDOWN: + return (0); + default: + break; + } + return (EOPNOTSUPP); +} + +static moduledata_t zfs_mod = { + "zfsctrl", + zfs_modevent, + 0 +}; +DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY); +MODULE_VERSION(zfsctrl, 1); +MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1); +MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1); +MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c new file mode 100644 index 000000000000..b1f98071a42a --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c @@ -0,0 +1,681 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/file.h> +#include <sys/vfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_dir.h> +#include <sys/zil.h> +#include <sys/zil_impl.h> +#include <sys/byteorder.h> +#include <sys/policy.h> +#include <sys/stat.h> +#include <sys/acl.h> +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/zfs_fuid.h> +#include <sys/dsl_dataset.h> + +/* + * These zfs_log_* functions must be called within a dmu tx, in one + * of 2 contexts depending on zilog->z_replay: + * + * Non replay mode + * --------------- + * We need to record the transaction so that if it is committed to + * the Intent Log then it can be replayed. An intent log transaction + * structure (itx_t) is allocated and all the information necessary to + * possibly replay the transaction is saved in it. The itx is then assigned + * a sequence number and inserted in the in-memory list anchored in the zilog. + * + * Replay mode + * ----------- + * We need to mark the intent log record as replayed in the log header. + * This is done in the same transaction as the replay so that they + * commit atomically. + */ + +int +zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap) +{ + int isxvattr = (vap->va_mask & AT_XVATTR); + switch (type) { + case Z_FILE: + if (vsecp == NULL && !isxvattr) + return (TX_CREATE); + if (vsecp && isxvattr) +#ifdef TODO + return (TX_CREATE_ACL_ATTR); +#else + panic("%s:%u: unsupported condition", __func__, __LINE__); +#endif + if (vsecp) + return (TX_CREATE_ACL); + else + return (TX_CREATE_ATTR); + /*NOTREACHED*/ + case Z_DIR: + if (vsecp == NULL && !isxvattr) + return (TX_MKDIR); + if (vsecp && isxvattr) +#ifdef TODO + return (TX_MKDIR_ACL_ATTR); +#else + panic("%s:%u: unsupported condition", __func__, __LINE__); +#endif + if (vsecp) + return (TX_MKDIR_ACL); + else + return (TX_MKDIR_ATTR); + case Z_XATTRDIR: + return (TX_MKXATTR); + } + ASSERT(0); + return (TX_MAX_TYPE); +} + +/* + * build up the log data necessary for logging xvattr_t + * First lr_attr_t is initialized. following the lr_attr_t + * is the mapsize and attribute bitmap copied from the xvattr_t. + * Following the bitmap and bitmapsize two 64 bit words are reserved + * for the create time which may be set. Following the create time + * records a single 64 bit integer which has the bits to set on + * replay for the xvattr. + */ +static void +zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) +{ + uint32_t *bitmap; + uint64_t *attrs; + uint64_t *crtime; + xoptattr_t *xoap; + void *scanstamp; + int i; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + lrattr->lr_attr_masksize = xvap->xva_mapsize; + bitmap = &lrattr->lr_attr_bitmap; + for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) { + *bitmap = xvap->xva_reqattrmap[i]; + } + + /* Now pack the attributes up in a single uint64_t */ + attrs = (uint64_t *)bitmap; + crtime = attrs + 1; + scanstamp = (caddr_t)(crtime + 2); + *attrs = 0; + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) + *attrs |= (xoap->xoa_readonly == 0) ? 0 : + XAT0_READONLY; + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) + *attrs |= (xoap->xoa_hidden == 0) ? 0 : + XAT0_HIDDEN; + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) + *attrs |= (xoap->xoa_system == 0) ? 0 : + XAT0_SYSTEM; + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) + *attrs |= (xoap->xoa_archive == 0) ? 0 : + XAT0_ARCHIVE; + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) + *attrs |= (xoap->xoa_immutable == 0) ? 0 : + XAT0_IMMUTABLE; + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) + *attrs |= (xoap->xoa_nounlink == 0) ? 0 : + XAT0_NOUNLINK; + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) + *attrs |= (xoap->xoa_appendonly == 0) ? 0 : + XAT0_APPENDONLY; + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) + *attrs |= (xoap->xoa_opaque == 0) ? 0 : + XAT0_APPENDONLY; + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) + *attrs |= (xoap->xoa_nodump == 0) ? 0 : + XAT0_NODUMP; + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) + *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 : + XAT0_AV_QUARANTINED; + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) + *attrs |= (xoap->xoa_av_modified == 0) ? 0 : + XAT0_AV_MODIFIED; + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime); + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) + *attrs |= (xoap->xoa_reparse == 0) ? 0 : + XAT0_REPARSE; + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) + *attrs |= (xoap->xoa_offline == 0) ? 0 : + XAT0_OFFLINE; + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) + *attrs |= (xoap->xoa_sparse == 0) ? 0 : + XAT0_SPARSE; +} + +static void * +zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start) +{ + zfs_fuid_t *zfuid; + uint64_t *fuidloc = start; + + /* First copy in the ACE FUIDs */ + for (zfuid = list_head(&fuidp->z_fuids); zfuid; + zfuid = list_next(&fuidp->z_fuids, zfuid)) { + *fuidloc++ = zfuid->z_logfuid; + } + return (fuidloc); +} + + +static void * +zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) +{ + zfs_fuid_domain_t *zdomain; + + /* now copy in the domain info, if any */ + if (fuidp->z_domain_str_sz != 0) { + for (zdomain = list_head(&fuidp->z_domains); zdomain; + zdomain = list_next(&fuidp->z_domains, zdomain)) { + bcopy((void *)zdomain->z_domain, start, + strlen(zdomain->z_domain) + 1); + start = (caddr_t)start + + strlen(zdomain->z_domain) + 1; + } + } + return (start); +} + +/* + * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and + * TK_MKXATTR transactions. + * + * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID + * domain information appended prior to the name. In this case the + * uid/gid in the log record will be a log centric FUID. + * + * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that + * may contain attributes, ACL and optional fuid information. + * + * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify + * and ACL and normal users/groups in the ACEs. + * + * There may be an optional xvattr attribute information similar + * to zfs_log_setattr. + * + * Also, after the file name "domain" strings may be appended. + */ +void +zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp, + zfs_fuid_info_t *fuidp, vattr_t *vap) +{ + itx_t *itx; + lr_create_t *lr; + lr_acl_create_t *lracl; + size_t aclsize = (vsecp != NULL) ? vsecp->vsa_aclentsz : 0; + size_t xvatsize = 0; + size_t txsize; + xvattr_t *xvap = (xvattr_t *)vap; + void *end; + size_t lrsize; + size_t namesize = strlen(name) + 1; + size_t fuidsz = 0; + + if (zil_replaying(zilog, tx)) + return; + + /* + * If we have FUIDs present then add in space for + * domains and ACE fuid's if any. + */ + if (fuidp) { + fuidsz += fuidp->z_domain_str_sz; + fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t); + } + + if (vap->va_mask & AT_XVATTR) + xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize); + + if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR || + (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR || + (int)txtype == TX_MKXATTR) { + txsize = sizeof (*lr) + namesize + fuidsz + xvatsize; + lrsize = sizeof (*lr); + } else { + txsize = + sizeof (lr_acl_create_t) + namesize + fuidsz + + ZIL_ACE_LENGTH(aclsize) + xvatsize; + lrsize = sizeof (lr_acl_create_t); + } + + itx = zil_itx_create(txtype, txsize); + + lr = (lr_create_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_foid = zp->z_id; + /* Store dnode slot count in 8 bits above object id. */ + LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT); + lr->lr_mode = zp->z_mode; + if (!IS_EPHEMERAL(zp->z_uid)) { + lr->lr_uid = (uint64_t)zp->z_uid; + } else { + lr->lr_uid = fuidp->z_fuid_owner; + } + if (!IS_EPHEMERAL(zp->z_gid)) { + lr->lr_gid = (uint64_t)zp->z_gid; + } else { + lr->lr_gid = fuidp->z_fuid_group; + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen, + sizeof (uint64_t)); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + lr->lr_crtime, sizeof (uint64_t) * 2); + + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev, + sizeof (lr->lr_rdev)) != 0) + lr->lr_rdev = 0; + + /* + * Fill in xvattr info if any + */ + if (vap->va_mask & AT_XVATTR) { + zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap); + end = (caddr_t)lr + lrsize + xvatsize; + } else { + end = (caddr_t)lr + lrsize; + } + + /* Now fill in any ACL info */ + + if (vsecp) { + lracl = (lr_acl_create_t *)&itx->itx_lr; + lracl->lr_aclcnt = vsecp->vsa_aclcnt; + lracl->lr_acl_bytes = aclsize; + lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; + lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; + if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS) + lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; + else + lracl->lr_acl_flags = 0; + + bcopy(vsecp->vsa_aclentp, end, aclsize); + end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize); + } + + /* drop in FUID info */ + if (fuidp) { + end = zfs_log_fuid_ids(fuidp, end); + end = zfs_log_fuid_domains(fuidp, end); + } + /* + * Now place file name in log record + */ + bcopy(name, end, namesize); + + zil_itx_assign(zilog, itx, tx); +} + +/* + * Handles both TX_REMOVE and TX_RMDIR transactions. + */ +void +zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, char *name, uint64_t foid) +{ + itx_t *itx; + lr_remove_t *lr; + size_t namesize = strlen(name) + 1; + + if (zil_replaying(zilog, tx)) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_remove_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + bcopy(name, (char *)(lr + 1), namesize); + + itx->itx_oid = foid; + + zil_itx_assign(zilog, itx, tx); +} + +/* + * Handles TX_LINK transactions. + */ +void +zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name) +{ + itx_t *itx; + lr_link_t *lr; + size_t namesize = strlen(name) + 1; + + if (zil_replaying(zilog, tx)) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_link_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_link_obj = zp->z_id; + bcopy(name, (char *)(lr + 1), namesize); + + zil_itx_assign(zilog, itx, tx); +} + +/* + * Handles TX_SYMLINK transactions. + */ +void +zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, char *link) +{ + itx_t *itx; + lr_create_t *lr; + size_t namesize = strlen(name) + 1; + size_t linksize = strlen(link) + 1; + + if (zil_replaying(zilog, tx)) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); + lr = (lr_create_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_foid = zp->z_id; + lr->lr_uid = zp->z_uid; + lr->lr_gid = zp->z_gid; + lr->lr_mode = zp->z_mode; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen, + sizeof (uint64_t)); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + lr->lr_crtime, sizeof (uint64_t) * 2); + bcopy(name, (char *)(lr + 1), namesize); + bcopy(link, (char *)(lr + 1) + namesize, linksize); + + zil_itx_assign(zilog, itx, tx); +} + +/* + * Handles TX_RENAME transactions. + */ +void +zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) +{ + itx_t *itx; + lr_rename_t *lr; + size_t snamesize = strlen(sname) + 1; + size_t dnamesize = strlen(dname) + 1; + + if (zil_replaying(zilog, tx)) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); + lr = (lr_rename_t *)&itx->itx_lr; + lr->lr_sdoid = sdzp->z_id; + lr->lr_tdoid = tdzp->z_id; + bcopy(sname, (char *)(lr + 1), snamesize); + bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); + itx->itx_oid = szp->z_id; + + zil_itx_assign(zilog, itx, tx); +} + +/* + * Handles TX_WRITE transactions. + */ +ssize_t zfs_immediate_write_sz = 32768; +#ifdef _KERNEL +SYSCTL_DECL(_vfs_zfs); +SYSCTL_LONG(_vfs_zfs, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN, + &zfs_immediate_write_sz, 0, "Minimal size for indirect log write"); +#endif + +void +zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, offset_t off, ssize_t resid, int ioflag) +{ + uint32_t blocksize = zp->z_blksz; + itx_wr_state_t write_state; + uintptr_t fsync_cnt; + + if (zil_replaying(zilog, tx) || zp->z_unlinked) + return; + + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + write_state = WR_INDIRECT; + else if (!spa_has_slogs(zilog->zl_spa) && + resid >= zfs_immediate_write_sz) + write_state = WR_INDIRECT; + else if (ioflag & (FSYNC | FDSYNC)) + write_state = WR_COPIED; + else + write_state = WR_NEED_COPY; + + if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { + (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); + } + + while (resid) { + itx_t *itx; + lr_write_t *lr; + itx_wr_state_t wr_state = write_state; + ssize_t len = resid; + + if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA) + wr_state = WR_NEED_COPY; + else if (wr_state == WR_INDIRECT) + len = MIN(blocksize - P2PHASE(off, blocksize), resid); + + itx = zil_itx_create(txtype, sizeof (*lr) + + (wr_state == WR_COPIED ? len : 0)); + lr = (lr_write_t *)&itx->itx_lr; + if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, + zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + wr_state = WR_NEED_COPY; + } + + itx->itx_wr_state = wr_state; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = len; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); + + itx->itx_private = zp->z_zfsvfs; + + if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) && + (fsync_cnt == 0)) + itx->itx_sync = B_FALSE; + + zil_itx_assign(zilog, itx, tx); + + off += len; + resid -= len; + } +} + +/* + * Handles TX_TRUNCATE transactions. + */ +void +zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t off, uint64_t len) +{ + itx_t *itx; + lr_truncate_t *lr; + + if (zil_replaying(zilog, tx) || zp->z_unlinked) + return; + + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_truncate_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = len; + + itx->itx_sync = (zp->z_sync_cnt != 0); + zil_itx_assign(zilog, itx, tx); +} + +/* + * Handles TX_SETATTR transactions. + */ +void +zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) +{ + itx_t *itx; + lr_setattr_t *lr; + xvattr_t *xvap = (xvattr_t *)vap; + size_t recsize = sizeof (lr_setattr_t); + void *start; + + if (zil_replaying(zilog, tx) || zp->z_unlinked) + return; + + /* + * If XVATTR set, then log record size needs to allow + * for lr_attr_t + xvattr mask, mapsize and create time + * plus actual attribute values + */ + if (vap->va_mask & AT_XVATTR) + recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize); + + if (fuidp) + recsize += fuidp->z_domain_str_sz; + + itx = zil_itx_create(txtype, recsize); + lr = (lr_setattr_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_mask = (uint64_t)mask_applied; + lr->lr_mode = (uint64_t)vap->va_mode; + if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid)) + lr->lr_uid = fuidp->z_fuid_owner; + else + lr->lr_uid = (uint64_t)vap->va_uid; + + if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid)) + lr->lr_gid = fuidp->z_fuid_group; + else + lr->lr_gid = (uint64_t)vap->va_gid; + + lr->lr_size = (uint64_t)vap->va_size; + ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); + ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); + start = (lr_setattr_t *)(lr + 1); + if (vap->va_mask & AT_XVATTR) { + zfs_log_xvattr((lr_attr_t *)start, xvap); + start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize); + } + + /* + * Now stick on domain information if any on end + */ + + if (fuidp) + (void) zfs_log_fuid_domains(fuidp, start); + + itx->itx_sync = (zp->z_sync_cnt != 0); + zil_itx_assign(zilog, itx, tx); +} + +/* + * Handles TX_ACL transactions. + */ +void +zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, + vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) +{ + itx_t *itx; + lr_acl_v0_t *lrv0; + lr_acl_t *lr; + int txtype; + int lrsize; + size_t txsize; + size_t aclbytes = vsecp->vsa_aclentsz; + + if (zil_replaying(zilog, tx) || zp->z_unlinked) + return; + + txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ? + TX_ACL_V0 : TX_ACL; + + if (txtype == TX_ACL) + lrsize = sizeof (*lr); + else + lrsize = sizeof (*lrv0); + + txsize = lrsize + + ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) + + (fuidp ? fuidp->z_domain_str_sz : 0) + + sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0); + + itx = zil_itx_create(txtype, txsize); + + lr = (lr_acl_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + if (txtype == TX_ACL) { + lr->lr_acl_bytes = aclbytes; + lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; + lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) + lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; + else + lr->lr_acl_flags = 0; + } + lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt; + + if (txtype == TX_ACL_V0) { + lrv0 = (lr_acl_v0_t *)lr; + bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes); + } else { + void *start = (ace_t *)(lr + 1); + + bcopy(vsecp->vsa_aclentp, start, aclbytes); + + start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes); + + if (fuidp) { + start = zfs_log_fuid_ids(fuidp, start); + (void) zfs_log_fuid_domains(fuidp, start); + } + } + + itx->itx_sync = (zp->z_sync_cnt != 0); + zil_itx_assign(zilog, itx, tx); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c new file mode 100644 index 000000000000..edb9ca86caa8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c @@ -0,0 +1,254 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/sunddi.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_onexit.h> +#include <sys/zvol.h> + +/* + * ZFS kernel routines may add/delete callback routines to be invoked + * upon process exit (triggered via the close operation from the /dev/zfs + * driver). + * + * These cleanup callbacks are intended to allow for the accumulation + * of kernel state across multiple ioctls. User processes participate + * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a + * clone-open, generating a unique minor number. The process then passes + * along that file descriptor to each ioctl that might have a cleanup operation. + * + * Consumers of the onexit routines should call zfs_onexit_fd_hold() early + * on to validate the given fd and add a reference to its file table entry. + * This allows the consumer to do its work and then add a callback, knowing + * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers + * should call zfs_onexit_fd_rele(). + * + * A simple example is zfs_ioc_recv(), where we might create an AVL tree + * with dataset/GUID mappings and then reuse that tree on subsequent + * zfs_ioc_recv() calls. + * + * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc() + * the AVL tree and pass it along with a callback function to + * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the + * callback and return an action handle. + * + * The action handle is then passed from user space to subsequent + * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree + * by calling zfs_onexit_cb_data() with the device minor number and + * action handle. + * + * If the user process exits abnormally, the callback is invoked implicitly + * as part of the driver close operation. Once the user space process is + * finished with the accumulated kernel state, it can also just call close(2) + * on the cleanup fd to trigger the cleanup callback. + */ + +void +zfs_onexit_init(zfs_onexit_t **zop) +{ + zfs_onexit_t *zo; + + zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP); + mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t), + offsetof(zfs_onexit_action_node_t, za_link)); +} + +void +zfs_onexit_destroy(zfs_onexit_t *zo) +{ + zfs_onexit_action_node_t *ap; + + mutex_enter(&zo->zo_lock); + while ((ap = list_head(&zo->zo_actions)) != NULL) { + list_remove(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + ap->za_func(ap->za_data); + kmem_free(ap, sizeof (zfs_onexit_action_node_t)); + mutex_enter(&zo->zo_lock); + } + mutex_exit(&zo->zo_lock); + + list_destroy(&zo->zo_actions); + mutex_destroy(&zo->zo_lock); + kmem_free(zo, sizeof (zfs_onexit_t)); +} + +static int +zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) +{ + *zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); + if (*zo == NULL) + return (SET_ERROR(EBADF)); + + return (0); +} + +/* + * Consumers might need to operate by minor number instead of fd, since + * they might be running in another thread (e.g. txg_sync_thread). Callers + * of this function must call zfs_onexit_fd_rele() when they're finished + * using the minor number. + */ +int +zfs_onexit_fd_hold(int fd, minor_t *minorp) +{ + file_t *fp, *tmpfp; + zfs_onexit_t *zo; + cap_rights_t rights; + void *data; + int error; + + fp = getf(fd, &cap_no_rights); + if (fp == NULL) + return (SET_ERROR(EBADF)); + + tmpfp = curthread->td_fpop; + curthread->td_fpop = fp; + error = devfs_get_cdevpriv(&data); + if (error == 0) + *minorp = (minor_t)(uintptr_t)data; + curthread->td_fpop = tmpfp; + if (error != 0) + return (SET_ERROR(EBADF)); + return (zfs_onexit_minor_to_state(*minorp, &zo)); +} + +void +zfs_onexit_fd_rele(int fd) +{ + releasef(fd); +} + +/* + * Add a callback to be invoked when the calling process exits. + */ +int +zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, + uint64_t *action_handle) +{ + zfs_onexit_t *zo; + zfs_onexit_action_node_t *ap; + int error; + + error = zfs_onexit_minor_to_state(minor, &zo); + if (error) + return (error); + + ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP); + list_link_init(&ap->za_link); + ap->za_func = func; + ap->za_data = data; + + mutex_enter(&zo->zo_lock); + list_insert_tail(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + if (action_handle) + *action_handle = (uint64_t)(uintptr_t)ap; + + return (0); +} + +static zfs_onexit_action_node_t * +zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle) +{ + zfs_onexit_action_node_t *match; + zfs_onexit_action_node_t *ap; + list_t *l; + + ASSERT(MUTEX_HELD(&zo->zo_lock)); + + match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle; + l = &zo->zo_actions; + for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) { + if (match == ap) + break; + } + return (ap); +} + +/* + * Delete the callback, triggering it first if 'fire' is set. + */ +int +zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) +{ + zfs_onexit_t *zo; + zfs_onexit_action_node_t *ap; + int error; + + error = zfs_onexit_minor_to_state(minor, &zo); + if (error) + return (error); + + mutex_enter(&zo->zo_lock); + ap = zfs_onexit_find_cb(zo, action_handle); + if (ap != NULL) { + list_remove(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + if (fire) + ap->za_func(ap->za_data); + kmem_free(ap, sizeof (zfs_onexit_action_node_t)); + } else { + mutex_exit(&zo->zo_lock); + error = SET_ERROR(ENOENT); + } + + return (error); +} + +/* + * Return the data associated with this callback. This allows consumers + * of the cleanup-on-exit interfaces to stash kernel data across system + * calls, knowing that it will be cleaned up if the calling process exits. + */ +int +zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) +{ + zfs_onexit_t *zo; + zfs_onexit_action_node_t *ap; + int error; + + *data = NULL; + + error = zfs_onexit_minor_to_state(minor, &zo); + if (error) + return (error); + + mutex_enter(&zo->zo_lock); + ap = zfs_onexit_find_cb(zo, action_handle); + if (ap != NULL) + *data = ap->za_data; + else + error = SET_ERROR(ENOENT); + mutex_exit(&zo->zo_lock); + + return (error); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c new file mode 100644 index 000000000000..9bf7643258c8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c @@ -0,0 +1,1070 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/vfs.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_fuid.h> +#include <sys/spa.h> +#include <sys/zil.h> +#include <sys/byteorder.h> +#include <sys/stat.h> +#include <sys/acl.h> +#include <sys/atomic.h> +#include <sys/cred.h> +#include <sys/namei.h> + +/* + * Functions to replay ZFS intent log (ZIL) records + * The functions are called through a function vector (zfs_replay_vector) + * which is indexed by the transaction type. + */ + +static void +zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, + uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) +{ + VATTR_NULL(vap); + vap->va_mask = (uint_t)mask; + if (mask & AT_TYPE) + vap->va_type = IFTOVT(mode); + if (mask & AT_MODE) + vap->va_mode = mode & MODEMASK; + if (mask & AT_UID) + vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; + if (mask & AT_GID) + vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid; + vap->va_rdev = zfs_cmpldev(rdev); + vap->va_nodeid = nodeid; +} + +/* ARGSUSED */ +static int +zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap) +{ + return (SET_ERROR(ENOTSUP)); +} + +static void +zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) +{ + xoptattr_t *xoap = NULL; + uint64_t *attrs; + uint64_t *crtime; + uint32_t *bitmap; + void *scanstamp; + int i; + + xvap->xva_vattr.va_mask |= AT_XVATTR; + if ((xoap = xva_getxoptattr(xvap)) == NULL) { + xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */ + return; + } + + ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize); + + bitmap = &lrattr->lr_attr_bitmap; + for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++) + xvap->xva_reqattrmap[i] = *bitmap; + + attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1); + crtime = attrs + 1; + scanstamp = (caddr_t)(crtime + 2); + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) + xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0); + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) + xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0); + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) + xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) + xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0); + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) + xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) + xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0); + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) + xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0); + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) + xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0); + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) + xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) + xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0); + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) + xoap->xoa_av_quarantined = + ((*attrs & XAT0_AV_QUARANTINED) != 0); + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime); + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) + xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) + xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) + xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0); +} + +static int +zfs_replay_domain_cnt(uint64_t uid, uint64_t gid) +{ + uint64_t uid_idx; + uint64_t gid_idx; + int domcnt = 0; + + uid_idx = FUID_INDEX(uid); + gid_idx = FUID_INDEX(gid); + if (uid_idx) + domcnt++; + if (gid_idx > 0 && gid_idx != uid_idx) + domcnt++; + + return (domcnt); +} + +static void * +zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start, + int domcnt) +{ + int i; + + for (i = 0; i != domcnt; i++) { + fuid_infop->z_domain_table[i] = start; + start = (caddr_t)start + strlen(start) + 1; + } + + return (start); +} + +/* + * Set the uid/gid in the fuid_info structure. + */ +static void +zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid) +{ + /* + * If owner or group are log specific FUIDs then slurp up + * domain information and build zfs_fuid_info_t + */ + if (IS_EPHEMERAL(uid)) + fuid_infop->z_fuid_owner = uid; + + if (IS_EPHEMERAL(gid)) + fuid_infop->z_fuid_group = gid; +} + +/* + * Load fuid domains into fuid_info_t + */ +static zfs_fuid_info_t * +zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid) +{ + int domcnt; + + zfs_fuid_info_t *fuid_infop; + + fuid_infop = zfs_fuid_info_alloc(); + + domcnt = zfs_replay_domain_cnt(uid, gid); + + if (domcnt == 0) + return (fuid_infop); + + fuid_infop->z_domain_table = + kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); + + zfs_replay_fuid_ugid(fuid_infop, uid, gid); + + fuid_infop->z_domain_cnt = domcnt; + *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt); + return (fuid_infop); +} + +/* + * load zfs_fuid_t's and fuid_domains into fuid_info_t + */ +static zfs_fuid_info_t * +zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid, + uint64_t gid) +{ + uint64_t *log_fuid = (uint64_t *)start; + zfs_fuid_info_t *fuid_infop; + int i; + + fuid_infop = zfs_fuid_info_alloc(); + fuid_infop->z_domain_cnt = domcnt; + + fuid_infop->z_domain_table = + kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); + + for (i = 0; i != idcnt; i++) { + zfs_fuid_t *zfuid; + + zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); + zfuid->z_logfuid = *log_fuid; + zfuid->z_id = -1; + zfuid->z_domidx = 0; + list_insert_tail(&fuid_infop->z_fuids, zfuid); + log_fuid++; + } + + zfs_replay_fuid_ugid(fuid_infop, uid, gid); + + *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt); + return (fuid_infop); +} + +static void +zfs_replay_swap_attrs(lr_attr_t *lrattr) +{ + /* swap the lr_attr structure */ + byteswap_uint32_array(lrattr, sizeof (*lrattr)); + /* swap the bitmap */ + byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) * + sizeof (uint32_t)); + /* swap the attributes, create time + 64 bit word for attributes */ + byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) * + (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t)); +} + +/* + * Replay file create with optional ACL, xvattr information as well + * as option FUID information. + */ +static int +zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_acl_create_t *lracl = arg2; + char *name = NULL; /* location determined later */ + lr_create_t *lr = (lr_create_t *)lracl; + znode_t *dzp; + vnode_t *vp = NULL; + xvattr_t xva; + int vflg = 0; + vsecattr_t vsec = { 0 }; + lr_attr_t *lrattr; + void *aclstart; + void *fuidstart; + size_t xvatlen = 0; + uint64_t txtype; + uint64_t objid; + uint64_t dnodesize; + int error; + + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); + if (byteswap) { + byteswap_uint64_array(lracl, sizeof (*lracl)); + if (txtype == TX_CREATE_ACL_ATTR || + txtype == TX_MKDIR_ACL_ATTR) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + zfs_replay_swap_attrs(lrattr); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + } + + aclstart = (caddr_t)(lracl + 1) + xvatlen; + zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); + /* swap fuids */ + if (lracl->lr_fuidcnt) { + byteswap_uint64_array((caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes), + lracl->lr_fuidcnt * sizeof (uint64_t)); + } + } + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + objid = LR_FOID_GET_OBJ(lr->lr_foid); + dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; + + xva_init(&xva); + zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); + + /* + * All forms of zfs create (create, mkdir, mkxattrdir, symlink) + * eventually end up in zfs_mknode(), which assigns the object's + * creation time, generation number, and dnode size. The generic + * zfs_create() has no concept of these attributes, so we smuggle + * the values inside the vattr's otherwise unused va_ctime, + * va_nblocks, and va_fsid fields. + + */ + ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); + xva.xva_vattr.va_nblocks = lr->lr_gen; + xva.xva_vattr.va_fsid = dnodesize; + + error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); + if (error != ENOENT) + goto bail; + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + switch (txtype) { + case TX_CREATE_ACL: + aclstart = (caddr_t)(lracl + 1); + fuidstart = (caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + /*FALLTHROUGH*/ + case TX_CREATE_ACL_ATTR: + if (name == NULL) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + xva.xva_vattr.va_mask |= AT_XVATTR; + zfs_replay_xvattr(lrattr, &xva); + } + vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; + vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; + vsec.vsa_aclcnt = lracl->lr_aclcnt; + vsec.vsa_aclentsz = lracl->lr_acl_bytes; + vsec.vsa_aclflags = lracl->lr_acl_flags; + if (zfsvfs->z_fuid_replay == NULL) { + fuidstart = (caddr_t)(lracl + 1) + xvatlen + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + } + +#ifdef TODO + error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr, + 0, 0, &vp, kcred, vflg, NULL, &vsec); +#else + panic("%s:%u: unsupported condition", __func__, __LINE__); +#endif + break; + case TX_MKDIR_ACL: + aclstart = (caddr_t)(lracl + 1); + fuidstart = (caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + /*FALLTHROUGH*/ + case TX_MKDIR_ACL_ATTR: + if (name == NULL) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr(lrattr, &xva); + } + vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; + vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; + vsec.vsa_aclcnt = lracl->lr_aclcnt; + vsec.vsa_aclentsz = lracl->lr_acl_bytes; + vsec.vsa_aclflags = lracl->lr_acl_flags; + if (zfsvfs->z_fuid_replay == NULL) { + fuidstart = (caddr_t)(lracl + 1) + xvatlen + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + } +#ifdef TODO + error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr, + &vp, kcred, NULL, vflg, &vsec); +#else + panic("%s:%u: unsupported condition", __func__, __LINE__); +#endif + break; + default: + error = SET_ERROR(ENOTSUP); + } + +bail: + if (error == 0 && vp != NULL) + VN_RELE(vp); + + VN_RELE(ZTOV(dzp)); + + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; + + return (error); +} + +static int +zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_create_t *lr = arg2; + char *name = NULL; /* location determined later */ + char *link; /* symlink content follows name */ + znode_t *dzp; + vnode_t *vp = NULL; + xvattr_t xva; + int vflg = 0; + size_t lrsize = sizeof (lr_create_t); + lr_attr_t *lrattr; + void *start; + size_t xvatlen; + uint64_t txtype; + struct componentname cn; + int error; + + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) + zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); + } + + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + uint64_t objid = LR_FOID_GET_OBJ(lr->lr_foid); + int dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; + + xva_init(&xva); + zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); + + /* + * All forms of zfs create (create, mkdir, mkxattrdir, symlink) + * eventually end up in zfs_mknode(), which assigns the object's + * creation time, generation number, and dnode slot count. The + * generic zfs_create() has no concept of these attributes, so + * we smuggle the values inside * the vattr's otherwise unused + * va_ctime, va_nblocks, and va_nlink fields. + */ + ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); + xva.xva_vattr.va_nblocks = lr->lr_gen; + xva.xva_vattr.va_fsid = dnodesize; + + error = dmu_object_info(zfsvfs->z_os, objid, NULL); + if (error != ENOENT) + goto out; + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + /* + * Symlinks don't have fuid info, and CIFS never creates + * symlinks. + * + * The _ATTR versions will grab the fuid info in their subcases. + */ + if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && + (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && + (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { + start = (lr + 1); + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + } + + cn.cn_cred = kcred; + cn.cn_thread = curthread; + cn.cn_flags = SAVENAME; + + vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); + switch (txtype) { + case TX_CREATE_ATTR: + lrattr = (lr_attr_t *)(caddr_t)(lr + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); + start = (caddr_t)(lr + 1) + xvatlen; + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + name = (char *)start; + + /*FALLTHROUGH*/ + case TX_CREATE: + if (name == NULL) + name = (char *)start; + + cn.cn_nameptr = name; + error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/); + break; + case TX_MKDIR_ATTR: + lrattr = (lr_attr_t *)(caddr_t)(lr + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); + start = (caddr_t)(lr + 1) + xvatlen; + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + name = (char *)start; + + /*FALLTHROUGH*/ + case TX_MKDIR: + if (name == NULL) + name = (char *)(lr + 1); + + cn.cn_nameptr = name; + error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/); + break; + case TX_MKXATTR: + error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred); + break; + case TX_SYMLINK: + name = (char *)(lr + 1); + link = name + strlen(name) + 1; + cn.cn_nameptr = name; + error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &xva.xva_vattr, link /*,vflg*/); + break; + default: + error = SET_ERROR(ENOTSUP); + } + VOP_UNLOCK(ZTOV(dzp), 0); + +out: + if (error == 0 && vp != NULL) + VN_URELE(vp); + + VN_RELE(ZTOV(dzp)); + + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; + return (error); +} + +static int +zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_remove_t *lr = arg2; + char *name = (char *)(lr + 1); /* name follows lr_remove_t */ + znode_t *dzp; + struct componentname cn; + vnode_t *vp; + int error; + int vflg = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + cn.cn_nameptr = name; + cn.cn_namelen = strlen(name); + cn.cn_nameiop = DELETE; + cn.cn_flags = ISLASTCN | SAVENAME; + cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; + cn.cn_cred = kcred; + cn.cn_thread = curthread; + vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); + error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn); + if (error != 0) { + VOP_UNLOCK(ZTOV(dzp), 0); + goto fail; + } + + switch ((int)lr->lr_common.lrc_txtype) { + case TX_REMOVE: + error = VOP_REMOVE(ZTOV(dzp), vp, &cn /*,vflg*/); + break; + case TX_RMDIR: + error = VOP_RMDIR(ZTOV(dzp), vp, &cn /*,vflg*/); + break; + default: + error = SET_ERROR(ENOTSUP); + } + vput(vp); + VOP_UNLOCK(ZTOV(dzp), 0); + +fail: + VN_RELE(ZTOV(dzp)); + + return (error); +} + +static int +zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_link_t *lr = arg2; + char *name = (char *)(lr + 1); /* name follows lr_link_t */ + znode_t *dzp, *zp; + struct componentname cn; + int error; + int vflg = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { + VN_RELE(ZTOV(dzp)); + return (error); + } + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + cn.cn_nameptr = name; + cn.cn_cred = kcred; + cn.cn_thread = curthread; + cn.cn_flags = SAVENAME; + + vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); + vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); + error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn /*,vflg*/); + VOP_UNLOCK(ZTOV(zp), 0); + VOP_UNLOCK(ZTOV(dzp), 0); + + VN_RELE(ZTOV(zp)); + VN_RELE(ZTOV(dzp)); + + return (error); +} + +static int +zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_rename_t *lr = arg2; + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; + znode_t *sdzp, *tdzp; + struct componentname scn, tcn; + vnode_t *svp, *tvp; + kthread_t *td = curthread; + int error; + int vflg = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) + return (error); + + if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { + VN_RELE(ZTOV(sdzp)); + return (error); + } + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + svp = tvp = NULL; + + scn.cn_nameptr = sname; + scn.cn_namelen = strlen(sname); + scn.cn_nameiop = DELETE; + scn.cn_flags = ISLASTCN | SAVENAME; + scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; + scn.cn_cred = kcred; + scn.cn_thread = td; + vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY); + error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn); + VOP_UNLOCK(ZTOV(sdzp), 0); + if (error != 0) + goto fail; + VOP_UNLOCK(svp, 0); + + tcn.cn_nameptr = tname; + tcn.cn_namelen = strlen(tname); + tcn.cn_nameiop = RENAME; + tcn.cn_flags = ISLASTCN | SAVENAME; + tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; + tcn.cn_cred = kcred; + tcn.cn_thread = td; + vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY); + error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn); + if (error == EJUSTRETURN) + tvp = NULL; + else if (error != 0) { + VOP_UNLOCK(ZTOV(tdzp), 0); + goto fail; + } + + error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn /*,vflg*/); + return (error); +fail: + if (svp != NULL) + vrele(svp); + if (tvp != NULL) + vrele(tvp); + VN_RELE(ZTOV(tdzp)); + VN_RELE(ZTOV(sdzp)); + + return (error); +} + +static int +zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_write_t *lr = arg2; + char *data = (char *)(lr + 1); /* data follows lr_write_t */ + znode_t *zp; + int error; + ssize_t resid; + uint64_t eod, offset, length; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log writes out of order, it's possible the + * file has been removed. In this case just drop the write + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + offset = lr->lr_offset; + length = lr->lr_length; + eod = offset + length; /* end of data for this write */ + + /* + * This may be a write from a dmu_sync() for a whole block, + * and may extend beyond the current end of the file. + * We can't just replay what was written for this TX_WRITE as + * a future TX_WRITE2 may extend the eof and the data for that + * write needs to be there. So we write the whole block and + * reduce the eof. This needs to be done within the single dmu + * transaction created within vn_rdwr -> zfs_write. So a possible + * new end of file is passed through in zfsvfs->z_replay_eof + */ + + zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */ + + /* If it's a dmu_sync() block, write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); + if (length < blocksize) { + offset -= offset % blocksize; + length = blocksize; + } + if (zp->z_size < eod) + zfsvfs->z_replay_eof = eod; + } + + error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset, + UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + VN_RELE(ZTOV(zp)); + zfsvfs->z_replay_eof = 0; /* safety */ + + return (error); +} + +/* + * TX_WRITE2 are only generated when dmu_sync() returns EALREADY + * meaning the pool block is already being synced. So now that we always write + * out full blocks, all we have to do is expand the eof if + * the file is grown. + */ +static int +zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_write_t *lr = arg2; + znode_t *zp; + int error; + uint64_t end; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + +top: + end = lr->lr_offset + lr->lr_length; + if (end > zp->z_size) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + zp->z_size = end; + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + VN_RELE(ZTOV(zp)); + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + return (error); + } + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + + /* Ensure the replayed seq is updated */ + (void) zil_replaying(zfsvfs->z_log, tx); + + dmu_tx_commit(tx); + } + + VN_RELE(ZTOV(zp)); + + return (error); +} + +static int +zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) +{ +#ifdef illumos + zfsvfs_t *zfsvfs = arg1; + lr_truncate_t *lr = arg2; + znode_t *zp; + flock64_t fl; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + bzero(&fl, sizeof (fl)); + fl.l_type = F_WRLCK; + fl.l_whence = 0; + fl.l_start = lr->lr_offset; + fl.l_len = lr->lr_length; + + error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX, + lr->lr_offset, kcred, NULL); + + VN_RELE(ZTOV(zp)); + + return (error); +#else + ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org"); + return (EOPNOTSUPP); +#endif +} + +static int +zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_setattr_t *lr = arg2; + znode_t *zp; + xvattr_t xva; + vattr_t *vap = &xva.xva_vattr; + vnode_t *vp; + int error; + void *start; + + xva_init(&xva); + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((lr->lr_mask & AT_XVATTR) && + zfsvfs->z_version >= ZPL_VERSION_INITIAL) + zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, + lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); + + vap->va_size = lr->lr_size; + ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime); + ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime); + + /* + * Fill in xvattr_t portions if necessary. + */ + + start = (lr_setattr_t *)(lr + 1); + if (vap->va_mask & AT_XVATTR) { + zfs_replay_xvattr((lr_attr_t *)start, &xva); + start = (caddr_t)start + + ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize); + } else + xva.xva_vattr.va_mask &= ~AT_XVATTR; + + zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + + vp = ZTOV(zp); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_SETATTR(vp, vap, kcred); + VOP_UNLOCK(vp, 0); + + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; + VN_RELE(vp); + + return (error); +} + +extern int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, + caller_context_t *ct); + +static int +zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_acl_v0_t *lr = arg2; + ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ + vsecattr_t vsa; + vnode_t *vp; + znode_t *zp; + int error; + + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + zfs_oldace_byteswap(ace, lr->lr_aclcnt); + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + bzero(&vsa, sizeof (vsa)); + vsa.vsa_mask = VSA_ACE | VSA_ACECNT; + vsa.vsa_aclcnt = lr->lr_aclcnt; + vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt; + vsa.vsa_aclflags = 0; + vsa.vsa_aclentp = ace; + + vp = ZTOV(zp); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL); + VOP_UNLOCK(vp, 0); + + VN_RELE(vp); + + return (error); +} + +/* + * Replaying ACLs is complicated by FUID support. + * The log record may contain some optional data + * to be used for replaying FUID's. These pieces + * are the actual FUIDs that were created initially. + * The FUID table index may no longer be valid and + * during zfs_create() a new index may be assigned. + * Because of this the log will contain the original + * doman+rid in order to create a new FUID. + * + * The individual ACEs may contain an ephemeral uid/gid which is no + * longer valid and will need to be replaced with an actual FUID. + * + */ +static int +zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_acl_t *lr = arg2; + ace_t *ace = (ace_t *)(lr + 1); + vsecattr_t vsa; + znode_t *zp; + vnode_t *vp; + int error; + + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); + if (lr->lr_fuidcnt) { + byteswap_uint64_array((caddr_t)ace + + ZIL_ACE_LENGTH(lr->lr_acl_bytes), + lr->lr_fuidcnt * sizeof (uint64_t)); + } + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + bzero(&vsa, sizeof (vsa)); + vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; + vsa.vsa_aclcnt = lr->lr_aclcnt; + vsa.vsa_aclentp = ace; + vsa.vsa_aclentsz = lr->lr_acl_bytes; + vsa.vsa_aclflags = lr->lr_acl_flags; + + if (lr->lr_fuidcnt) { + void *fuidstart = (caddr_t)ace + + ZIL_ACE_LENGTH(lr->lr_acl_bytes); + + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, &fuidstart, + lr->lr_fuidcnt, lr->lr_domcnt, 0, 0); + } + + vp = ZTOV(zp); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL); + VOP_UNLOCK(vp, 0); + + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + + zfsvfs->z_fuid_replay = NULL; + VN_RELE(vp); + + return (error); +} + +/* + * Callback vectors for replaying records + */ +zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { + zfs_replay_error, /* 0 no such transaction type */ + zfs_replay_create, /* TX_CREATE */ + zfs_replay_create, /* TX_MKDIR */ + zfs_replay_create, /* TX_MKXATTR */ + zfs_replay_create, /* TX_SYMLINK */ + zfs_replay_remove, /* TX_REMOVE */ + zfs_replay_remove, /* TX_RMDIR */ + zfs_replay_link, /* TX_LINK */ + zfs_replay_rename, /* TX_RENAME */ + zfs_replay_write, /* TX_WRITE */ + zfs_replay_truncate, /* TX_TRUNCATE */ + zfs_replay_setattr, /* TX_SETATTR */ + zfs_replay_acl_v0, /* TX_ACL_V0 */ + zfs_replay_acl, /* TX_ACL */ + zfs_replay_create_acl, /* TX_CREATE_ACL */ + zfs_replay_create, /* TX_CREATE_ATTR */ + zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */ + zfs_replay_create_acl, /* TX_MKDIR_ACL */ + zfs_replay_create, /* TX_MKDIR_ATTR */ + zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ + zfs_replay_write2, /* TX_WRITE2 */ +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c new file mode 100644 index 000000000000..7743e81dd5f1 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c @@ -0,0 +1,601 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* + * This file contains the code to implement file range locking in + * ZFS, although there isn't much specific to ZFS (all that comes to mind is + * support for growing the blocksize). + * + * Interface + * --------- + * Defined in zfs_rlock.h but essentially: + * rl = zfs_range_lock(zp, off, len, lock_type); + * zfs_range_unlock(rl); + * zfs_range_reduce(rl, off, len); + * + * AVL tree + * -------- + * An AVL tree is used to maintain the state of the existing ranges + * that are locked for exclusive (writer) or shared (reader) use. + * The starting range offset is used for searching and sorting the tree. + * + * Common case + * ----------- + * The (hopefully) usual case is of no overlaps or contention for + * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree + * searched that finds no overlap, and *this* rl_t is placed in the tree. + * + * Overlaps/Reference counting/Proxy locks + * --------------------------------------- + * The avl code only allows one node at a particular offset. Also it's very + * inefficient to search through all previous entries looking for overlaps + * (because the very 1st in the ordered list might be at offset 0 but + * cover the whole file). + * So this implementation uses reference counts and proxy range locks. + * Firstly, only reader locks use reference counts and proxy locks, + * because writer locks are exclusive. + * When a reader lock overlaps with another then a proxy lock is created + * for that range and replaces the original lock. If the overlap + * is exact then the reference count of the proxy is simply incremented. + * Otherwise, the proxy lock is split into smaller lock ranges and + * new proxy locks created for non overlapping ranges. + * The reference counts are adjusted accordingly. + * Meanwhile, the orginal lock is kept around (this is the callers handle) + * and its offset and length are used when releasing the lock. + * + * Thread coordination + * ------------------- + * In order to make wakeups efficient and to ensure multiple continuous + * readers on a range don't starve a writer for the same range lock, + * two condition variables are allocated in each rl_t. + * If a writer (or reader) can't get a range it initialises the writer + * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; + * and waits on that cv. When a thread unlocks that range it wakes up all + * writers then all readers before destroying the lock. + * + * Append mode writes + * ------------------ + * Append mode writes need to lock a range at the end of a file. + * The offset of the end of the file is determined under the + * range locking mutex, and the lock type converted from RL_APPEND to + * RL_WRITER and the range locked. + * + * Grow block handling + * ------------------- + * ZFS supports multiple block sizes currently upto 128K. The smallest + * block size is used for the file which is grown as needed. During this + * growth all other writers and readers must be excluded. + * So if the block size needs to be grown then the whole file is + * exclusively locked, then later the caller will reduce the lock + * range to just the range to be written using zfs_reduce_range. + */ + +#include <sys/zfs_rlock.h> + +/* + * Check if a write lock can be grabbed, or wait and recheck until available. + */ +static void +zfs_range_lock_writer(znode_t *zp, rl_t *new) +{ + avl_tree_t *tree = &zp->z_range_avl; + rl_t *rl; + avl_index_t where; + uint64_t end_size; + uint64_t off = new->r_off; + uint64_t len = new->r_len; + + for (;;) { + /* + * Range locking is also used by zvol and uses a + * dummied up znode. However, for zvol, we don't need to + * append or grow blocksize, and besides we don't have + * a "sa" data or z_zfsvfs - so skip that processing. + * + * Yes, this is ugly, and would be solved by not handling + * grow or append in range lock code. If that was done then + * we could make the range locking code generically available + * to other non-zfs consumers. + */ + if (zp->z_vnode) { /* caller is ZPL */ + /* + * If in append mode pick up the current end of file. + * This is done under z_range_lock to avoid races. + */ + if (new->r_type == RL_APPEND) + new->r_off = zp->z_size; + + /* + * If we need to grow the block size then grab the whole + * file range. This is also done under z_range_lock to + * avoid races. + */ + end_size = MAX(zp->z_size, new->r_off + len); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { + new->r_off = 0; + new->r_len = UINT64_MAX; + } + } + + /* + * First check for the usual case of no locks + */ + if (avl_numnodes(tree) == 0) { + new->r_type = RL_WRITER; /* convert to writer */ + avl_add(tree, new); + return; + } + + /* + * Look for any locks in the range. + */ + rl = avl_find(tree, new, &where); + if (rl) + goto wait; /* already locked at same offset */ + + rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + if (rl && (rl->r_off < new->r_off + new->r_len)) + goto wait; + + rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); + if (rl && rl->r_off + rl->r_len > new->r_off) + goto wait; + + new->r_type = RL_WRITER; /* convert possible RL_APPEND */ + avl_insert(tree, new, where); + return; +wait: + if (!rl->r_write_wanted) { + cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); + rl->r_write_wanted = B_TRUE; + } + cv_wait(&rl->r_wr_cv, &zp->z_range_lock); + + /* reset to original */ + new->r_off = off; + new->r_len = len; + } +} + +/* + * If this is an original (non-proxy) lock then replace it by + * a proxy and return the proxy. + */ +static rl_t * +zfs_range_proxify(avl_tree_t *tree, rl_t *rl) +{ + rl_t *proxy; + + if (rl->r_proxy) + return (rl); /* already a proxy */ + + ASSERT3U(rl->r_cnt, ==, 1); + ASSERT(rl->r_write_wanted == B_FALSE); + ASSERT(rl->r_read_wanted == B_FALSE); + avl_remove(tree, rl); + rl->r_cnt = 0; + + /* create a proxy range lock */ + proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); + proxy->r_off = rl->r_off; + proxy->r_len = rl->r_len; + proxy->r_cnt = 1; + proxy->r_type = RL_READER; + proxy->r_proxy = B_TRUE; + proxy->r_write_wanted = B_FALSE; + proxy->r_read_wanted = B_FALSE; + avl_add(tree, proxy); + + return (proxy); +} + +/* + * Split the range lock at the supplied offset + * returning the *front* proxy. + */ +static rl_t * +zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) +{ + rl_t *front, *rear; + + ASSERT3U(rl->r_len, >, 1); + ASSERT3U(off, >, rl->r_off); + ASSERT3U(off, <, rl->r_off + rl->r_len); + ASSERT(rl->r_write_wanted == B_FALSE); + ASSERT(rl->r_read_wanted == B_FALSE); + + /* create the rear proxy range lock */ + rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); + rear->r_off = off; + rear->r_len = rl->r_off + rl->r_len - off; + rear->r_cnt = rl->r_cnt; + rear->r_type = RL_READER; + rear->r_proxy = B_TRUE; + rear->r_write_wanted = B_FALSE; + rear->r_read_wanted = B_FALSE; + + front = zfs_range_proxify(tree, rl); + front->r_len = off - rl->r_off; + + avl_insert_here(tree, rear, front, AVL_AFTER); + return (front); +} + +/* + * Create and add a new proxy range lock for the supplied range. + */ +static void +zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) +{ + rl_t *rl; + + ASSERT(len); + rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); + rl->r_off = off; + rl->r_len = len; + rl->r_cnt = 1; + rl->r_type = RL_READER; + rl->r_proxy = B_TRUE; + rl->r_write_wanted = B_FALSE; + rl->r_read_wanted = B_FALSE; + avl_add(tree, rl); +} + +static void +zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) +{ + rl_t *next; + uint64_t off = new->r_off; + uint64_t len = new->r_len; + + /* + * prev arrives either: + * - pointing to an entry at the same offset + * - pointing to the entry with the closest previous offset whose + * range may overlap with the new range + * - null, if there were no ranges starting before the new one + */ + if (prev) { + if (prev->r_off + prev->r_len <= off) { + prev = NULL; + } else if (prev->r_off != off) { + /* + * convert to proxy if needed then + * split this entry and bump ref count + */ + prev = zfs_range_split(tree, prev, off); + prev = AVL_NEXT(tree, prev); /* move to rear range */ + } + } + ASSERT((prev == NULL) || (prev->r_off == off)); + + if (prev) + next = prev; + else + next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + + if (next == NULL || off + len <= next->r_off) { + /* no overlaps, use the original new rl_t in the tree */ + avl_insert(tree, new, where); + return; + } + + if (off < next->r_off) { + /* Add a proxy for initial range before the overlap */ + zfs_range_new_proxy(tree, off, next->r_off - off); + } + + new->r_cnt = 0; /* will use proxies in tree */ + /* + * We now search forward through the ranges, until we go past the end + * of the new range. For each entry we make it a proxy if it + * isn't already, then bump its reference count. If there's any + * gaps between the ranges then we create a new proxy range. + */ + for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { + if (off + len <= next->r_off) + break; + if (prev && prev->r_off + prev->r_len < next->r_off) { + /* there's a gap */ + ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); + zfs_range_new_proxy(tree, prev->r_off + prev->r_len, + next->r_off - (prev->r_off + prev->r_len)); + } + if (off + len == next->r_off + next->r_len) { + /* exact overlap with end */ + next = zfs_range_proxify(tree, next); + next->r_cnt++; + return; + } + if (off + len < next->r_off + next->r_len) { + /* new range ends in the middle of this block */ + next = zfs_range_split(tree, next, off + len); + next->r_cnt++; + return; + } + ASSERT3U(off + len, >, next->r_off + next->r_len); + next = zfs_range_proxify(tree, next); + next->r_cnt++; + } + + /* Add the remaining end range. */ + zfs_range_new_proxy(tree, prev->r_off + prev->r_len, + (off + len) - (prev->r_off + prev->r_len)); +} + +/* + * Check if a reader lock can be grabbed, or wait and recheck until available. + */ +static void +zfs_range_lock_reader(znode_t *zp, rl_t *new) +{ + avl_tree_t *tree = &zp->z_range_avl; + rl_t *prev, *next; + avl_index_t where; + uint64_t off = new->r_off; + uint64_t len = new->r_len; + + /* + * Look for any writer locks in the range. + */ +retry: + prev = avl_find(tree, new, &where); + if (prev == NULL) + prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); + + /* + * Check the previous range for a writer lock overlap. + */ + if (prev && (off < prev->r_off + prev->r_len)) { + if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { + if (!prev->r_read_wanted) { + cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); + prev->r_read_wanted = B_TRUE; + } + cv_wait(&prev->r_rd_cv, &zp->z_range_lock); + goto retry; + } + if (off + len < prev->r_off + prev->r_len) + goto got_lock; + } + + /* + * Search through the following ranges to see if there's + * write lock any overlap. + */ + if (prev) + next = AVL_NEXT(tree, prev); + else + next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + for (; next; next = AVL_NEXT(tree, next)) { + if (off + len <= next->r_off) + goto got_lock; + if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { + if (!next->r_read_wanted) { + cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); + next->r_read_wanted = B_TRUE; + } + cv_wait(&next->r_rd_cv, &zp->z_range_lock); + goto retry; + } + if (off + len <= next->r_off + next->r_len) + goto got_lock; + } + +got_lock: + /* + * Add the read lock, which may involve splitting existing + * locks and bumping ref counts (r_cnt). + */ + zfs_range_add_reader(tree, new, prev, where); +} + +/* + * Lock a range (offset, length) as either shared (RL_READER) + * or exclusive (RL_WRITER). Returns the range lock structure + * for later unlocking or reduce range (if entire file + * previously locked as RL_WRITER). + */ +rl_t * +zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) +{ + rl_t *new; + + ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); + + new = kmem_alloc(sizeof (rl_t), KM_SLEEP); + new->r_zp = zp; + new->r_off = off; + if (len + off < off) /* overflow */ + len = UINT64_MAX - off; + new->r_len = len; + new->r_cnt = 1; /* assume it's going to be in the tree */ + new->r_type = type; + new->r_proxy = B_FALSE; + new->r_write_wanted = B_FALSE; + new->r_read_wanted = B_FALSE; + + mutex_enter(&zp->z_range_lock); + if (type == RL_READER) { + /* + * First check for the usual case of no locks + */ + if (avl_numnodes(&zp->z_range_avl) == 0) + avl_add(&zp->z_range_avl, new); + else + zfs_range_lock_reader(zp, new); + } else + zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ + mutex_exit(&zp->z_range_lock); + return (new); +} + +/* + * Unlock a reader lock + */ +static void +zfs_range_unlock_reader(znode_t *zp, rl_t *remove) +{ + avl_tree_t *tree = &zp->z_range_avl; + rl_t *rl, *next = NULL; + uint64_t len; + + /* + * The common case is when the remove entry is in the tree + * (cnt == 1) meaning there's been no other reader locks overlapping + * with this one. Otherwise the remove entry will have been + * removed from the tree and replaced by proxies (one or + * more ranges mapping to the entire range). + */ + if (remove->r_cnt == 1) { + avl_remove(tree, remove); + if (remove->r_write_wanted) { + cv_broadcast(&remove->r_wr_cv); + cv_destroy(&remove->r_wr_cv); + } + if (remove->r_read_wanted) { + cv_broadcast(&remove->r_rd_cv); + cv_destroy(&remove->r_rd_cv); + } + } else { + ASSERT0(remove->r_cnt); + ASSERT0(remove->r_write_wanted); + ASSERT0(remove->r_read_wanted); + /* + * Find start proxy representing this reader lock, + * then decrement ref count on all proxies + * that make up this range, freeing them as needed. + */ + rl = avl_find(tree, remove, NULL); + ASSERT(rl); + ASSERT(rl->r_cnt); + ASSERT(rl->r_type == RL_READER); + for (len = remove->r_len; len != 0; rl = next) { + len -= rl->r_len; + if (len) { + next = AVL_NEXT(tree, rl); + ASSERT(next); + ASSERT(rl->r_off + rl->r_len == next->r_off); + ASSERT(next->r_cnt); + ASSERT(next->r_type == RL_READER); + } + rl->r_cnt--; + if (rl->r_cnt == 0) { + avl_remove(tree, rl); + if (rl->r_write_wanted) { + cv_broadcast(&rl->r_wr_cv); + cv_destroy(&rl->r_wr_cv); + } + if (rl->r_read_wanted) { + cv_broadcast(&rl->r_rd_cv); + cv_destroy(&rl->r_rd_cv); + } + kmem_free(rl, sizeof (rl_t)); + } + } + } + kmem_free(remove, sizeof (rl_t)); +} + +/* + * Unlock range and destroy range lock structure. + */ +void +zfs_range_unlock(rl_t *rl) +{ + znode_t *zp = rl->r_zp; + + ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); + ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); + ASSERT(!rl->r_proxy); + + mutex_enter(&zp->z_range_lock); + if (rl->r_type == RL_WRITER) { + /* writer locks can't be shared or split */ + avl_remove(&zp->z_range_avl, rl); + mutex_exit(&zp->z_range_lock); + if (rl->r_write_wanted) { + cv_broadcast(&rl->r_wr_cv); + cv_destroy(&rl->r_wr_cv); + } + if (rl->r_read_wanted) { + cv_broadcast(&rl->r_rd_cv); + cv_destroy(&rl->r_rd_cv); + } + kmem_free(rl, sizeof (rl_t)); + } else { + /* + * lock may be shared, let zfs_range_unlock_reader() + * release the lock and free the rl_t + */ + zfs_range_unlock_reader(zp, rl); + mutex_exit(&zp->z_range_lock); + } +} + +/* + * Reduce range locked as RL_WRITER from whole file to specified range. + * Asserts the whole file is exclusivly locked and so there's only one + * entry in the tree. + */ +void +zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) +{ + znode_t *zp = rl->r_zp; + + /* Ensure there are no other locks */ + ASSERT(avl_numnodes(&zp->z_range_avl) == 1); + ASSERT(rl->r_off == 0); + ASSERT(rl->r_type == RL_WRITER); + ASSERT(!rl->r_proxy); + ASSERT3U(rl->r_len, ==, UINT64_MAX); + ASSERT3U(rl->r_cnt, ==, 1); + + mutex_enter(&zp->z_range_lock); + rl->r_off = off; + rl->r_len = len; + mutex_exit(&zp->z_range_lock); + if (rl->r_write_wanted) + cv_broadcast(&rl->r_wr_cv); + if (rl->r_read_wanted) + cv_broadcast(&rl->r_rd_cv); +} + +/* + * AVL comparison function used to order range locks + * Locks are ordered on the start offset of the range. + */ +int +zfs_range_compare(const void *arg1, const void *arg2) +{ + const rl_t *rl1 = (const rl_t *)arg1; + const rl_t *rl2 = (const rl_t *)arg2; + + return (AVL_CMP(rl1->r_off, rl2->r_off)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c new file mode 100644 index 000000000000..c914db8d10c3 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c @@ -0,0 +1,326 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/vnode.h> +#include <sys/sa.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_sa.h> + +/* + * ZPL attribute registration table. + * Order of attributes doesn't matter + * a unique value will be assigned for each + * attribute that is file system specific + * + * This is just the set of ZPL attributes that this + * version of ZFS deals with natively. The file system + * could have other attributes stored in files, but they will be + * ignored. The SA framework will preserve them, just that + * this version of ZFS won't change or delete them. + */ + +sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { + {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, + {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, + {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, + {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, + {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, + {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, + {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, + {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, + {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, + {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, + {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, + {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, + {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, + {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, + {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, + {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, + {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, + {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0}, + {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0}, + {"ZPL_DACL_ACES", 0, SA_ACL, 0}, + {NULL, 0, 0, 0} +}; + +#ifdef _KERNEL + +int +zfs_sa_readlink(znode_t *zp, uio_t *uio) +{ + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + size_t bufsz; + int error; + + bufsz = zp->z_size; + if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) { + error = uiomove((caddr_t)db->db_data + + ZFS_OLD_ZNODE_PHYS_SIZE, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + } else { + dmu_buf_t *dbp; + if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id, + 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) { + error = uiomove(dbp->db_data, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + dmu_buf_rele(dbp, FTAG); + } + } + return (error); +} + +void +zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) +{ + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + + if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) { + VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx)); + if (len) { + bcopy(link, (caddr_t)db->db_data + + ZFS_OLD_ZNODE_PHYS_SIZE, len); + } + } else { + dmu_buf_t *dbp; + + zfs_grow_blocksize(zp, len, tx); + VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os, + zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)); + + dmu_buf_will_dirty(dbp, tx); + + ASSERT3U(len, <=, dbp->db_size); + bcopy(link, dbp->db_data, len); + dmu_buf_rele(dbp, FTAG); + } +} + +void +zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + xoptattr_t *xoap; + + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); + if (zp->z_is_sa) { + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), + &xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp)) != 0) + return; + } else { + dmu_object_info_t doi; + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + int len; + + if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP)) + return; + + sa_object_info(zp->z_sa_hdl, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + ZFS_OLD_ZNODE_PHYS_SIZE; + + if (len <= doi.doi_bonus_size) { + (void) memcpy(xoap->xoa_av_scanstamp, + (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + sizeof (xoap->xoa_av_scanstamp)); + } + } + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); +} + +void +zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + xoptattr_t *xoap; + + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); + if (zp->z_is_sa) + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), + &xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp), tx)); + else { + dmu_object_info_t doi; + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + int len; + + sa_object_info(zp->z_sa_hdl, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + ZFS_OLD_ZNODE_PHYS_SIZE; + if (len > doi.doi_bonus_size) + VERIFY(dmu_set_bonus(db, len, tx) == 0); + (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp)); + + zp->z_pflags |= ZFS_BONUS_SCANSTAMP; + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + &zp->z_pflags, sizeof (uint64_t), tx)); + } +} + +/* + * I'm not convinced we should do any of this upgrade. + * since the SA code can read both old/new znode formats + * with probably little to no performance difference. + * + * All new files will be created with the new format. + */ + +void +zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) +{ + dmu_buf_t *db = sa_get_db(hdl); + znode_t *zp = sa_get_userdata(hdl); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + sa_bulk_attr_t bulk[20]; + int count = 0; + sa_bulk_attr_t sa_attrs[20] = { 0 }; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t uid, gid, mode, rdev, xattr, parent; + uint64_t crtime[2], mtime[2], ctime[2]; + zfs_acl_phys_t znode_acl; + char scanstamp[AV_SCANSTAMP_SZ]; + + /* + * No upgrade if ACL isn't cached + * since we won't know which locks are held + * and ready the ACL would require special "locked" + * interfaces that would be messy + */ + if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK) + return; + + /* + * If the vnode lock is held and we aren't the owner + * then just return since we don't want to deadlock + * trying to update the status of z_is_sa. This + * file can then be upgraded at a later time. + * + * Otherwise, we know we are doing the + * sa_update() that caused us to enter this function. + */ + if (vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_NOWAIT) != 0) + return; + + /* First do a bulk query of the attributes that aren't cached */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &znode_acl, 88); + + if (sa_bulk_lookup_locked(hdl, bulk, count) != 0) + goto done; + + + /* + * While the order here doesn't matter its best to try and organize + * it is such a way to pick up an already existing layout number + */ + count = 0; + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs), + NULL, &zp->z_gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, + zp->z_atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL, + &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, 8); + if (zp->z_vnode->v_type == VBLK || zp->z_vnode->v_type == VCHR) + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &zp->z_acl_cached->z_acl_count, 8); + + if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID) + zfs_acl_xform(zp, zp->z_acl_cached, CRED()); + + locate.cb_aclp = zp->z_acl_cached; + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes); + + if (xattr) + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs), + NULL, &xattr, 8); + + /* if scanstamp then add scanstamp */ + + if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { + bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + scanstamp, AV_SCANSTAMP_SZ); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), + NULL, scanstamp, AV_SCANSTAMP_SZ); + zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; + } + + VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0); + VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs, + count, tx) == 0); + if (znode_acl.z_acl_extern_obj) + VERIFY(0 == dmu_object_free(zfsvfs->z_os, + znode_acl.z_acl_extern_obj, tx)); + + zp->z_is_sa = B_TRUE; +done: + VOP_UNLOCK(ZTOV(zp), 0); +} + +void +zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp) +{ + if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa) + return; + + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + if (zfs_external_acl(zp)) { + dmu_tx_hold_free(tx, zfs_external_acl(zp), 0, + DMU_OBJECT_END); + } +} + +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c new file mode 100644 index 000000000000..7255a09f5757 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -0,0 +1,2853 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. + * All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/acl.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/mntent.h> +#include <sys/mount.h> +#include <sys/cmn_err.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_dir.h> +#include <sys/zil.h> +#include <sys/fs/zfs.h> +#include <sys/dmu.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_deleg.h> +#include <sys/spa.h> +#include <sys/zap.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#include <sys/varargs.h> +#include <sys/policy.h> +#include <sys/atomic.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> +#include <sys/sunddi.h> +#include <sys/dnlc.h> +#include <sys/dmu_objset.h> +#include <sys/spa_boot.h> +#include <sys/jail.h> +#include <ufs/ufs/quota.h> +#include <sys/rmlock.h> + +#include "zfs_comutil.h" + +struct mtx zfs_debug_mtx; +MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); + +SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); + +int zfs_super_owner; +SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, + "File system owner can perform privileged operation on his file systems"); + +int zfs_debug_level; +SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, + "Debug level"); + +SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); +static int zfs_version_acl = ZFS_ACL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, + "ZFS_ACL_VERSION"); +static int zfs_version_spa = SPA_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, + "SPA_VERSION"); +static int zfs_version_zpl = ZPL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, + "ZPL_VERSION"); + +static int zfs_root_setvnode(zfsvfs_t *zfsvfs); +static void zfs_root_dropvnode(zfsvfs_t *zfsvfs); + +static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); +static int zfs_mount(vfs_t *vfsp); +static int zfs_umount(vfs_t *vfsp, int fflag); +static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); +static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); +static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); +static int zfs_sync(vfs_t *vfsp, int waitfor); +static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors); +static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); +static void zfs_objset_close(zfsvfs_t *zfsvfs); +static void zfs_freevfs(vfs_t *vfsp); + +struct vfsops zfs_vfsops = { + .vfs_mount = zfs_mount, + .vfs_unmount = zfs_umount, + .vfs_root = zfs_root, + .vfs_statfs = zfs_statfs, + .vfs_vget = zfs_vget, + .vfs_sync = zfs_sync, + .vfs_checkexp = zfs_checkexp, + .vfs_fhtovp = zfs_fhtovp, + .vfs_quotactl = zfs_quotactl, +}; + +VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); + +/* + * We need to keep a count of active fs's. + * This is necessary to prevent our module + * from being unloaded after a umount -f + */ +static uint32_t zfs_active_fs_count = 0; + +static int +zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) +{ + int error = 0; + char buf[32]; + int err; + uint64_t usedobj, quotaobj; + uint64_t quota, used = 0; + timespec_t now; + + usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + if (quotaobj == 0 || zfsvfs->z_replay) { + error = EINVAL; + goto done; + } + (void)sprintf(buf, "%llx", (longlong_t)id); + if ((error = zap_lookup(zfsvfs->z_os, quotaobj, + buf, sizeof(quota), 1, "a)) != 0) { + dprintf("%s(%d): quotaobj lookup failed\n", __FUNCTION__, __LINE__); + goto done; + } + /* + * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". + * So we set them to be the same. + */ + dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); + error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof(used), 1, &used); + if (error && error != ENOENT) { + dprintf("%s(%d): usedobj failed; %d\n", __FUNCTION__, __LINE__, error); + goto done; + } + dqp->dqb_curblocks = btodb(used); + dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; + vfs_timestamp(&now); + /* + * Setting this to 0 causes FreeBSD quota(8) to print + * the number of days since the epoch, which isn't + * particularly useful. + */ + dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; +done: + return (error); +} + +static int +zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + struct thread *td; + int cmd, type, error = 0; + int bitsize; + uint64_t fuid; + zfs_userquota_prop_t quota_type; + struct dqblk64 dqblk = { 0 }; + + td = curthread; + cmd = cmds >> SUBCMDSHIFT; + type = cmds & SUBCMDMASK; + + ZFS_ENTER(zfsvfs); + if (id == -1) { + switch (type) { + case USRQUOTA: + id = td->td_ucred->cr_ruid; + break; + case GRPQUOTA: + id = td->td_ucred->cr_rgid; + break; + default: + error = EINVAL; + if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) + vfs_unbusy(vfsp); + goto done; + } + } + /* + * Map BSD type to: + * ZFS_PROP_USERUSED, + * ZFS_PROP_USERQUOTA, + * ZFS_PROP_GROUPUSED, + * ZFS_PROP_GROUPQUOTA + */ + switch (cmd) { + case Q_SETQUOTA: + case Q_SETQUOTA32: + if (type == USRQUOTA) + quota_type = ZFS_PROP_USERQUOTA; + else if (type == GRPQUOTA) + quota_type = ZFS_PROP_GROUPQUOTA; + else + error = EINVAL; + break; + case Q_GETQUOTA: + case Q_GETQUOTA32: + if (type == USRQUOTA) + quota_type = ZFS_PROP_USERUSED; + else if (type == GRPQUOTA) + quota_type = ZFS_PROP_GROUPUSED; + else + error = EINVAL; + break; + } + + /* + * Depending on the cmd, we may need to get + * the ruid and domain (see fuidstr_to_sid?), + * the fuid (how?), or other information. + * Create fuid using zfs_fuid_create(zfsvfs, id, + * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? + * I think I can use just the id? + * + * Look at zfs_fuid_overquota() to look up a quota. + * zap_lookup(something, quotaobj, fuidstring, sizeof(long long), 1, "a) + * + * See zfs_set_userquota() to set a quota. + */ + if ((u_int)type >= MAXQUOTAS) { + error = EINVAL; + goto done; + } + + switch (cmd) { + case Q_GETQUOTASIZE: + bitsize = 64; + error = copyout(&bitsize, arg, sizeof(int)); + break; + case Q_QUOTAON: + // As far as I can tell, you can't turn quotas on or off on zfs + error = 0; + vfs_unbusy(vfsp); + break; + case Q_QUOTAOFF: + error = ENOTSUP; + vfs_unbusy(vfsp); + break; + case Q_SETQUOTA: + error = copyin(&dqblk, arg, sizeof(dqblk)); + if (error == 0) + error = zfs_set_userquota(zfsvfs, quota_type, + "", id, dbtob(dqblk.dqb_bhardlimit)); + break; + case Q_GETQUOTA: + error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); + if (error == 0) + error = copyout(&dqblk, arg, sizeof(dqblk)); + break; + default: + error = EINVAL; + break; + } +done: + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +static int +zfs_sync(vfs_t *vfsp, int waitfor) +{ + + /* + * Data integrity is job one. We don't want a compromised kernel + * writing to the storage pool, so we never sync during panic. + */ + if (panicstr) + return (0); + + /* + * Ignore the system syncher. ZFS already commits async data + * at zfs_txg_timeout intervals. + */ + if (waitfor == MNT_LAZY) + return (0); + + if (vfsp != NULL) { + /* + * Sync a specific filesystem. + */ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + dsl_pool_t *dp; + int error; + + error = vfs_stdsync(vfsp, waitfor); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + */ + if (sys_shutdown && spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, 0); + + ZFS_EXIT(zfsvfs); + } else { + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(1M). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); + } + + return (0); +} + +#ifndef __FreeBSD_kernel__ +static int +zfs_create_unique_device(dev_t *dev) +{ + major_t new_major; + + do { + ASSERT3U(zfs_minor, <=, MAXMIN32); + minor_t start = zfs_minor; + do { + mutex_enter(&zfs_dev_mtx); + if (zfs_minor >= MAXMIN32) { + /* + * If we're still using the real major + * keep out of /dev/zfs and /dev/zvol minor + * number space. If we're using a getudev()'ed + * major number, we can use all of its minors. + */ + if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) + zfs_minor = ZFS_MIN_MINOR; + else + zfs_minor = 0; + } else { + zfs_minor++; + } + *dev = makedevice(zfs_major, zfs_minor); + mutex_exit(&zfs_dev_mtx); + } while (vfs_devismounted(*dev) && zfs_minor != start); + if (zfs_minor == start) { + /* + * We are using all ~262,000 minor numbers for the + * current major number. Create a new major number. + */ + if ((new_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "zfs_mount: Can't get unique major " + "device number."); + return (-1); + } + mutex_enter(&zfs_dev_mtx); + zfs_major = new_major; + zfs_minor = 0; + + mutex_exit(&zfs_dev_mtx); + } else { + break; + } + /* CONSTANTCONDITION */ + } while (1); + + return (0); +} +#endif /* !__FreeBSD_kernel__ */ + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == TRUE) { + zfsvfs->z_atime = TRUE; + zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); + } else { + zfsvfs->z_atime = FALSE; + zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); + } +} + +static void +xattr_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == TRUE) { + /* XXX locking on vfs_flag? */ +#ifdef TODO + zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; +#endif + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); + } else { + /* XXX locking on vfs_flag? */ +#ifdef TODO + zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; +#endif + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); + } +} + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); + ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); + ASSERT(ISP2(newval)); + + zfsvfs->z_max_blksz = newval; + zfsvfs->z_vfs->mnt_stat.f_iosize = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval) { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); + } else { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); + } +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); + } +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); + } +} + +/* + * The nbmand mount option can be changed at mount time. + * We can't allow it to be toggled on live file systems or incorrect + * behavior may be seen from cifs clients + * + * This property isn't registered via dsl_prop_register(), but this callback + * will be called when a file system is first mounted + */ +static void +nbmand_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == FALSE) { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); + } else { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); + } +} + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_show_ctldir = newval; +} + +static void +vscan_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_vscan = newval; +} + +static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_mode = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_inherit = newval; +} + +static int +zfs_register_callbacks(vfs_t *vfsp) +{ + struct dsl_dataset *ds = NULL; + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + uint64_t nbmand; + boolean_t readonly = B_FALSE; + boolean_t do_readonly = B_FALSE; + boolean_t setuid = B_FALSE; + boolean_t do_setuid = B_FALSE; + boolean_t exec = B_FALSE; + boolean_t do_exec = B_FALSE; +#ifdef illumos + boolean_t devices = B_FALSE; + boolean_t do_devices = B_FALSE; +#endif + boolean_t xattr = B_FALSE; + boolean_t do_xattr = B_FALSE; + boolean_t atime = B_FALSE; + boolean_t do_atime = B_FALSE; + int error = 0; + + ASSERT(vfsp); + zfsvfs = vfsp->vfs_data; + ASSERT(zfsvfs); + os = zfsvfs->z_os; + + /* + * This function can be called for a snapshot when we update snapshot's + * mount point, which isn't really supported. + */ + if (dmu_objset_is_snapshot(os)) + return (EOPNOTSUPP); + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || + !spa_writeable(dmu_objset_spa(os))) { + readonly = B_TRUE; + do_readonly = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { + readonly = B_FALSE; + do_readonly = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } else { + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { + setuid = B_TRUE; + do_setuid = B_TRUE; + } + } + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { + exec = B_FALSE; + do_exec = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { + exec = B_TRUE; + do_exec = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { + xattr = B_FALSE; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { + xattr = B_TRUE; + do_xattr = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { + atime = B_FALSE; + do_atime = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { + atime = B_TRUE; + do_atime = B_TRUE; + } + + /* + * We need to enter pool configuration here, so that we can use + * dsl_prop_get_int_ds() to handle the special nbmand property below. + * dsl_prop_get_integer() can not be used, because it has to acquire + * spa_namespace_lock and we can not do that because we already hold + * z_teardown_lock. The problem is that spa_write_cachefile() is called + * with spa_namespace_lock held and the function calls ZFS vnode + * operations to write the cache file and thus z_teardown_lock is + * acquired after spa_namespace_lock. + */ + ds = dmu_objset_ds(os); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + + /* + * nbmand is a special property. It can only be changed at + * mount time. + * + * This is weird, but it is documented to only be changeable + * at mount time. + */ + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { + nbmand = B_FALSE; + } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { + nbmand = B_TRUE; + } else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) { + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + return (error); + } + + /* + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... + */ + error = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); +#ifdef illumos + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); +#endif + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, + zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (error) + goto unregister; + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (do_readonly) + readonly_changed_cb(zfsvfs, readonly); + if (do_setuid) + setuid_changed_cb(zfsvfs, setuid); + if (do_exec) + exec_changed_cb(zfsvfs, exec); + if (do_xattr) + xattr_changed_cb(zfsvfs, xattr); + if (do_atime) + atime_changed_cb(zfsvfs, atime); + + nbmand_changed_cb(zfsvfs, nbmand); + + return (0); + +unregister: + dsl_prop_unregister_all(ds, zfsvfs); + return (error); +} + +static int +zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, + uint64_t *userp, uint64_t *groupp) +{ + /* + * Is it a valid type of object to track? + */ + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) + return (SET_ERROR(ENOENT)); + + /* + * If we have a NULL data pointer + * then assume the id's aren't changing and + * return EEXIST to the dmu to let it know to + * use the same ids + */ + if (data == NULL) + return (SET_ERROR(EEXIST)); + + if (bonustype == DMU_OT_ZNODE) { + znode_phys_t *znp = data; + *userp = znp->zp_uid; + *groupp = znp->zp_gid; + } else { + int hdrsize; + sa_hdr_phys_t *sap = data; + sa_hdr_phys_t sa = *sap; + boolean_t swap = B_FALSE; + + ASSERT(bonustype == DMU_OT_SA); + + if (sa.sa_magic == 0) { + /* + * This should only happen for newly created + * files that haven't had the znode data filled + * in yet. + */ + *userp = 0; + *groupp = 0; + return (0); + } + if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { + sa.sa_magic = SA_MAGIC; + sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); + swap = B_TRUE; + } else { + VERIFY3U(sa.sa_magic, ==, SA_MAGIC); + } + + hdrsize = sa_hdrsize(&sa); + VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); + *userp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_UID_OFFSET)); + *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_GID_OFFSET)); + if (swap) { + *userp = BSWAP_64(*userp); + *groupp = BSWAP_64(*groupp); + } + } + return (0); +} + +static void +fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, + char *domainbuf, int buflen, uid_t *ridp) +{ + uint64_t fuid; + const char *domain; + + fuid = zfs_strtonum(fuidstr, NULL); + + domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); + if (domain) + (void) strlcpy(domainbuf, domain, buflen); + else + domainbuf[0] = '\0'; + *ridp = FUID_RID(fuid); +} + +static uint64_t +zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) +{ + switch (type) { + case ZFS_PROP_USERUSED: + return (DMU_USERUSED_OBJECT); + case ZFS_PROP_GROUPUSED: + return (DMU_GROUPUSED_OBJECT); + case ZFS_PROP_USERQUOTA: + return (zfsvfs->z_userquota_obj); + case ZFS_PROP_GROUPQUOTA: + return (zfsvfs->z_groupquota_obj); + } + return (0); +} + +int +zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) +{ + int error; + zap_cursor_t zc; + zap_attribute_t za; + zfs_useracct_t *buf = vbuf; + uint64_t obj; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == 0) { + *bufsizep = 0; + return (0); + } + + for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > + *bufsizep) + break; + + fuidstr_to_sid(zfsvfs, za.za_name, + buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); + + buf->zu_space = za.za_first_integer; + buf++; + } + if (error == ENOENT) + error = 0; + + ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); + *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; + *cookiep = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + return (error); +} + +/* + * buf must be big enough (eg, 32 bytes) + */ +static int +id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, + char *buf, boolean_t addok) +{ + uint64_t fuid; + int domainid = 0; + + if (domain && domain[0]) { + domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); + if (domainid == -1) + return (SET_ERROR(ENOENT)); + } + fuid = FUID_ENCODE(domainid, rid); + (void) sprintf(buf, "%llx", (longlong_t)fuid); + return (0); +} + +int +zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valp) +{ + char buf[32]; + int err; + uint64_t obj; + + *valp = 0; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == 0) + return (0); + + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); + if (err) + return (err); + + err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); + if (err == ENOENT) + err = 0; + return (err); +} + +int +zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota) +{ + char buf[32]; + int err; + dmu_tx_t *tx; + uint64_t *objp; + boolean_t fuid_dirtied; + + if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) + return (SET_ERROR(EINVAL)); + + if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) + return (SET_ERROR(ENOTSUP)); + + objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : + &zfsvfs->z_groupquota_obj; + + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); + if (err) + return (err); + fuid_dirtied = zfsvfs->z_fuid_dirty; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); + if (*objp == 0) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + zfs_userquota_prop_prefixes[type]); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + mutex_enter(&zfsvfs->z_lock); + if (*objp == 0) { + *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, + DMU_OT_NONE, 0, tx); + VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); + } + mutex_exit(&zfsvfs->z_lock); + + if (quota == 0) { + err = zap_remove(zfsvfs->z_os, *objp, buf, tx); + if (err == ENOENT) + err = 0; + } else { + err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); + } + ASSERT(err == 0); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + dmu_tx_commit(tx); + return (err); +} + +boolean_t +zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) +{ + char buf[32]; + uint64_t used, quota, usedobj, quotaobj; + int err; + + usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + (void) sprintf(buf, "%llx", (longlong_t)fuid); + err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); + if (err != 0) + return (B_FALSE); + + err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); + if (err != 0) + return (B_FALSE); + return (used >= quota); +} + +boolean_t +zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) +{ + uint64_t fuid; + uint64_t quotaobj; + + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + fuid = isgroup ? zp->z_gid : zp->z_uid; + + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); +} + +/* + * Associate this zfsvfs with the given objset, which must be owned. + * This will cache a bunch of on-disk state from the objset in the + * zfsvfs. + */ +static int +zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + uint64_t val; + + zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error != 0) + return (error); + if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + (void) printf("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); + return (SET_ERROR(ENOTSUP)); + } + error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); + if (error != 0) + return (error); + zfsvfs->z_norm = (int)val; + + error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); + if (error != 0) + return (error); + zfsvfs->z_utf8 = (val != 0); + + error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); + if (error != 0) + return (error); + zfsvfs->z_case = (uint_t)val; + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + uint64_t sa_obj = 0; + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + if (error != 0) + return (error); + } + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error != 0) + return (error); + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error != 0) + return (error); + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error == ENOENT) + zfsvfs->z_userquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error == ENOENT) + zfsvfs->z_groupquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error == ENOENT) + zfsvfs->z_fuid_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error == ENOENT) + zfsvfs->z_shares_dir = 0; + else if (error != 0) + return (error); + + /* + * Only use the name cache if we are looking for a + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name (which is always the case on + * FreeBSD). + */ + zfsvfs->z_use_namecache = !zfsvfs->z_norm || + ((zfsvfs->z_case == ZFS_CASE_MIXED) && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); + + return (0); +} + +#if defined(__FreeBSD__) +taskq_t *zfsvfs_taskq; + +static void +zfsvfs_task_unlinked_drain(void *context, int pending __unused) +{ + + zfs_unlinked_drain((zfsvfs_t *)context); +} +#endif + +int +zfsvfs_create(const char *osname, zfsvfs_t **zfvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + int error; + + /* + * XXX: Fix struct statfs so this isn't necessary! + * + * The 'osname' is used as the filesystem's special node, which means + * it must fit in statfs.f_mntfromname, or else it can't be + * enumerated, so libzfs_mnttab_find() returns NULL, which causes + * 'zfs unmount' to think it's not mounted when it is. + */ + if (strlen(osname) >= MNAMELEN) + return (SET_ERROR(ENAMETOOLONG)); + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + /* + * We claim to always be readonly so we can open snapshots; + * other ZPL code will prevent us from writing to snapshots. + */ + + error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); + if (error != 0) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + error = zfsvfs_create_impl(zfvp, zfsvfs, os); + if (error != 0) { + dmu_objset_disown(os, zfsvfs); + } + return (error); +} + + +int +zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + + zfsvfs->z_vfs = NULL; + zfsvfs->z_parent = zfsvfs; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); +#if defined(__FreeBSD__) + TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, + zfsvfs_task_unlinked_drain, zfsvfs); +#endif +#ifdef DIAGNOSTIC + rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); +#else + rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); +#endif + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + rm_init(&zfsvfs->z_rootvnodelock, "zfs root vnode lock"); + + error = zfsvfs_init(zfsvfs, os); + if (error != 0) { + *zfvp = NULL; + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + *zfvp = zfsvfs; + return (0); +} + +static int +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + int error; + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + boolean_t readonly; + + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; + if (readonly != 0) + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + else + zfs_unlinked_drain(zfsvfs); + + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + } + } + zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ + } + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + return (0); +} + +extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ + +void +zfsvfs_free(zfsvfs_t *zfsvfs) +{ + int i; + + /* + * This is a barrier to prevent the filesystem from going away in + * zfs_znode_move() until we can safely ensure that the filesystem is + * not unmounted. We consider the filesystem valid before the barrier + * and invalid after the barrier. + */ + rw_enter(&zfsvfs_lock, RW_READER); + rw_exit(&zfsvfs_lock); + + rm_destroy(&zfsvfs->z_rootvnodelock); + + zfs_fuid_destroy(zfsvfs); + + mutex_destroy(&zfsvfs->z_znodes_lock); + mutex_destroy(&zfsvfs->z_lock); + list_destroy(&zfsvfs->z_all_znodes); + rrm_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); + rw_destroy(&zfsvfs->z_fuid_lock); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} + +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_vfs) { + if (zfsvfs->z_use_fuids) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } else { + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } + } + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); +} + +static int +zfs_domount(vfs_t *vfsp, char *osname) +{ + uint64_t recordsize, fsid_guid; + int error = 0; + zfsvfs_t *zfsvfs; + vnode_t *vp; + + ASSERT(vfsp); + ASSERT(osname); + + error = zfsvfs_create(osname, &zfsvfs); + if (error) + return (error); + zfsvfs->z_vfs = vfsp; + +#ifdef illumos + /* Initialize the generic filesystem structure. */ + vfsp->vfs_bcount = 0; + vfsp->vfs_data = NULL; + + if (zfs_create_unique_device(&mount_dev) == -1) { + error = SET_ERROR(ENODEV); + goto out; + } + ASSERT(vfs_devismounted(mount_dev) == 0); +#endif + + if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, + NULL)) + goto out; + zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; + zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; + + vfsp->vfs_data = zfsvfs; + vfsp->mnt_flag |= MNT_LOCAL; + vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; + vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; + vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; + vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ + + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); + ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); + vfsp->vfs_fsid.val[0] = fsid_guid; + vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | + vfsp->mnt_vfc->vfc_typenum & 0xFF; + + /* + * Set features for file system. + */ + zfs_set_fuid_feature(zfsvfs); + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); + } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + } + vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + uint64_t pval; + + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) + goto out; + xattr_changed_cb(zfsvfs, pval); + zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; + + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + } else { + error = zfsvfs_setup(zfsvfs, B_TRUE); + } + + vfs_mountedfrom(vfsp, osname); + + if (!zfsvfs->z_issnap) + zfsctl_create(zfsvfs); +out: + if (error) { + dmu_objset_disown(zfsvfs->z_os, zfsvfs); + zfsvfs_free(zfsvfs); + } else { + atomic_inc_32(&zfs_active_fs_count); + } + + return (error); +} + +void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + + if (!dmu_objset_is_snapshot(os)) + dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); +} + +#ifdef SECLABEL +/* + * Convert a decimal digit string to a uint64_t integer. + */ +static int +str_to_uint64(char *str, uint64_t *objnum) +{ + uint64_t num = 0; + + while (*str) { + if (*str < '0' || *str > '9') + return (SET_ERROR(EINVAL)); + + num = num*10 + *str++ - '0'; + } + + *objnum = num; + return (0); +} + +/* + * The boot path passed from the boot loader is in the form of + * "rootpool-name/root-filesystem-object-number'. Convert this + * string to a dataset name: "rootpool-name/root-filesystem-name". + */ +static int +zfs_parse_bootfs(char *bpath, char *outpath) +{ + char *slashp; + uint64_t objnum; + int error; + + if (*bpath == 0 || *bpath == '/') + return (SET_ERROR(EINVAL)); + + (void) strcpy(outpath, bpath); + + slashp = strchr(bpath, '/'); + + /* if no '/', just return the pool name */ + if (slashp == NULL) { + return (0); + } + + /* if not a number, just return the root dataset name */ + if (str_to_uint64(slashp+1, &objnum)) { + return (0); + } + + *slashp = '\0'; + error = dsl_dsobj_to_dsname(bpath, objnum, outpath); + *slashp = '/'; + + return (error); +} + +/* + * Check that the hex label string is appropriate for the dataset being + * mounted into the global_zone proper. + * + * Return an error if the hex label string is not default or + * admin_low/admin_high. For admin_low labels, the corresponding + * dataset must be readonly. + */ +int +zfs_check_global_label(const char *dsname, const char *hexsl) +{ + if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_HIGH) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_LOW) == 0) { + /* must be readonly */ + uint64_t rdonly; + + if (dsl_prop_get_integer(dsname, + zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) + return (SET_ERROR(EACCES)); + return (rdonly ? 0 : EACCES); + } + return (SET_ERROR(EACCES)); +} + +/* + * Determine whether the mount is allowed according to MAC check. + * by comparing (where appropriate) label of the dataset against + * the label of the zone being mounted into. If the dataset has + * no label, create one. + * + * Returns 0 if access allowed, error otherwise (e.g. EACCES) + */ +static int +zfs_mount_label_policy(vfs_t *vfsp, char *osname) +{ + int error, retv; + zone_t *mntzone = NULL; + ts_label_t *mnt_tsl; + bslabel_t *mnt_sl; + bslabel_t ds_sl; + char ds_hexsl[MAXNAMELEN]; + + retv = EACCES; /* assume the worst */ + + /* + * Start by getting the dataset label if it exists. + */ + error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + 1, sizeof (ds_hexsl), &ds_hexsl, NULL); + if (error) + return (SET_ERROR(EACCES)); + + /* + * If labeling is NOT enabled, then disallow the mount of datasets + * which have a non-default label already. No other label checks + * are needed. + */ + if (!is_system_labeled()) { + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + return (SET_ERROR(EACCES)); + } + + /* + * Get the label of the mountpoint. If mounting into the global + * zone (i.e. mountpoint is not within an active zone and the + * zoned property is off), the label must be default or + * admin_low/admin_high only; no other checks are needed. + */ + mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); + if (mntzone->zone_id == GLOBAL_ZONEID) { + uint64_t zoned; + + zone_rele(mntzone); + + if (dsl_prop_get_integer(osname, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) + return (SET_ERROR(EACCES)); + if (!zoned) + return (zfs_check_global_label(osname, ds_hexsl)); + else + /* + * This is the case of a zone dataset being mounted + * initially, before the zone has been fully created; + * allow this mount into global zone. + */ + return (0); + } + + mnt_tsl = mntzone->zone_slabel; + ASSERT(mnt_tsl != NULL); + label_hold(mnt_tsl); + mnt_sl = label2bslabel(mnt_tsl); + + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { + /* + * The dataset doesn't have a real label, so fabricate one. + */ + char *str = NULL; + + if (l_to_str_internal(mnt_sl, &str) == 0 && + dsl_prop_set_string(osname, + zfs_prop_to_name(ZFS_PROP_MLSLABEL), + ZPROP_SRC_LOCAL, str) == 0) + retv = 0; + if (str != NULL) + kmem_free(str, strlen(str) + 1); + } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { + /* + * Now compare labels to complete the MAC check. If the + * labels are equal then allow access. If the mountpoint + * label dominates the dataset label, allow readonly access. + * Otherwise, access is denied. + */ + if (blequal(mnt_sl, &ds_sl)) + retv = 0; + else if (bldominates(mnt_sl, &ds_sl)) { + vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); + retv = 0; + } + } + + label_rele(mnt_tsl); + zone_rele(mntzone); + return (retv); +} +#endif /* SECLABEL */ + +#ifdef OPENSOLARIS_MOUNTROOT +static int +zfs_mountroot(vfs_t *vfsp, enum whymountroot why) +{ + int error = 0; + static int zfsrootdone = 0; + zfsvfs_t *zfsvfs = NULL; + znode_t *zp = NULL; + vnode_t *vp = NULL; + char *zfs_bootfs; + char *zfs_devid; + + ASSERT(vfsp); + + /* + * The filesystem that we mount as root is defined in the + * boot property "zfs-bootfs" with a format of + * "poolname/root-dataset-objnum". + */ + if (why == ROOT_INIT) { + if (zfsrootdone++) + return (SET_ERROR(EBUSY)); + /* + * the process of doing a spa_load will require the + * clock to be set before we could (for example) do + * something better by looking at the timestamp on + * an uberblock, so just set it to -1. + */ + clkset(-1); + + if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { + cmn_err(CE_NOTE, "spa_get_bootfs: can not get " + "bootfs name"); + return (SET_ERROR(EINVAL)); + } + zfs_devid = spa_get_bootprop("diskdevid"); + error = spa_import_rootpool(rootfs.bo_name, zfs_devid); + if (zfs_devid) + spa_free_bootprop(zfs_devid); + if (error) { + spa_free_bootprop(zfs_bootfs); + cmn_err(CE_NOTE, "spa_import_rootpool: error %d", + error); + return (error); + } + if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { + spa_free_bootprop(zfs_bootfs); + cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", + error); + return (error); + } + + spa_free_bootprop(zfs_bootfs); + + if (error = vfs_lock(vfsp)) + return (error); + + if (error = zfs_domount(vfsp, rootfs.bo_name)) { + cmn_err(CE_NOTE, "zfs_domount: error %d", error); + goto out; + } + + zfsvfs = (zfsvfs_t *)vfsp->vfs_data; + ASSERT(zfsvfs); + if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { + cmn_err(CE_NOTE, "zfs_zget: error %d", error); + goto out; + } + + vp = ZTOV(zp); + mutex_enter(&vp->v_lock); + vp->v_flag |= VROOT; + mutex_exit(&vp->v_lock); + rootvp = vp; + + /* + * Leave rootvp held. The root file system is never unmounted. + */ + + vfs_add((struct vnode *)0, vfsp, + (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); +out: + vfs_unlock(vfsp); + return (error); + } else if (why == ROOT_REMOUNT) { + readonly_changed_cb(vfsp->vfs_data, B_FALSE); + vfsp->vfs_flag |= VFS_REMOUNT; + + /* refresh mount options */ + zfs_unregister_callbacks(vfsp->vfs_data); + return (zfs_register_callbacks(vfsp)); + + } else if (why == ROOT_UNMOUNT) { + zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); + (void) zfs_sync(vfsp, 0, 0); + return (0); + } + + /* + * if "why" is equal to anything else other than ROOT_INIT, + * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. + */ + return (SET_ERROR(ENOTSUP)); +} +#endif /* OPENSOLARIS_MOUNTROOT */ + +static int +getpoolname(const char *osname, char *poolname) +{ + char *p; + + p = strchr(osname, '/'); + if (p == NULL) { + if (strlen(osname) >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strcpy(poolname, osname); + } else { + if (p - osname >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(poolname, osname, p - osname); + poolname[p - osname] = '\0'; + } + return (0); +} + +/*ARGSUSED*/ +static int +zfs_mount(vfs_t *vfsp) +{ + kthread_t *td = curthread; + vnode_t *mvp = vfsp->mnt_vnodecovered; + cred_t *cr = td->td_ucred; + char *osname; + int error = 0; + int canwrite; + +#ifdef illumos + if (mvp->v_type != VDIR) + return (SET_ERROR(ENOTDIR)); + + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_REMOUNT) == 0 && + (uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (SET_ERROR(EBUSY)); + } + mutex_exit(&mvp->v_lock); + + /* + * ZFS does not support passing unparsed data in via MS_DATA. + * Users should use the MS_OPTIONSTR interface; this means + * that all option parsing is already done and the options struct + * can be interrogated. + */ + if ((uap->flags & MS_DATA) && uap->datalen > 0) + return (SET_ERROR(EINVAL)); + + /* + * Get the objset name (the "special" mount argument). + */ + if (error = pn_get(uap->spec, fromspace, &spn)) + return (error); + + osname = spn.pn_path; +#else /* !illumos */ + if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) + return (SET_ERROR(EINVAL)); + + /* + * If full-owner-access is enabled and delegated administration is + * turned on, we must set nosuid. + */ + if (zfs_super_owner && + dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { + secpolicy_fs_mount_clearopts(cr, vfsp); + } +#endif /* illumos */ + + /* + * Check for mount privilege? + * + * If we don't have privilege then see if + * we have local permission to allow it + */ + error = secpolicy_fs_mount(cr, mvp, vfsp); + if (error) { + if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) + goto out; + + if (!(vfsp->vfs_flag & MS_REMOUNT)) { + vattr_t vattr; + + /* + * Make sure user is the owner of the mount point + * or has sufficient privileges. + */ + + vattr.va_mask = AT_UID; + + vn_lock(mvp, LK_SHARED | LK_RETRY); + if (VOP_GETATTR(mvp, &vattr, cr)) { + VOP_UNLOCK(mvp, 0); + goto out; + } + + if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && + VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { + VOP_UNLOCK(mvp, 0); + goto out; + } + VOP_UNLOCK(mvp, 0); + } + + secpolicy_fs_mount_clearopts(cr, vfsp); + } + + /* + * Refuse to mount a filesystem if we are in a local zone and the + * dataset is not visible. + */ + if (!INGLOBALZONE(curthread) && + (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { + error = SET_ERROR(EPERM); + goto out; + } + +#ifdef SECLABEL + error = zfs_mount_label_policy(vfsp, osname); + if (error) + goto out; +#endif + + vfsp->vfs_flag |= MNT_NFS4ACLS; + + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (vfsp->vfs_flag & MS_REMOUNT) { + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + /* + * Refresh mount options with z_teardown_lock blocking I/O while + * the filesystem is in an inconsistent state. + * The lock also serializes this code with filesystem + * manipulations between entry to zfs_suspend_fs() and return + * from zfs_resume_fs(). + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + zfs_unregister_callbacks(zfsvfs); + error = zfs_register_callbacks(vfsp); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + goto out; + } + + /* Initial root mount: try hard to import the requested root pool. */ + if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && + (vfsp->vfs_flag & MNT_UPDATE) == 0) { + char pname[MAXNAMELEN]; + + error = getpoolname(osname, pname); + if (error == 0) + error = spa_import_rootpool(pname); + if (error) + goto out; + } + DROP_GIANT(); + error = zfs_domount(vfsp, osname); + PICKUP_GIANT(); + + if (error == 0) + zfs_root_setvnode((zfsvfs_t *)vfsp->vfs_data); + +#ifdef illumos + /* + * Add an extra VFS_HOLD on our parent vfs so that it can't + * disappear due to a forced unmount. + */ + if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) + VFS_HOLD(mvp->v_vfsp); +#endif + +out: + return (error); +} + +static int +zfs_statfs(vfs_t *vfsp, struct statfs *statp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + uint64_t refdbytes, availbytes, usedobjs, availobjs; + + statp->f_version = STATFS_VERSION; + + ZFS_ENTER(zfsvfs); + + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + + /* + * The underlying storage pool actually uses multiple block sizes. + * We report the fragsize as the smallest block size we support, + * and we report our blocksize as the filesystem's maximum blocksize. + */ + statp->f_bsize = SPA_MINBLOCKSIZE; + statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; + + /* + * The following report "total" blocks of various kinds in the + * file system, but reported in terms of f_frsize - the + * "fragment" size. + */ + + statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; + statp->f_bfree = availbytes / statp->f_bsize; + statp->f_bavail = statp->f_bfree; /* no root reservation */ + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of object available + * and the number of blocks (each object will take at least a block). + */ + statp->f_ffree = MIN(availobjs, statp->f_bfree); + statp->f_files = statp->f_ffree + usedobjs; + + /* + * We're a zfs filesystem. + */ + (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); + + strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, + sizeof(statp->f_mntfromname)); + strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, + sizeof(statp->f_mntonname)); + + statp->f_namemax = MAXNAMELEN - 1; + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_root_setvnode(zfsvfs_t *zfsvfs) +{ + znode_t *rootzp; + int error; + + ZFS_ENTER(zfsvfs); + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error != 0) + panic("could not zfs_zget for root vnode"); + ZFS_EXIT(zfsvfs); + + rm_wlock(&zfsvfs->z_rootvnodelock); + if (zfsvfs->z_rootvnode != NULL) + panic("zfs mount point already has a root vnode: %p\n", + zfsvfs->z_rootvnode); + zfsvfs->z_rootvnode = ZTOV(rootzp); + rm_wunlock(&zfsvfs->z_rootvnodelock); + return (0); +} + +static void +zfs_root_putvnode(zfsvfs_t *zfsvfs) +{ + struct vnode *vp; + + rm_wlock(&zfsvfs->z_rootvnodelock); + vp = zfsvfs->z_rootvnode; + zfsvfs->z_rootvnode = NULL; + rm_wunlock(&zfsvfs->z_rootvnodelock); + if (vp != NULL) + vrele(vp); +} + +static int +zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) +{ + struct rm_priotracker tracker; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *rootzp; + int error; + + rm_rlock(&zfsvfs->z_rootvnodelock, &tracker); + *vpp = zfsvfs->z_rootvnode; + if (*vpp != NULL && (((*vpp)->v_iflag & VI_DOOMED) == 0)) { + vrefact(*vpp); + rm_runlock(&zfsvfs->z_rootvnodelock, &tracker); + goto lock; + } + rm_runlock(&zfsvfs->z_rootvnodelock, &tracker); + + /* + * We found the vnode but did not like it. + */ + if (*vpp != NULL) { + *vpp = NULL; + zfs_root_putvnode(zfsvfs); + } + + ZFS_ENTER(zfsvfs); + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *vpp = ZTOV(rootzp); + + ZFS_EXIT(zfsvfs); + + if (error == 0) { +lock: + error = vn_lock(*vpp, flags); + if (error != 0) { + VN_RELE(*vpp); + *vpp = NULL; + } + } + return (error); +} + +/* + * Teardown the zfsvfs::z_os. + * + * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + znode_t *zp; + + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's vfsp as the parent + * filesystem and all of its snapshots have their vnode's + * v_vfsp set to the parent's filesystem's vfsp. Note, + * 'z_parent' is self referential for non-snapshots. + */ + (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); +#ifdef FREEBSD_NAMECACHE + cache_purgevfs(zfsvfs->z_parent->z_vfs, true); +#endif + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + return (SET_ERROR(EIO)); + } + + /* + * At this point there are no vops active, and any new vops will + * fail with EIO since we have z_teardown_lock for writer (only + * relavent for forced unmount). + * + * Release all holds on dbufs. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; + zp = list_next(&zfsvfs->z_all_znodes, zp)) + if (zp->z_sa_hdl) { + ASSERT(ZTOV(zp)->v_count >= 0); + zfs_znode_dmu_fini(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * If we are unmounting, set the unmounted flag and let new vops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other vops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data + */ + if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && + !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + dmu_objset_evict_dbufs(zfsvfs->z_os); + + return (0); +} + +/*ARGSUSED*/ +static int +zfs_umount(vfs_t *vfsp, int fflag) +{ + kthread_t *td = curthread; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + objset_t *os; + cred_t *cr = td->td_ucred; + int ret; + + zfs_root_putvnode(zfsvfs); + + ret = secpolicy_fs_unmount(cr, vfsp); + if (ret) { + if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), + ZFS_DELEG_PERM_MOUNT, cr)) + return (ret); + } + + /* + * We purge the parent filesystem's vfsp as the parent filesystem + * and all of its snapshots have their vnode's v_vfsp set to the + * parent's filesystem's vfsp. Note, 'z_parent' is self + * referential for non-snapshots. + */ + (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); + + /* + * Unmount any snapshots mounted under .zfs before unmounting the + * dataset itself. + */ + if (zfsvfs->z_ctldir != NULL) { + if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) + return (ret); + } + + if (fflag & MS_FORCE) { + /* + * Mark file system as unmounted before calling + * vflush(FORCECLOSE). This way we ensure no future vnops + * will be called and risk operating on DOOMED vnodes. + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + zfsvfs->z_unmounted = B_TRUE; + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * Flush all the files. + */ + ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); + if (ret != 0) + return (ret); + +#ifdef illumos + if (!(fflag & MS_FORCE)) { + /* + * Check the number of active vnodes in the file system. + * Our count is maintained in the vfs structure, but the + * number is off by 1 to indicate a hold on the vfs + * structure itself. + * + * The '.zfs' directory maintains a reference of its + * own, and any active references underneath are + * reflected in the vnode count. + */ + if (zfsvfs->z_ctldir == NULL) { + if (vfsp->vfs_count > 1) + return (SET_ERROR(EBUSY)); + } else { + if (vfsp->vfs_count > 2 || + zfsvfs->z_ctldir->v_count > 1) + return (SET_ERROR(EBUSY)); + } + } +#endif + + while (taskqueue_cancel(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task, NULL) != 0) + taskqueue_drain(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task); + + VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + os = zfsvfs->z_os; + + /* + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. + */ + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os_user_ptr_lock); + dmu_objset_set_user(os, NULL); + mutex_exit(&os->os_user_ptr_lock); + + /* + * Finally release the objset + */ + dmu_objset_disown(os, zfsvfs); + } + + /* + * We can now safely destroy the '.zfs' directory node. + */ + if (zfsvfs->z_ctldir != NULL) + zfsctl_destroy(zfsvfs); + zfs_freevfs(vfsp); + + return (0); +} + +static int +zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + int err; + + /* + * zfs_zget() can't operate on virtual entries like .zfs/ or + * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. + * This will make NFS to switch to LOOKUP instead of using VGET. + */ + if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || + (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) + return (EOPNOTSUPP); + + ZFS_ENTER(zfsvfs); + err = zfs_zget(zfsvfs, ino, &zp); + if (err == 0 && zp->z_unlinked) { + vrele(ZTOV(zp)); + err = EINVAL; + } + if (err == 0) + *vpp = ZTOV(zp); + ZFS_EXIT(zfsvfs); + if (err == 0) + err = vn_lock(*vpp, flags); + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + /* + * If this is regular file system vfsp is the same as + * zfsvfs->z_parent->z_vfs, but if it is snapshot, + * zfsvfs->z_parent->z_vfs represents parent file system + * which we have to use here, because only this file system + * has mnt_export configured. + */ + return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, + credanonp, numsecflavors, secflavors)); +} + +CTASSERT(SHORT_FID_LEN <= sizeof(struct fid)); +CTASSERT(LONG_FID_LEN <= sizeof(struct fid)); + +static int +zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) +{ + struct componentname cn; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + vnode_t *dvp; + uint64_t object = 0; + uint64_t fid_gen = 0; + uint64_t gen_mask; + uint64_t zp_gen; + int i, err; + + *vpp = NULL; + + ZFS_ENTER(zfsvfs); + + /* + * On FreeBSD we can get snapshot's mount point or its parent file + * system mount point depending if snapshot is already mounted or not. + */ + if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint64_t objsetid = 0; + uint64_t setgen = 0; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); + + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); + + ZFS_EXIT(zfsvfs); + + err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); + if (err) + return (SET_ERROR(EINVAL)); + ZFS_ENTER(zfsvfs); + } + + if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { + zfid_short_t *zfid = (zfid_short_t *)fidp; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + } else { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * A zero fid_gen means we are in .zfs or the .zfs/snapshot + * directory tree. If the object == zfsvfs->z_shares_dir, then + * we are in the .zfs/shares directory tree. + */ + if ((fid_gen == 0 && + (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || + (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { + ZFS_EXIT(zfsvfs); + VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); + if (object == ZFSCTL_INO_SNAPDIR) { + cn.cn_nameptr = "snapshot"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN | LOCKLEAF; + cn.cn_lkflags = flags; + VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); + vput(dvp); + } else if (object == zfsvfs->z_shares_dir) { + /* + * XXX This branch must not be taken, + * if it is, then the lookup below will + * explode. + */ + cn.cn_nameptr = "shares"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN; + cn.cn_lkflags = flags; + VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); + vput(dvp); + } else { + *vpp = dvp; + } + return (err); + } + + gen_mask = -1ULL >> (64 - 8 * i); + + dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); + if (err = zfs_zget(zfsvfs, object, &zp)) { + ZFS_EXIT(zfsvfs); + return (err); + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (uint64_t)); + zp_gen = zp_gen & gen_mask; + if (zp_gen == 0) + zp_gen = 1; + if (zp->z_unlinked || zp_gen != fid_gen) { + dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); + vrele(ZTOV(zp)); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + *vpp = ZTOV(zp); + ZFS_EXIT(zfsvfs); + err = vn_lock(*vpp, flags); + if (err == 0) + vnode_create_vobject(*vpp, zp->z_size, curthread); + else + *vpp = NULL; + return (err); +} + +/* + * Block out VOPs and close zfsvfs_t::z_os + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying + * dataset and objset intact so that they can be atomically handed off during + * a subsequent rollback or recv operation and the resume thereafter. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs) +{ + int error; + + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); + + return (0); +} + +/* + * Rebuild SA and release VOPs. Note that ownership of the underlying dataset + * is an invariant across any of the operations that can be performed while the + * filesystem was suspended. Whether it succeeded or failed, the preconditions + * are the same: the relevant objset and associated dataset are owned by + * zfsvfs, held, and long held on entry. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + int err; + znode_t *zp; + + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just update the objset_t, as the one we + * had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + VERIFY0(dmu_objset_from_ds(ds, &os)); + + err = zfsvfs_init(zfsvfs, os); + if (err != 0) + goto bail; + + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + + zfs_set_fuid_feature(zfsvfs); + + /* + * Attempt to re-establish all the active znodes with + * their dbufs. If a zfs_rezget() fails, then we'll let + * any potential callers discover that via ZFS_ENTER_VERIFY_VP + * when they try to use their znode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + (void) zfs_rezget(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + +bail: + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + if (err) { + /* + * Since we couldn't setup the sa framework, try to force + * unmount this file system. + */ + if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { + vfs_ref(zfsvfs->z_vfs); + (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); + } + } + return (err); +} + +static void +zfs_freevfs(vfs_t *vfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + +#ifdef illumos + /* + * If this is a snapshot, we have an extra VFS_HOLD on our parent + * from zfs_mount(). Release it here. If we came through + * zfs_mountroot() instead, we didn't grab an extra hold, so + * skip the VFS_RELE for rootvfs. + */ + if (zfsvfs->z_issnap && (vfsp != rootvfs)) + VFS_RELE(zfsvfs->z_parent->z_vfs); +#endif + + zfsvfs_free(zfsvfs); + + atomic_dec_32(&zfs_active_fs_count); +} + +#ifdef __i386__ +static int desiredvnodes_backup; +#endif + +static void +zfs_vnodes_adjust(void) +{ +#ifdef __i386__ + int newdesiredvnodes; + + desiredvnodes_backup = desiredvnodes; + + /* + * We calculate newdesiredvnodes the same way it is done in + * vntblinit(). If it is equal to desiredvnodes, it means that + * it wasn't tuned by the administrator and we can tune it down. + */ + newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * + vm_kmem_size / (5 * (sizeof(struct vm_object) + + sizeof(struct vnode)))); + if (newdesiredvnodes == desiredvnodes) + desiredvnodes = (3 * newdesiredvnodes) / 4; +#endif +} + +static void +zfs_vnodes_adjust_back(void) +{ + +#ifdef __i386__ + desiredvnodes = desiredvnodes_backup; +#endif +} + +void +zfs_init(void) +{ + + printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); + + /* + * Initialize .zfs directory structures + */ + zfsctl_init(); + + /* + * Initialize znode cache, vnode ops, etc... + */ + zfs_znode_init(); + + /* + * Reduce number of vnodes. Originally number of vnodes is calculated + * with UFS inode in mind. We reduce it here, because it's too big for + * ZFS/i386. + */ + zfs_vnodes_adjust(); + + dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); +#if defined(__FreeBSD__) + zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); +#endif +} + +void +zfs_fini(void) +{ +#if defined(__FreeBSD__) + taskq_destroy(zfsvfs_taskq); +#endif + zfsctl_fini(); + zfs_znode_fini(); + zfs_vnodes_adjust_back(); +} + +int +zfs_busy(void) +{ + return (zfs_active_fs_count != 0); +} + +int +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) +{ + int error; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + + if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) + return (SET_ERROR(EINVAL)); + + if (newvers < zfsvfs->z_version) + return (SET_ERROR(EINVAL)); + + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (SET_ERROR(ENOTSUP)); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); + } + + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT0(error); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, + "from %llu to %llu", zfsvfs->z_version, newvers); + + dmu_tx_commit(tx); + + zfsvfs->z_version = newvers; + os->os_version = newvers; + + zfs_set_fuid_feature(zfsvfs); + + return (0); +} + +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { + pname = ZPL_VERSION_STR; + } else { + pname = zfs_prop_to_name(prop); + } + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + +/* + * Return true if the coresponding vfs's unmounted flag is set. + * Otherwise return false. + * If this function returns true we know VFS unmount has been initiated. + */ +boolean_t +zfs_get_vfs_flag_unmounted(objset_t *os) +{ + zfsvfs_t *zfvp; + boolean_t unmounted = B_FALSE; + + ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + if (zfvp != NULL && zfvp->z_vfs != NULL && + (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) + unmounted = B_TRUE; + mutex_exit(&os->os_user_ptr_lock); + + return (unmounted); +} + +#ifdef _KERNEL +void +zfsvfs_update_fromname(const char *oldname, const char *newname) +{ + char tmpbuf[MAXPATHLEN]; + struct mount *mp; + char *fromname; + size_t oldlen; + + oldlen = strlen(oldname); + + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + fromname = mp->mnt_stat.f_mntfromname; + if (strcmp(fromname, oldname) == 0) { + (void)strlcpy(fromname, newname, + sizeof(mp->mnt_stat.f_mntfromname)); + continue; + } + if (strncmp(fromname, oldname, oldlen) == 0 && + (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { + (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s", + newname, fromname + oldlen); + (void)strlcpy(fromname, tmpbuf, + sizeof(mp->mnt_stat.f_mntfromname)); + continue; + } + } + mtx_unlock(&mountlist_mtx); +} +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c new file mode 100644 index 000000000000..7de3c8d1f98f --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -0,0 +1,6076 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Nexenta Systems, Inc. + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vm.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/taskq.h> +#include <sys/uio.h> +#include <sys/atomic.h> +#include <sys/namei.h> +#include <sys/mman.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_ioctl.h> +#include <sys/fs/zfs.h> +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/dbuf.h> +#include <sys/zap.h> +#include <sys/sa.h> +#include <sys/dirent.h> +#include <sys/policy.h> +#include <sys/sunddi.h> +#include <sys/filio.h> +#include <sys/sid.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> +#include <sys/zfs_sa.h> +#include <sys/zfs_rlock.h> +#include <sys/extdirent.h> +#include <sys/kidmap.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/sched.h> +#include <sys/acl.h> +#include <sys/vmmeter.h> +#include <vm/vm_param.h> +#include <sys/zil.h> + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) VN_RELE() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, + * to indicate that this operation has already called dmu_tx_wait(). + * This will ensure that we don't retry forever, waiting a short bit + * each time. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * if (error == ERESTART) { + * waited = B_TRUE; + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * zil_commit(zilog, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* ARGSUSED */ +static int +zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(*vpp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + ((flag & FAPPEND) == 0)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { + if (fs_vscan(*vpp, cr, 0) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + } + + /* Keep a count of the synchronous opens in the znode */ + if (flag & (FSYNC | FDSYNC)) + atomic_inc_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +static int +zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + /* + * Clean up any locks held by this process on the vp. + */ + cleanlocks(vp, ddi_get_pid(), 0); + cleanshares(vp, ddi_get_pid()); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Decrement the synchronous opens in the znode */ + if ((flag & (FSYNC | FDSYNC)) && (count == 1)) + atomic_dec_32(&zp->z_sync_cnt); + + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) + VERIFY(fs_vscan(vp, cr, 1) == 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and + * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) +{ + znode_t *zp = VTOZ(vp); + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + file_sz = zp->z_size; + if (noff >= file_sz) { + return (SET_ERROR(ENXIO)); + } + + if (cmd == _FIO_SEEK_HOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); + + if (error == ESRCH) + return (SET_ERROR(ENXIO)); + + /* + * We could find a hole that begins after the logical end-of-file, + * because dmu_offset_next() only works on whole blocks. If the + * EOF falls mid-block, then indicate that the "virtual hole" + * at the end of the file begins at the logical EOF, rather than + * at the end of the last block. + */ + if (noff > file_sz) { + ASSERT(hole); + noff = file_sz; + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +/* ARGSUSED */ +static int +zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, + int *rvalp, caller_context_t *ct) +{ + offset_t off; + offset_t ndata; + dmu_object_info_t doi; + int error; + zfsvfs_t *zfsvfs; + znode_t *zp; + + switch (com) { + case _FIOFFS: + { + return (0); + + /* + * The following two ioctls are used by bfu. Faking out, + * necessary to avoid bfu errors. + */ + } + case _FIOGDIO: + case _FIOSDIO: + { + return (0); + } + + case _FIO_SEEK_DATA: + case _FIO_SEEK_HOLE: + { +#ifdef illumos + if (ddi_copyin((void *)data, &off, sizeof (off), flag)) + return (SET_ERROR(EFAULT)); +#else + off = *(offset_t *)data; +#endif + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* offset parameter is in/out */ + error = zfs_holey(vp, com, &off); + ZFS_EXIT(zfsvfs); + if (error) + return (error); +#ifdef illumos + if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) + return (SET_ERROR(EFAULT)); +#else + *(offset_t *)data = off; +#endif + return (0); + } +#ifdef illumos + case _FIO_COUNT_FILLED: + { + /* + * _FIO_COUNT_FILLED adds a new ioctl command which + * exposes the number of filled blocks in a + * ZFS object. + */ + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * Wait for all dirty blocks for this object + * to get synced out to disk, and the DMU info + * updated. + */ + error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Retrieve fill count from DMU object. + */ + error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + ndata = doi.doi_fill_count; + + ZFS_EXIT(zfsvfs); + if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) + return (SET_ERROR(EFAULT)); + return (0); + } +#endif + } + return (SET_ERROR(ENOTTY)); +} + +static vm_page_t +page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) +{ + vm_object_t obj; + vm_page_t pp; + int64_t end; + + /* + * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE + * aligned boundaries, if the range is not aligned. As a result a + * DEV_BSIZE subrange with partially dirty data may get marked as clean. + * It may happen that all DEV_BSIZE subranges are marked clean and thus + * the whole page would be considred clean despite have some dirty data. + * For this reason we should shrink the range to DEV_BSIZE aligned + * boundaries before calling vm_page_clear_dirty. + */ + end = rounddown2(off + nbytes, DEV_BSIZE); + off = roundup2(off, DEV_BSIZE); + nbytes = end - off; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked(obj); + + for (;;) { + if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + pp->valid) { + if (vm_page_xbusied(pp)) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_reference(pp); + vm_page_lock(pp); + zfs_vmobject_wunlock(obj); + vm_page_busy_sleep(pp, "zfsmwb", true); + zfs_vmobject_wlock(obj); + continue; + } + vm_page_sbusy(pp); + } else if (pp != NULL) { + ASSERT(!pp->valid); + pp = NULL; + } + + if (pp != NULL) { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_object_pip_add(obj, 1); + pmap_remove_write(pp); + if (nbytes != 0) + vm_page_clear_dirty(pp, off, nbytes); + } + break; + } + return (pp); +} + +static void +page_unbusy(vm_page_t pp) +{ + + vm_page_sunbusy(pp); + vm_object_pip_subtract(pp->object, 1); +} + +static vm_page_t +page_hold(vnode_t *vp, int64_t start) +{ + vm_object_t obj; + vm_page_t pp; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked(obj); + + for (;;) { + if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + pp->valid) { + if (vm_page_xbusied(pp)) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_reference(pp); + vm_page_lock(pp); + zfs_vmobject_wunlock(obj); + vm_page_busy_sleep(pp, "zfsmwb", true); + zfs_vmobject_wlock(obj); + continue; + } + + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_page_lock(pp); + vm_page_hold(pp); + vm_page_unlock(pp); + + } else + pp = NULL; + break; + } + return (pp); +} + +static void +page_unhold(vm_page_t pp) +{ + + vm_page_lock(pp); + vm_page_unhold(pp); + vm_page_unlock(pp); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + */ +static void +update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, + int segflg, dmu_tx_t *tx) +{ + vm_object_t obj; + struct sf_buf *sf; + caddr_t va; + int off; + + ASSERT(segflg != UIO_NOCOPY); + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + + off = start & PAGEOFFSET; + zfs_vmobject_wlock(obj); + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + vm_page_t pp; + int nbytes = imin(PAGESIZE - off, len); + + if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { + zfs_vmobject_wunlock(obj); + + va = zfs_map_page(pp, &sf); + (void) dmu_read(os, oid, start+off, nbytes, + va+off, DMU_READ_PREFETCH);; + zfs_unmap_page(sf); + + zfs_vmobject_wlock(obj); + page_unbusy(pp); + } + len -= nbytes; + off = 0; + } + vm_object_pip_wakeupn(obj, 0); + zfs_vmobject_wunlock(obj); +} + +/* + * Read with UIO_NOCOPY flag means that sendfile(2) requests + * ZFS to populate a range of page cache pages with data. + * + * NOTE: this function could be optimized to pre-allocate + * all pages in advance, drain exclusive busy on all of them, + * map them into contiguous KVA region and populate them + * in one single dmu_read() call. + */ +static int +mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + struct sf_buf *sf; + vm_object_t obj; + vm_page_t pp; + int64_t start; + caddr_t va; + int len = nbytes; + int off; + int error = 0; + + ASSERT(uio->uio_segflg == UIO_NOCOPY); + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); + + zfs_vmobject_wlock(obj); + for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { + int bytes = MIN(PAGESIZE, len); + + pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | + VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); + if (pp->valid == 0) { + zfs_vmobject_wunlock(obj); + va = zfs_map_page(pp, &sf); + error = dmu_read(os, zp->z_id, start, bytes, va, + DMU_READ_PREFETCH); + if (bytes != PAGESIZE && error == 0) + bzero(va + bytes, PAGESIZE - bytes); + zfs_unmap_page(sf); + zfs_vmobject_wlock(obj); + vm_page_sunbusy(pp); + vm_page_lock(pp); + if (error) { + if (pp->wire_count == 0 && pp->valid == 0 && + !vm_page_busied(pp)) + vm_page_free(pp); + } else { + pp->valid = VM_PAGE_BITS_ALL; + vm_page_activate(pp); + } + vm_page_unlock(pp); + } else { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_page_sunbusy(pp); + } + if (error) + break; + uio->uio_resid -= bytes; + uio->uio_offset += bytes; + len -= bytes; + } + zfs_vmobject_wunlock(obj); + return (error); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedread(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + vm_object_t obj; + int64_t start; + caddr_t va; + int len = nbytes; + int off; + int error = 0; + + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + + start = uio->uio_loffset; + off = start & PAGEOFFSET; + zfs_vmobject_wlock(obj); + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + vm_page_t pp; + uint64_t bytes = MIN(PAGESIZE - off, len); + + if (pp = page_hold(vp, start)) { + struct sf_buf *sf; + caddr_t va; + + zfs_vmobject_wunlock(obj); + va = zfs_map_page(pp, &sf); +#ifdef illumos + error = uiomove(va + off, bytes, UIO_READ, uio); +#else + error = vn_io_fault_uiomove(va + off, bytes, uio); +#endif + zfs_unmap_page(sf); + zfs_vmobject_wlock(obj); + page_unhold(pp); + } else { + zfs_vmobject_wunlock(obj); + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, bytes); + zfs_vmobject_wlock(obj); + } + len -= bytes; + off = 0; + if (error) + break; + } + zfs_vmobject_wunlock(obj); + return (error); +} + +offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: vp - vnode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - SYNC flags; used to provide FRSYNC semantics. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 on success, error code on failure. + * + * Side Effects: + * vp - atime updated if byte count > 0 + */ +/* ARGSUSED */ +static int +zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + ssize_t n, nbytes; + int error = 0; + rl_t *rl; + xuio_t *xuio = NULL; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_pflags & ZFS_AV_QUARANTINED) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + /* + * Validate file offset + */ + if (uio->uio_loffset < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Fasttrack empty reads + */ + if (uio->uio_resid == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * Check for mandatory locks + */ + if (MANDMODE(zp->z_mode)) { + if (error = chklock(vp, FREAD, + uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + */ + if (zfsvfs->z_log && + (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) + zil_commit(zfsvfs->z_log, zp->z_id); + + /* + * Lock the range against changes. + */ + rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); + + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio->uio_loffset >= zp->z_size) { + error = 0; + goto out; + } + + ASSERT(uio->uio_loffset < zp->z_size); + n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + +#ifdef illumos + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { + int nblk; + int blksz = zp->z_blksz; + uint64_t offset = uio->uio_loffset; + + xuio = (xuio_t *)uio; + if ((ISP2(blksz))) { + nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, + blksz)) / blksz; + } else { + ASSERT(offset + n <= blksz); + nblk = 1; + } + (void) dmu_xuio_init(xuio, nblk); + + if (vn_has_cached_data(vp)) { + /* + * For simplicity, we always allocate a full buffer + * even if we only expect to read a portion of a block. + */ + while (--nblk >= 0) { + (void) dmu_xuio_add(xuio, + dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz), 0, blksz); + } + } + } +#endif /* illumos */ + + while (n > 0) { + nbytes = MIN(n, zfs_read_chunk_size - + P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); + +#ifdef __FreeBSD__ + if (uio->uio_segflg == UIO_NOCOPY) + error = mappedread_sf(vp, nbytes, uio); + else +#endif /* __FreeBSD__ */ + if (vn_has_cached_data(vp)) { + error = mappedread(vp, nbytes, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes); + } + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + + n -= nbytes; + } +out: + zfs_range_unlock(rl); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Write the bytes to a file. + * + * IN: vp - vnode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is + * set if in append mode. + * cr - credentials of caller. + * ct - caller context (NFS/CIFS fem monitor only) + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - ctime|mtime updated if byte count > 0 + */ + +/* ARGSUSED */ +static int +zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + rlim64_t limit = MAXOFFSET_T; + ssize_t start_resid = uio->uio_resid; + ssize_t tx_bytes; + uint64_t end_size; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog; + offset_t woff; + ssize_t n, nbytes; + rl_t *rl; + int max_blksz = zfsvfs->z_max_blksz; + int error = 0; + arc_buf_t *abuf; + iovec_t *aiov = NULL; + xuio_t *xuio = NULL; + int i_iov = 0; + int iovcnt = uio->uio_iovcnt; + iovec_t *iovp = uio->uio_iov; + int write_eof; + int count = 0; + sa_bulk_attr_t bulk[4]; + uint64_t mtime[2], ctime[2]; + + /* + * Fasttrack empty write + */ + n = start_resid; + if (n == 0) + return (0); + + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* + * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our + * callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * If immutable or not appending then return EPERM. + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common() + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && + (uio->uio_loffset < zp->z_size))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + zilog = zfsvfs->z_log; + + /* + * Validate file offset + */ + woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Check for mandatory locks before calling zfs_range_lock() + * in order to prevent a deadlock with locks set via fcntl(). + */ + if (MANDMODE((mode_t)zp->z_mode) && + (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + +#ifdef illumos + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + * Skip this if uio contains loaned arc_buf. + */ + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) + xuio = (xuio_t *)uio; + else + uio_prefaultpages(MIN(n, max_blksz), uio); +#endif + + /* + * If in append mode, set the io offset pointer to eof. + */ + if (ioflag & FAPPEND) { + /* + * Obtain an appending range lock to guarantee file append + * semantics. We reset the write offset once we have the lock. + */ + rl = zfs_range_lock(zp, 0, n, RL_APPEND); + woff = rl->r_off; + if (rl->r_len == UINT64_MAX) { + /* + * We overlocked the file because this write will cause + * the file block size to increase. + * Note that zp_size cannot change with this lock held. + */ + woff = zp->z_size; + } + uio->uio_loffset = woff; + } else { + /* + * Note that if the file block size will change as a result of + * this write, then this range lock will lock the entire file + * so that we can re-write the block safely. + */ + rl = zfs_range_lock(zp, woff, n, RL_WRITER); + } + + if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { + zfs_range_unlock(rl); + ZFS_EXIT(zfsvfs); + return (EFBIG); + } + + if (woff >= limit) { + zfs_range_unlock(rl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFBIG)); + } + + if ((woff + n) > limit || woff > (limit - n)) + n = limit - woff; + + /* Will this write extend the file length? */ + write_eof = (woff + n > zp->z_size); + + end_size = MAX(zp->z_size, woff + n); + + /* + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (n > 0) { + abuf = NULL; + woff = uio->uio_loffset; + if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || + zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { + if (abuf != NULL) + dmu_return_arcbuf(abuf); + error = SET_ERROR(EDQUOT); + break; + } + + if (xuio && abuf == NULL) { + ASSERT(i_iov < iovcnt); + aiov = &iovp[i_iov]; + abuf = dmu_xuio_arcbuf(xuio, i_iov); + dmu_xuio_clear(xuio, i_iov); + DTRACE_PROBE3(zfs_cp_write, int, i_iov, + iovec_t *, aiov, arc_buf_t *, abuf); + ASSERT((aiov->iov_base == abuf->b_data) || + ((char *)aiov->iov_base - (char *)abuf->b_data + + aiov->iov_len == arc_buf_size(abuf))); + i_iov++; + } else if (abuf == NULL && n >= max_blksz && + woff >= zp->z_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + /* + * This write covers a full block. "Borrow" a buffer + * from the dmu so that we can fill it before we enter + * a transaction. This avoids the possibility of + * holding up the transaction if the data copy hangs + * up on a pagefault (e.g., from an NFS server mapping). + */ + size_t cbytes; + + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + if (error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes)) { + dmu_return_arcbuf(abuf); + break; + } + ASSERT(cbytes == max_blksz); + } + + /* + * Start a transaction. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); + break; + } + + /* + * If zfs_range_lock() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since zfs_range_reduce() will + * shrink down r_len to the appropriate size. + */ + if (rl->r_len == UINT64_MAX) { + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, + 1 << highbit64(zp->z_blksz)); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); + zfs_range_reduce(rl, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + + if (woff + nbytes > zp->z_size) + vnode_pager_setsize(vp, woff + nbytes); + + if (abuf == NULL) { + tx_bytes = uio->uio_resid; + error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes, tx); + tx_bytes -= uio->uio_resid; + } else { + tx_bytes = nbytes; + ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); + /* + * If this is not a full block write, but we are + * extending the file past EOF and this data starts + * block-aligned, use assign_arcbuf(). Otherwise, + * write via dmu_write(). + */ + if (tx_bytes < max_blksz && (!write_eof || + aiov->iov_base != abuf->b_data)) { + ASSERT(xuio); + dmu_write(zfsvfs->z_os, zp->z_id, woff, + aiov->iov_len, aiov->iov_base, tx); + dmu_return_arcbuf(abuf); + xuio_stat_wbuf_copied(); + } else { + ASSERT(xuio || tx_bytes == max_blksz); + dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), + woff, abuf, tx); + } + ASSERT(tx_bytes <= uio->uio_resid); + uioskip(uio, tx_bytes); + } + if (tx_bytes && vn_has_cached_data(vp)) { + update_pages(vp, woff, tx_bytes, zfsvfs->z_os, + zp->z_id, uio->uio_segflg, tx); + } + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + dmu_tx_commit(tx); + ASSERT(error != 0); + break; + } + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the excute bits is set. + * + * It would be nice to to this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(vp, cr, + (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); + } + mutex_exit(&zp->z_acl_lock); + + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_size) < uio->uio_loffset) { + (void) atomic_cas_64(&zp->z_size, end_size, + uio->uio_loffset); +#ifdef illumos + ASSERT(error == 0); +#else + ASSERT(error == 0 || error == EFAULT); +#endif + } + /* + * If we are replaying and eof is non zero then force + * the file size to the specified eof. Note, there's no + * concurrency during replay. + */ + if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) + zp->z_size = zfsvfs->z_replay_eof; + + if (error == 0) + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + else + (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); + dmu_tx_commit(tx); + + if (error != 0) + break; + ASSERT(tx_bytes == nbytes); + n -= nbytes; + +#ifdef illumos + if (!xuio && n > 0) + uio_prefaultpages(MIN(n, max_blksz), uio); +#endif + } + + zfs_range_unlock(rl); + + /* + * If we're in replay mode, or we made no progress, return error. + * Otherwise, it's at least a partial write, so it's successful. + */ + if (zfsvfs->z_replay || uio->uio_resid == start_resid) { + ZFS_EXIT(zfsvfs); + return (error); + } + +#ifdef __FreeBSD__ + /* + * EFAULT means that at least one page of the source buffer was not + * available. VFS will re-try remaining I/O upon this error. + */ + if (error == EFAULT) { + ZFS_EXIT(zfsvfs); + return (error); + } +#endif + + if (ioflag & (FSYNC | FDSYNC) || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, zp->z_id); + + ZFS_EXIT(zfsvfs); + return (0); +} + +void +zfs_get_done(zgd_t *zgd, int error) +{ + znode_t *zp = zgd->zgd_private; + objset_t *os = zp->z_zfsvfs->z_os; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_range_unlock(zgd->zgd_rl); + + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); + + if (error == 0 && zgd->zgd_bp) + zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); + + kmem_free(zgd, sizeof (zgd_t)); +} + +#ifdef DEBUG +static int zil_fault_io = 0; +#endif + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + dmu_buf_t *db; + zgd_t *zgd; + int error = 0; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + /* + * Nothing to do if the file has been removed + */ + if (zfs_zget(zfsvfs, object, &zp) != 0) + return (SET_ERROR(ENOENT)); + if (zp->z_unlinked) { + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_vnrele_taskq(dmu_objset_pool(os))); + return (SET_ERROR(ENOENT)); + } + + zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_lwb = lwb; + zgd->zgd_private = zp; + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); + /* test for truncation needs to be done while range locked */ + if (offset >= zp->z_size) { + error = SET_ERROR(ENOENT); + } else { + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + } + ASSERT(error == 0 || error == ENOENT); + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's + * written out and its checksum is being calculated + * that no one can change the data. We need to re-check + * blocksize after we get the lock in case it's changed! + */ + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; + offset -= blkoff; + zgd->zgd_rl = zfs_range_lock(zp, offset, size, + RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + zfs_range_unlock(zgd->zgd_rl); + } + /* test for truncation needs to be done while range locked */ + if (lr->lr_offset >= zp->z_size) + error = SET_ERROR(ENOENT); +#ifdef DEBUG + if (zil_fault_io) { + error = SET_ERROR(EIO); + zil_fault_io = 0; + } +#endif + if (error == 0) + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); + + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ + if (error == 0) + return (0); + + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP, + * so there's no need to zio_flush() its + * vdevs (flushing would needlesly hurt + * performance, and doesn't work on + * indirect vdevs). + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; + } + } + } + + zfs_get_done(zgd, error); + + return (error); +} + +/*ARGSUSED*/ +static int +zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +static int +zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) +{ + int error; + + *vpp = arg; + error = vn_lock(*vpp, lkflags); + if (error != 0) + vrele(*vpp); + return (error); +} + +static int +zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) +{ + znode_t *zdp = VTOZ(dvp); + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + int error; + int ltype; + + ASSERT_VOP_LOCKED(dvp, __func__); +#ifdef DIAGNOSTIC + if ((zdp->z_pflags & ZFS_XATTR) == 0) + VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); +#endif + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + ASSERT3P(dvp, ==, vp); + vref(dvp); + ltype = lkflags & LK_TYPE_MASK; + if (ltype != VOP_ISLOCKED(dvp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); + + /* + * Relock for the "." case could leave us with + * reclaimed vnode. + */ + if (dvp->v_iflag & VI_DOOMED) { + vrele(dvp); + return (SET_ERROR(ENOENT)); + } + } + return (0); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + /* + * Note that in this case, dvp is the child vnode, and we + * are looking up the parent vnode - exactly reverse from + * normal operation. Unlocking dvp requires some rather + * tricky unlock/relock dance to prevent mp from being freed; + * use vn_vget_ino_gen() which takes care of all that. + * + * XXX Note that there is a time window when both vnodes are + * unlocked. It is possible, although highly unlikely, that + * during that window the parent-child relationship between + * the vnodes may change, for example, get reversed. + * In that case we would have a wrong lock order for the vnodes. + * All other filesystems seem to ignore this problem, so we + * do the same here. + * A potential solution could be implemented as follows: + * - using LK_NOWAIT when locking the second vnode and retrying + * if necessary + * - checking that the parent-child relationship still holds + * after locking both vnodes and retrying if it doesn't + */ + error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); + return (error); + } else { + error = vn_lock(vp, lkflags); + if (error != 0) + vrele(vp); + return (error); + } +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held vnode reference for it. + * + * IN: dvp - vnode of directory to search. + * nm - name of entry to lookup. + * pnp - full pathname to lookup [UNUSED]. + * flags - LOOKUP_XATTR set if looking for an attribute. + * rdir - root directory vnode [UNUSED]. + * cr - credentials of caller. + * ct - caller context + * + * OUT: vpp - vnode of located entry, NULL if not found. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +static int +zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, + int nameiop, cred_t *cr, kthread_t *td, int flags) +{ + znode_t *zdp = VTOZ(dvp); + znode_t *zp; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + int error = 0; + + /* + * Fast path lookup, however we must skip DNLC lookup + * for case folding or normalizing lookups because the + * DNLC code only stores the passed in name. This means + * creating 'a' and removing 'A' on a case insensitive + * file system would work, but DNLC still thinks 'a' + * exists and won't let you create it again on the next + * pass through fast path. + */ + if (!(flags & LOOKUP_XATTR)) { + if (dvp->v_type != VDIR) { + return (SET_ERROR(ENOTDIR)); + } else if (zdp->z_sa_hdl == NULL) { + return (SET_ERROR(EIO)); + } + } + + DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *vpp = NULL; + + if (flags & LOOKUP_XATTR) { +#ifdef TODO + /* + * If the xattr property is off, refuse the lookup request. + */ + if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } +#endif + + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Do we have permission to get into attribute directory? + */ + if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, + B_FALSE, cr)) { + vrele(*vpp); + *vpp = NULL; + } + + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Check accessibility of directory. + */ + if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + + /* + * First handle the special cases. + */ + if ((cnp->cn_flags & ISDOTDOT) != 0) { + /* + * If we are a snapshot mounted under .zfs, return + * the vp for the snapshot directory. + */ + if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { + struct componentname cn; + vnode_t *zfsctl_vp; + int ltype; + + ZFS_EXIT(zfsvfs); + ltype = VOP_ISLOCKED(dvp); + VOP_UNLOCK(dvp, 0); + error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, + &zfsctl_vp); + if (error == 0) { + cn.cn_nameptr = "snapshot"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = cnp->cn_nameiop; + cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; + cn.cn_lkflags = cnp->cn_lkflags; + error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); + vput(zfsctl_vp); + } + vn_lock(dvp, ltype | LK_RETRY); + return (error); + } + } + if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { + ZFS_EXIT(zfsvfs); + if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); + return (error); + } + + /* + * The loop is retry the lookup if the parent-child relationship + * changes during the dot-dot locking complexities. + */ + for (;;) { + uint64_t parent; + + error = zfs_dirlook(zdp, nm, &zp); + if (error == 0) + *vpp = ZTOV(zp); + + ZFS_EXIT(zfsvfs); + if (error != 0) + break; + + error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); + if (error != 0) { + /* + * If we've got a locking error, then the vnode + * got reclaimed because of a force unmount. + * We never enter doomed vnodes into the name cache. + */ + *vpp = NULL; + return (error); + } + + if ((cnp->cn_flags & ISDOTDOT) == 0) + break; + + ZFS_ENTER(zfsvfs); + if (zdp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + } else { + error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + } + if (error != 0) { + ZFS_EXIT(zfsvfs); + vput(ZTOV(zp)); + break; + } + if (zp->z_id == parent) { + ZFS_EXIT(zfsvfs); + break; + } + vput(ZTOV(zp)); + } + +out: + if (error != 0) + *vpp = NULL; + + /* Translate errors and add SAVENAME when needed. */ + if (cnp->cn_flags & ISLASTCN) { + switch (nameiop) { + case CREATE: + case RENAME: + if (error == ENOENT) { + error = EJUSTRETURN; + cnp->cn_flags |= SAVENAME; + break; + } + /* FALLTHROUGH */ + case DELETE: + if (error == 0) + cnp->cn_flags |= SAVENAME; + break; + } + } + + /* Insert name into cache (as non-existent) if appropriate. */ + if (zfsvfs->z_use_namecache && + error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(dvp, NULL, cnp); + + /* Insert name into cache if appropriate. */ + if (zfsvfs->z_use_namecache && + error == 0 && (cnp->cn_flags & MAKEENTRY)) { + if (!(cnp->cn_flags & ISLASTCN) || + (nameiop != DELETE && nameiop != RENAME)) { + cache_enter(dvp, *vpp, cnp); + } + } + + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the vp of the created or trunc'd file. + * + * IN: dvp - vnode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - large file flag [UNUSED]. + * ct - caller context + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created or trunc'd entry. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated if new entry created + * vp - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +static int +zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, + vnode_t **vpp, cred_t *cr, kthread_t *td) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + objset_t *os; + dmu_tx_t *tx; + int error; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + void *vsecp = NULL; + int flag = 0; + uint64_t txtype; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || (vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + *vpp = NULL; + + if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) + vap->va_mode &= ~S_ISVTX; + + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + ASSERT3P(zp, ==, NULL); + + /* + * Create a new file object and update the directory + * to reference it. + */ + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + + if ((dzp->z_pflags & ZFS_XATTR) && + (vap->va_type != VREG)) { + error = SET_ERROR(EINVAL); + goto out; + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + getnewvnode_reserve(1); + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + +out: + if (error == 0) { + *vpp = ZTOV(zp); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dvp - vnode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime + * vp - ctime (if nlink > 0) + */ + +/*ARGSUSED*/ +static int +zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp = VTOZ(vp); + znode_t *xzp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t acl_obj, xattr_obj; + uint64_t obj = 0; + dmu_tx_t *tx; + boolean_t unlinked, toobig = FALSE; + uint64_t txtype; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + ZFS_VERIFY_ZP(zp); + zilog = zfsvfs->z_log; + zp = VTOZ(vp); + + xattr_obj = 0; + xzp = NULL; + + if (error = zfs_zaccess_delete(dzp, zp, cr)) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (vp->v_type == VDIR) { + error = SET_ERROR(EPERM); + goto out; + } + + vnevent_remove(vp, dvp, name, ct); + + obj = zp->z_id; + + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT0(error); + } + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the vnode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + + if (xzp) { + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + /* + * Mark this transaction as typically resulting in a net free of space + */ + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + zfs_unlinked_add(zp, tx); + vp->v_vflag |= VV_NOSYNC; + } + + txtype = TX_REMOVE; + zfs_log_remove(zilog, tx, txtype, dzp, name, obj); + + dmu_tx_commit(tx); +out: + + if (xzp) + vrele(ZTOV(xzp)); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Create a new directory and insert it into dvp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dvp - vnode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created directory. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + * vp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +static int +zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t txtype; + dmu_tx_t *tx; + int error; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + + ASSERT(vap->va_type == VDIR); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && + ((vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. + */ + *vpp = NULL; + + if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + ASSERT3P(zp, ==, NULL); + + if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + /* + * Add a new entry to the directory. + */ + getnewvnode_reserve(1); + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + /* + * Now put new name in parent dir. + */ + (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); + + *vpp = ZTOV(zp); + + txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, + acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dvp - vnode of directory to remove from. + * name - name of directory to be removed. + * cwd - vnode of current working directory. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + ZFS_VERIFY_ZP(zp); + zilog = zfsvfs->z_log; + + + if (error = zfs_zaccess_delete(dzp, zp, cr)) { + goto out; + } + + if (vp->v_type != VDIR) { + error = SET_ERROR(ENOTDIR); + goto out; + } + + vnevent_rmdir(vp, dvp, name, ct); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + cache_purge(dvp); + + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); + } + + dmu_tx_commit(tx); + + cache_purge(vp); +out: + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Read as many directory entries as will fit into the provided + * buffer from the given directory cursor position (specified in + * the uio structure). + * + * IN: vp - vnode of directory to read. + * uio - structure supplying read location, range info, + * and return buffer. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * OUT: uio - updated offset and range, buffer filled. + * eofp - set to true if end-of-file detected. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +static int +zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) +{ + znode_t *zp = VTOZ(vp); + iovec_t *iovp; + edirent_t *eodp; + dirent64_t *odp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + caddr_t outbuf; + size_t bufsize; + zap_cursor_t zc; + zap_attribute_t zap; + uint_t bytes_wanted; + uint64_t offset; /* must be unsigned; checks for < 1 */ + uint64_t parent; + int local_eof; + int outcount; + int error; + uint8_t prefetch; + boolean_t check_sysattrs; + uint8_t type; + int ncooks; + u_long *cooks = NULL; + int flags = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If we are not given an eof variable, + * use a local one. + */ + if (eofp == NULL) + eofp = &local_eof; + + /* + * Check for valid iov_len. + */ + if (uio->uio_iov->iov_len <= 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Quit if directory has been removed (posix) + */ + if ((*eofp = zp->z_unlinked) != 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + error = 0; + os = zfsvfs->z_os; + offset = uio->uio_loffset; + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Get space to change directory entries into fs independent format. + */ + iovp = uio->uio_iov; + bytes_wanted = iovp->iov_len; + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { + bufsize = bytes_wanted; + outbuf = kmem_alloc(bufsize, KM_SLEEP); + odp = (struct dirent64 *)outbuf; + } else { + bufsize = bytes_wanted; + outbuf = NULL; + odp = (struct dirent64 *)iovp->iov_base; + } + eodp = (struct edirent *)odp; + + if (ncookies != NULL) { + /* + * Minimum entry size is dirent size and 1 byte for a file name. + */ + ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); + cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); + *cookies = cooks; + *ncookies = ncooks; + } + /* + * If this VFS supports the system attribute view interface; and + * we're looking at an extended attribute directory; and we care + * about normalization conflicts on this vfs; then we must check + * for normalization conflicts with the sysattr name space. + */ +#ifdef TODO + check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && + (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && + (flags & V_RDDIR_ENTFLAGS); +#else + check_sysattrs = 0; +#endif + + /* + * Transform to file-system independent format + */ + outcount = 0; + while (outcount < bytes_wanted) { + ino64_t objnum; + ushort_t reclen; + off64_t *next = NULL; + + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strcpy(zap.za_name, "."); + zap.za_normalization_conflict = 0; + objnum = zp->z_id; + type = DT_DIR; + } else if (offset == 1) { + (void) strcpy(zap.za_name, ".."); + zap.za_normalization_conflict = 0; + objnum = parent; + type = DT_DIR; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + type = DT_DIR; + } else { + /* + * Grab next entry. + */ + if (error = zap_cursor_retrieve(&zc, &zap)) { + if ((*eofp = (error == ENOENT)) != 0) + break; + else + goto update; + } + + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset); + error = SET_ERROR(ENXIO); + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + /* + * MacOS X can extract the object type here such as: + * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); + */ + type = ZFS_DIRENT_TYPE(zap.za_first_integer); + + if (check_sysattrs && !zap.za_normalization_conflict) { +#ifdef TODO + zap.za_normalization_conflict = + xattr_sysattr_casechk(zap.za_name); +#else + panic("%s:%u: TODO", __func__, __LINE__); +#endif + } + } + + if (flags & V_RDDIR_ACCFILTER) { + /* + * If we have no access at all, don't include + * this entry in the returned information + */ + znode_t *ezp; + if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) + goto skip_entry; + if (!zfs_has_access(ezp, cr)) { + vrele(ZTOV(ezp)); + goto skip_entry; + } + vrele(ZTOV(ezp)); + } + + if (flags & V_RDDIR_ENTFLAGS) + reclen = EDIRENT_RECLEN(strlen(zap.za_name)); + else + reclen = DIRENT64_RECLEN(strlen(zap.za_name)); + + /* + * Will this entry fit in the buffer? + */ + if (outcount + reclen > bufsize) { + /* + * Did we manage to fit anything in the buffer? + */ + if (!outcount) { + error = SET_ERROR(EINVAL); + goto update; + } + break; + } + if (flags & V_RDDIR_ENTFLAGS) { + /* + * Add extended flag entry: + */ + eodp->ed_ino = objnum; + eodp->ed_reclen = reclen; + /* NOTE: ed_off is the offset for the *next* entry. */ + next = &eodp->ed_off; + eodp->ed_eflags = zap.za_normalization_conflict ? + ED_CASE_CONFLICT : 0; + (void) strncpy(eodp->ed_name, zap.za_name, + EDIRENT_NAMELEN(reclen)); + eodp = (edirent_t *)((intptr_t)eodp + reclen); + } else { + /* + * Add normal entry: + */ + odp->d_ino = objnum; + odp->d_reclen = reclen; + odp->d_namlen = strlen(zap.za_name); + /* NOTE: d_off is the offset for the *next* entry. */ + next = &odp->d_off; + (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); + odp->d_type = type; + dirent_terminate(odp); + odp = (dirent64_t *)((intptr_t)odp + reclen); + } + outcount += reclen; + + ASSERT(outcount <= bufsize); + + /* Prefetch znode */ + if (prefetch) + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); + + skip_entry: + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + + /* Fill the offset right after advancing the cursor. */ + if (next != NULL) + *next = offset; + if (cooks != NULL) { + *cooks++ = offset; + ncooks--; + KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); + } + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + + /* Subtract unused cookies */ + if (ncookies != NULL) + *ncookies -= ncooks; + + if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { + iovp->iov_base += outcount; + iovp->iov_len -= outcount; + uio->uio_resid -= outcount; + } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { + /* + * Reset the pointer. + */ + offset = uio->uio_loffset; + } + +update: + zap_cursor_fini(&zc); + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) + kmem_free(outbuf, bufsize); + + if (error == ENOENT) + error = 0; + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + uio->uio_loffset = offset; + ZFS_EXIT(zfsvfs); + if (error != 0 && cookies != NULL) { + free(*cookies, M_TEMP); + *cookies = NULL; + *ncookies = 0; + } + return (error); +} + +ulong_t zfs_fsync_sync_cnt = 4; + +static int +zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + + if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + } + return (0); +} + + +/* + * Get the requested file attributes and place them in the provided + * vattr structure. + * + * IN: vp - vnode of file. + * vap - va_mask identifies requested attributes. + * If AT_XVATTR set, then optional attrs are requested + * flags - ATTR_NOACLCHECK (CIFS server context) + * cr - credentials of caller. + * ct - caller context + * + * OUT: vap - attribute values. + * + * RETURN: 0 (always succeeds). + */ +/* ARGSUSED */ +static int +zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + uint32_t blksize; + u_longlong_t nblocks; + uint64_t mtime[2], ctime[2], crtime[2], rdev; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap = NULL; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + sa_bulk_attr_t bulk[4]; + int count = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + if (vp->v_type == VBLK || vp->v_type == VCHR) + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + + if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. + * Also, if we are the owner don't bother, since owner should + * always be allowed to read basic attributes of file. + */ + if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && + (vap->va_uid != crgetuid(cr))) { + if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, + skipaclchk, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * Return all attributes. It's cheaper to provide the answer + * than to determine whether we were asked the question. + */ + + vap->va_type = IFTOVT(zp->z_mode); + vap->va_mode = zp->z_mode & ~S_IFMT; +#ifdef illumos + vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; +#else + vn_fsid(vp, vap); +#endif + vap->va_nodeid = zp->z_id; + vap->va_nlink = zp->z_links; + if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) && + zp->z_links < ZFS_LINK_MAX) + vap->va_nlink++; + vap->va_size = zp->z_size; +#ifdef illumos + vap->va_rdev = vp->v_rdev; +#else + if (vp->v_type == VBLK || vp->v_type == VCHR) + vap->va_rdev = zfs_cmpldev(rdev); +#endif + vap->va_seq = zp->z_seq; + vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ + vap->va_filerev = zp->z_seq; + + /* + * Add in any requested optional attributes and the create time. + * Also set the corresponding bits in the returned attribute bitmap. + */ + if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + xoap->xoa_archive = + ((zp->z_pflags & ZFS_ARCHIVE) != 0); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + xoap->xoa_readonly = + ((zp->z_pflags & ZFS_READONLY) != 0); + XVA_SET_RTN(xvap, XAT_READONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + xoap->xoa_system = + ((zp->z_pflags & ZFS_SYSTEM) != 0); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + xoap->xoa_hidden = + ((zp->z_pflags & ZFS_HIDDEN) != 0); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + xoap->xoa_nounlink = + ((zp->z_pflags & ZFS_NOUNLINK) != 0); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + xoap->xoa_immutable = + ((zp->z_pflags & ZFS_IMMUTABLE) != 0); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + xoap->xoa_appendonly = + ((zp->z_pflags & ZFS_APPENDONLY) != 0); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + xoap->xoa_nodump = + ((zp->z_pflags & ZFS_NODUMP) != 0); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + xoap->xoa_opaque = + ((zp->z_pflags & ZFS_OPAQUE) != 0); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + xoap->xoa_av_quarantined = + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + xoap->xoa_av_modified = + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && + vp->v_type == VREG) { + zfs_sa_get_scanstamp(zp, xvap); + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_GEN)) { + xoap->xoa_generation = zp->z_gen; + XVA_SET_RTN(xvap, XAT_GEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + xoap->xoa_offline = + ((zp->z_pflags & ZFS_OFFLINE) != 0); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + xoap->xoa_sparse = + ((zp->z_pflags & ZFS_SPARSE) != 0); + XVA_SET_RTN(xvap, XAT_SPARSE); + } + } + + ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); + ZFS_TIME_DECODE(&vap->va_mtime, mtime); + ZFS_TIME_DECODE(&vap->va_ctime, ctime); + ZFS_TIME_DECODE(&vap->va_birthtime, crtime); + + + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); + vap->va_blksize = blksize; + vap->va_bytes = nblocks << 9; /* nblocks * 512 */ + + if (zp->z_blksz == 0) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + vap->va_blksize = zfsvfs->z_max_blksz; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: vp - vnode of file to be modified. + * vap - new attribute values. + * If AT_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * ct - caller context + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +static int +zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + xvattr_t tmpxvattr; + uint_t mask = vap->va_mask; + uint_t saved_mask = 0; + uint64_t saved_mode; + int trim_mask = 0; + uint64_t new_mode; + uint64_t new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2]; + znode_t *attrzp; + int need_policy = FALSE; + int err, err2; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t fuid_dirtied = B_FALSE; + sa_bulk_attr_t bulk[7], xattr_bulk[7]; + int count = 0, xattr_count = 0; + + if (mask == 0) + return (0); + + if (mask & AT_NOSET) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & AT_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (mask & AT_SIZE && vp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * If this is an xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + + xva_init(&tmpxvattr); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) && + ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || + ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* + * Note: ZFS_READONLY is handled in zfs_zaccess_common. + */ + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (AT_ATIME | AT_MTIME)) { + if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOVERFLOW)); + } + } + if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && + TIMESPEC_OVERFLOW(&vap->va_birthtime)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOVERFLOW)); + } + + attrzp = NULL; + aclp = NULL; + + /* Can this be moved to before the top label? */ + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * First validate permissions + */ + + if (mask & AT_SIZE) { + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + if (mask & (AT_ATIME|AT_MTIME) || + ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + skipaclchk, cr); + } + + if (mask & (AT_UID|AT_GID)) { + int idmask = (mask & (AT_UID|AT_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & AT_MODE)) + vap->va_mode = zp->z_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & AT_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both AT_UID and AT_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || + ((idmask == AT_UID) && take_owner) || + ((idmask == AT_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + skipaclchk, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + secpolicy_setid_clear(vap, vp, cr); + trim_mask = (mask & (AT_UID|AT_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + oldva.va_mode = zp->z_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & AT_XVATTR) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((zp->z_pflags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((vp->v_type != VREG && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + if (mask & AT_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + err = secpolicy_setid_setsticky_clear(vp, vap, + &oldva, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + trim_mask |= AT_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + if (trim_mask & AT_MODE) { + /* + * Save the mode, as secpolicy_vnode_setattr() + * will overwrite it with ova.va_mode. + */ + saved_mode = vap->va_mode; + } + } + err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + + if (trim_mask) { + vap->va_mask |= saved_mask; + if (trim_mask & AT_MODE) { + /* + * Recover the mode after + * secpolicy_vnode_setattr(). + */ + vap->va_mode = saved_mode; + } + } + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + if ((mask & (AT_UID | AT_GID))) { + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + + if (err == 0 && xattr_obj) { + err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); + if (err == 0) { + err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); + if (err != 0) + vrele(ZTOV(attrzp)); + } + if (err) + goto out2; + } + if (mask & AT_UID) { + new_uid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_uid != zp->z_uid && + zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (mask & AT_GID) { + new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &fuidp); + if (new_gid != zp->z_gid && + zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + } + tx = dmu_tx_create(zfsvfs->z_os); + + if (mask & AT_MODE) { + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && + !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { + err = SET_ERROR(EPERM); + goto out; + } + + if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) + goto out; + + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, + aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if ((mask & AT_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + } + + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + + count = 0; + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&zp->z_acl_lock); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&attrzp->z_acl_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + } + + if (mask & (AT_UID|AT_GID)) { + + if (mask & AT_UID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + zp->z_uid = new_uid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + attrzp->z_uid = new_uid; + } + } + + if (mask & AT_GID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + zp->z_gid = new_gid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + attrzp->z_gid = new_gid; + } + } + if (!(mask & AT_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT(err == 0); + if (attrzp) { + err = zfs_acl_chown_setattr(attrzp); + ASSERT(err == 0); + } + } + + if (mask & AT_MODE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = new_mode; + ASSERT3U((uintptr_t)aclp, !=, 0); + err = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT0(err); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = aclp; + aclp = NULL; + } + + + if (mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + } + + if (mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + } + + /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ + if (mask & AT_SIZE && !(mask & AT_MTIME)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); + } else if (mask != 0) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, + B_TRUE); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(attrzp, STATE_CHANGED, + mtime, ctime, B_TRUE); + } + } + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & AT_XVATTR)) { + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + xoap->xoa_createtime = vap->va_birthtime; + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + ASSERT(vp->v_type == VREG); + + zfs_xvattr_set(zp, xvap, tx); + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&zp->z_acl_lock); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&attrzp->z_acl_lock); + } +out: + if (err == 0 && attrzp) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + + if (attrzp) + vput(ZTOV(attrzp)); + + if (aclp) + zfs_acl_free(aclp); + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) { + dmu_tx_abort(tx); + } else { + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } + +out2: + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * We acquire all but fdvp locks using non-blocking acquisitions. If we + * fail to acquire any lock in the path we will drop all held locks, + * acquire the new lock in a blocking fashion, and then release it and + * restart the rename. This acquire/release step ensures that we do not + * spin on a lock waiting for release. On error release all vnode locks + * and decrement references the way tmpfs_rename() would do. + */ +static int +zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, + struct vnode *tdvp, struct vnode **tvpp, + const struct componentname *scnp, const struct componentname *tcnp) +{ + zfsvfs_t *zfsvfs; + struct vnode *nvp, *svp, *tvp; + znode_t *sdzp, *tdzp, *szp, *tzp; + const char *snm = scnp->cn_nameptr; + const char *tnm = tcnp->cn_nameptr; + int error; + + VOP_UNLOCK(tdvp, 0); + if (*tvpp != NULL && *tvpp != tdvp) + VOP_UNLOCK(*tvpp, 0); + +relock: + error = vn_lock(sdvp, LK_EXCLUSIVE); + if (error) + goto out; + sdzp = VTOZ(sdvp); + + error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK(sdvp, 0); + if (error != EBUSY) + goto out; + error = vn_lock(tdvp, LK_EXCLUSIVE); + if (error) + goto out; + VOP_UNLOCK(tdvp, 0); + goto relock; + } + tdzp = VTOZ(tdvp); + + /* + * Before using sdzp and tdzp we must ensure that they are live. + * As a porting legacy from illumos we have two things to worry + * about. One is typical for FreeBSD and it is that the vnode is + * not reclaimed (doomed). The other is that the znode is live. + * The current code can invalidate the znode without acquiring the + * corresponding vnode lock if the object represented by the znode + * and vnode is no longer valid after a rollback or receive operation. + * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock + * that protects the znodes from the invalidation. + */ + zfsvfs = sdzp->z_zfsvfs; + ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); + ZFS_ENTER(zfsvfs); + + /* + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. + */ + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK(sdvp, 0); + VOP_UNLOCK(tdvp, 0); + error = SET_ERROR(EIO); + goto out; + } + + /* + * Re-resolve svp to be certain it still exists and fetch the + * correct vnode. + */ + error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); + if (error != 0) { + /* Source entry invalid or not there. */ + ZFS_EXIT(zfsvfs); + VOP_UNLOCK(sdvp, 0); + VOP_UNLOCK(tdvp, 0); + if ((scnp->cn_flags & ISDOTDOT) != 0 || + (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) + error = SET_ERROR(EINVAL); + goto out; + } + svp = ZTOV(szp); + + /* + * Re-resolve tvp, if it disappeared we just carry on. + */ + error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); + if (error != 0) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK(sdvp, 0); + VOP_UNLOCK(tdvp, 0); + vrele(svp); + if ((tcnp->cn_flags & ISDOTDOT) != 0) + error = SET_ERROR(EINVAL); + goto out; + } + if (tzp != NULL) + tvp = ZTOV(tzp); + else + tvp = NULL; + + /* + * At present the vnode locks must be acquired before z_teardown_lock, + * although it would be more logical to use the opposite order. + */ + ZFS_EXIT(zfsvfs); + + /* + * Now try acquire locks on svp and tvp. + */ + nvp = svp; + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK(sdvp, 0); + VOP_UNLOCK(tdvp, 0); + if (tvp != NULL) + vrele(tvp); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; + } + VOP_UNLOCK(nvp, 0); + /* + * Concurrent rename race. + * XXX ? + */ + if (nvp == tdvp) { + vrele(nvp); + error = SET_ERROR(EINVAL); + goto out; + } + vrele(*svpp); + *svpp = nvp; + goto relock; + } + vrele(*svpp); + *svpp = nvp; + + if (*tvpp != NULL) + vrele(*tvpp); + *tvpp = NULL; + if (tvp != NULL) { + nvp = tvp; + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK(sdvp, 0); + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(*svpp, 0); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; + } + vput(nvp); + goto relock; + } + *tvpp = nvp; + } + + return (0); + +out: + return (error); +} + +/* + * Note that we must use VRELE_ASYNC in this function as it walks + * up the directory tree and vrele may need to acquire an exclusive + * lock if a last reference to a vnode is dropped. + */ +static int +zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) +{ + zfsvfs_t *zfsvfs; + znode_t *zp, *zp1; + uint64_t parent; + int error; + + zfsvfs = tdzp->z_zfsvfs; + if (tdzp == szp) + return (SET_ERROR(EINVAL)); + if (tdzp == sdzp) + return (0); + if (tdzp->z_id == zfsvfs->z_root) + return (0); + zp = tdzp; + for (;;) { + ASSERT(!zp->z_unlinked); + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + break; + + if (parent == szp->z_id) { + error = SET_ERROR(EINVAL); + break; + } + if (parent == zfsvfs->z_root) + break; + if (parent == sdzp->z_id) + break; + + error = zfs_zget(zfsvfs, parent, &zp1); + if (error != 0) + break; + + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); + zp = zp1; + } + + if (error == ENOTDIR) + panic("checkpath: .. not a directory\n"); + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); + return (error); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdvp - Source directory containing the "old entry". + * snm - Old entry name. + * tdvp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * sdvp,tdvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, + vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, + cred_t *cr) +{ + zfsvfs_t *zfsvfs; + znode_t *sdzp, *tdzp, *szp, *tzp; + zilog_t *zilog = NULL; + dmu_tx_t *tx; + char *snm = scnp->cn_nameptr; + char *tnm = tcnp->cn_nameptr; + int error = 0; + + /* Reject renames across filesystems. */ + if ((*svpp)->v_mount != tdvp->v_mount || + ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { + error = SET_ERROR(EXDEV); + goto out; + } + + if (zfsctl_is_node(tdvp)) { + error = SET_ERROR(EXDEV); + goto out; + } + + /* + * Lock all four vnodes to ensure safety and semantics of renaming. + */ + error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); + if (error != 0) { + /* no vnodes are locked in the case of error here */ + return (error); + } + + tdzp = VTOZ(tdvp); + sdzp = VTOZ(sdvp); + zfsvfs = tdzp->z_zfsvfs; + zilog = zfsvfs->z_log; + + /* + * After we re-enter ZFS_ENTER() we will have to revalidate all + * znodes involved. + */ + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + error = SET_ERROR(EILSEQ); + goto unlockout; + } + + /* If source and target are the same file, there is nothing to do. */ + if ((*svpp) == (*tvpp)) { + error = 0; + goto unlockout; + } + + if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || + ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && + (*tvpp)->v_mountedhere != NULL)) { + error = SET_ERROR(EXDEV); + goto unlockout; + } + + /* + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. + */ + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + goto unlockout; + } + + szp = VTOZ(*svpp); + tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); + if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { + error = SET_ERROR(EIO); + goto unlockout; + } + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + error = SET_ERROR(EINVAL); + goto unlockout; + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) + goto unlockout; + + if ((*svpp)->v_type == VDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || + sdzp == szp || + (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { + error = EINVAL; + goto unlockout; + } + + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if (error = zfs_rename_check(szp, sdzp, tdzp)) + goto unlockout; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if ((*svpp)->v_type == VDIR) { + if ((*tvpp)->v_type != VDIR) { + error = SET_ERROR(ENOTDIR); + goto unlockout; + } else { + cache_purge(tdvp); + if (sdvp != tdvp) + cache_purge(sdvp); + } + } else { + if ((*tvpp)->v_type == VDIR) { + error = SET_ERROR(EISDIR); + goto unlockout; + } + } + } + + vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); + if (tzp) + vnevent_rename_dest(*tvpp, tdvp, tnm, ct); + + /* + * notify the target directory if it is not the same + * as source directory. + */ + if (tdvp != sdvp) { + vnevent_rename_dest_dir(tdvp, ct); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto unlockout; + } + + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); + + if (error == 0) { + error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_pflags |= ZFS_AV_MODIFIED; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); + + error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, + NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME, sdzp, + snm, tdzp, tnm, szp); + + /* + * Update path information for the target vnode + */ + vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, + ZRENAMING, NULL), ==, 0); + } + } + if (error == 0) { + cache_purge(*svpp); + if (*tvpp != NULL) + cache_purge(*tvpp); + cache_purge_negative(tdvp); + } + } + + dmu_tx_commit(tx); + +unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ + ZFS_EXIT(zfsvfs); + VOP_UNLOCK(*svpp, 0); + VOP_UNLOCK(sdvp, 0); + +out: /* original two vnodes are locked */ + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (*tvpp != NULL) + VOP_UNLOCK(*tvpp, 0); + if (tdvp != *tvpp) + VOP_UNLOCK(tdvp, 0); + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dvp - Directory to contain new symbolic link. + * link - Name for new symlink entry. + * vap - Attributes of new entry. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, + cred_t *cr, kthread_t *td) +{ + znode_t *zp, *dzp = VTOZ(dvp); + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t len = strlen(link); + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; + int flags = 0; + + ASSERT(vap->va_type == VLNK); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENAMETOOLONG)); + } + + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); + if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + getnewvnode_reserve(1); + tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create a new object for the symlink. + * for version 4 ZPL datsets the symlink will be an SA attribute + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + link, len, tx); + else + zfs_sa_symlink(zp, link, len, tx); + + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); + /* + * Insert the new object into the directory. + */ + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); + + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + *vpp = ZTOV(zp); + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by vp. + * + * IN: vp - vnode of symbolic link. + * uio - structure to contain the link path. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - structure containing the link path. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdvp referencing svp. + * + * IN: tdvp - Directory to contain new entry. + * svp - vnode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * ct - caller context + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * tdvp - ctime|mtime updated + * svp - ctime updated + */ +/* ARGSUSED */ +static int +zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, + caller_context_t *ct, int flags) +{ + znode_t *dzp = VTOZ(tdvp); + znode_t *tzp, *szp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + int error; + uint64_t parent; + uid_t owner; + + ASSERT(tdvp->v_type == VDIR); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (svp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + szp = VTOZ(svp); + ZFS_VERIFY_ZP(szp); + + if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* Prevent links to .zfs/shares files */ + + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + + owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, dzp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_create(dzp, name, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + zfs_log_link(zilog, tx, txtype, dzp, szp, name); + } + + dmu_tx_commit(tx); + + if (error == 0) { + vnevent_link(svp, ct); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + + +/*ARGSUSED*/ +void +zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) { + /* + * The fs has been unmounted, or we did a + * suspend/resume and this file no longer exists. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vrecycle(vp); + return; + } + + if (zp->z_unlinked) { + /* + * Fast path to recycle a vnode of a removed file. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vrecycle(vp); + return; + } + + if (zp->z_atime_dirty && zp->z_unlinked == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&zp->z_atime, sizeof (zp->z_atime), tx); + zp->z_atime_dirty = 0; + dmu_tx_commit(tx); + } + } + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} + + +CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); +CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); + +/*ARGSUSED*/ +static int +zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint32_t gen; + uint64_t gen64; + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int size, i, error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), + &gen64, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + gen = (uint32_t)gen64; + + size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; + +#ifdef illumos + if (fidp->fid_len < size) { + fidp->fid_len = size; + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOSPC)); + } +#else + fidp->fid_len = size; +#endif + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = size; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* Must have a non-zero generation number to distinguish from .zfs */ + if (gen == 0) + gen = 1; + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + if (size == LONG_FID_LEN) { + uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); + zfid_long_t *zlfid; + + zlfid = (zfid_long_t *)fidp; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); + + /* XXX - this should be the generation number for the objset */ + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + zlfid->zf_setgen[i] = 0; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp, *xzp; + zfsvfs_t *zfsvfs; + int error; + + switch (cmd) { + case _PC_LINK_MAX: + *valp = MIN(LONG_MAX, ZFS_LINK_MAX); + return (0); + + case _PC_FILESIZEBITS: + *valp = 64; + return (0); +#ifdef illumos + case _PC_XATTR_EXISTS: + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + *valp = 0; + error = zfs_dirent_lookup(zp, "", &xzp, + ZXATTR | ZEXISTS | ZSHARED); + if (error == 0) { + if (!zfs_dirempty(xzp)) + *valp = 1; + vrele(ZTOV(xzp)); + } else if (error == ENOENT) { + /* + * If there aren't extended attributes, it's the + * same as having zero of them. + */ + error = 0; + } + ZFS_EXIT(zfsvfs); + return (error); + + case _PC_SATTR_ENABLED: + case _PC_SATTR_EXISTS: + *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && + (vp->v_type == VREG || vp->v_type == VDIR); + return (0); + + case _PC_ACCESS_FILTERING: + *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && + vp->v_type == VDIR; + return (0); + + case _PC_ACL_ENABLED: + *valp = _ACL_ACE_ENABLED; + return (0); +#endif /* illumos */ + case _PC_MIN_HOLE_SIZE: + *valp = (int)SPA_MINBLOCKSIZE; + return (0); +#ifdef illumos + case _PC_TIMESTAMP_RESOLUTION: + /* nanosecond timestamp resolution */ + *valp = 1L; + return (0); +#endif + case _PC_ACL_EXTENDED: + *valp = 0; + return (0); + + case _PC_ACL_NFS4: + *valp = 1; + return (0); + + case _PC_ACL_PATH_MAX: + *valp = ACL_MAX_ENTRIES; + return (0); + + default: + return (EOPNOTSUPP); + } +} + +/*ARGSUSED*/ +static int +zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + error = zfs_getacl(zp, vsecp, skipaclchk, cr); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/*ARGSUSED*/ +int +zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + zilog_t *zilog = zfsvfs->z_log; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_setacl(zp, vsecp, skipaclchk, cr); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +static int +zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, + int *rahead) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zp->z_zfsvfs->z_os; + rl_t *rl; + vm_object_t object; + off_t start, end, obj_size; + uint_t blksz; + int pgsin_b, pgsin_a; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + start = IDX_TO_OFF(ma[0]->pindex); + end = IDX_TO_OFF(ma[count - 1]->pindex + 1); + + /* + * Lock a range covering all required and optional pages. + * Note that we need to handle the case of the block size growing. + */ + for (;;) { + blksz = zp->z_blksz; + rl = zfs_range_lock(zp, rounddown(start, blksz), + roundup(end, blksz) - rounddown(start, blksz), RL_READER); + if (blksz == zp->z_blksz) + break; + zfs_range_unlock(rl); + } + + object = ma[0]->object; + zfs_vmobject_wlock(object); + obj_size = object->un_pager.vnp.vnp_size; + zfs_vmobject_wunlock(object); + if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { + zfs_range_unlock(rl); + ZFS_EXIT(zfsvfs); + return (zfs_vm_pagerret_bad); + } + + pgsin_b = 0; + if (rbehind != NULL) { + pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); + pgsin_b = MIN(*rbehind, pgsin_b); + } + + pgsin_a = 0; + if (rahead != NULL) { + pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); + if (end + IDX_TO_OFF(pgsin_a) >= obj_size) + pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); + pgsin_a = MIN(*rahead, pgsin_a); + } + + /* + * NB: we need to pass the exact byte size of the data that we expect + * to read after accounting for the file size. This is required because + * ZFS will panic if we request DMU to read beyond the end of the last + * allocated block. + */ + error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, + MIN(end, obj_size) - (end - PAGE_SIZE)); + + zfs_range_unlock(rl); + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + + if (error != 0) + return (zfs_vm_pagerret_error); + + VM_CNT_INC(v_vnodein); + VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a); + if (rbehind != NULL) + *rbehind = pgsin_b; + if (rahead != NULL) + *rahead = pgsin_a; + return (zfs_vm_pagerret_ok); +} + +static int +zfs_freebsd_getpages(ap) + struct vop_getpages_args /* { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int *a_rbehind; + int *a_rahead; + } */ *ap; +{ + + return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, + ap->a_rahead)); +} + +static int +zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, + int *rtvals) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + rl_t *rl; + dmu_tx_t *tx; + struct sf_buf *sf; + vm_object_t object; + vm_page_t m; + caddr_t va; + size_t tocopy; + size_t lo_len; + vm_ooffset_t lo_off; + vm_ooffset_t off; + uint_t blksz; + int ncount; + int pcount; + int err; + int i; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + object = vp->v_object; + pcount = btoc(len); + ncount = pcount; + + KASSERT(ma[0]->object == object, ("mismatching object")); + KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); + + for (i = 0; i < pcount; i++) + rtvals[i] = zfs_vm_pagerret_error; + + off = IDX_TO_OFF(ma[0]->pindex); + blksz = zp->z_blksz; + lo_off = rounddown(off, blksz); + lo_len = roundup(len + (off - lo_off), blksz); + rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER); + + zfs_vmobject_wlock(object); + if (len + off > object->un_pager.vnp.vnp_size) { + if (object->un_pager.vnp.vnp_size > off) { + int pgoff; + + len = object->un_pager.vnp.vnp_size - off; + ncount = btoc(len); + if ((pgoff = (int)len & PAGE_MASK) != 0) { + /* + * If the object is locked and the following + * conditions hold, then the page's dirty + * field cannot be concurrently changed by a + * pmap operation. + */ + m = ma[ncount - 1]; + vm_page_assert_sbusied(m); + KASSERT(!pmap_page_is_write_mapped(m), + ("zfs_putpages: page %p is not read-only", m)); + vm_page_clear_dirty(m, pgoff, PAGE_SIZE - + pgoff); + } + } else { + len = 0; + ncount = 0; + } + if (ncount < pcount) { + for (i = ncount; i < pcount; i++) { + rtvals[i] = zfs_vm_pagerret_bad; + } + } + } + zfs_vmobject_wunlock(object); + + if (ncount == 0) + goto out; + + if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || + zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { + goto out; + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, off, len); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + goto out; + } + + if (zp->z_blksz < PAGE_SIZE) { + for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { + tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; + va = zfs_map_page(ma[i], &sf); + dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); + zfs_unmap_page(sf); + } + } else { + err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); + } + + if (err == 0) { + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(err); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); + + zfs_vmobject_wlock(object); + for (i = 0; i < ncount; i++) { + rtvals[i] = zfs_vm_pagerret_ok; + vm_page_undirty(ma[i]); + } + zfs_vmobject_wunlock(object); + VM_CNT_INC(v_vnodeout); + VM_CNT_ADD(v_vnodepgsout, ncount); + } + dmu_tx_commit(tx); + +out: + zfs_range_unlock(rl); + if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + return (rtvals[0]); +} + +int +zfs_freebsd_putpages(ap) + struct vop_putpages_args /* { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int a_sync; + int *a_rtvals; + } */ *ap; +{ + + return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, + ap->a_rtvals)); +} + +static int +zfs_freebsd_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct bufobj **a_bop; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; + } */ *ap; +{ + + if (ap->a_bop != NULL) + *ap->a_bop = &ap->a_vp->v_bufobj; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + if (ap->a_runp != NULL) + *ap->a_runp = 0; + if (ap->a_runb != NULL) + *ap->a_runb = 0; + + return (0); +} + +static int +zfs_freebsd_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + int error; + + error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); + if (error == 0) + vnode_create_vobject(vp, zp->z_size, ap->a_td); + return (error); +} + +static int +zfs_freebsd_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + + return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); +} + +static int +zfs_freebsd_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + u_long a_command; + caddr_t a_data; + int a_fflag; + struct ucred *cred; + struct thread *td; + } */ *ap; +{ + + return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, + ap->a_fflag, ap->a_cred, NULL, NULL)); +} + +static int +ioflags(int ioflags) +{ + int flags = 0; + + if (ioflags & IO_APPEND) + flags |= FAPPEND; + if (ioflags & IO_NDELAY) + flags |= FNONBLOCK; + if (ioflags & IO_SYNC) + flags |= (FSYNC | FDSYNC | FRSYNC); + + return (flags); +} + +static int +zfs_freebsd_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + + return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), + ap->a_cred, NULL)); +} + +static int +zfs_freebsd_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + + return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), + ap->a_cred, NULL)); +} + +static int +zfs_freebsd_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + accmode_t a_accmode; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + accmode_t accmode; + int error = 0; + + /* + * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, + */ + accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); + if (accmode != 0) + error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); + + /* + * VADMIN has to be handled by vaccess(). + */ + if (error == 0) { + accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); + if (accmode != 0) { + error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, + zp->z_gid, accmode, ap->a_cred, NULL); + } + } + + /* + * For VEXEC, ensure that at least one execute bit is set for + * non-directories. + */ + if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && + (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { + error = EACCES; + } + + return (error); +} + +static int +zfs_freebsd_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + struct componentname *cnp = ap->a_cnp; + char nm[NAME_MAX + 1]; + + ASSERT(cnp->cn_namelen < sizeof(nm)); + strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); + + return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, + cnp->cn_cred, cnp->cn_thread, 0)); +} + +static int +zfs_cache_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + zfsvfs_t *zfsvfs; + + zfsvfs = ap->a_dvp->v_mount->mnt_data; + if (zfsvfs->z_use_namecache) + return (vfs_cache_lookup(ap)); + else + return (zfs_freebsd_lookup(ap)); +} + +static int +zfs_freebsd_create(ap) + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + zfsvfs_t *zfsvfs; + struct componentname *cnp = ap->a_cnp; + vattr_t *vap = ap->a_vap; + int error, mode; + + ASSERT(cnp->cn_flags & SAVENAME); + + vattr_init_mask(vap); + mode = vap->va_mode & ALLPERMS; + zfsvfs = ap->a_dvp->v_mount->mnt_data; + + error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, + ap->a_vpp, cnp->cn_cred, cnp->cn_thread); + if (zfsvfs->z_use_namecache && + error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(ap->a_dvp, *ap->a_vpp, cnp); + return (error); +} + +static int +zfs_freebsd_remove(ap) + struct vop_remove_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, + ap->a_cnp->cn_cred)); +} + +static int +zfs_freebsd_mkdir(ap) + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + vattr_t *vap = ap->a_vap; + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + vattr_init_mask(vap); + + return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, + ap->a_cnp->cn_cred)); +} + +static int +zfs_freebsd_rmdir(ap) + struct vop_rmdir_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + struct componentname *cnp = ap->a_cnp; + + ASSERT(cnp->cn_flags & SAVENAME); + + return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); +} + +static int +zfs_freebsd_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *a_ncookies; + u_long **a_cookies; + } */ *ap; +{ + + return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, + ap->a_ncookies, ap->a_cookies)); +} + +static int +zfs_freebsd_fsync(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + int a_waitfor; + struct thread *a_td; + } */ *ap; +{ + + vop_stdfsync(ap); + return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); +} + +static int +zfs_freebsd_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + } */ *ap; +{ + vattr_t *vap = ap->a_vap; + xvattr_t xvap; + u_long fflags = 0; + int error; + + xva_init(&xvap); + xvap.xva_vattr = *vap; + xvap.xva_vattr.va_mask |= AT_XVATTR; + + /* Convert chflags into ZFS-type flags. */ + /* XXX: what about SF_SETTABLE?. */ + XVA_SET_REQ(&xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&xvap, XAT_APPENDONLY); + XVA_SET_REQ(&xvap, XAT_NOUNLINK); + XVA_SET_REQ(&xvap, XAT_NODUMP); + XVA_SET_REQ(&xvap, XAT_READONLY); + XVA_SET_REQ(&xvap, XAT_ARCHIVE); + XVA_SET_REQ(&xvap, XAT_SYSTEM); + XVA_SET_REQ(&xvap, XAT_HIDDEN); + XVA_SET_REQ(&xvap, XAT_REPARSE); + XVA_SET_REQ(&xvap, XAT_OFFLINE); + XVA_SET_REQ(&xvap, XAT_SPARSE); + + error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); + if (error != 0) + return (error); + + /* Convert ZFS xattr into chflags. */ +#define FLAG_CHECK(fflag, xflag, xfield) do { \ + if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ + fflags |= (fflag); \ +} while (0) + FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, + xvap.xva_xoptattrs.xoa_immutable); + FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, + xvap.xva_xoptattrs.xoa_appendonly); + FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, + xvap.xva_xoptattrs.xoa_nounlink); + FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, + xvap.xva_xoptattrs.xoa_archive); + FLAG_CHECK(UF_NODUMP, XAT_NODUMP, + xvap.xva_xoptattrs.xoa_nodump); + FLAG_CHECK(UF_READONLY, XAT_READONLY, + xvap.xva_xoptattrs.xoa_readonly); + FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, + xvap.xva_xoptattrs.xoa_system); + FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, + xvap.xva_xoptattrs.xoa_hidden); + FLAG_CHECK(UF_REPARSE, XAT_REPARSE, + xvap.xva_xoptattrs.xoa_reparse); + FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, + xvap.xva_xoptattrs.xoa_offline); + FLAG_CHECK(UF_SPARSE, XAT_SPARSE, + xvap.xva_xoptattrs.xoa_sparse); + +#undef FLAG_CHECK + *vap = xvap.xva_vattr; + vap->va_flags = fflags; + return (0); +} + +static int +zfs_freebsd_setattr(ap) + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + cred_t *cred = ap->a_cred; + xvattr_t xvap; + u_long fflags; + uint64_t zflags; + + vattr_init_mask(vap); + vap->va_mask &= ~AT_NOSET; + + xva_init(&xvap); + xvap.xva_vattr = *vap; + + zflags = VTOZ(vp)->z_pflags; + + if (vap->va_flags != VNOVAL) { + zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; + int error; + + if (zfsvfs->z_use_fuids == B_FALSE) + return (EOPNOTSUPP); + + fflags = vap->va_flags; + /* + * XXX KDM + * We need to figure out whether it makes sense to allow + * UF_REPARSE through, since we don't really have other + * facilities to handle reparse points and zfs_setattr() + * doesn't currently allow setting that attribute anyway. + */ + if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| + UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| + UF_OFFLINE|UF_SPARSE)) != 0) + return (EOPNOTSUPP); + /* + * Unprivileged processes are not permitted to unset system + * flags, or modify flags if any system flags are set. + * Privileged non-jail processes may not modify system flags + * if securelevel > 0 and any existing system flags are set. + * Privileged jail processes behave like privileged non-jail + * processes if the PR_ALLOW_CHFLAGS permission bit is set; + * otherwise, they behave like unprivileged processes. + */ + if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || + priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) { + if (zflags & + (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { + error = securelevel_gt(cred, 0); + if (error != 0) + return (error); + } + } else { + /* + * Callers may only modify the file flags on objects they + * have VADMIN rights for. + */ + if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) + return (error); + if (zflags & + (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { + return (EPERM); + } + if (fflags & + (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { + return (EPERM); + } + } + +#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ + if (((fflags & (fflag)) && !(zflags & (zflag))) || \ + ((zflags & (zflag)) && !(fflags & (fflag)))) { \ + XVA_SET_REQ(&xvap, (xflag)); \ + (xfield) = ((fflags & (fflag)) != 0); \ + } \ +} while (0) + /* Convert chflags into ZFS-type flags. */ + /* XXX: what about SF_SETTABLE?. */ + FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, + xvap.xva_xoptattrs.xoa_immutable); + FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, + xvap.xva_xoptattrs.xoa_appendonly); + FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, + xvap.xva_xoptattrs.xoa_nounlink); + FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, + xvap.xva_xoptattrs.xoa_archive); + FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, + xvap.xva_xoptattrs.xoa_nodump); + FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, + xvap.xva_xoptattrs.xoa_readonly); + FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, + xvap.xva_xoptattrs.xoa_system); + FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, + xvap.xva_xoptattrs.xoa_hidden); + FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, + xvap.xva_xoptattrs.xoa_hidden); + FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, + xvap.xva_xoptattrs.xoa_offline); + FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, + xvap.xva_xoptattrs.xoa_sparse); +#undef FLAG_CHANGE + } + if (vap->va_birthtime.tv_sec != VNOVAL) { + xvap.xva_vattr.va_mask |= AT_XVATTR; + XVA_SET_REQ(&xvap, XAT_CREATETIME); + } + return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); +} + +static int +zfs_freebsd_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + vnode_t *fdvp = ap->a_fdvp; + vnode_t *fvp = ap->a_fvp; + vnode_t *tdvp = ap->a_tdvp; + vnode_t *tvp = ap->a_tvp; + int error; + + ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); + ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); + + error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, + ap->a_tcnp, ap->a_fcnp->cn_cred); + + vrele(fdvp); + vrele(fvp); + vrele(tdvp); + if (tvp != NULL) + vrele(tvp); + + return (error); +} + +static int +zfs_freebsd_symlink(ap) + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap; +{ + struct componentname *cnp = ap->a_cnp; + vattr_t *vap = ap->a_vap; + + ASSERT(cnp->cn_flags & SAVENAME); + + vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ + vattr_init_mask(vap); + + return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, + __DECONST(char *, ap->a_target), cnp->cn_cred, cnp->cn_thread)); +} + +static int +zfs_freebsd_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + + return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); +} + +static int +zfs_freebsd_link(ap) + struct vop_link_args /* { + struct vnode *a_tdvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + struct componentname *cnp = ap->a_cnp; + vnode_t *vp = ap->a_vp; + vnode_t *tdvp = ap->a_tdvp; + + if (tdvp->v_mount != vp->v_mount) + return (EXDEV); + + ASSERT(cnp->cn_flags & SAVENAME); + + return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); +} + +static int +zfs_freebsd_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + + zfs_inactive(vp, ap->a_td->td_ucred, NULL); + return (0); +} + +static int +zfs_freebsd_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp != NULL); + + /* Destroy the vm object and flush associated pages. */ + vnode_destroy_vobject(vp); + + /* + * z_teardown_inactive_lock protects from a race with + * zfs_znode_dmu_fini in zfsvfs_teardown during + * force unmount. + */ + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) + zfs_znode_free(zp); + else + zfs_zinactive(zp); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + + vp->v_data = NULL; + return (0); +} + +static int +zfs_freebsd_fid(ap) + struct vop_fid_args /* { + struct vnode *a_vp; + struct fid *a_fid; + } */ *ap; +{ + + return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); +} + +static int +zfs_freebsd_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + register_t *a_retval; + } */ *ap; +{ + ulong_t val; + int error; + + error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); + if (error == 0) { + *ap->a_retval = val; + return (error); + } + if (error != EOPNOTSUPP) + return (error); + + switch (ap->a_name) { + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + return (0); + case _PC_PIPE_BUF: + if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { + *ap->a_retval = PIPE_BUF; + return (0); + } + return (EINVAL); + default: + return (vop_stdpathconf(ap)); + } +} + +/* + * FreeBSD's extended attributes namespace defines file name prefix for ZFS' + * extended attribute name: + * + * NAMESPACE PREFIX + * system freebsd:system: + * user (none, can be used to access ZFS fsattr(5) attributes + * created on Solaris) + */ +static int +zfs_create_attrname(int attrnamespace, const char *name, char *attrname, + size_t size) +{ + const char *namespace, *prefix, *suffix; + + /* We don't allow '/' character in attribute name. */ + if (strchr(name, '/') != NULL) + return (EINVAL); + /* We don't allow attribute names that start with "freebsd:" string. */ + if (strncmp(name, "freebsd:", 8) == 0) + return (EINVAL); + + bzero(attrname, size); + + switch (attrnamespace) { + case EXTATTR_NAMESPACE_USER: +#if 0 + prefix = "freebsd:"; + namespace = EXTATTR_NAMESPACE_USER_STRING; + suffix = ":"; +#else + /* + * This is the default namespace by which we can access all + * attributes created on Solaris. + */ + prefix = namespace = suffix = ""; +#endif + break; + case EXTATTR_NAMESPACE_SYSTEM: + prefix = "freebsd:"; + namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; + suffix = ":"; + break; + case EXTATTR_NAMESPACE_EMPTY: + default: + return (EINVAL); + } + if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, + name) >= size) { + return (ENAMETOOLONG); + } + return (0); +} + +/* + * Vnode operating to retrieve a named extended attribute. + */ +static int +zfs_getextattr(struct vop_getextattr_args *ap) +/* +vop_getextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VREAD); + if (error != 0) + return (error); + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof(attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + flags = FREAD; + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, + xvp, td); + error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + if (error == ENOENT) + error = ENOATTR; + return (error); + } + + if (ap->a_size != NULL) { + error = VOP_GETATTR(vp, &va, ap->a_cred); + if (error == 0) + *ap->a_size = (size_t)va.va_size; + } else if (ap->a_uio != NULL) + error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); + + VOP_UNLOCK(vp, 0); + vn_close(vp, flags, ap->a_cred, td); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Vnode operation to remove a named attribute. + */ +int +zfs_deleteextattr(struct vop_deleteextattr_args *ap) +/* +vop_deleteextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VWRITE); + if (error != 0) + return (error); + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof(attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, + UIO_SYSSPACE, attrname, xvp, td); + error = namei(&nd); + vp = nd.ni_vp; + if (error != 0) { + ZFS_EXIT(zfsvfs); + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error == ENOENT) + error = ENOATTR; + return (error); + } + + error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); + NDFREE(&nd, NDF_ONLY_PNBUF); + + vput(nd.ni_dvp); + if (vp == nd.ni_dvp) + vrele(vp); + else + vput(vp); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Vnode operation to set a named attribute. + */ +static int +zfs_setextattr(struct vop_setextattr_args *ap) +/* +vop_setextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VWRITE); + if (error != 0) + return (error); + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof(attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR | CREATE_XATTR_DIR); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + flags = FFLAGS(O_WRONLY | O_CREAT); + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, + xvp, td); + error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + VATTR_NULL(&va); + va.va_size = 0; + error = VOP_SETATTR(vp, &va, ap->a_cred); + if (error == 0) + VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); + + VOP_UNLOCK(vp, 0); + vn_close(vp, flags, ap->a_cred, td); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Vnode operation to retrieve extended attributes on a vnode. + */ +static int +zfs_listextattr(struct vop_listextattr_args *ap) +/* +vop_listextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrprefix[16]; + u_char dirbuf[sizeof(struct dirent)]; + struct dirent *dp; + struct iovec aiov; + struct uio auio, *uio = ap->a_uio; + size_t *sizep = ap->a_size; + size_t plen; + vnode_t *xvp = NULL, *vp; + int done, error, eof, pos; + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VREAD); + if (error != 0) + return (error); + + error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, + sizeof(attrprefix)); + if (error != 0) + return (error); + plen = strlen(attrprefix); + + ZFS_ENTER(zfsvfs); + + if (sizep != NULL) + *sizep = 0; + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR); + if (error != 0) { + ZFS_EXIT(zfsvfs); + /* + * ENOATTR means that the EA directory does not yet exist, + * i.e. there are no extended attributes there. + */ + if (error == ENOATTR) + error = 0; + return (error); + } + + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, + UIO_SYSSPACE, ".", xvp, td); + error = namei(&nd); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; + auio.uio_rw = UIO_READ; + auio.uio_offset = 0; + + do { + u_char nlen; + + aiov.iov_base = (void *)dirbuf; + aiov.iov_len = sizeof(dirbuf); + auio.uio_resid = sizeof(dirbuf); + error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); + done = sizeof(dirbuf) - auio.uio_resid; + if (error != 0) + break; + for (pos = 0; pos < done;) { + dp = (struct dirent *)(dirbuf + pos); + pos += dp->d_reclen; + /* + * XXX: Temporarily we also accept DT_UNKNOWN, as this + * is what we get when attribute was created on Solaris. + */ + if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) + continue; + if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) + continue; + else if (strncmp(dp->d_name, attrprefix, plen) != 0) + continue; + nlen = dp->d_namlen - plen; + if (sizep != NULL) + *sizep += 1 + nlen; + else if (uio != NULL) { + /* + * Format of extattr name entry is one byte for + * length and the rest for name. + */ + error = uiomove(&nlen, 1, uio->uio_rw, uio); + if (error == 0) { + error = uiomove(dp->d_name + plen, nlen, + uio->uio_rw, uio); + } + if (error != 0) + break; + } + } + } while (!eof && error == 0); + + vput(vp); + ZFS_EXIT(zfsvfs); + + return (error); +} + +int +zfs_freebsd_getacl(ap) + struct vop_getacl_args /* { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; + } */ *ap; +{ + int error; + vsecattr_t vsecattr; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; + if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) + return (error); + + error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); + if (vsecattr.vsa_aclentp != NULL) + kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); + + return (error); +} + +int +zfs_freebsd_setacl(ap) + struct vop_setacl_args /* { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; + } */ *ap; +{ + int error; + vsecattr_t vsecattr; + int aclbsize; /* size of acl list in bytes */ + aclent_t *aaclp; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + if (ap->a_aclp == NULL) + return (EINVAL); + + if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) + return (EINVAL); + + /* + * With NFSv4 ACLs, chmod(2) may need to add additional entries, + * splitting every entry into two and appending "canonical six" + * entries at the end. Don't allow for setting an ACL that would + * cause chmod(2) to run out of ACL entries. + */ + if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) + return (ENOSPC); + + error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); + if (error != 0) + return (error); + + vsecattr.vsa_mask = VSA_ACE; + aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); + vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); + aaclp = vsecattr.vsa_aclentp; + vsecattr.vsa_aclentsz = aclbsize; + + aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); + error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); + kmem_free(aaclp, aclbsize); + + return (error); +} + +int +zfs_freebsd_aclcheck(ap) + struct vop_aclcheck_args /* { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; + } */ *ap; +{ + + return (EOPNOTSUPP); +} + +static int +zfs_vptocnp(struct vop_vptocnp_args *ap) +{ + vnode_t *covered_vp; + vnode_t *vp = ap->a_vp;; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *zp = VTOZ(vp); + int ltype; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * If we are a snapshot mounted under .zfs, run the operation + * on the covered vnode. + */ + if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { + char name[MAXNAMLEN + 1]; + znode_t *dzp; + size_t len; + + error = zfs_znode_parent_and_name(zp, &dzp, name); + if (error == 0) { + len = strlen(name); + if (*ap->a_buflen < len) + error = SET_ERROR(ENOMEM); + } + if (error == 0) { + *ap->a_buflen -= len; + bcopy(name, ap->a_buf + *ap->a_buflen, len); + *ap->a_vpp = ZTOV(dzp); + } + ZFS_EXIT(zfsvfs); + return (error); + } + ZFS_EXIT(zfsvfs); + + covered_vp = vp->v_mount->mnt_vnodecovered; + vhold(covered_vp); + ltype = VOP_ISLOCKED(vp); + VOP_UNLOCK(vp, 0); + error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); + if (error == 0) { + error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, + ap->a_buf, ap->a_buflen); + vput(covered_vp); + } + vn_lock(vp, ltype | LK_RETRY); + if ((vp->v_iflag & VI_DOOMED) != 0) + error = SET_ERROR(ENOENT); + return (error); +} + +#ifdef DIAGNOSTIC +static int +zfs_lock(ap) + struct vop_lock1_args /* { + struct vnode *a_vp; + int a_flags; + char *file; + int line; + } */ *ap; +{ + vnode_t *vp; + znode_t *zp; + int err; + + err = vop_stdlock(ap); + if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { + vp = ap->a_vp; + zp = vp->v_data; + if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 && + zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) + VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); + } + return (err); +} +#endif + +struct vop_vector zfs_vnodeops; +struct vop_vector zfs_fifoops; +struct vop_vector zfs_shareops; + +struct vop_vector zfs_vnodeops = { + .vop_default = &default_vnodeops, + .vop_inactive = zfs_freebsd_inactive, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_access = zfs_freebsd_access, + .vop_allocate = VOP_EINVAL, + .vop_lookup = zfs_cache_lookup, + .vop_cachedlookup = zfs_freebsd_lookup, + .vop_getattr = zfs_freebsd_getattr, + .vop_setattr = zfs_freebsd_setattr, + .vop_create = zfs_freebsd_create, + .vop_mknod = zfs_freebsd_create, + .vop_mkdir = zfs_freebsd_mkdir, + .vop_readdir = zfs_freebsd_readdir, + .vop_fsync = zfs_freebsd_fsync, + .vop_open = zfs_freebsd_open, + .vop_close = zfs_freebsd_close, + .vop_rmdir = zfs_freebsd_rmdir, + .vop_ioctl = zfs_freebsd_ioctl, + .vop_link = zfs_freebsd_link, + .vop_symlink = zfs_freebsd_symlink, + .vop_readlink = zfs_freebsd_readlink, + .vop_read = zfs_freebsd_read, + .vop_write = zfs_freebsd_write, + .vop_remove = zfs_freebsd_remove, + .vop_rename = zfs_freebsd_rename, + .vop_pathconf = zfs_freebsd_pathconf, + .vop_bmap = zfs_freebsd_bmap, + .vop_fid = zfs_freebsd_fid, + .vop_getextattr = zfs_getextattr, + .vop_deleteextattr = zfs_deleteextattr, + .vop_setextattr = zfs_setextattr, + .vop_listextattr = zfs_listextattr, + .vop_getacl = zfs_freebsd_getacl, + .vop_setacl = zfs_freebsd_setacl, + .vop_aclcheck = zfs_freebsd_aclcheck, + .vop_getpages = zfs_freebsd_getpages, + .vop_putpages = zfs_freebsd_putpages, + .vop_vptocnp = zfs_vptocnp, +#ifdef DIAGNOSTIC + .vop_lock1 = zfs_lock, +#endif +}; + +struct vop_vector zfs_fifoops = { + .vop_default = &fifo_specops, + .vop_fsync = zfs_freebsd_fsync, + .vop_access = zfs_freebsd_access, + .vop_getattr = zfs_freebsd_getattr, + .vop_inactive = zfs_freebsd_inactive, + .vop_read = VOP_PANIC, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_setattr = zfs_freebsd_setattr, + .vop_write = VOP_PANIC, + .vop_pathconf = zfs_freebsd_pathconf, + .vop_fid = zfs_freebsd_fid, + .vop_getacl = zfs_freebsd_getacl, + .vop_setacl = zfs_freebsd_setacl, + .vop_aclcheck = zfs_freebsd_aclcheck, +}; + +/* + * special share hidden files vnode operations template + */ +struct vop_vector zfs_shareops = { + .vop_default = &default_vnodeops, + .vop_access = zfs_freebsd_access, + .vop_inactive = zfs_freebsd_inactive, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_fid = zfs_freebsd_fid, + .vop_pathconf = zfs_freebsd_pathconf, +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c new file mode 100644 index 000000000000..c94cef7d456a --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c @@ -0,0 +1,2289 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ + +#ifdef _KERNEL +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/mntent.h> +#include <sys/u8_textprep.h> +#include <sys/dsl_dataset.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/atomic.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_rlock.h> +#include <sys/zfs_fuid.h> +#include <sys/dnode.h> +#include <sys/fs/zfs.h> +#include <sys/kidmap.h> +#endif /* _KERNEL */ + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/refcount.h> +#include <sys/stat.h> +#include <sys/zap.h> +#include <sys/zfs_znode.h> +#include <sys/sa.h> +#include <sys/zfs_sa.h> +#include <sys/zfs_stat.h> +#include <sys/refcount.h> + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +/* Used by fstat(1). */ +SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, + SYSCTL_NULL_INT_PTR, sizeof(znode_t), "sizeof(znode_t)"); + +/* + * Define ZNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define ZNODE_STATS +#endif /* DEBUG */ + +#ifdef ZNODE_STATS +#define ZNODE_STAT_ADD(stat) ((stat)++) +#else +#define ZNODE_STAT_ADD(stat) /* nothing */ +#endif /* ZNODE_STATS */ + +/* + * Functions needed for userland (ie: libzpool) are not put under + * #ifdef_KERNEL; the rest of the functions have dependencies + * (such as VFS logic) that will not compile easily in userland. + */ +#ifdef _KERNEL +/* + * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to + * be freed before it can be safely accessed. + */ +krwlock_t zfsvfs_lock; + +static kmem_cache_t *znode_cache = NULL; + +/*ARGSUSED*/ +static void +znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) +{ + /* + * We should never drop all dbuf refs without first clearing + * the eviction callback. + */ + panic("evicting znode %p\n", user_ptr); +} + +extern struct vop_vector zfs_vnodeops; +extern struct vop_vector zfs_fifoops; +extern struct vop_vector zfs_shareops; + +static int +zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_t *zp = buf; + + POINTER_INVALIDATE(&zp->z_zfsvfs); + + list_link_init(&zp->z_link_node); + + mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); + + mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&zp->z_range_avl, zfs_range_compare, + sizeof (rl_t), offsetof(rl_t, r_node)); + + zp->z_acl_cached = NULL; + zp->z_vnode = NULL; + zp->z_moved = 0; + return (0); +} + +/*ARGSUSED*/ +static void +zfs_znode_cache_destructor(void *buf, void *arg) +{ + znode_t *zp = buf; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + ASSERT3P(zp->z_vnode, ==, NULL); + ASSERT(!list_link_active(&zp->z_link_node)); + mutex_destroy(&zp->z_acl_lock); + avl_destroy(&zp->z_range_avl); + mutex_destroy(&zp->z_range_lock); + + ASSERT(zp->z_acl_cached == NULL); +} + +#ifdef ZNODE_STATS +static struct { + uint64_t zms_zfsvfs_invalid; + uint64_t zms_zfsvfs_recheck1; + uint64_t zms_zfsvfs_unmounted; + uint64_t zms_zfsvfs_recheck2; + uint64_t zms_obj_held; + uint64_t zms_vnode_locked; + uint64_t zms_not_only_dnlc; +} znode_move_stats; +#endif /* ZNODE_STATS */ + +#ifdef illumos +static void +zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) +{ + vnode_t *vp; + + /* Copy fields. */ + nzp->z_zfsvfs = ozp->z_zfsvfs; + + /* Swap vnodes. */ + vp = nzp->z_vnode; + nzp->z_vnode = ozp->z_vnode; + ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ + ZTOV(ozp)->v_data = ozp; + ZTOV(nzp)->v_data = nzp; + + nzp->z_id = ozp->z_id; + ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ + ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); + nzp->z_unlinked = ozp->z_unlinked; + nzp->z_atime_dirty = ozp->z_atime_dirty; + nzp->z_zn_prefetch = ozp->z_zn_prefetch; + nzp->z_blksz = ozp->z_blksz; + nzp->z_seq = ozp->z_seq; + nzp->z_mapcnt = ozp->z_mapcnt; + nzp->z_gen = ozp->z_gen; + nzp->z_sync_cnt = ozp->z_sync_cnt; + nzp->z_is_sa = ozp->z_is_sa; + nzp->z_sa_hdl = ozp->z_sa_hdl; + bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2); + nzp->z_links = ozp->z_links; + nzp->z_size = ozp->z_size; + nzp->z_pflags = ozp->z_pflags; + nzp->z_uid = ozp->z_uid; + nzp->z_gid = ozp->z_gid; + nzp->z_mode = ozp->z_mode; + + /* + * Since this is just an idle znode and kmem is already dealing with + * memory pressure, release any cached ACL. + */ + if (ozp->z_acl_cached) { + zfs_acl_free(ozp->z_acl_cached); + ozp->z_acl_cached = NULL; + } + + sa_set_userp(nzp->z_sa_hdl, nzp); + + /* + * Invalidate the original znode by clearing fields that provide a + * pointer back to the znode. Set the low bit of the vfs pointer to + * ensure that zfs_znode_move() recognizes the znode as invalid in any + * subsequent callback. + */ + ozp->z_sa_hdl = NULL; + POINTER_INVALIDATE(&ozp->z_zfsvfs); + + /* + * Mark the znode. + */ + nzp->z_moved = 1; + ozp->z_moved = (uint8_t)-1; +} + +/*ARGSUSED*/ +static kmem_cbrc_t +zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) +{ + znode_t *ozp = buf, *nzp = newbuf; + zfsvfs_t *zfsvfs; + vnode_t *vp; + + /* + * The znode is on the file system's list of known znodes if the vfs + * pointer is valid. We set the low bit of the vfs pointer when freeing + * the znode to invalidate it, and the memory patterns written by kmem + * (baddcafe and deadbeef) set at least one of the two low bits. A newly + * created znode sets the vfs pointer last of all to indicate that the + * znode is known and in a valid state to be moved by this function. + */ + zfsvfs = ozp->z_zfsvfs; + if (!POINTER_IS_VALID(zfsvfs)) { + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * Close a small window in which it's possible that the filesystem could + * be unmounted and freed, and zfsvfs, though valid in the previous + * statement, could point to unrelated memory by the time we try to + * prevent the filesystem from being unmounted. + */ + rw_enter(&zfsvfs_lock, RW_WRITER); + if (zfsvfs != ozp->z_zfsvfs) { + rw_exit(&zfsvfs_lock); + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * If the znode is still valid, then so is the file system. We know that + * no valid file system can be freed while we hold zfsvfs_lock, so we + * can safely ensure that the filesystem is not and will not be + * unmounted. The next statement is equivalent to ZFS_ENTER(). + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); + if (zfsvfs->z_unmounted) { + ZFS_EXIT(zfsvfs); + rw_exit(&zfsvfs_lock); + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); + return (KMEM_CBRC_DONT_KNOW); + } + rw_exit(&zfsvfs_lock); + + mutex_enter(&zfsvfs->z_znodes_lock); + /* + * Recheck the vfs pointer in case the znode was removed just before + * acquiring the lock. + */ + if (zfsvfs != ozp->z_zfsvfs) { + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * At this point we know that as long as we hold z_znodes_lock, the + * znode cannot be freed and fields within the znode can be safely + * accessed. Now, prevent a race with zfs_zget(). + */ + if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); + return (KMEM_CBRC_LATER); + } + + vp = ZTOV(ozp); + if (mutex_tryenter(&vp->v_lock) == 0) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); + return (KMEM_CBRC_LATER); + } + + /* Only move znodes that are referenced _only_ by the DNLC. */ + if (vp->v_count != 1 || !vn_in_dnlc(vp)) { + mutex_exit(&vp->v_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); + return (KMEM_CBRC_LATER); + } + + /* + * The znode is known and in a valid state to move. We're holding the + * locks needed to execute the critical section. + */ + zfs_znode_move_impl(ozp, nzp); + mutex_exit(&vp->v_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); + + list_link_replace(&ozp->z_link_node, &nzp->z_link_node); + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + + return (KMEM_CBRC_YES); +} +#endif /* illumos */ + +void +zfs_znode_init(void) +{ + /* + * Initialize zcache + */ + rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); + ASSERT(znode_cache == NULL); + znode_cache = kmem_cache_create("zfs_znode_cache", + sizeof (znode_t), 0, zfs_znode_cache_constructor, + zfs_znode_cache_destructor, NULL, NULL, NULL, 0); + kmem_cache_set_move(znode_cache, zfs_znode_move); +} + +void +zfs_znode_fini(void) +{ +#ifdef illumos + /* + * Cleanup vfs & vnode ops + */ + zfs_remove_op_tables(); +#endif + + /* + * Cleanup zcache + */ + if (znode_cache) + kmem_cache_destroy(znode_cache); + znode_cache = NULL; + rw_destroy(&zfsvfs_lock); +} + +#ifdef illumos +struct vnodeops *zfs_dvnodeops; +struct vnodeops *zfs_fvnodeops; +struct vnodeops *zfs_symvnodeops; +struct vnodeops *zfs_xdvnodeops; +struct vnodeops *zfs_evnodeops; +struct vnodeops *zfs_sharevnodeops; + +void +zfs_remove_op_tables() +{ + /* + * Remove vfs ops + */ + ASSERT(zfsfstype); + (void) vfs_freevfsops_by_type(zfsfstype); + zfsfstype = 0; + + /* + * Remove vnode ops + */ + if (zfs_dvnodeops) + vn_freevnodeops(zfs_dvnodeops); + if (zfs_fvnodeops) + vn_freevnodeops(zfs_fvnodeops); + if (zfs_symvnodeops) + vn_freevnodeops(zfs_symvnodeops); + if (zfs_xdvnodeops) + vn_freevnodeops(zfs_xdvnodeops); + if (zfs_evnodeops) + vn_freevnodeops(zfs_evnodeops); + if (zfs_sharevnodeops) + vn_freevnodeops(zfs_sharevnodeops); + + zfs_dvnodeops = NULL; + zfs_fvnodeops = NULL; + zfs_symvnodeops = NULL; + zfs_xdvnodeops = NULL; + zfs_evnodeops = NULL; + zfs_sharevnodeops = NULL; +} + +extern const fs_operation_def_t zfs_dvnodeops_template[]; +extern const fs_operation_def_t zfs_fvnodeops_template[]; +extern const fs_operation_def_t zfs_xdvnodeops_template[]; +extern const fs_operation_def_t zfs_symvnodeops_template[]; +extern const fs_operation_def_t zfs_evnodeops_template[]; +extern const fs_operation_def_t zfs_sharevnodeops_template[]; + +int +zfs_create_op_tables() +{ + int error; + + /* + * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() + * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). + * In this case we just return as the ops vectors are already set up. + */ + if (zfs_dvnodeops) + return (0); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, + &zfs_dvnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, + &zfs_fvnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, + &zfs_symvnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, + &zfs_xdvnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, + &zfs_evnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template, + &zfs_sharevnodeops); + + return (error); +} +#endif /* illumos */ + +int +zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + zfs_acl_ids_t acl_ids; + vattr_t vattr; + znode_t *sharezp; + znode_t *zp; + int error; + + vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0555; + vattr.va_uid = crgetuid(kcred); + vattr.va_gid = crgetgid(kcred); + + sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); + ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); + sharezp->z_moved = 0; + sharezp->z_unlinked = 0; + sharezp->z_atime_dirty = 0; + sharezp->z_zfsvfs = zfsvfs; + sharezp->z_is_sa = zfsvfs->z_use_sa; + + VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, + kcred, NULL, &acl_ids)); + zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, sharezp); + POINTER_INVALIDATE(&sharezp->z_zfsvfs); + error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); + zfsvfs->z_shares_dir = sharezp->z_id; + + zfs_acl_ids_free(&acl_ids); + sa_handle_destroy(sharezp->z_sa_hdl); + kmem_cache_free(znode_cache, sharezp); + + return (error); +} + +/* + * define a couple of values we need available + * for both 64 and 32 bit environments. + */ +#ifndef NBITSMINOR64 +#define NBITSMINOR64 32 +#endif +#ifndef MAXMAJ64 +#define MAXMAJ64 0xffffffffUL +#endif +#ifndef MAXMIN64 +#define MAXMIN64 0xffffffffUL +#endif + +/* + * Create special expldev for ZFS private use. + * Can't use standard expldev since it doesn't do + * what we want. The standard expldev() takes a + * dev32_t in LP64 and expands it to a long dev_t. + * We need an interface that takes a dev32_t in ILP32 + * and expands it to a long dev_t. + */ +static uint64_t +zfs_expldev(dev_t dev) +{ + return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); +} +/* + * Special cmpldev for ZFS private use. + * Can't use standard cmpldev since it takes + * a long dev_t and compresses it to dev32_t in + * LP64. We need to do a compaction of a long dev_t + * to a dev32_t in ILP32. + */ +dev_t +zfs_cmpldev(uint64_t dev) +{ + return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); +} + +static void +zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, + dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) +{ + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); + + ASSERT(zp->z_sa_hdl == NULL); + ASSERT(zp->z_acl_cached == NULL); + if (sa_hdl == NULL) { + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, + SA_HDL_SHARED, &zp->z_sa_hdl)); + } else { + zp->z_sa_hdl = sa_hdl; + sa_set_userp(sa_hdl, zp); + } + + zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; + + /* + * Slap on VROOT if we are the root znode unless we are the root + * node of a snapshot mounted under .zfs. + */ + if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) + ZTOV(zp)->v_flag |= VROOT; + + vn_exists(ZTOV(zp)); +} + +void +zfs_znode_dmu_fini(znode_t *zp) +{ + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || + zp->z_unlinked || + RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); + + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; +} + +static void +zfs_vnode_forget(vnode_t *vp) +{ + + /* copied from insmntque_stddtr */ + vp->v_data = NULL; + vp->v_op = &dead_vnodeops; + vgone(vp); + vput(vp); +} + +/* + * Construct a new znode/vnode and intialize. + * + * This does not do a call to dmu_set_user() that is + * up to the caller to do, in case you don't want to + * return the znode + */ +static znode_t * +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, + dmu_object_type_t obj_type, sa_handle_t *hdl) +{ + znode_t *zp; + vnode_t *vp; + uint64_t mode; + uint64_t parent; + sa_bulk_attr_t bulk[9]; + int count = 0; + int error; + + zp = kmem_cache_alloc(znode_cache, KM_SLEEP); + + KASSERT(curthread->td_vp_reserv > 0, + ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); + error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp); + if (error != 0) { + kmem_cache_free(znode_cache, zp); + return (NULL); + } + zp->z_vnode = vp; + vp->v_data = zp; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + zp->z_moved = 0; + + /* + * Defer setting z_zfsvfs until the znode is ready to be a candidate for + * the zfs_znode_move() callback. + */ + zp->z_sa_hdl = NULL; + zp->z_unlinked = 0; + zp->z_atime_dirty = 0; + zp->z_mapcnt = 0; + zp->z_id = db->db_object; + zp->z_blksz = blksz; + zp->z_seq = 0x7A4653; + zp->z_sync_cnt = 0; + + vp = ZTOV(zp); + + zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, 8); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) { + if (hdl == NULL) + sa_handle_destroy(zp->z_sa_hdl); + zfs_vnode_forget(vp); + zp->z_vnode = NULL; + kmem_cache_free(znode_cache, zp); + return (NULL); + } + + zp->z_mode = mode; + + vp->v_type = IFTOVT((mode_t)mode); + + switch (vp->v_type) { + case VDIR: + zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ + break; +#ifdef illumos + case VBLK: + case VCHR: + { + uint64_t rdev; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), + &rdev, sizeof (rdev)) == 0); + + vp->v_rdev = zfs_cmpldev(rdev); + } + break; +#endif + case VFIFO: +#ifdef illumos + case VSOCK: + case VDOOR: +#endif + vp->v_op = &zfs_fifoops; + break; + case VREG: + if (parent == zfsvfs->z_shares_dir) { + ASSERT(zp->z_uid == 0 && zp->z_gid == 0); + vp->v_op = &zfs_shareops; + } + break; +#ifdef illumos + case VLNK: + vn_setops(vp, zfs_symvnodeops); + break; + default: + vn_setops(vp, zfs_evnodeops); + break; +#endif + } + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + membar_producer(); + /* + * Everything else must be valid before assigning z_zfsvfs makes the + * znode eligible for zfs_znode_move(). + */ + zp->z_zfsvfs = zfsvfs; + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * Acquire vnode lock before making it available to the world. + */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + VN_LOCK_AREC(vp); + if (vp->v_type != VFIFO) + VN_LOCK_ASHARE(vp); + +#ifdef illumos + VFS_HOLD(zfsvfs->z_vfs); +#endif + return (zp); +} + +static uint64_t empty_xattr; +static uint64_t pad[4]; +static zfs_acl_phys_t acl_phys; +/* + * Create a new DMU object to hold a zfs znode. + * + * IN: dzp - parent directory for new znode + * vap - file attributes for new znode + * tx - dmu transaction id for zap operations + * cr - credentials of caller + * flag - flags: + * IS_ROOT_NODE - new object will be root + * IS_XATTR - new object is an attribute + * bonuslen - length of bonus buffer + * setaclp - File/Dir initial ACL + * fuidp - Tracks fuid allocation. + * + * OUT: zpp - allocated znode + * + */ +void +zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) +{ + uint64_t crtime[2], atime[2], mtime[2], ctime[2]; + uint64_t mode, size, links, parent, pflags; + uint64_t dzp_pflags = 0; + uint64_t rdev = 0; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + dmu_buf_t *db; + timestruc_t now; + uint64_t gen, obj; + int err; + int bonuslen; + int dnodesize; + sa_handle_t *sa_hdl; + dmu_object_type_t obj_type; + sa_bulk_attr_t *sa_attrs; + int cnt = 0; + zfs_acl_locator_cb_t locate = { 0 }; + + ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); + + if (zfsvfs->z_replay) { + obj = vap->va_nodeid; + now = vap->va_ctime; /* see zfs_replay_create() */ + gen = vap->va_nblocks; /* ditto */ + dnodesize = vap->va_fsid; /* ditto */ + } else { + obj = 0; + vfs_timestamp(&now); + gen = dmu_tx_get_txg(tx); + dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); + } + + if (dnodesize == 0) + dnodesize = DNODE_MIN_SIZE; + + obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; + bonuslen = (obj_type == DMU_OT_SA) ? + DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; + + /* + * Create a new DMU object. + */ + /* + * There's currently no mechanism for pre-reading the blocks that will + * be needed to allocate a new object, so we accept the small chance + * that there will be an i/o error and we will fail one of the + * assertions below. + */ + if (vap->va_type == VDIR) { + if (zfsvfs->z_replay) { + VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = zap_create_norm_dnsize(zfsvfs->z_os, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx); + } + } else { + if (zfsvfs->z_replay) { + VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = dmu_object_alloc_dnsize(zfsvfs->z_os, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx); + } + } + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); + + /* + * If this is the root, fix up the half-initialized parent pointer + * to reference the just-allocated physical data area. + */ + if (flag & IS_ROOT_NODE) { + dzp->z_id = obj; + } else { + dzp_pflags = dzp->z_pflags; + } + + /* + * If parent is an xattr, so am I. + */ + if (dzp_pflags & ZFS_XATTR) { + flag |= IS_XATTR; + } + + if (zfsvfs->z_use_fuids) + pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + else + pflags = 0; + + if (vap->va_type == VDIR) { + size = 2; /* contents ("." and "..") */ + links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; + } else { + size = links = 0; + } + + if (vap->va_type == VBLK || vap->va_type == VCHR) { + rdev = zfs_expldev(vap->va_rdev); + } + + parent = dzp->z_id; + mode = acl_ids->z_mode; + if (flag & IS_XATTR) + pflags |= ZFS_XATTR; + + /* + * No execs denied will be deterimed when zfs_mode_compute() is called. + */ + pflags |= acl_ids->z_aclp->z_hints & + (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| + ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); + + ZFS_TIME_ENCODE(&now, crtime); + ZFS_TIME_ENCODE(&now, ctime); + + if (vap->va_mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, atime); + } else { + ZFS_TIME_ENCODE(&now, atime); + } + + if (vap->va_mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + } else { + ZFS_TIME_ENCODE(&now, mtime); + } + + /* Now add in all of the "SA" attributes */ + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, + &sa_hdl)); + + /* + * Setup the array of attributes to be replaced/set on the new file + * + * order for DMU_OT_ZNODE is critical since it needs to be constructed + * in the old znode_phys_t format. Don't change this ordering + */ + sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + } else { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), + NULL, &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), + NULL, &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + } + + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, + &empty_xattr, 8); + } + if (obj_type == DMU_OT_ZNODE || + (vap->va_type == VBLK || vap->va_type == VCHR)) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), + NULL, &rdev, 8); + + } + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, + &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, + &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, + sizeof (uint64_t) * 4); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (zfs_acl_phys_t)); + } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &acl_ids->z_aclp->z_acl_count, 8); + locate.cb_aclp = acl_ids->z_aclp; + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, + acl_ids->z_aclp->z_acl_bytes); + mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, + acl_ids->z_fuid, acl_ids->z_fgid); + } + + VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); + + if (!(flag & IS_ROOT_NODE)) { + *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); + ASSERT(*zpp != NULL); + } else { + /* + * If we are creating the root node, the "parent" we + * passed in is the znode for the root. + */ + *zpp = dzp; + + (*zpp)->z_sa_hdl = sa_hdl; + } + + (*zpp)->z_pflags = pflags; + (*zpp)->z_mode = mode; + (*zpp)->z_dnodesize = dnodesize; + + if (vap->va_mask & AT_XVATTR) + zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); + + if (obj_type == DMU_OT_ZNODE || + acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { + VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); + } + if (!(flag & IS_ROOT_NODE)) { + vnode_t *vp; + + vp = ZTOV(*zpp); + vp->v_vflag |= VV_FORCEINSMQ; + err = insmntque(vp, zfsvfs->z_vfs); + vp->v_vflag &= ~VV_FORCEINSMQ; + KASSERT(err == 0, ("insmntque() failed: error %d", err)); + } + kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); +} + +/* + * Update in-core attributes. It is assumed the caller will be doing an + * sa_bulk_update to push the changes out. + */ +void +zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + xoptattr_t *xoap; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + uint64_t times[2]; + ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + ×, sizeof (times), tx); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_READONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, + xoap->xoa_av_quarantined, zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + zfs_sa_set_scanstamp(zp, xvap, tx); + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); + } + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SPARSE); + } +} + +int +zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) +{ + dmu_object_info_t doi; + dmu_buf_t *db; + znode_t *zp; + vnode_t *vp; + sa_handle_t *hdl; + struct thread *td; + int locked; + int err; + + td = curthread; + getnewvnode_reserve(1); +again: + *zpp = NULL; + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); +#ifdef __FreeBSD__ + getnewvnode_drop_reserve(); +#endif + return (SET_ERROR(EINVAL)); + } + + hdl = dmu_buf_get_user(db); + if (hdl != NULL) { + zp = sa_get_userdata(hdl); + + /* + * Since "SA" does immediate eviction we + * should never find a sa handle that doesn't + * know about the znode. + */ + ASSERT3P(zp, !=, NULL); + ASSERT3U(zp->z_id, ==, obj_num); + if (zp->z_unlinked) { + err = SET_ERROR(ENOENT); + } else { + vp = ZTOV(zp); + /* + * Don't let the vnode disappear after + * ZFS_OBJ_HOLD_EXIT. + */ + VN_HOLD(vp); + *zpp = zp; + err = 0; + } + + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + if (err) { + getnewvnode_drop_reserve(); + return (err); + } + + locked = VOP_ISLOCKED(vp); + VI_LOCK(vp); + if ((vp->v_iflag & VI_DOOMED) != 0 && + locked != LK_EXCLUSIVE) { + /* + * The vnode is doomed and this thread doesn't + * hold the exclusive lock on it, so the vnode + * must be being reclaimed by another thread. + * Otherwise the doomed vnode is being reclaimed + * by this thread and zfs_zget is called from + * ZIL internals. + */ + VI_UNLOCK(vp); + + /* + * XXX vrele() locks the vnode when the last reference + * is dropped. Although in this case the vnode is + * doomed / dead and so no inactivation is required, + * the vnode lock is still acquired. That could result + * in a LOR with z_teardown_lock if another thread holds + * the vnode's lock and tries to take z_teardown_lock. + * But that is only possible if the other thread peforms + * a ZFS vnode operation on the vnode. That either + * should not happen if the vnode is dead or the thread + * should also have a refrence to the vnode and thus + * our reference is not last. + */ + VN_RELE(vp); + goto again; + } + VI_UNLOCK(vp); + getnewvnode_drop_reserve(); + return (err); + } + + /* + * Not found create new znode/vnode + * but only if file exists. + * + * There is a small window where zfs_vget() could + * find this object while a file create is still in + * progress. This is checked for in zfs_znode_alloc() + * + * if zfs_znode_alloc() fails it will drop the hold on the + * bonus buffer. + */ + zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, + doi.doi_bonus_type, NULL); + if (zp == NULL) { + err = SET_ERROR(ENOENT); + } else { + *zpp = zp; + } + if (err == 0) { + vnode_t *vp = ZTOV(zp); + + err = insmntque(vp, zfsvfs->z_vfs); + if (err == 0) { + vp->v_hash = obj_num; + VOP_UNLOCK(vp, 0); + } else { + zp->z_vnode = NULL; + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + *zpp = NULL; + } + } + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (err); +} + +int +zfs_rezget(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_info_t doi; + dmu_buf_t *db; + vnode_t *vp; + uint64_t obj_num = zp->z_id; + uint64_t mode, size; + sa_bulk_attr_t bulk[8]; + int err; + int count = 0; + uint64_t gen; + + /* + * Remove cached pages before reloading the znode, so that they are not + * lingering after we run into any error. Ideally, we should vgone() + * the vnode in case of error, but currently we cannot do that + * because of the LOR between the vnode lock and z_teardown_lock. + * So, instead, we have to "doom" the znode in the illumos style. + */ + vp = ZTOV(zp); + vn_pages_remove(vp, 0, 0); + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + mutex_enter(&zp->z_acl_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + mutex_exit(&zp->z_acl_lock); + ASSERT(zp->z_sa_hdl == NULL); + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EINVAL)); + } + + zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); + size = zp->z_size; + + /* reload cached values */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, + &gen, sizeof (gen)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, sizeof (zp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, sizeof (zp->z_uid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, sizeof (zp->z_gid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + zp->z_mode = mode; + + if (gen != zp->z_gen) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + /* + * It is highly improbable but still quite possible that two + * objects in different datasets are created with the same + * object numbers and in transaction groups with the same + * numbers. znodes corresponding to those objects would + * have the same z_id and z_gen, but their other attributes + * may be different. + * zfs recv -F may replace one of such objects with the other. + * As a result file properties recorded in the replaced + * object's vnode may no longer match the received object's + * properties. At present the only cached property is the + * files type recorded in v_type. + * So, handle this case by leaving the old vnode and znode + * disassociated from the actual object. A new vnode and a + * znode will be created if the object is accessed + * (e.g. via a look-up). The old vnode and znode will be + * recycled when the last vnode reference is dropped. + */ + if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + /* + * If the file has zero links, then it has been unlinked on the send + * side and it must be in the received unlinked set. + * We call zfs_znode_dmu_fini() now to prevent any accesses to the + * stale data and to prevent automatical removal of the file in + * zfs_zinactive(). The file will be removed either when it is removed + * on the send side and the next incremental stream is received or + * when the unlinked set gets processed. + */ + zp->z_unlinked = (zp->z_links == 0); + if (zp->z_unlinked) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (0); + } + + zp->z_blksz = doi.doi_data_block_size; + if (zp->z_size != size) + vnode_pager_setsize(vp, zp->z_size); + + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + return (0); +} + +void +zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + uint64_t obj = zp->z_id; + uint64_t acl_obj = zfs_external_acl(zp); + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + if (acl_obj) { + VERIFY(!zp->z_is_sa); + VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + } + VERIFY(0 == dmu_object_free(os, obj, tx)); + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); + zfs_znode_free(zp); +} + +void +zfs_zinactive(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t z_id = zp->z_id; + + ASSERT(zp->z_sa_hdl); + + /* + * Don't allow a zfs_zget() while were trying to release this znode + */ + ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); + + /* + * If this was the last reference to a file with no links, remove + * the file from the file system unless the file system is mounted + * read-only. That can happen, for example, if the file system was + * originally read-write, the file was opened, then unlinked and + * the file system was made read-only before the file was finally + * closed. The file will remain in the unlinked set. + */ + if (zp->z_unlinked) { + ASSERT(!zfsvfs->z_issnap); + if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + zfs_rmnode(zp); + return; + } + } + + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + zfs_znode_free(zp); +} + +void +zfs_znode_free(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp->z_sa_hdl == NULL); + zp->z_vnode = NULL; + mutex_enter(&zfsvfs->z_znodes_lock); + POINTER_INVALIDATE(&zp->z_zfsvfs); + list_remove(&zfsvfs->z_all_znodes, zp); + mutex_exit(&zfsvfs->z_znodes_lock); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + kmem_cache_free(znode_cache, zp); + +#ifdef illumos + VFS_RELE(zfsvfs->z_vfs); +#endif +} + +void +zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2], boolean_t have_tx) +{ + timestruc_t now; + + vfs_timestamp(&now); + + if (have_tx) { /* will sa_bulk_update happen really soon? */ + zp->z_atime_dirty = 0; + zp->z_seq++; + } else { + zp->z_atime_dirty = 1; + } + + if (flag & AT_ATIME) { + ZFS_TIME_ENCODE(&now, zp->z_atime); + } + + if (flag & AT_MTIME) { + ZFS_TIME_ENCODE(&now, mtime); + if (zp->z_zfsvfs->z_use_fuids) { + zp->z_pflags |= (ZFS_ARCHIVE | + ZFS_AV_MODIFIED); + } + } + + if (flag & AT_CTIME) { + ZFS_TIME_ENCODE(&now, ctime); + if (zp->z_zfsvfs->z_use_fuids) + zp->z_pflags |= ZFS_ARCHIVE; + } +} + +/* + * Grow the block size for a file. + * + * IN: zp - znode of file to free data in. + * size - requested block size + * tx - open transaction. + * + * NOTE: this function assumes that the znode is write locked. + */ +void +zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) +{ + int error; + u_longlong_t dummy; + + if (size <= zp->z_blksz) + return; + /* + * If the file size is already greater than the current blocksize, + * we will not grow. If there is more than one block in a file, + * the blocksize cannot change. + */ + if (zp->z_blksz && zp->z_size > zp->z_blksz) + return; + + error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, + size, 0, tx); + + if (error == ENOTSUP) + return; + ASSERT0(error); + + /* What blocksize did we actually get? */ + dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); +} + +#ifdef illumos +/* + * This is a dummy interface used when pvn_vplist_dirty() should *not* + * be calling back into the fs for a putpage(). E.g.: when truncating + * a file, the pages being "thrown away* don't need to be written out. + */ +/* ARGSUSED */ +static int +zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, + int flags, cred_t *cr) +{ + ASSERT(0); + return (0); +} +#endif + +/* + * Increase the file length + * + * IN: zp - znode of file to free data in. + * end - new end-of-file + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_extend(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_tx_t *tx; + rl_t *rl; + uint64_t newblksz; + int error; + + /* + * We will change zp_size, lock the whole file. + */ + rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end <= zp->z_size) { + zfs_range_unlock(rl); + return (0); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + if (end > zp->z_blksz && + (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { + /* + * We are growing the file past the current block size. + */ + if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); + } else { + newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); + } + dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); + } else { + newblksz = 0; + } + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_range_unlock(rl); + return (error); + } + + if (newblksz) + zfs_grow_blocksize(zp, newblksz, tx); + + zp->z_size = end; + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx)); + + vnode_pager_setsize(ZTOV(zp), end); + + zfs_range_unlock(rl); + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Free space in a file. + * + * IN: zp - znode of file to free data in. + * off - start of section to free. + * len - length of section to free. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + rl_t *rl; + int error; + + /* + * Lock the range being freed. + */ + rl = zfs_range_lock(zp, off, len, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (off >= zp->z_size) { + zfs_range_unlock(rl); + return (0); + } + + if (off + len > zp->z_size) + len = zp->z_size - off; + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); + + if (error == 0) { + /* + * In FreeBSD we cannot free block in the middle of a file, + * but only at the end of a file, so this code path should + * never happen. + */ + vnode_pager_setsize(ZTOV(zp), off); + } + + zfs_range_unlock(rl); + + return (error); +} + +/* + * Truncate a file + * + * IN: zp - znode of file to free data in. + * end - new end-of-file. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_trunc(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + dmu_tx_t *tx; + rl_t *rl; + int error; + sa_bulk_attr_t bulk[2]; + int count = 0; + + /* + * We will change zp_size, lock the whole file. + */ + rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end >= zp->z_size) { + zfs_range_unlock(rl); + return (0); + } + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, + DMU_OBJECT_END); + if (error) { + zfs_range_unlock(rl); + return (error); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_range_unlock(rl); + return (error); + } + + zp->z_size = end; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &zp->z_size, sizeof (zp->z_size)); + + if (end == 0) { + zp->z_pflags &= ~ZFS_SPARSE; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + } + VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); + + dmu_tx_commit(tx); + + /* + * Clear any mapped pages in the truncated region. This has to + * happen outside of the transaction to avoid the possibility of + * a deadlock with someone trying to push a page that we are + * about to invalidate. + */ + vnode_pager_setsize(vp, end); + + zfs_range_unlock(rl); + + return (0); +} + +/* + * Free space in a file + * + * IN: zp - znode of file to free data in. + * off - start of range + * len - end of range (0 => EOF) + * flag - current file open mode flags. + * log - TRUE if this action should be logged + * + * RETURN: 0 on success, error code on failure + */ +int +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +{ + vnode_t *vp = ZTOV(zp); + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t mode; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + int error; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, + sizeof (mode))) != 0) + return (error); + + if (off > zp->z_size) { + error = zfs_extend(zp, off+len); + if (error == 0 && log) + goto log; + else + return (error); + } + + /* + * Check for any locks in the region to be freed. + */ + + if (MANDLOCK(vp, (mode_t)mode)) { + uint64_t length = (len ? len : zp->z_size - off); + if (error = chklock(vp, FWRITE, off, length, flag, NULL)) + return (error); + } + + if (len == 0) { + error = zfs_trunc(zp, off); + } else { + if ((error = zfs_free_range(zp, off, len)) == 0 && + off + len > zp->z_size) + error = zfs_extend(zp, off+len); + } + if (error || !log) + return (error); +log: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + + dmu_tx_commit(tx); + return (0); +} + +void +zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) +{ + uint64_t moid, obj, sa_obj, version; + uint64_t sense = ZFS_CASE_SENSITIVE; + uint64_t norm = 0; + nvpair_t *elem; + int error; + int i; + znode_t *rootzp = NULL; + zfsvfs_t *zfsvfs; + vattr_t vattr; + znode_t *zp; + zfs_acl_ids_t acl_ids; + + /* + * First attempt to create master node. + */ + /* + * In an empty objset, there are no blocks to read and thus + * there can be no i/o errors (which we assert below). + */ + moid = MASTER_NODE_OBJ; + error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + /* + * Give dmu_object_alloc() a hint about where to start + * allocating new objects. Otherwise, since the metadnode's + * dnode_phys_t structure isn't initialized yet, dmu_object_next() + * would fail and we'd have to skip to the next dnode block. + */ + os->os_obj_next = moid + 1; + + /* + * Set starting attributes. + */ + version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); + elem = NULL; + while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { + /* For the moment we expect all zpl props to be uint64_ts */ + uint64_t val; + char *name; + + ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); + VERIFY(nvpair_value_uint64(elem, &val) == 0); + name = nvpair_name(elem); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { + if (val < version) + version = val; + } else { + error = zap_update(os, moid, name, 8, 1, &val, tx); + } + ASSERT(error == 0); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) + norm = val; + else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) + sense = val; + } + ASSERT(version != 0); + error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); + + /* + * Create zap object used for SA attribute registration + */ + + if (version >= ZPL_VERSION_SA) { + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT(error == 0); + } else { + sa_obj = 0; + } + /* + * Create a delete queue. + */ + obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); + ASSERT(error == 0); + + /* + * Create root znode. Create minimal znode/vnode/zfsvfs + * to allow zfs_mknode to work. + */ + VATTR_NULL(&vattr); + vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0755; + vattr.va_uid = crgetuid(cr); + vattr.va_gid = crgetgid(cr); + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); + ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); + rootzp->z_moved = 0; + rootzp->z_unlinked = 0; + rootzp->z_atime_dirty = 0; + rootzp->z_is_sa = USE_SA(version, os); + + zfsvfs->z_os = os; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_version = version; + zfsvfs->z_use_fuids = USE_FUIDS(version, os); + zfsvfs->z_use_sa = USE_SA(version, os); + zfsvfs->z_norm = norm; + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + + ASSERT(error == 0); + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + rootzp->z_zfsvfs = zfsvfs; + VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + cr, NULL, &acl_ids)); + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, rootzp); + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); + ASSERT(error == 0); + zfs_acl_ids_free(&acl_ids); + POINTER_INVALIDATE(&rootzp->z_zfsvfs); + + sa_handle_destroy(rootzp->z_sa_hdl); + kmem_cache_free(znode_cache, rootzp); + + /* + * Create shares directory + */ + + error = zfs_create_share_dir(zfsvfs, tx); + + ASSERT(error == 0); + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} +#endif /* _KERNEL */ + +static int +zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) +{ + uint64_t sa_obj = 0; + int error; + + error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); + if (error != 0 && error != ENOENT) + return (error); + + error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); + return (error); +} + +static int +zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, + dmu_buf_t **db, void *tag) +{ + dmu_object_info_t doi; + int error; + + if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) + return (error); + + dmu_object_info_from_db(*db, &doi); + if ((doi.doi_bonus_type != DMU_OT_SA && + doi.doi_bonus_type != DMU_OT_ZNODE) || + doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)) { + sa_buf_rele(*db, tag); + return (SET_ERROR(ENOTSUP)); + } + + error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); + if (error != 0) { + sa_buf_rele(*db, tag); + return (error); + } + + return (0); +} + +void +zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) +{ + sa_handle_destroy(hdl); + sa_buf_rele(db, tag); +} + +/* + * Given an object number, return its parent object number and whether + * or not the object is an extended attribute directory. + */ +static int +zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, + uint64_t *pobjp, int *is_xattrdir) +{ + uint64_t parent; + uint64_t pflags; + uint64_t mode; + uint64_t parent_mode; + sa_bulk_attr_t bulk[3]; + sa_handle_t *sa_hdl; + dmu_buf_t *sa_db; + int count = 0; + int error; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, + &parent, sizeof (parent)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, + &pflags, sizeof (pflags)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &mode, sizeof (mode)); + + if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) + return (error); + + /* + * When a link is removed its parent pointer is not changed and will + * be invalid. There are two cases where a link is removed but the + * file stays around, when it goes to the delete queue and when there + * are additional links. + */ + error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); + if (error != 0) + return (error); + + error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + if (error != 0) + return (error); + + *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); + + /* + * Extended attributes can be applied to files, directories, etc. + * Otherwise the parent must be a directory. + */ + if (!*is_xattrdir && !S_ISDIR(parent_mode)) + return (SET_ERROR(EINVAL)); + + *pobjp = parent; + + return (0); +} + +/* + * Given an object number, return some zpl level statistics + */ +static int +zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, + zfs_stat_t *sb) +{ + sa_bulk_attr_t bulk[4]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &sb->zs_mode, sizeof (sb->zs_mode)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, + &sb->zs_gen, sizeof (sb->zs_gen)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, + &sb->zs_links, sizeof (sb->zs_links)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, + &sb->zs_ctime, sizeof (sb->zs_ctime)); + + return (sa_bulk_lookup(hdl, bulk, count)); +} + +static int +zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, + sa_attr_type_t *sa_table, char *buf, int len) +{ + sa_handle_t *sa_hdl; + sa_handle_t *prevhdl = NULL; + dmu_buf_t *prevdb = NULL; + dmu_buf_t *sa_db = NULL; + char *path = buf + len - 1; + int error; + + *path = '\0'; + sa_hdl = hdl; + + uint64_t deleteq_obj; + VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + error = zap_lookup_int(osp, deleteq_obj, obj); + if (error == 0) { + return (ESTALE); + } else if (error != ENOENT) { + return (error); + } + error = 0; + + for (;;) { + uint64_t pobj; + char component[MAXNAMELEN + 2]; + size_t complen; + int is_xattrdir; + + if (prevdb) + zfs_release_sa_handle(prevhdl, prevdb, FTAG); + + if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, + &is_xattrdir)) != 0) + break; + + if (pobj == obj) { + if (path[0] != '/') + *--path = '/'; + break; + } + + component[0] = '/'; + if (is_xattrdir) { + (void) sprintf(component + 1, "<xattrdir>"); + } else { + error = zap_value_search(osp, pobj, obj, + ZFS_DIRENT_OBJ(-1ULL), component + 1); + if (error != 0) + break; + } + + complen = strlen(component); + path -= complen; + ASSERT(path >= buf); + bcopy(component, path, complen); + obj = pobj; + + if (sa_hdl != hdl) { + prevhdl = sa_hdl; + prevdb = sa_db; + } + error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); + if (error != 0) { + sa_hdl = prevhdl; + sa_db = prevdb; + break; + } + } + + if (sa_hdl != NULL && sa_hdl != hdl) { + ASSERT(sa_db != NULL); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + } + + if (error == 0) + (void) memmove(buf, path, buf + len - path); + + return (error); +} + +int +zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) +{ + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +int +zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len) +{ + char *path = buf + len - 1; + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + *path = '\0'; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_stats_impl(hdl, sa_table, sb); + if (error != 0) { + zfs_release_sa_handle(hdl, db, FTAG); + return (error); + } + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +#ifdef _KERNEL +int +zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t parent; + int is_xattrdir; + int err; + + /* Extended attributes should not be visible as regular files. */ + if ((zp->z_pflags & ZFS_XATTR) != 0) + return (SET_ERROR(EINVAL)); + + err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table, + &parent, &is_xattrdir); + if (err != 0) + return (err); + ASSERT0(is_xattrdir); + + /* No name as this is a root object. */ + if (parent == zp->z_id) + return (SET_ERROR(EINVAL)); + + err = zap_value_search(zfsvfs->z_os, parent, zp->z_id, + ZFS_DIRENT_OBJ(-1ULL), buf); + if (err != 0) + return (err); + err = zfs_zget(zfsvfs, parent, dzpp); + return (err); +} +#endif /* _KERNEL */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c new file mode 100644 index 000000000000..f31ac23ce703 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -0,0 +1,3318 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/arc.h> +#include <sys/stat.h> +#include <sys/resource.h> +#include <sys/zil.h> +#include <sys/zil_impl.h> +#include <sys/dsl_dataset.h> +#include <sys/vdev_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_pool.h> +#include <sys/abd.h> + +/* + * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system + * calls that change the file system. Each itx has enough information to + * be able to replay them after a system crash, power loss, or + * equivalent failure mode. These are stored in memory until either: + * + * 1. they are committed to the pool by the DMU transaction group + * (txg), at which point they can be discarded; or + * 2. they are committed to the on-disk ZIL for the dataset being + * modified (e.g. due to an fsync, O_DSYNC, or other synchronous + * requirement). + * + * In the event of a crash or power loss, the itxs contained by each + * dataset's on-disk ZIL will be replayed when that dataset is first + * instantianted (e.g. if the dataset is a normal fileystem, when it is + * first mounted). + * + * As hinted at above, there is one ZIL per dataset (both the in-memory + * representation, and the on-disk representation). The on-disk format + * consists of 3 parts: + * + * - a single, per-dataset, ZIL header; which points to a chain of + * - zero or more ZIL blocks; each of which contains + * - zero or more ZIL records + * + * A ZIL record holds the information necessary to replay a single + * system call transaction. A ZIL block can hold many ZIL records, and + * the blocks are chained together, similarly to a singly linked list. + * + * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL + * block in the chain, and the ZIL header points to the first block in + * the chain. + * + * Note, there is not a fixed place in the pool to hold these ZIL + * blocks; they are dynamically allocated and freed as needed from the + * blocks available on the pool, though they can be preferentially + * allocated from a dedicated "log" vdev. + */ + +/* + * This controls the amount of time that a ZIL block (lwb) will remain + * "open" when it isn't "full", and it has a thread waiting for it to be + * committed to stable storage. Please refer to the zil_commit_waiter() + * function (and the comments within it) for more details. + */ +int zfs_commit_timeout_pct = 5; + +/* + * Disable intent logging replay. This global ZIL switch affects all pools. + */ +int zil_replay_disable = 0; +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN, + &zil_replay_disable, 0, "Disable intent logging replay"); + +/* + * Tunable parameter for debugging or performance analysis. Setting + * zfs_nocacheflush will cause corruption on power loss if a volatile + * out-of-order write cache is enabled. + */ +boolean_t zfs_nocacheflush = B_FALSE; +SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RWTUN, + &zfs_nocacheflush, 0, "Disable cache flush"); +boolean_t zfs_trim_enabled = B_TRUE; +SYSCTL_DECL(_vfs_zfs_trim); +SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0, + "Enable ZFS TRIM"); + +/* + * Limit SLOG write size per commit executed with synchronous priority. + * Any writes above that will be executed with lower (asynchronous) priority + * to limit potential SLOG device abuse by single active ZIL writer. + */ +uint64_t zil_slog_bulk = 768 * 1024; +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_bulk, CTLFLAG_RWTUN, + &zil_slog_bulk, 0, "Maximal SLOG commit size with sync priority"); + +static kmem_cache_t *zil_lwb_cache; +static kmem_cache_t *zil_zcw_cache; + +#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ + sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) + +static int +zil_bp_compare(const void *x1, const void *x2) +{ + const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; + const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; + + int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); + if (likely(cmp)) + return (cmp); + + return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); +} + +static void +zil_bp_tree_init(zilog_t *zilog) +{ + avl_create(&zilog->zl_bp_tree, zil_bp_compare, + sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); +} + +static void +zil_bp_tree_fini(zilog_t *zilog) +{ + avl_tree_t *t = &zilog->zl_bp_tree; + zil_bp_node_t *zn; + void *cookie = NULL; + + while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(zn, sizeof (zil_bp_node_t)); + + avl_destroy(t); +} + +int +zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) +{ + avl_tree_t *t = &zilog->zl_bp_tree; + const dva_t *dva; + zil_bp_node_t *zn; + avl_index_t where; + + if (BP_IS_EMBEDDED(bp)) + return (0); + + dva = BP_IDENTITY(bp); + + if (avl_find(t, dva, &where) != NULL) + return (SET_ERROR(EEXIST)); + + zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); + zn->zn_dva = *dva; + avl_insert(t, zn, where); + + return (0); +} + +static zil_header_t * +zil_header_in_syncing_context(zilog_t *zilog) +{ + return ((zil_header_t *)zilog->zl_header); +} + +static void +zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) +{ + zio_cksum_t *zc = &bp->blk_cksum; + + zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); + zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); + zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); + zc->zc_word[ZIL_ZC_SEQ] = 1ULL; +} + +/* + * Read a log block and make sure it's valid. + */ +static int +zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, + char **end) +{ + enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + arc_flags_t aflags = ARC_FLAG_WAIT; + arc_buf_t *abuf = NULL; + zbookmark_phys_t zb; + int error; + + if (zilog->zl_header->zh_claim_txg == 0) + zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; + + if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + + if (error == 0) { + zio_cksum_t cksum = bp->blk_cksum; + + /* + * Validate the checksummed log block. + * + * Sequence numbers should be... sequential. The checksum + * verifier for the next block should be bp's checksum plus 1. + * + * Also check the log chain linkage and size used. + */ + cksum.zc_word[ZIL_ZC_SEQ]++; + + if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { + zil_chain_t *zilc = abuf->b_data; + char *lr = (char *)(zilc + 1); + uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); + + if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, + sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { + error = SET_ERROR(ECKSUM); + } else { + ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); + bcopy(lr, dst, len); + *end = (char *)dst + len; + *nbp = zilc->zc_next_blk; + } + } else { + char *lr = abuf->b_data; + uint64_t size = BP_GET_LSIZE(bp); + zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; + + if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, + sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + (zilc->zc_nused > (size - sizeof (*zilc)))) { + error = SET_ERROR(ECKSUM); + } else { + ASSERT3U(zilc->zc_nused, <=, + SPA_OLD_MAXBLOCKSIZE); + bcopy(lr, dst, zilc->zc_nused); + *end = (char *)dst + zilc->zc_nused; + *nbp = zilc->zc_next_blk; + } + } + + arc_buf_destroy(abuf, &abuf); + } + + return (error); +} + +/* + * Read a TX_WRITE log data block. + */ +static int +zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) +{ + enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + const blkptr_t *bp = &lr->lr_blkptr; + arc_flags_t aflags = ARC_FLAG_WAIT; + arc_buf_t *abuf = NULL; + zbookmark_phys_t zb; + int error; + + if (BP_IS_HOLE(bp)) { + if (wbuf != NULL) + bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); + return (0); + } + + if (zilog->zl_header->zh_claim_txg == 0) + zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; + + SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, + ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + + error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + + if (error == 0) { + if (wbuf != NULL) + bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); + arc_buf_destroy(abuf, &abuf); + } + + return (error); +} + +/* + * Parse the intent log, and call parse_func for each valid record within. + */ +int +zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, + zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) +{ + const zil_header_t *zh = zilog->zl_header; + boolean_t claimed = !!zh->zh_claim_txg; + uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; + uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; + uint64_t max_blk_seq = 0; + uint64_t max_lr_seq = 0; + uint64_t blk_count = 0; + uint64_t lr_count = 0; + blkptr_t blk, next_blk; + char *lrbuf, *lrp; + int error = 0; + + /* + * Old logs didn't record the maximum zh_claim_lr_seq. + */ + if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) + claim_lr_seq = UINT64_MAX; + + /* + * Starting at the block pointed to by zh_log we read the log chain. + * For each block in the chain we strongly check that block to + * ensure its validity. We stop when an invalid block is found. + * For each block pointer in the chain we call parse_blk_func(). + * For each record in each valid block we call parse_lr_func(). + * If the log has been claimed, stop if we encounter a sequence + * number greater than the highest claimed sequence number. + */ + lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); + zil_bp_tree_init(zilog); + + for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { + uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; + int reclen; + char *end; + + if (blk_seq > claim_blk_seq) + break; + if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) + break; + ASSERT3U(max_blk_seq, <, blk_seq); + max_blk_seq = blk_seq; + blk_count++; + + if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) + break; + + error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); + if (error != 0) + break; + + for (lrp = lrbuf; lrp < end; lrp += reclen) { + lr_t *lr = (lr_t *)lrp; + reclen = lr->lrc_reclen; + ASSERT3U(reclen, >=, sizeof (lr_t)); + if (lr->lrc_seq > claim_lr_seq) + goto done; + if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) + goto done; + ASSERT3U(max_lr_seq, <, lr->lrc_seq); + max_lr_seq = lr->lrc_seq; + lr_count++; + } + } +done: + zilog->zl_parse_error = error; + zilog->zl_parse_blk_seq = max_blk_seq; + zilog->zl_parse_lr_seq = max_lr_seq; + zilog->zl_parse_blk_count = blk_count; + zilog->zl_parse_lr_count = lr_count; + + ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || + (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); + + zil_bp_tree_fini(zilog); + zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); + + return (error); +} + +/* ARGSUSED */ +static int +zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) +{ + ASSERT(!BP_IS_HOLE(bp)); + + /* + * As we call this function from the context of a rewind to a + * checkpoint, each ZIL block whose txg is later than the txg + * that we rewind to is invalid. Thus, we return -1 so + * zil_parse() doesn't attempt to read it. + */ + if (bp->blk_birth >= first_txg) + return (-1); + + if (zil_bp_tree_add(zilog, bp) != 0) + return (0); + + zio_free(zilog->zl_spa, first_txg, bp); + return (0); +} + +/* ARGSUSED */ +static int +zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) +{ + return (0); +} + +static int +zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) +{ + /* + * Claim log block if not already committed and not already claimed. + * If tx == NULL, just verify that the block is claimable. + */ + if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || + zil_bp_tree_add(zilog, bp) != 0) + return (0); + + return (zio_wait(zio_claim(NULL, zilog->zl_spa, + tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); +} + +static int +zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) +{ + lr_write_t *lr = (lr_write_t *)lrc; + int error; + + if (lrc->lrc_txtype != TX_WRITE) + return (0); + + /* + * If the block is not readable, don't claim it. This can happen + * in normal operation when a log block is written to disk before + * some of the dmu_sync() blocks it points to. In this case, the + * transaction cannot have been committed to anyone (we would have + * waited for all writes to be stable first), so it is semantically + * correct to declare this the end of the log. + */ + if (lr->lr_blkptr.blk_birth >= first_txg && + (error = zil_read_log_data(zilog, lr, NULL)) != 0) + return (error); + return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); +} + +/* ARGSUSED */ +static int +zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) +{ + zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); + + return (0); +} + +static int +zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) +{ + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + + /* + * If we previously claimed it, we need to free it. + */ + if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && + bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && + !BP_IS_HOLE(bp)) + zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); + + return (0); +} + +static int +zil_lwb_vdev_compare(const void *x1, const void *x2) +{ + const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; + const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; + + return (AVL_CMP(v1, v2)); +} + +static lwb_t * +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg) +{ + lwb_t *lwb; + + lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); + lwb->lwb_zilog = zilog; + lwb->lwb_blk = *bp; + lwb->lwb_slog = slog; + lwb->lwb_state = LWB_STATE_CLOSED; + lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); + lwb->lwb_max_txg = txg; + lwb->lwb_write_zio = NULL; + lwb->lwb_root_zio = NULL; + lwb->lwb_tx = NULL; + lwb->lwb_issued_timestamp = 0; + if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { + lwb->lwb_nused = sizeof (zil_chain_t); + lwb->lwb_sz = BP_GET_LSIZE(bp); + } else { + lwb->lwb_nused = 0; + lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); + } + + mutex_enter(&zilog->zl_lock); + list_insert_tail(&zilog->zl_lwb_list, lwb); + mutex_exit(&zilog->zl_lock); + + ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); + ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + VERIFY(list_is_empty(&lwb->lwb_waiters)); + + return (lwb); +} + +static void +zil_free_lwb(zilog_t *zilog, lwb_t *lwb) +{ + ASSERT(MUTEX_HELD(&zilog->zl_lock)); + ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); + VERIFY(list_is_empty(&lwb->lwb_waiters)); + ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT3P(lwb->lwb_write_zio, ==, NULL); + ASSERT3P(lwb->lwb_root_zio, ==, NULL); + ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); + ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || + lwb->lwb_state == LWB_STATE_DONE); + + /* + * Clear the zilog's field to indicate this lwb is no longer + * valid, and prevent use-after-free errors. + */ + if (zilog->zl_last_lwb_opened == lwb) + zilog->zl_last_lwb_opened = NULL; + + kmem_cache_free(zil_lwb_cache, lwb); +} + +/* + * Called when we create in-memory log transactions so that we know + * to cleanup the itxs at the end of spa_sync(). + */ +void +zilog_dirty(zilog_t *zilog, uint64_t txg) +{ + dsl_pool_t *dp = zilog->zl_dmu_pool; + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); + + ASSERT(spa_writeable(zilog->zl_spa)); + + if (ds->ds_is_snapshot) + panic("dirtying snapshot!"); + + if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(ds->ds_dbuf, zilog); + + zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg); + } +} + +/* + * Determine if the zil is dirty in the specified txg. Callers wanting to + * ensure that the dirty state does not change must hold the itxg_lock for + * the specified txg. Holding the lock will ensure that the zil cannot be + * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current + * state. + */ +boolean_t +zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) +{ + dsl_pool_t *dp = zilog->zl_dmu_pool; + + if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) + return (B_TRUE); + return (B_FALSE); +} + +/* + * Determine if the zil is dirty. The zil is considered dirty if it has + * any pending itx records that have not been cleaned by zil_clean(). + */ +boolean_t +zilog_is_dirty(zilog_t *zilog) +{ + dsl_pool_t *dp = zilog->zl_dmu_pool; + + for (int t = 0; t < TXG_SIZE; t++) { + if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Create an on-disk intent log. + */ +static lwb_t * +zil_create(zilog_t *zilog) +{ + const zil_header_t *zh = zilog->zl_header; + lwb_t *lwb = NULL; + uint64_t txg = 0; + dmu_tx_t *tx = NULL; + blkptr_t blk; + int error = 0; + boolean_t slog = FALSE; + + /* + * Wait for any previous destroy to complete. + */ + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + + ASSERT(zh->zh_claim_txg == 0); + ASSERT(zh->zh_replay_seq == 0); + + blk = zh->zh_log; + + /* + * Allocate an initial log block if: + * - there isn't one already + * - the existing block is the wrong endianess + */ + if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { + tx = dmu_tx_create(zilog->zl_os); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + if (!BP_IS_HOLE(&blk)) { + zio_free(zilog->zl_spa, txg, &blk); + BP_ZERO(&blk); + } + + error = zio_alloc_zil(zilog->zl_spa, + zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL, + ZIL_MIN_BLKSZ, &slog); + + if (error == 0) + zil_init_log_chain(zilog, &blk); + } + + /* + * Allocate a log write block (lwb) for the first log block. + */ + if (error == 0) + lwb = zil_alloc_lwb(zilog, &blk, slog, txg); + + /* + * If we just allocated the first log block, commit our transaction + * and wait for zil_sync() to stuff the block poiner into zh_log. + * (zh is part of the MOS, so we cannot modify it in open context.) + */ + if (tx != NULL) { + dmu_tx_commit(tx); + txg_wait_synced(zilog->zl_dmu_pool, txg); + } + + ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); + + return (lwb); +} + +/* + * In one tx, free all log blocks and clear the log header. If keep_first + * is set, then we're replaying a log with no content. We want to keep the + * first block, however, so that the first synchronous transaction doesn't + * require a txg_wait_synced() in zil_create(). We don't need to + * txg_wait_synced() here either when keep_first is set, because both + * zil_create() and zil_destroy() will wait for any in-progress destroys + * to complete. + */ +void +zil_destroy(zilog_t *zilog, boolean_t keep_first) +{ + const zil_header_t *zh = zilog->zl_header; + lwb_t *lwb; + dmu_tx_t *tx; + uint64_t txg; + + /* + * Wait for any previous destroy to complete. + */ + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + + zilog->zl_old_header = *zh; /* debugging aid */ + + if (BP_IS_HOLE(&zh->zh_log)) + return; + + tx = dmu_tx_create(zilog->zl_os); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + mutex_enter(&zilog->zl_lock); + + ASSERT3U(zilog->zl_destroy_txg, <, txg); + zilog->zl_destroy_txg = txg; + zilog->zl_keep_first = keep_first; + + if (!list_is_empty(&zilog->zl_lwb_list)) { + ASSERT(zh->zh_claim_txg == 0); + VERIFY(!keep_first); + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + list_remove(&zilog->zl_lwb_list, lwb); + if (lwb->lwb_buf != NULL) + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); + zil_free_lwb(zilog, lwb); + } + } else if (!keep_first) { + zil_destroy_sync(zilog, tx); + } + mutex_exit(&zilog->zl_lock); + + dmu_tx_commit(tx); +} + +void +zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) +{ + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + (void) zil_parse(zilog, zil_free_log_block, + zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); +} + +int +zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) +{ + dmu_tx_t *tx = txarg; + zilog_t *zilog; + uint64_t first_txg; + zil_header_t *zh; + objset_t *os; + int error; + + error = dmu_objset_own_obj(dp, ds->ds_object, + DMU_OST_ANY, B_FALSE, FTAG, &os); + if (error != 0) { + /* + * EBUSY indicates that the objset is inconsistent, in which + * case it can not have a ZIL. + */ + if (error != EBUSY) { + cmn_err(CE_WARN, "can't open objset for %llu, error %u", + (unsigned long long)ds->ds_object, error); + } + return (0); + } + + zilog = dmu_objset_zil(os); + zh = zil_header_in_syncing_context(zilog); + ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa)); + first_txg = spa_min_claim_txg(zilog->zl_spa); + + /* + * If the spa_log_state is not set to be cleared, check whether + * the current uberblock is a checkpoint one and if the current + * header has been claimed before moving on. + * + * If the current uberblock is a checkpointed uberblock then + * one of the following scenarios took place: + * + * 1] We are currently rewinding to the checkpoint of the pool. + * 2] We crashed in the middle of a checkpoint rewind but we + * did manage to write the checkpointed uberblock to the + * vdev labels, so when we tried to import the pool again + * the checkpointed uberblock was selected from the import + * procedure. + * + * In both cases we want to zero out all the ZIL blocks, except + * the ones that have been claimed at the time of the checkpoint + * (their zh_claim_txg != 0). The reason is that these blocks + * may be corrupted since we may have reused their locations on + * disk after we took the checkpoint. + * + * We could try to set spa_log_state to SPA_LOG_CLEAR earlier + * when we first figure out whether the current uberblock is + * checkpointed or not. Unfortunately, that would discard all + * the logs, including the ones that are claimed, and we would + * leak space. + */ + if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR || + (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && + zh->zh_claim_txg == 0)) { + if (!BP_IS_HOLE(&zh->zh_log)) { + (void) zil_parse(zilog, zil_clear_log_block, + zil_noop_log_record, tx, first_txg); + } + BP_ZERO(&zh->zh_log); + dsl_dataset_dirty(dmu_objset_ds(os), tx); + dmu_objset_disown(os, FTAG); + return (0); + } + + /* + * If we are not rewinding and opening the pool normally, then + * the min_claim_txg should be equal to the first txg of the pool. + */ + ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa)); + + /* + * Claim all log blocks if we haven't already done so, and remember + * the highest claimed sequence number. This ensures that if we can + * read only part of the log now (e.g. due to a missing device), + * but we can read the entire log later, we will not try to replay + * or destroy beyond the last block we successfully claimed. + */ + ASSERT3U(zh->zh_claim_txg, <=, first_txg); + if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { + (void) zil_parse(zilog, zil_claim_log_block, + zil_claim_log_record, tx, first_txg); + zh->zh_claim_txg = first_txg; + zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; + zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; + if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) + zh->zh_flags |= ZIL_REPLAY_NEEDED; + zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; + dsl_dataset_dirty(dmu_objset_ds(os), tx); + } + + ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); + dmu_objset_disown(os, FTAG); + return (0); +} + +/* + * Check the log by walking the log chain. + * Checksum errors are ok as they indicate the end of the chain. + * Any other error (no device or read failure) returns an error. + */ +/* ARGSUSED */ +int +zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) +{ + zilog_t *zilog; + objset_t *os; + blkptr_t *bp; + int error; + + ASSERT(tx == NULL); + + error = dmu_objset_from_ds(ds, &os); + if (error != 0) { + cmn_err(CE_WARN, "can't open objset %llu, error %d", + (unsigned long long)ds->ds_object, error); + return (0); + } + + zilog = dmu_objset_zil(os); + bp = (blkptr_t *)&zilog->zl_header->zh_log; + + if (!BP_IS_HOLE(bp)) { + vdev_t *vd; + boolean_t valid = B_TRUE; + + /* + * Check the first block and determine if it's on a log device + * which may have been removed or faulted prior to loading this + * pool. If so, there's no point in checking the rest of the + * log as its content should have already been synced to the + * pool. + */ + spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); + vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); + if (vd->vdev_islog && vdev_is_dead(vd)) + valid = vdev_log_state_valid(vd); + spa_config_exit(os->os_spa, SCL_STATE, FTAG); + + if (!valid) + return (0); + + /* + * Check whether the current uberblock is checkpointed (e.g. + * we are rewinding) and whether the current header has been + * claimed or not. If it hasn't then skip verifying it. We + * do this because its ZIL blocks may be part of the pool's + * state before the rewind, which is no longer valid. + */ + zil_header_t *zh = zil_header_in_syncing_context(zilog); + if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && + zh->zh_claim_txg == 0) + return (0); + } + + /* + * Because tx == NULL, zil_claim_log_block() will not actually claim + * any blocks, but just determine whether it is possible to do so. + * In addition to checking the log chain, zil_claim_log_block() + * will invoke zio_claim() with a done func of spa_claim_notify(), + * which will update spa_max_claim_txg. See spa_load() for details. + */ + error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, + zilog->zl_header->zh_claim_txg ? -1ULL : + spa_min_claim_txg(os->os_spa)); + + return ((error == ECKSUM || error == ENOENT) ? 0 : error); +} + +/* + * When an itx is "skipped", this function is used to properly mark the + * waiter as "done, and signal any thread(s) waiting on it. An itx can + * be skipped (and not committed to an lwb) for a variety of reasons, + * one of them being that the itx was committed via spa_sync(), prior to + * it being committed to an lwb; this can happen if a thread calling + * zil_commit() is racing with spa_sync(). + */ +static void +zil_commit_waiter_skip(zil_commit_waiter_t *zcw) +{ + mutex_enter(&zcw->zcw_lock); + ASSERT3B(zcw->zcw_done, ==, B_FALSE); + zcw->zcw_done = B_TRUE; + cv_broadcast(&zcw->zcw_cv); + mutex_exit(&zcw->zcw_lock); +} + +/* + * This function is used when the given waiter is to be linked into an + * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb. + * At this point, the waiter will no longer be referenced by the itx, + * and instead, will be referenced by the lwb. + */ +static void +zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) +{ + /* + * The lwb_waiters field of the lwb is protected by the zilog's + * zl_lock, thus it must be held when calling this function. + */ + ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); + + mutex_enter(&zcw->zcw_lock); + ASSERT(!list_link_active(&zcw->zcw_node)); + ASSERT3P(zcw->zcw_lwb, ==, NULL); + ASSERT3P(lwb, !=, NULL); + ASSERT(lwb->lwb_state == LWB_STATE_OPENED || + lwb->lwb_state == LWB_STATE_ISSUED); + + list_insert_tail(&lwb->lwb_waiters, zcw); + zcw->zcw_lwb = lwb; + mutex_exit(&zcw->zcw_lock); +} + +/* + * This function is used when zio_alloc_zil() fails to allocate a ZIL + * block, and the given waiter must be linked to the "nolwb waiters" + * list inside of zil_process_commit_list(). + */ +static void +zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) +{ + mutex_enter(&zcw->zcw_lock); + ASSERT(!list_link_active(&zcw->zcw_node)); + ASSERT3P(zcw->zcw_lwb, ==, NULL); + list_insert_tail(nolwb, zcw); + mutex_exit(&zcw->zcw_lock); +} + +void +zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) +{ + avl_tree_t *t = &lwb->lwb_vdev_tree; + avl_index_t where; + zil_vdev_node_t *zv, zvsearch; + int ndvas = BP_GET_NDVAS(bp); + int i; + + if (zfs_nocacheflush) + return; + + mutex_enter(&lwb->lwb_vdev_lock); + for (i = 0; i < ndvas; i++) { + zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); + if (avl_find(t, &zvsearch, &where) == NULL) { + zv = kmem_alloc(sizeof (*zv), KM_SLEEP); + zv->zv_vdev = zvsearch.zv_vdev; + avl_insert(t, zv, where); + } + } + mutex_exit(&lwb->lwb_vdev_lock); +} + +void +zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) +{ + lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); +} + +/* + * This function is a called after all VDEVs associated with a given lwb + * write have completed their DKIOCFLUSHWRITECACHE command; or as soon + * as the lwb write completes, if "zfs_nocacheflush" is set. + * + * The intention is for this function to be called as soon as the + * contents of an lwb are considered "stable" on disk, and will survive + * any sudden loss of power. At this point, any threads waiting for the + * lwb to reach this state are signalled, and the "waiter" structures + * are marked "done". + */ +static void +zil_lwb_flush_vdevs_done(zio_t *zio) +{ + lwb_t *lwb = zio->io_private; + zilog_t *zilog = lwb->lwb_zilog; + dmu_tx_t *tx = lwb->lwb_tx; + zil_commit_waiter_t *zcw; + + spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); + + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + + mutex_enter(&zilog->zl_lock); + + /* + * Ensure the lwb buffer pointer is cleared before releasing the + * txg. If we have had an allocation failure and the txg is + * waiting to sync then we want zil_sync() to remove the lwb so + * that it's not picked up as the next new one in + * zil_process_commit_list(). zil_sync() will only remove the + * lwb if lwb_buf is null. + */ + lwb->lwb_buf = NULL; + lwb->lwb_tx = NULL; + + ASSERT3U(lwb->lwb_issued_timestamp, >, 0); + zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp; + + lwb->lwb_root_zio = NULL; + lwb->lwb_state = LWB_STATE_DONE; + + if (zilog->zl_last_lwb_opened == lwb) { + /* + * Remember the highest committed log sequence number + * for ztest. We only update this value when all the log + * writes succeeded, because ztest wants to ASSERT that + * it got the whole log chain. + */ + zilog->zl_commit_lr_seq = zilog->zl_lr_seq; + } + + while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) { + mutex_enter(&zcw->zcw_lock); + + ASSERT(list_link_active(&zcw->zcw_node)); + list_remove(&lwb->lwb_waiters, zcw); + + ASSERT3P(zcw->zcw_lwb, ==, lwb); + zcw->zcw_lwb = NULL; + + zcw->zcw_zio_error = zio->io_error; + + ASSERT3B(zcw->zcw_done, ==, B_FALSE); + zcw->zcw_done = B_TRUE; + cv_broadcast(&zcw->zcw_cv); + + mutex_exit(&zcw->zcw_lock); + } + + mutex_exit(&zilog->zl_lock); + + /* + * Now that we've written this log block, we have a stable pointer + * to the next block in the chain, so it's OK to let the txg in + * which we allocated the next block sync. + */ + dmu_tx_commit(tx); +} + +/* + * This is called when an lwb write completes. This means, this specific + * lwb was written to disk, and all dependent lwb have also been + * written to disk. + * + * At this point, a DKIOCFLUSHWRITECACHE command hasn't been issued to + * the VDEVs involved in writing out this specific lwb. The lwb will be + * "done" once zil_lwb_flush_vdevs_done() is called, which occurs in the + * zio completion callback for the lwb's root zio. + */ +static void +zil_lwb_write_done(zio_t *zio) +{ + lwb_t *lwb = zio->io_private; + spa_t *spa = zio->io_spa; + zilog_t *zilog = lwb->lwb_zilog; + avl_tree_t *t = &lwb->lwb_vdev_tree; + void *cookie = NULL; + zil_vdev_node_t *zv; + + ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); + + ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); + ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); + ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); + ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); + ASSERT(!BP_IS_GANG(zio->io_bp)); + ASSERT(!BP_IS_HOLE(zio->io_bp)); + ASSERT(BP_GET_FILL(zio->io_bp) == 0); + + abd_put(zio->io_abd); + + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); + + mutex_enter(&zilog->zl_lock); + lwb->lwb_write_zio = NULL; + mutex_exit(&zilog->zl_lock); + + if (avl_numnodes(t) == 0) + return; + + /* + * If there was an IO error, we're not going to call zio_flush() + * on these vdevs, so we simply empty the tree and free the + * nodes. We avoid calling zio_flush() since there isn't any + * good reason for doing so, after the lwb block failed to be + * written out. + */ + if (zio->io_error != 0) { + while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(zv, sizeof (*zv)); + return; + } + + while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { + vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); + if (vd != NULL) + zio_flush(lwb->lwb_root_zio, vd); + kmem_free(zv, sizeof (*zv)); + } +} + +/* + * This function's purpose is to "open" an lwb such that it is ready to + * accept new itxs being committed to it. To do this, the lwb's zio + * structures are created, and linked to the lwb. This function is + * idempotent; if the passed in lwb has already been opened, this + * function is essentially a no-op. + */ +static void +zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) +{ + zbookmark_phys_t zb; + zio_priority_t prio; + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT3P(lwb, !=, NULL); + EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); + EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); + + SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, + lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); + + if (lwb->lwb_root_zio == NULL) { + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); + + if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + prio = ZIO_PRIORITY_SYNC_WRITE; + else + prio = ZIO_PRIORITY_ASYNC_WRITE; + + lwb->lwb_root_zio = zio_root(zilog->zl_spa, + zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); + ASSERT3P(lwb->lwb_root_zio, !=, NULL); + + lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, + zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, + BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, + prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); + ASSERT3P(lwb->lwb_write_zio, !=, NULL); + + lwb->lwb_state = LWB_STATE_OPENED; + + mutex_enter(&zilog->zl_lock); + + /* + * The zilog's "zl_last_lwb_opened" field is used to + * build the lwb/zio dependency chain, which is used to + * preserve the ordering of lwb completions that is + * required by the semantics of the ZIL. Each new lwb + * zio becomes a parent of the "previous" lwb zio, such + * that the new lwb's zio cannot complete until the + * "previous" lwb's zio completes. + * + * This is required by the semantics of zil_commit(); + * the commit waiters attached to the lwbs will be woken + * in the lwb zio's completion callback, so this zio + * dependency graph ensures the waiters are woken in the + * correct order (the same order the lwbs were created). + */ + lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened; + if (last_lwb_opened != NULL && + last_lwb_opened->lwb_state != LWB_STATE_DONE) { + ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || + last_lwb_opened->lwb_state == LWB_STATE_ISSUED); + ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL); + zio_add_child(lwb->lwb_root_zio, + last_lwb_opened->lwb_root_zio); + } + zilog->zl_last_lwb_opened = lwb; + + mutex_exit(&zilog->zl_lock); + } + + ASSERT3P(lwb->lwb_root_zio, !=, NULL); + ASSERT3P(lwb->lwb_write_zio, !=, NULL); + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); +} + +/* + * Define a limited set of intent log block sizes. + * + * These must be a multiple of 4KB. Note only the amount used (again + * aligned to 4KB) actually gets written. However, we can't always just + * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. + */ +uint64_t zil_block_buckets[] = { + 4096, /* non TX_WRITE */ + 8192+4096, /* data base */ + 32*1024 + 4096, /* NFS writes */ + UINT64_MAX +}; + +/* + * Start a log block write and advance to the next log block. + * Calls are serialized. + */ +static lwb_t * +zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) +{ + lwb_t *nlwb = NULL; + zil_chain_t *zilc; + spa_t *spa = zilog->zl_spa; + blkptr_t *bp; + dmu_tx_t *tx; + uint64_t txg; + uint64_t zil_blksz, wsz; + int i, error; + boolean_t slog; + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT3P(lwb->lwb_root_zio, !=, NULL); + ASSERT3P(lwb->lwb_write_zio, !=, NULL); + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { + zilc = (zil_chain_t *)lwb->lwb_buf; + bp = &zilc->zc_next_blk; + } else { + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); + bp = &zilc->zc_next_blk; + } + + ASSERT(lwb->lwb_nused <= lwb->lwb_sz); + + /* + * Allocate the next block and save its address in this block + * before writing it in order to establish the log chain. + * Note that if the allocation of nlwb synced before we wrote + * the block that points at it (lwb), we'd leak it if we crashed. + * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). + * We dirty the dataset to ensure that zil_sync() will be called + * to clean up in the event of allocation failure or I/O failure. + */ + + tx = dmu_tx_create(zilog->zl_os); + + /* + * Since we are not going to create any new dirty data, and we + * can even help with clearing the existing dirty data, we + * should not be subject to the dirty data based delays. We + * use TXG_NOTHROTTLE to bypass the delay mechanism. + */ + VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); + + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + lwb->lwb_tx = tx; + + /* + * Log blocks are pre-allocated. Here we select the size of the next + * block, based on size used in the last block. + * - first find the smallest bucket that will fit the block from a + * limited set of block sizes. This is because it's faster to write + * blocks allocated from the same metaslab as they are adjacent or + * close. + * - next find the maximum from the new suggested size and an array of + * previous sizes. This lessens a picket fence effect of wrongly + * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k + * requests. + * + * Note we only write what is used, but we can't just allocate + * the maximum block size because we can exhaust the available + * pool log space. + */ + zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); + for (i = 0; zil_blksz > zil_block_buckets[i]; i++) + continue; + zil_blksz = zil_block_buckets[i]; + if (zil_blksz == UINT64_MAX) + zil_blksz = SPA_OLD_MAXBLOCKSIZE; + zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; + for (i = 0; i < ZIL_PREV_BLKS; i++) + zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); + zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); + + BP_ZERO(bp); + + /* pass the old blkptr in order to spread log blocks across devs */ + error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object, + txg, bp, &lwb->lwb_blk, zil_blksz, &slog); + if (error == 0) { + ASSERT3U(bp->blk_birth, ==, txg); + bp->blk_cksum = lwb->lwb_blk.blk_cksum; + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; + + /* + * Allocate a new log write block (lwb). + */ + nlwb = zil_alloc_lwb(zilog, bp, slog, txg); + } + + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { + /* For Slim ZIL only write what is used. */ + wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); + ASSERT3U(wsz, <=, lwb->lwb_sz); + zio_shrink(lwb->lwb_write_zio, wsz); + + } else { + wsz = lwb->lwb_sz; + } + + zilc->zc_pad = 0; + zilc->zc_nused = lwb->lwb_nused; + zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; + + /* + * clear unused data for security + */ + bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); + + spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); + + zil_lwb_add_block(lwb, &lwb->lwb_blk); + lwb->lwb_issued_timestamp = gethrtime(); + lwb->lwb_state = LWB_STATE_ISSUED; + + zio_nowait(lwb->lwb_root_zio); + zio_nowait(lwb->lwb_write_zio); + + /* + * If there was an allocation failure then nlwb will be null which + * forces a txg_wait_synced(). + */ + return (nlwb); +} + +static lwb_t * +zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) +{ + lr_t *lrcb, *lrc; + lr_write_t *lrwb, *lrw; + char *lr_buf; + uint64_t dlen, dnow, lwb_sp, reclen, txg; + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT3P(lwb, !=, NULL); + ASSERT3P(lwb->lwb_buf, !=, NULL); + + zil_lwb_write_open(zilog, lwb); + + lrc = &itx->itx_lr; + lrw = (lr_write_t *)lrc; + + /* + * A commit itx doesn't represent any on-disk state; instead + * it's simply used as a place holder on the commit list, and + * provides a mechanism for attaching a "commit waiter" onto the + * correct lwb (such that the waiter can be signalled upon + * completion of that lwb). Thus, we don't process this itx's + * log record if it's a commit itx (these itx's don't have log + * records), and instead link the itx's waiter onto the lwb's + * list of waiters. + * + * For more details, see the comment above zil_commit(). + */ + if (lrc->lrc_txtype == TX_COMMIT) { + mutex_enter(&zilog->zl_lock); + zil_commit_waiter_link_lwb(itx->itx_private, lwb); + itx->itx_private = NULL; + mutex_exit(&zilog->zl_lock); + return (lwb); + } + + if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + dlen = P2ROUNDUP_TYPED( + lrw->lr_length, sizeof (uint64_t), uint64_t); + } else { + dlen = 0; + } + reclen = lrc->lrc_reclen; + zilog->zl_cur_used += (reclen + dlen); + txg = lrc->lrc_txg; + + ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen)); + +cont: + /* + * If this record won't fit in the current log block, start a new one. + * For WR_NEED_COPY optimize layout for minimal number of chunks. + */ + lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + if (reclen > lwb_sp || (reclen + dlen > lwb_sp && + lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 || + lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) { + lwb = zil_lwb_write_issue(zilog, lwb); + if (lwb == NULL) + return (NULL); + zil_lwb_write_open(zilog, lwb); + ASSERT(LWB_EMPTY(lwb)); + lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); + } + + dnow = MIN(dlen, lwb_sp - reclen); + lr_buf = lwb->lwb_buf + lwb->lwb_nused; + bcopy(lrc, lr_buf, reclen); + lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ + lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ + + /* + * If it's a write, fetch the data or get its blkptr as appropriate. + */ + if (lrc->lrc_txtype == TX_WRITE) { + if (txg > spa_freeze_txg(zilog->zl_spa)) + txg_wait_synced(zilog->zl_dmu_pool, txg); + if (itx->itx_wr_state != WR_COPIED) { + char *dbuf; + int error; + + if (itx->itx_wr_state == WR_NEED_COPY) { + dbuf = lr_buf + reclen; + lrcb->lrc_reclen += dnow; + if (lrwb->lr_length > dnow) + lrwb->lr_length = dnow; + lrw->lr_offset += dnow; + lrw->lr_length -= dnow; + } else { + ASSERT(itx->itx_wr_state == WR_INDIRECT); + dbuf = NULL; + } + + /* + * We pass in the "lwb_write_zio" rather than + * "lwb_root_zio" so that the "lwb_write_zio" + * becomes the parent of any zio's created by + * the "zl_get_data" callback. The vdevs are + * flushed after the "lwb_write_zio" completes, + * so we want to make sure that completion + * callback waits for these additional zio's, + * such that the vdevs used by those zio's will + * be included in the lwb's vdev tree, and those + * vdevs will be properly flushed. If we passed + * in "lwb_root_zio" here, then these additional + * vdevs may not be flushed; e.g. if these zio's + * completed after "lwb_write_zio" completed. + */ + error = zilog->zl_get_data(itx->itx_private, + lrwb, dbuf, lwb, lwb->lwb_write_zio); + + if (error == EIO) { + txg_wait_synced(zilog->zl_dmu_pool, txg); + return (lwb); + } + if (error != 0) { + ASSERT(error == ENOENT || error == EEXIST || + error == EALREADY); + return (lwb); + } + } + } + + /* + * We're actually making an entry, so update lrc_seq to be the + * log record sequence number. Note that this is generally not + * equal to the itx sequence number because not all transactions + * are synchronous, and sometimes spa_sync() gets there first. + */ + lrcb->lrc_seq = ++zilog->zl_lr_seq; + lwb->lwb_nused += reclen + dnow; + + zil_lwb_add_txg(lwb, txg); + + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); + ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); + + dlen -= dnow; + if (dlen > 0) { + zilog->zl_cur_used += reclen; + goto cont; + } + + return (lwb); +} + +itx_t * +zil_itx_create(uint64_t txtype, size_t lrsize) +{ + itx_t *itx; + + lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); + + itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); + itx->itx_lr.lrc_txtype = txtype; + itx->itx_lr.lrc_reclen = lrsize; + itx->itx_lr.lrc_seq = 0; /* defensive */ + itx->itx_sync = B_TRUE; /* default is synchronous */ + + return (itx); +} + +void +zil_itx_destroy(itx_t *itx) +{ + kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); +} + +/* + * Free up the sync and async itxs. The itxs_t has already been detached + * so no locks are needed. + */ +static void +zil_itxg_clean(itxs_t *itxs) +{ + itx_t *itx; + list_t *list; + avl_tree_t *t; + void *cookie; + itx_async_node_t *ian; + + list = &itxs->i_sync_list; + while ((itx = list_head(list)) != NULL) { + /* + * In the general case, commit itxs will not be found + * here, as they'll be committed to an lwb via + * zil_lwb_commit(), and free'd in that function. Having + * said that, it is still possible for commit itxs to be + * found here, due to the following race: + * + * - a thread calls zil_commit() which assigns the + * commit itx to a per-txg i_sync_list + * - zil_itxg_clean() is called (e.g. via spa_sync()) + * while the waiter is still on the i_sync_list + * + * There's nothing to prevent syncing the txg while the + * waiter is on the i_sync_list. This normally doesn't + * happen because spa_sync() is slower than zil_commit(), + * but if zil_commit() calls txg_wait_synced() (e.g. + * because zil_create() or zil_commit_writer_stall() is + * called) we will hit this case. + */ + if (itx->itx_lr.lrc_txtype == TX_COMMIT) + zil_commit_waiter_skip(itx->itx_private); + + list_remove(list, itx); + zil_itx_destroy(itx); + } + + cookie = NULL; + t = &itxs->i_async_tree; + while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { + list = &ian->ia_list; + while ((itx = list_head(list)) != NULL) { + list_remove(list, itx); + /* commit itxs should never be on the async lists. */ + ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); + zil_itx_destroy(itx); + } + list_destroy(list); + kmem_free(ian, sizeof (itx_async_node_t)); + } + avl_destroy(t); + + kmem_free(itxs, sizeof (itxs_t)); +} + +static int +zil_aitx_compare(const void *x1, const void *x2) +{ + const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; + const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; + + return (AVL_CMP(o1, o2)); +} + +/* + * Remove all async itx with the given oid. + */ +static void +zil_remove_async(zilog_t *zilog, uint64_t oid) +{ + uint64_t otxg, txg; + itx_async_node_t *ian; + avl_tree_t *t; + avl_index_t where; + list_t clean_list; + itx_t *itx; + + ASSERT(oid != 0); + list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); + + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ + otxg = ZILTEST_TXG; + else + otxg = spa_last_synced_txg(zilog->zl_spa) + 1; + + for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { + itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; + + mutex_enter(&itxg->itxg_lock); + if (itxg->itxg_txg != txg) { + mutex_exit(&itxg->itxg_lock); + continue; + } + + /* + * Locate the object node and append its list. + */ + t = &itxg->itxg_itxs->i_async_tree; + ian = avl_find(t, &oid, &where); + if (ian != NULL) + list_move_tail(&clean_list, &ian->ia_list); + mutex_exit(&itxg->itxg_lock); + } + while ((itx = list_head(&clean_list)) != NULL) { + list_remove(&clean_list, itx); + /* commit itxs should never be on the async lists. */ + ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); + zil_itx_destroy(itx); + } + list_destroy(&clean_list); +} + +void +zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) +{ + uint64_t txg; + itxg_t *itxg; + itxs_t *itxs, *clean = NULL; + + /* + * Object ids can be re-instantiated in the next txg so + * remove any async transactions to avoid future leaks. + * This can happen if a fsync occurs on the re-instantiated + * object for a WR_INDIRECT or WR_NEED_COPY write, which gets + * the new file data and flushes a write record for the old object. + */ + if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) + zil_remove_async(zilog, itx->itx_oid); + + /* + * Ensure the data of a renamed file is committed before the rename. + */ + if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) + zil_async_to_sync(zilog, itx->itx_oid); + + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) + txg = ZILTEST_TXG; + else + txg = dmu_tx_get_txg(tx); + + itxg = &zilog->zl_itxg[txg & TXG_MASK]; + mutex_enter(&itxg->itxg_lock); + itxs = itxg->itxg_itxs; + if (itxg->itxg_txg != txg) { + if (itxs != NULL) { + /* + * The zil_clean callback hasn't got around to cleaning + * this itxg. Save the itxs for release below. + * This should be rare. + */ + zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " + "txg %llu", itxg->itxg_txg); + clean = itxg->itxg_itxs; + } + itxg->itxg_txg = txg; + itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); + + list_create(&itxs->i_sync_list, sizeof (itx_t), + offsetof(itx_t, itx_node)); + avl_create(&itxs->i_async_tree, zil_aitx_compare, + sizeof (itx_async_node_t), + offsetof(itx_async_node_t, ia_node)); + } + if (itx->itx_sync) { + list_insert_tail(&itxs->i_sync_list, itx); + } else { + avl_tree_t *t = &itxs->i_async_tree; + uint64_t foid = + LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid); + itx_async_node_t *ian; + avl_index_t where; + + ian = avl_find(t, &foid, &where); + if (ian == NULL) { + ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); + list_create(&ian->ia_list, sizeof (itx_t), + offsetof(itx_t, itx_node)); + ian->ia_foid = foid; + avl_insert(t, ian, where); + } + list_insert_tail(&ian->ia_list, itx); + } + + itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); + + /* + * We don't want to dirty the ZIL using ZILTEST_TXG, because + * zil_clean() will never be called using ZILTEST_TXG. Thus, we + * need to be careful to always dirty the ZIL using the "real" + * TXG (not itxg_txg) even when the SPA is frozen. + */ + zilog_dirty(zilog, dmu_tx_get_txg(tx)); + mutex_exit(&itxg->itxg_lock); + + /* Release the old itxs now we've dropped the lock */ + if (clean != NULL) + zil_itxg_clean(clean); +} + +/* + * If there are any in-memory intent log transactions which have now been + * synced then start up a taskq to free them. We should only do this after we + * have written out the uberblocks (i.e. txg has been comitted) so that + * don't inadvertently clean out in-memory log records that would be required + * by zil_commit(). + */ +void +zil_clean(zilog_t *zilog, uint64_t synced_txg) +{ + itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; + itxs_t *clean_me; + + ASSERT3U(synced_txg, <, ZILTEST_TXG); + + mutex_enter(&itxg->itxg_lock); + if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { + mutex_exit(&itxg->itxg_lock); + return; + } + ASSERT3U(itxg->itxg_txg, <=, synced_txg); + ASSERT3U(itxg->itxg_txg, !=, 0); + clean_me = itxg->itxg_itxs; + itxg->itxg_itxs = NULL; + itxg->itxg_txg = 0; + mutex_exit(&itxg->itxg_lock); + /* + * Preferably start a task queue to free up the old itxs but + * if taskq_dispatch can't allocate resources to do that then + * free it in-line. This should be rare. Note, using TQ_SLEEP + * created a bad performance problem. + */ + ASSERT3P(zilog->zl_dmu_pool, !=, NULL); + ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); + if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, + (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0) + zil_itxg_clean(clean_me); +} + +/* + * This function will traverse the queue of itxs that need to be + * committed, and move them onto the ZIL's zl_itx_commit_list. + */ +static void +zil_get_commit_list(zilog_t *zilog) +{ + uint64_t otxg, txg; + list_t *commit_list = &zilog->zl_itx_commit_list; + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ + otxg = ZILTEST_TXG; + else + otxg = spa_last_synced_txg(zilog->zl_spa) + 1; + + /* + * This is inherently racy, since there is nothing to prevent + * the last synced txg from changing. That's okay since we'll + * only commit things in the future. + */ + for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { + itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; + + mutex_enter(&itxg->itxg_lock); + if (itxg->itxg_txg != txg) { + mutex_exit(&itxg->itxg_lock); + continue; + } + + /* + * If we're adding itx records to the zl_itx_commit_list, + * then the zil better be dirty in this "txg". We can assert + * that here since we're holding the itxg_lock which will + * prevent spa_sync from cleaning it. Once we add the itxs + * to the zl_itx_commit_list we must commit it to disk even + * if it's unnecessary (i.e. the txg was synced). + */ + ASSERT(zilog_is_dirty_in_txg(zilog, txg) || + spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); + list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); + + mutex_exit(&itxg->itxg_lock); + } +} + +/* + * Move the async itxs for a specified object to commit into sync lists. + */ +void +zil_async_to_sync(zilog_t *zilog, uint64_t foid) +{ + uint64_t otxg, txg; + itx_async_node_t *ian; + avl_tree_t *t; + avl_index_t where; + + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ + otxg = ZILTEST_TXG; + else + otxg = spa_last_synced_txg(zilog->zl_spa) + 1; + + /* + * This is inherently racy, since there is nothing to prevent + * the last synced txg from changing. + */ + for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { + itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; + + mutex_enter(&itxg->itxg_lock); + if (itxg->itxg_txg != txg) { + mutex_exit(&itxg->itxg_lock); + continue; + } + + /* + * If a foid is specified then find that node and append its + * list. Otherwise walk the tree appending all the lists + * to the sync list. We add to the end rather than the + * beginning to ensure the create has happened. + */ + t = &itxg->itxg_itxs->i_async_tree; + if (foid != 0) { + ian = avl_find(t, &foid, &where); + if (ian != NULL) { + list_move_tail(&itxg->itxg_itxs->i_sync_list, + &ian->ia_list); + } + } else { + void *cookie = NULL; + + while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { + list_move_tail(&itxg->itxg_itxs->i_sync_list, + &ian->ia_list); + list_destroy(&ian->ia_list); + kmem_free(ian, sizeof (itx_async_node_t)); + } + } + mutex_exit(&itxg->itxg_lock); + } +} + +/* + * This function will prune commit itxs that are at the head of the + * commit list (it won't prune past the first non-commit itx), and + * either: a) attach them to the last lwb that's still pending + * completion, or b) skip them altogether. + * + * This is used as a performance optimization to prevent commit itxs + * from generating new lwbs when it's unnecessary to do so. + */ +static void +zil_prune_commit_list(zilog_t *zilog) +{ + itx_t *itx; + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + + while (itx = list_head(&zilog->zl_itx_commit_list)) { + lr_t *lrc = &itx->itx_lr; + if (lrc->lrc_txtype != TX_COMMIT) + break; + + mutex_enter(&zilog->zl_lock); + + lwb_t *last_lwb = zilog->zl_last_lwb_opened; + if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_DONE) { + /* + * All of the itxs this waiter was waiting on + * must have already completed (or there were + * never any itx's for it to wait on), so it's + * safe to skip this waiter and mark it done. + */ + zil_commit_waiter_skip(itx->itx_private); + } else { + zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); + itx->itx_private = NULL; + } + + mutex_exit(&zilog->zl_lock); + + list_remove(&zilog->zl_itx_commit_list, itx); + zil_itx_destroy(itx); + } + + IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); +} + +static void +zil_commit_writer_stall(zilog_t *zilog) +{ + /* + * When zio_alloc_zil() fails to allocate the next lwb block on + * disk, we must call txg_wait_synced() to ensure all of the + * lwbs in the zilog's zl_lwb_list are synced and then freed (in + * zil_sync()), such that any subsequent ZIL writer (i.e. a call + * to zil_process_commit_list()) will have to call zil_create(), + * and start a new ZIL chain. + * + * Since zil_alloc_zil() failed, the lwb that was previously + * issued does not have a pointer to the "next" lwb on disk. + * Thus, if another ZIL writer thread was to allocate the "next" + * on-disk lwb, that block could be leaked in the event of a + * crash (because the previous lwb on-disk would not point to + * it). + * + * We must hold the zilog's zl_issuer_lock while we do this, to + * ensure no new threads enter zil_process_commit_list() until + * all lwb's in the zl_lwb_list have been synced and freed + * (which is achieved via the txg_wait_synced() call). + */ + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + txg_wait_synced(zilog->zl_dmu_pool, 0); + ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); +} + +/* + * This function will traverse the commit list, creating new lwbs as + * needed, and committing the itxs from the commit list to these newly + * created lwbs. Additionally, as a new lwb is created, the previous + * lwb will be issued to the zio layer to be written to disk. + */ +static void +zil_process_commit_list(zilog_t *zilog) +{ + spa_t *spa = zilog->zl_spa; + list_t nolwb_waiters; + lwb_t *lwb; + itx_t *itx; + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + + /* + * Return if there's nothing to commit before we dirty the fs by + * calling zil_create(). + */ + if (list_head(&zilog->zl_itx_commit_list) == NULL) + return; + + list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), + offsetof(zil_commit_waiter_t, zcw_node)); + + lwb = list_tail(&zilog->zl_lwb_list); + if (lwb == NULL) { + lwb = zil_create(zilog); + } else { + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE); + } + + while (itx = list_head(&zilog->zl_itx_commit_list)) { + lr_t *lrc = &itx->itx_lr; + uint64_t txg = lrc->lrc_txg; + + ASSERT3U(txg, !=, 0); + + if (lrc->lrc_txtype == TX_COMMIT) { + DTRACE_PROBE2(zil__process__commit__itx, + zilog_t *, zilog, itx_t *, itx); + } else { + DTRACE_PROBE2(zil__process__normal__itx, + zilog_t *, zilog, itx_t *, itx); + } + + boolean_t synced = txg <= spa_last_synced_txg(spa); + boolean_t frozen = txg > spa_freeze_txg(spa); + + /* + * If the txg of this itx has already been synced out, then + * we don't need to commit this itx to an lwb. This is + * because the data of this itx will have already been + * written to the main pool. This is inherently racy, and + * it's still ok to commit an itx whose txg has already + * been synced; this will result in a write that's + * unnecessary, but will do no harm. + * + * With that said, we always want to commit TX_COMMIT itxs + * to an lwb, regardless of whether or not that itx's txg + * has been synced out. We do this to ensure any OPENED lwb + * will always have at least one zil_commit_waiter_t linked + * to the lwb. + * + * As a counter-example, if we skipped TX_COMMIT itx's + * whose txg had already been synced, the following + * situation could occur if we happened to be racing with + * spa_sync: + * + * 1. we commit a non-TX_COMMIT itx to an lwb, where the + * itx's txg is 10 and the last synced txg is 9. + * 2. spa_sync finishes syncing out txg 10. + * 3. we move to the next itx in the list, it's a TX_COMMIT + * whose txg is 10, so we skip it rather than committing + * it to the lwb used in (1). + * + * If the itx that is skipped in (3) is the last TX_COMMIT + * itx in the commit list, than it's possible for the lwb + * used in (1) to remain in the OPENED state indefinitely. + * + * To prevent the above scenario from occuring, ensuring + * that once an lwb is OPENED it will transition to ISSUED + * and eventually DONE, we always commit TX_COMMIT itx's to + * an lwb here, even if that itx's txg has already been + * synced. + * + * Finally, if the pool is frozen, we _always_ commit the + * itx. The point of freezing the pool is to prevent data + * from being written to the main pool via spa_sync, and + * instead rely solely on the ZIL to persistently store the + * data; i.e. when the pool is frozen, the last synced txg + * value can't be trusted. + */ + if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { + if (lwb != NULL) { + lwb = zil_lwb_commit(zilog, itx, lwb); + } else if (lrc->lrc_txtype == TX_COMMIT) { + ASSERT3P(lwb, ==, NULL); + zil_commit_waiter_link_nolwb( + itx->itx_private, &nolwb_waiters); + } + } + + list_remove(&zilog->zl_itx_commit_list, itx); + zil_itx_destroy(itx); + } + + if (lwb == NULL) { + /* + * This indicates zio_alloc_zil() failed to allocate the + * "next" lwb on-disk. When this happens, we must stall + * the ZIL write pipeline; see the comment within + * zil_commit_writer_stall() for more details. + */ + zil_commit_writer_stall(zilog); + + /* + * Additionally, we have to signal and mark the "nolwb" + * waiters as "done" here, since without an lwb, we + * can't do this via zil_lwb_flush_vdevs_done() like + * normal. + */ + zil_commit_waiter_t *zcw; + while (zcw = list_head(&nolwb_waiters)) { + zil_commit_waiter_skip(zcw); + list_remove(&nolwb_waiters, zcw); + } + } else { + ASSERT(list_is_empty(&nolwb_waiters)); + ASSERT3P(lwb, !=, NULL); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE); + + /* + * At this point, the ZIL block pointed at by the "lwb" + * variable is in one of the following states: "closed" + * or "open". + * + * If its "closed", then no itxs have been committed to + * it, so there's no point in issuing its zio (i.e. + * it's "empty"). + * + * If its "open" state, then it contains one or more + * itxs that eventually need to be committed to stable + * storage. In this case we intentionally do not issue + * the lwb's zio to disk yet, and instead rely on one of + * the following two mechanisms for issuing the zio: + * + * 1. Ideally, there will be more ZIL activity occuring + * on the system, such that this function will be + * immediately called again (not necessarily by the same + * thread) and this lwb's zio will be issued via + * zil_lwb_commit(). This way, the lwb is guaranteed to + * be "full" when it is issued to disk, and we'll make + * use of the lwb's size the best we can. + * + * 2. If there isn't sufficient ZIL activity occuring on + * the system, such that this lwb's zio isn't issued via + * zil_lwb_commit(), zil_commit_waiter() will issue the + * lwb's zio. If this occurs, the lwb is not guaranteed + * to be "full" by the time its zio is issued, and means + * the size of the lwb was "too large" given the amount + * of ZIL activity occuring on the system at that time. + * + * We do this for a couple of reasons: + * + * 1. To try and reduce the number of IOPs needed to + * write the same number of itxs. If an lwb has space + * available in it's buffer for more itxs, and more itxs + * will be committed relatively soon (relative to the + * latency of performing a write), then it's beneficial + * to wait for these "next" itxs. This way, more itxs + * can be committed to stable storage with fewer writes. + * + * 2. To try and use the largest lwb block size that the + * incoming rate of itxs can support. Again, this is to + * try and pack as many itxs into as few lwbs as + * possible, without significantly impacting the latency + * of each individual itx. + */ + } +} + +/* + * This function is responsible for ensuring the passed in commit waiter + * (and associated commit itx) is committed to an lwb. If the waiter is + * not already committed to an lwb, all itxs in the zilog's queue of + * itxs will be processed. The assumption is the passed in waiter's + * commit itx will found in the queue just like the other non-commit + * itxs, such that when the entire queue is processed, the waiter will + * have been commited to an lwb. + * + * The lwb associated with the passed in waiter is not guaranteed to + * have been issued by the time this function completes. If the lwb is + * not issued, we rely on future calls to zil_commit_writer() to issue + * the lwb, or the timeout mechanism found in zil_commit_waiter(). + */ +static void +zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) +{ + ASSERT(!MUTEX_HELD(&zilog->zl_lock)); + ASSERT(spa_writeable(zilog->zl_spa)); + + mutex_enter(&zilog->zl_issuer_lock); + + if (zcw->zcw_lwb != NULL || zcw->zcw_done) { + /* + * It's possible that, while we were waiting to acquire + * the "zl_issuer_lock", another thread committed this + * waiter to an lwb. If that occurs, we bail out early, + * without processing any of the zilog's queue of itxs. + * + * On certain workloads and system configurations, the + * "zl_issuer_lock" can become highly contended. In an + * attempt to reduce this contention, we immediately drop + * the lock if the waiter has already been processed. + * + * We've measured this optimization to reduce CPU spent + * contending on this lock by up to 5%, using a system + * with 32 CPUs, low latency storage (~50 usec writes), + * and 1024 threads performing sync writes. + */ + goto out; + } + + zil_get_commit_list(zilog); + zil_prune_commit_list(zilog); + zil_process_commit_list(zilog); + +out: + mutex_exit(&zilog->zl_issuer_lock); +} + +static void +zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) +{ + ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT(MUTEX_HELD(&zcw->zcw_lock)); + ASSERT3B(zcw->zcw_done, ==, B_FALSE); + + lwb_t *lwb = zcw->zcw_lwb; + ASSERT3P(lwb, !=, NULL); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); + + /* + * If the lwb has already been issued by another thread, we can + * immediately return since there's no work to be done (the + * point of this function is to issue the lwb). Additionally, we + * do this prior to acquiring the zl_issuer_lock, to avoid + * acquiring it when it's not necessary to do so. + */ + if (lwb->lwb_state == LWB_STATE_ISSUED || + lwb->lwb_state == LWB_STATE_DONE) + return; + + /* + * In order to call zil_lwb_write_issue() we must hold the + * zilog's "zl_issuer_lock". We can't simply acquire that lock, + * since we're already holding the commit waiter's "zcw_lock", + * and those two locks are aquired in the opposite order + * elsewhere. + */ + mutex_exit(&zcw->zcw_lock); + mutex_enter(&zilog->zl_issuer_lock); + mutex_enter(&zcw->zcw_lock); + + /* + * Since we just dropped and re-acquired the commit waiter's + * lock, we have to re-check to see if the waiter was marked + * "done" during that process. If the waiter was marked "done", + * the "lwb" pointer is no longer valid (it can be free'd after + * the waiter is marked "done"), so without this check we could + * wind up with a use-after-free error below. + */ + if (zcw->zcw_done) + goto out; + + ASSERT3P(lwb, ==, zcw->zcw_lwb); + + /* + * We've already checked this above, but since we hadn't acquired + * the zilog's zl_issuer_lock, we have to perform this check a + * second time while holding the lock. + * + * We don't need to hold the zl_lock since the lwb cannot transition + * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb + * _can_ transition from ISSUED to DONE, but it's OK to race with + * that transition since we treat the lwb the same, whether it's in + * the ISSUED or DONE states. + * + * The important thing, is we treat the lwb differently depending on + * if it's ISSUED or OPENED, and block any other threads that might + * attempt to issue this lwb. For that reason we hold the + * zl_issuer_lock when checking the lwb_state; we must not call + * zil_lwb_write_issue() if the lwb had already been issued. + * + * See the comment above the lwb_state_t structure definition for + * more details on the lwb states, and locking requirements. + */ + if (lwb->lwb_state == LWB_STATE_ISSUED || + lwb->lwb_state == LWB_STATE_DONE) + goto out; + + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + + /* + * As described in the comments above zil_commit_waiter() and + * zil_process_commit_list(), we need to issue this lwb's zio + * since we've reached the commit waiter's timeout and it still + * hasn't been issued. + */ + lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); + + IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED); + + /* + * Since the lwb's zio hadn't been issued by the time this thread + * reached its timeout, we reset the zilog's "zl_cur_used" field + * to influence the zil block size selection algorithm. + * + * By having to issue the lwb's zio here, it means the size of the + * lwb was too large, given the incoming throughput of itxs. By + * setting "zl_cur_used" to zero, we communicate this fact to the + * block size selection algorithm, so it can take this informaiton + * into account, and potentially select a smaller size for the + * next lwb block that is allocated. + */ + zilog->zl_cur_used = 0; + + if (nlwb == NULL) { + /* + * When zil_lwb_write_issue() returns NULL, this + * indicates zio_alloc_zil() failed to allocate the + * "next" lwb on-disk. When this occurs, the ZIL write + * pipeline must be stalled; see the comment within the + * zil_commit_writer_stall() function for more details. + * + * We must drop the commit waiter's lock prior to + * calling zil_commit_writer_stall() or else we can wind + * up with the following deadlock: + * + * - This thread is waiting for the txg to sync while + * holding the waiter's lock; txg_wait_synced() is + * used within txg_commit_writer_stall(). + * + * - The txg can't sync because it is waiting for this + * lwb's zio callback to call dmu_tx_commit(). + * + * - The lwb's zio callback can't call dmu_tx_commit() + * because it's blocked trying to acquire the waiter's + * lock, which occurs prior to calling dmu_tx_commit() + */ + mutex_exit(&zcw->zcw_lock); + zil_commit_writer_stall(zilog); + mutex_enter(&zcw->zcw_lock); + } + +out: + mutex_exit(&zilog->zl_issuer_lock); + ASSERT(MUTEX_HELD(&zcw->zcw_lock)); +} + +/* + * This function is responsible for performing the following two tasks: + * + * 1. its primary responsibility is to block until the given "commit + * waiter" is considered "done". + * + * 2. its secondary responsibility is to issue the zio for the lwb that + * the given "commit waiter" is waiting on, if this function has + * waited "long enough" and the lwb is still in the "open" state. + * + * Given a sufficient amount of itxs being generated and written using + * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() + * function. If this does not occur, this secondary responsibility will + * ensure the lwb is issued even if there is not other synchronous + * activity on the system. + * + * For more details, see zil_process_commit_list(); more specifically, + * the comment at the bottom of that function. + */ +static void +zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) +{ + ASSERT(!MUTEX_HELD(&zilog->zl_lock)); + ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT(spa_writeable(zilog->zl_spa)); + + mutex_enter(&zcw->zcw_lock); + + /* + * The timeout is scaled based on the lwb latency to avoid + * significantly impacting the latency of each individual itx. + * For more details, see the comment at the bottom of the + * zil_process_commit_list() function. + */ + int pct = MAX(zfs_commit_timeout_pct, 1); +#if defined(illumos) || !defined(_KERNEL) + hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100; + hrtime_t wakeup = gethrtime() + sleep; +#else + sbintime_t sleep = nstosbt((zilog->zl_last_lwb_latency * pct) / 100); + sbintime_t wakeup = getsbinuptime() + sleep; +#endif + boolean_t timedout = B_FALSE; + + while (!zcw->zcw_done) { + ASSERT(MUTEX_HELD(&zcw->zcw_lock)); + + lwb_t *lwb = zcw->zcw_lwb; + + /* + * Usually, the waiter will have a non-NULL lwb field here, + * but it's possible for it to be NULL as a result of + * zil_commit() racing with spa_sync(). + * + * When zil_clean() is called, it's possible for the itxg + * list (which may be cleaned via a taskq) to contain + * commit itxs. When this occurs, the commit waiters linked + * off of these commit itxs will not be committed to an + * lwb. Additionally, these commit waiters will not be + * marked done until zil_commit_waiter_skip() is called via + * zil_itxg_clean(). + * + * Thus, it's possible for this commit waiter (i.e. the + * "zcw" variable) to be found in this "in between" state; + * where it's "zcw_lwb" field is NULL, and it hasn't yet + * been skipped, so it's "zcw_done" field is still B_FALSE. + */ + IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED); + + if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { + ASSERT3B(timedout, ==, B_FALSE); + + /* + * If the lwb hasn't been issued yet, then we + * need to wait with a timeout, in case this + * function needs to issue the lwb after the + * timeout is reached; responsibility (2) from + * the comment above this function. + */ +#if defined(illumos) || !defined(_KERNEL) + clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv, + &zcw->zcw_lock, wakeup, USEC2NSEC(1), + CALLOUT_FLAG_ABSOLUTE); + + if (timeleft >= 0 || zcw->zcw_done) + continue; +#else + int wait_err = cv_timedwait_sbt(&zcw->zcw_cv, + &zcw->zcw_lock, wakeup, SBT_1NS, C_ABSOLUTE); + if (wait_err != EWOULDBLOCK || zcw->zcw_done) + continue; +#endif + + timedout = B_TRUE; + zil_commit_waiter_timeout(zilog, zcw); + + if (!zcw->zcw_done) { + /* + * If the commit waiter has already been + * marked "done", it's possible for the + * waiter's lwb structure to have already + * been freed. Thus, we can only reliably + * make these assertions if the waiter + * isn't done. + */ + ASSERT3P(lwb, ==, zcw->zcw_lwb); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); + } + } else { + /* + * If the lwb isn't open, then it must have already + * been issued. In that case, there's no need to + * use a timeout when waiting for the lwb to + * complete. + * + * Additionally, if the lwb is NULL, the waiter + * will soon be signalled and marked done via + * zil_clean() and zil_itxg_clean(), so no timeout + * is required. + */ + + IMPLY(lwb != NULL, + lwb->lwb_state == LWB_STATE_ISSUED || + lwb->lwb_state == LWB_STATE_DONE); + cv_wait(&zcw->zcw_cv, &zcw->zcw_lock); + } + } + + mutex_exit(&zcw->zcw_lock); +} + +static zil_commit_waiter_t * +zil_alloc_commit_waiter() +{ + zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP); + + cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL); + list_link_init(&zcw->zcw_node); + zcw->zcw_lwb = NULL; + zcw->zcw_done = B_FALSE; + zcw->zcw_zio_error = 0; + + return (zcw); +} + +static void +zil_free_commit_waiter(zil_commit_waiter_t *zcw) +{ + ASSERT(!list_link_active(&zcw->zcw_node)); + ASSERT3P(zcw->zcw_lwb, ==, NULL); + ASSERT3B(zcw->zcw_done, ==, B_TRUE); + mutex_destroy(&zcw->zcw_lock); + cv_destroy(&zcw->zcw_cv); + kmem_cache_free(zil_zcw_cache, zcw); +} + +/* + * This function is used to create a TX_COMMIT itx and assign it. This + * way, it will be linked into the ZIL's list of synchronous itxs, and + * then later committed to an lwb (or skipped) when + * zil_process_commit_list() is called. + */ +static void +zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) +{ + dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); + itx->itx_sync = B_TRUE; + itx->itx_private = zcw; + + zil_itx_assign(zilog, itx, tx); + + dmu_tx_commit(tx); +} + +/* + * Commit ZFS Intent Log transactions (itxs) to stable storage. + * + * When writing ZIL transactions to the on-disk representation of the + * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple + * itxs can be committed to a single lwb. Once a lwb is written and + * committed to stable storage (i.e. the lwb is written, and vdevs have + * been flushed), each itx that was committed to that lwb is also + * considered to be committed to stable storage. + * + * When an itx is committed to an lwb, the log record (lr_t) contained + * by the itx is copied into the lwb's zio buffer, and once this buffer + * is written to disk, it becomes an on-disk ZIL block. + * + * As itxs are generated, they're inserted into the ZIL's queue of + * uncommitted itxs. The semantics of zil_commit() are such that it will + * block until all itxs that were in the queue when it was called, are + * committed to stable storage. + * + * If "foid" is zero, this means all "synchronous" and "asynchronous" + * itxs, for all objects in the dataset, will be committed to stable + * storage prior to zil_commit() returning. If "foid" is non-zero, all + * "synchronous" itxs for all objects, but only "asynchronous" itxs + * that correspond to the foid passed in, will be committed to stable + * storage prior to zil_commit() returning. + * + * Generally speaking, when zil_commit() is called, the consumer doesn't + * actually care about _all_ of the uncommitted itxs. Instead, they're + * simply trying to waiting for a specific itx to be committed to disk, + * but the interface(s) for interacting with the ZIL don't allow such + * fine-grained communication. A better interface would allow a consumer + * to create and assign an itx, and then pass a reference to this itx to + * zil_commit(); such that zil_commit() would return as soon as that + * specific itx was committed to disk (instead of waiting for _all_ + * itxs to be committed). + * + * When a thread calls zil_commit() a special "commit itx" will be + * generated, along with a corresponding "waiter" for this commit itx. + * zil_commit() will wait on this waiter's CV, such that when the waiter + * is marked done, and signalled, zil_commit() will return. + * + * This commit itx is inserted into the queue of uncommitted itxs. This + * provides an easy mechanism for determining which itxs were in the + * queue prior to zil_commit() having been called, and which itxs were + * added after zil_commit() was called. + * + * The commit it is special; it doesn't have any on-disk representation. + * When a commit itx is "committed" to an lwb, the waiter associated + * with it is linked onto the lwb's list of waiters. Then, when that lwb + * completes, each waiter on the lwb's list is marked done and signalled + * -- allowing the thread waiting on the waiter to return from zil_commit(). + * + * It's important to point out a few critical factors that allow us + * to make use of the commit itxs, commit waiters, per-lwb lists of + * commit waiters, and zio completion callbacks like we're doing: + * + * 1. The list of waiters for each lwb is traversed, and each commit + * waiter is marked "done" and signalled, in the zio completion + * callback of the lwb's zio[*]. + * + * * Actually, the waiters are signalled in the zio completion + * callback of the root zio for the DKIOCFLUSHWRITECACHE commands + * that are sent to the vdevs upon completion of the lwb zio. + * + * 2. When the itxs are inserted into the ZIL's queue of uncommitted + * itxs, the order in which they are inserted is preserved[*]; as + * itxs are added to the queue, they are added to the tail of + * in-memory linked lists. + * + * When committing the itxs to lwbs (to be written to disk), they + * are committed in the same order in which the itxs were added to + * the uncommitted queue's linked list(s); i.e. the linked list of + * itxs to commit is traversed from head to tail, and each itx is + * committed to an lwb in that order. + * + * * To clarify: + * + * - the order of "sync" itxs is preserved w.r.t. other + * "sync" itxs, regardless of the corresponding objects. + * - the order of "async" itxs is preserved w.r.t. other + * "async" itxs corresponding to the same object. + * - the order of "async" itxs is *not* preserved w.r.t. other + * "async" itxs corresponding to different objects. + * - the order of "sync" itxs w.r.t. "async" itxs (or vice + * versa) is *not* preserved, even for itxs that correspond + * to the same object. + * + * For more details, see: zil_itx_assign(), zil_async_to_sync(), + * zil_get_commit_list(), and zil_process_commit_list(). + * + * 3. The lwbs represent a linked list of blocks on disk. Thus, any + * lwb cannot be considered committed to stable storage, until its + * "previous" lwb is also committed to stable storage. This fact, + * coupled with the fact described above, means that itxs are + * committed in (roughly) the order in which they were generated. + * This is essential because itxs are dependent on prior itxs. + * Thus, we *must not* deem an itx as being committed to stable + * storage, until *all* prior itxs have also been committed to + * stable storage. + * + * To enforce this ordering of lwb zio's, while still leveraging as + * much of the underlying storage performance as possible, we rely + * on two fundamental concepts: + * + * 1. The creation and issuance of lwb zio's is protected by + * the zilog's "zl_issuer_lock", which ensures only a single + * thread is creating and/or issuing lwb's at a time + * 2. The "previous" lwb is a child of the "current" lwb + * (leveraging the zio parent-child depenency graph) + * + * By relying on this parent-child zio relationship, we can have + * many lwb zio's concurrently issued to the underlying storage, + * but the order in which they complete will be the same order in + * which they were created. + */ +void +zil_commit(zilog_t *zilog, uint64_t foid) +{ + /* + * We should never attempt to call zil_commit on a snapshot for + * a couple of reasons: + * + * 1. A snapshot may never be modified, thus it cannot have any + * in-flight itxs that would have modified the dataset. + * + * 2. By design, when zil_commit() is called, a commit itx will + * be assigned to this zilog; as a result, the zilog will be + * dirtied. We must not dirty the zilog of a snapshot; there's + * checks in the code that enforce this invariant, and will + * cause a panic if it's not upheld. + */ + ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); + + if (zilog->zl_sync == ZFS_SYNC_DISABLED) + return; + + if (!spa_writeable(zilog->zl_spa)) { + /* + * If the SPA is not writable, there should never be any + * pending itxs waiting to be committed to disk. If that + * weren't true, we'd skip writing those itxs out, and + * would break the sematics of zil_commit(); thus, we're + * verifying that truth before we return to the caller. + */ + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); + for (int i = 0; i < TXG_SIZE; i++) + ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); + return; + } + + /* + * If the ZIL is suspended, we don't want to dirty it by calling + * zil_commit_itx_assign() below, nor can we write out + * lwbs like would be done in zil_commit_write(). Thus, we + * simply rely on txg_wait_synced() to maintain the necessary + * semantics, and avoid calling those functions altogether. + */ + if (zilog->zl_suspend > 0) { + txg_wait_synced(zilog->zl_dmu_pool, 0); + return; + } + + zil_commit_impl(zilog, foid); +} + +void +zil_commit_impl(zilog_t *zilog, uint64_t foid) +{ + /* + * Move the "async" itxs for the specified foid to the "sync" + * queues, such that they will be later committed (or skipped) + * to an lwb when zil_process_commit_list() is called. + * + * Since these "async" itxs must be committed prior to this + * call to zil_commit returning, we must perform this operation + * before we call zil_commit_itx_assign(). + */ + zil_async_to_sync(zilog, foid); + + /* + * We allocate a new "waiter" structure which will initially be + * linked to the commit itx using the itx's "itx_private" field. + * Since the commit itx doesn't represent any on-disk state, + * when it's committed to an lwb, rather than copying the its + * lr_t into the lwb's buffer, the commit itx's "waiter" will be + * added to the lwb's list of waiters. Then, when the lwb is + * committed to stable storage, each waiter in the lwb's list of + * waiters will be marked "done", and signalled. + * + * We must create the waiter and assign the commit itx prior to + * calling zil_commit_writer(), or else our specific commit itx + * is not guaranteed to be committed to an lwb prior to calling + * zil_commit_waiter(). + */ + zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); + zil_commit_itx_assign(zilog, zcw); + + zil_commit_writer(zilog, zcw); + zil_commit_waiter(zilog, zcw); + + if (zcw->zcw_zio_error != 0) { + /* + * If there was an error writing out the ZIL blocks that + * this thread is waiting on, then we fallback to + * relying on spa_sync() to write out the data this + * thread is waiting on. Obviously this has performance + * implications, but the expectation is for this to be + * an exceptional case, and shouldn't occur often. + */ + DTRACE_PROBE2(zil__commit__io__error, + zilog_t *, zilog, zil_commit_waiter_t *, zcw); + txg_wait_synced(zilog->zl_dmu_pool, 0); + } + + zil_free_commit_waiter(zcw); +} + +/* + * Called in syncing context to free committed log blocks and update log header. + */ +void +zil_sync(zilog_t *zilog, dmu_tx_t *tx) +{ + zil_header_t *zh = zil_header_in_syncing_context(zilog); + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = zilog->zl_spa; + uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; + lwb_t *lwb; + + /* + * We don't zero out zl_destroy_txg, so make sure we don't try + * to destroy it twice. + */ + if (spa_sync_pass(spa) != 1) + return; + + mutex_enter(&zilog->zl_lock); + + ASSERT(zilog->zl_stop_sync == 0); + + if (*replayed_seq != 0) { + ASSERT(zh->zh_replay_seq < *replayed_seq); + zh->zh_replay_seq = *replayed_seq; + *replayed_seq = 0; + } + + if (zilog->zl_destroy_txg == txg) { + blkptr_t blk = zh->zh_log; + + ASSERT(list_head(&zilog->zl_lwb_list) == NULL); + + bzero(zh, sizeof (zil_header_t)); + bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); + + if (zilog->zl_keep_first) { + /* + * If this block was part of log chain that couldn't + * be claimed because a device was missing during + * zil_claim(), but that device later returns, + * then this block could erroneously appear valid. + * To guard against this, assign a new GUID to the new + * log chain so it doesn't matter what blk points to. + */ + zil_init_log_chain(zilog, &blk); + zh->zh_log = blk; + } + } + + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + zh->zh_log = lwb->lwb_blk; + if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) + break; + list_remove(&zilog->zl_lwb_list, lwb); + zio_free(spa, txg, &lwb->lwb_blk); + zil_free_lwb(zilog, lwb); + + /* + * If we don't have anything left in the lwb list then + * we've had an allocation failure and we need to zero + * out the zil_header blkptr so that we don't end + * up freeing the same block twice. + */ + if (list_head(&zilog->zl_lwb_list) == NULL) + BP_ZERO(&zh->zh_log); + } + mutex_exit(&zilog->zl_lock); +} + +/* ARGSUSED */ +static int +zil_lwb_cons(void *vbuf, void *unused, int kmflag) +{ + lwb_t *lwb = vbuf; + list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), + offsetof(zil_commit_waiter_t, zcw_node)); + avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, + sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); + mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +/* ARGSUSED */ +static void +zil_lwb_dest(void *vbuf, void *unused) +{ + lwb_t *lwb = vbuf; + mutex_destroy(&lwb->lwb_vdev_lock); + avl_destroy(&lwb->lwb_vdev_tree); + list_destroy(&lwb->lwb_waiters); +} + +void +zil_init(void) +{ + zil_lwb_cache = kmem_cache_create("zil_lwb_cache", + sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0); + + zil_zcw_cache = kmem_cache_create("zil_zcw_cache", + sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +zil_fini(void) +{ + kmem_cache_destroy(zil_zcw_cache); + kmem_cache_destroy(zil_lwb_cache); +} + +void +zil_set_sync(zilog_t *zilog, uint64_t sync) +{ + zilog->zl_sync = sync; +} + +void +zil_set_logbias(zilog_t *zilog, uint64_t logbias) +{ + zilog->zl_logbias = logbias; +} + +zilog_t * +zil_alloc(objset_t *os, zil_header_t *zh_phys) +{ + zilog_t *zilog; + + zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); + + zilog->zl_header = zh_phys; + zilog->zl_os = os; + zilog->zl_spa = dmu_objset_spa(os); + zilog->zl_dmu_pool = dmu_objset_pool(os); + zilog->zl_destroy_txg = TXG_INITIAL - 1; + zilog->zl_logbias = dmu_objset_logbias(os); + zilog->zl_sync = dmu_objset_syncprop(os); + zilog->zl_dirty_max_txg = 0; + zilog->zl_last_lwb_opened = NULL; + zilog->zl_last_lwb_latency = 0; + + mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); + + for (int i = 0; i < TXG_SIZE; i++) { + mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, + MUTEX_DEFAULT, NULL); + } + + list_create(&zilog->zl_lwb_list, sizeof (lwb_t), + offsetof(lwb_t, lwb_node)); + + list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), + offsetof(itx_t, itx_node)); + + cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); + + return (zilog); +} + +void +zil_free(zilog_t *zilog) +{ + zilog->zl_stop_sync = 1; + + ASSERT0(zilog->zl_suspend); + ASSERT0(zilog->zl_suspending); + + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + list_destroy(&zilog->zl_lwb_list); + + ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); + list_destroy(&zilog->zl_itx_commit_list); + + for (int i = 0; i < TXG_SIZE; i++) { + /* + * It's possible for an itx to be generated that doesn't dirty + * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() + * callback to remove the entry. We remove those here. + * + * Also free up the ziltest itxs. + */ + if (zilog->zl_itxg[i].itxg_itxs) + zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); + mutex_destroy(&zilog->zl_itxg[i].itxg_lock); + } + + mutex_destroy(&zilog->zl_issuer_lock); + mutex_destroy(&zilog->zl_lock); + + cv_destroy(&zilog->zl_cv_suspend); + + kmem_free(zilog, sizeof (zilog_t)); +} + +/* + * Open an intent log. + */ +zilog_t * +zil_open(objset_t *os, zil_get_data_t *get_data) +{ + zilog_t *zilog = dmu_objset_zil(os); + + ASSERT3P(zilog->zl_get_data, ==, NULL); + ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + + zilog->zl_get_data = get_data; + + return (zilog); +} + +/* + * Close an intent log. + */ +void +zil_close(zilog_t *zilog) +{ + lwb_t *lwb; + uint64_t txg; + + if (!dmu_objset_is_snapshot(zilog->zl_os)) { + zil_commit(zilog, 0); + } else { + ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); + ASSERT0(zilog->zl_dirty_max_txg); + ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); + } + + mutex_enter(&zilog->zl_lock); + lwb = list_tail(&zilog->zl_lwb_list); + if (lwb == NULL) + txg = zilog->zl_dirty_max_txg; + else + txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); + mutex_exit(&zilog->zl_lock); + + /* + * We need to use txg_wait_synced() to wait long enough for the + * ZIL to be clean, and to wait for all pending lwbs to be + * written out. + */ + if (txg) + txg_wait_synced(zilog->zl_dmu_pool, txg); + + if (txg < spa_freeze_txg(zilog->zl_spa)) + ASSERT(!zilog_is_dirty(zilog)); + + zilog->zl_get_data = NULL; + + /* + * We should have only one lwb left on the list; remove it now. + */ + mutex_enter(&zilog->zl_lock); + lwb = list_head(&zilog->zl_lwb_list); + if (lwb != NULL) { + ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list)); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); + list_remove(&zilog->zl_lwb_list, lwb); + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + zil_free_lwb(zilog, lwb); + } + mutex_exit(&zilog->zl_lock); +} + +static char *suspend_tag = "zil suspending"; + +/* + * Suspend an intent log. While in suspended mode, we still honor + * synchronous semantics, but we rely on txg_wait_synced() to do it. + * On old version pools, we suspend the log briefly when taking a + * snapshot so that it will have an empty intent log. + * + * Long holds are not really intended to be used the way we do here -- + * held for such a short time. A concurrent caller of dsl_dataset_long_held() + * could fail. Therefore we take pains to only put a long hold if it is + * actually necessary. Fortunately, it will only be necessary if the + * objset is currently mounted (or the ZVOL equivalent). In that case it + * will already have a long hold, so we are not really making things any worse. + * + * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or + * zvol_state_t), and use their mechanism to prevent their hold from being + * dropped (e.g. VFS_HOLD()). However, that would be even more pain for + * very little gain. + * + * if cookiep == NULL, this does both the suspend & resume. + * Otherwise, it returns with the dataset "long held", and the cookie + * should be passed into zil_resume(). + */ +int +zil_suspend(const char *osname, void **cookiep) +{ + objset_t *os; + zilog_t *zilog; + const zil_header_t *zh; + int error; + + error = dmu_objset_hold(osname, suspend_tag, &os); + if (error != 0) + return (error); + zilog = dmu_objset_zil(os); + + mutex_enter(&zilog->zl_lock); + zh = zilog->zl_header; + + if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ + mutex_exit(&zilog->zl_lock); + dmu_objset_rele(os, suspend_tag); + return (SET_ERROR(EBUSY)); + } + + /* + * Don't put a long hold in the cases where we can avoid it. This + * is when there is no cookie so we are doing a suspend & resume + * (i.e. called from zil_vdev_offline()), and there's nothing to do + * for the suspend because it's already suspended, or there's no ZIL. + */ + if (cookiep == NULL && !zilog->zl_suspending && + (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { + mutex_exit(&zilog->zl_lock); + dmu_objset_rele(os, suspend_tag); + return (0); + } + + dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); + dsl_pool_rele(dmu_objset_pool(os), suspend_tag); + + zilog->zl_suspend++; + + if (zilog->zl_suspend > 1) { + /* + * Someone else is already suspending it. + * Just wait for them to finish. + */ + + while (zilog->zl_suspending) + cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); + mutex_exit(&zilog->zl_lock); + + if (cookiep == NULL) + zil_resume(os); + else + *cookiep = os; + return (0); + } + + /* + * If there is no pointer to an on-disk block, this ZIL must not + * be active (e.g. filesystem not mounted), so there's nothing + * to clean up. + */ + if (BP_IS_HOLE(&zh->zh_log)) { + ASSERT(cookiep != NULL); /* fast path already handled */ + + *cookiep = os; + mutex_exit(&zilog->zl_lock); + return (0); + } + + zilog->zl_suspending = B_TRUE; + mutex_exit(&zilog->zl_lock); + + /* + * We need to use zil_commit_impl to ensure we wait for all + * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed + * to disk before proceeding. If we used zil_commit instead, it + * would just call txg_wait_synced(), because zl_suspend is set. + * txg_wait_synced() doesn't wait for these lwb's to be + * LWB_STATE_DONE before returning. + */ + zil_commit_impl(zilog, 0); + + /* + * Now that we've ensured all lwb's are LWB_STATE_DONE, we use + * txg_wait_synced() to ensure the data from the zilog has + * migrated to the main pool before calling zil_destroy(). + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); + + zil_destroy(zilog, B_FALSE); + + mutex_enter(&zilog->zl_lock); + zilog->zl_suspending = B_FALSE; + cv_broadcast(&zilog->zl_cv_suspend); + mutex_exit(&zilog->zl_lock); + + if (cookiep == NULL) + zil_resume(os); + else + *cookiep = os; + return (0); +} + +void +zil_resume(void *cookie) +{ + objset_t *os = cookie; + zilog_t *zilog = dmu_objset_zil(os); + + mutex_enter(&zilog->zl_lock); + ASSERT(zilog->zl_suspend != 0); + zilog->zl_suspend--; + mutex_exit(&zilog->zl_lock); + dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); + dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); +} + +typedef struct zil_replay_arg { + zil_replay_func_t **zr_replay; + void *zr_arg; + boolean_t zr_byteswap; + char *zr_lr; +} zil_replay_arg_t; + +static int +zil_replay_error(zilog_t *zilog, lr_t *lr, int error) +{ + char name[ZFS_MAX_DATASET_NAME_LEN]; + + zilog->zl_replaying_seq--; /* didn't actually replay this one */ + + dmu_objset_name(zilog->zl_os, name); + + cmn_err(CE_WARN, "ZFS replay transaction error %d, " + "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, + (u_longlong_t)lr->lrc_seq, + (u_longlong_t)(lr->lrc_txtype & ~TX_CI), + (lr->lrc_txtype & TX_CI) ? "CI" : ""); + + return (error); +} + +static int +zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) +{ + zil_replay_arg_t *zr = zra; + const zil_header_t *zh = zilog->zl_header; + uint64_t reclen = lr->lrc_reclen; + uint64_t txtype = lr->lrc_txtype; + int error = 0; + + zilog->zl_replaying_seq = lr->lrc_seq; + + if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ + return (0); + + if (lr->lrc_txg < claim_txg) /* already committed */ + return (0); + + /* Strip case-insensitive bit, still present in log record */ + txtype &= ~TX_CI; + + if (txtype == 0 || txtype >= TX_MAX_TYPE) + return (zil_replay_error(zilog, lr, EINVAL)); + + /* + * If this record type can be logged out of order, the object + * (lr_foid) may no longer exist. That's legitimate, not an error. + */ + if (TX_OOO(txtype)) { + error = dmu_object_info(zilog->zl_os, + LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL); + if (error == ENOENT || error == EEXIST) + return (0); + } + + /* + * Make a copy of the data so we can revise and extend it. + */ + bcopy(lr, zr->zr_lr, reclen); + + /* + * If this is a TX_WRITE with a blkptr, suck in the data. + */ + if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { + error = zil_read_log_data(zilog, (lr_write_t *)lr, + zr->zr_lr + reclen); + if (error != 0) + return (zil_replay_error(zilog, lr, error)); + } + + /* + * The log block containing this lr may have been byteswapped + * so that we can easily examine common fields like lrc_txtype. + * However, the log is a mix of different record types, and only the + * replay vectors know how to byteswap their records. Therefore, if + * the lr was byteswapped, undo it before invoking the replay vector. + */ + if (zr->zr_byteswap) + byteswap_uint64_array(zr->zr_lr, reclen); + + /* + * We must now do two things atomically: replay this log record, + * and update the log header sequence number to reflect the fact that + * we did so. At the end of each replay function the sequence number + * is updated if we are in replay mode. + */ + error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); + if (error != 0) { + /* + * The DMU's dnode layer doesn't see removes until the txg + * commits, so a subsequent claim can spuriously fail with + * EEXIST. So if we receive any error we try syncing out + * any removes then retry the transaction. Note that we + * specify B_FALSE for byteswap now, so we don't do it twice. + */ + txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); + error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); + if (error != 0) + return (zil_replay_error(zilog, lr, error)); + } + return (0); +} + +/* ARGSUSED */ +static int +zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + zilog->zl_replay_blks++; + + return (0); +} + +/* + * If this dataset has a non-empty intent log, replay it and destroy it. + */ +void +zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) +{ + zilog_t *zilog = dmu_objset_zil(os); + const zil_header_t *zh = zilog->zl_header; + zil_replay_arg_t zr; + + if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { + zil_destroy(zilog, B_TRUE); + return; + } + + zr.zr_replay = replay_func; + zr.zr_arg = arg; + zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); + zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); + + /* + * Wait for in-progress removes to sync before starting replay. + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); + + zilog->zl_replay = B_TRUE; + zilog->zl_replay_time = ddi_get_lbolt(); + ASSERT(zilog->zl_replay_blks == 0); + (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, + zh->zh_claim_txg); + kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); + + zil_destroy(zilog, B_FALSE); + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + zilog->zl_replay = B_FALSE; +} + +boolean_t +zil_replaying(zilog_t *zilog, dmu_tx_t *tx) +{ + if (zilog->zl_sync == ZFS_SYNC_DISABLED) + return (B_TRUE); + + if (zilog->zl_replay) { + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = + zilog->zl_replaying_seq; + return (B_TRUE); + } + + return (B_FALSE); +} + +/* ARGSUSED */ +int +zil_reset(const char *osname, void *arg) +{ + int error; + + error = zil_suspend(osname, NULL); + if (error != 0) + return (SET_ERROR(EEXIST)); + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c new file mode 100644 index 000000000000..ded3a63163b2 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -0,0 +1,4311 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include <sys/sysmacros.h> +#include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zio_impl.h> +#include <sys/zio_compress.h> +#include <sys/zio_checksum.h> +#include <sys/dmu_objset.h> +#include <sys/arc.h> +#include <sys/ddt.h> +#include <sys/trim_map.h> +#include <sys/blkptr.h> +#include <sys/zfeature.h> +#include <sys/dsl_scan.h> +#include <sys/metaslab_impl.h> +#include <sys/abd.h> +#include <sys/cityhash.h> + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); +#if defined(__amd64__) +static int zio_use_uma = 1; +#else +static int zio_use_uma = 0; +#endif +SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, + "Use uma(9) for ZIO allocations"); +static int zio_exclude_metadata = 0; +SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, + "Exclude metadata buffers from dumps as well"); + +zio_trim_stats_t zio_trim_stats = { + { "bytes", KSTAT_DATA_UINT64, + "Number of bytes successfully TRIMmed" }, + { "success", KSTAT_DATA_UINT64, + "Number of successful TRIM requests" }, + { "unsupported", KSTAT_DATA_UINT64, + "Number of TRIM requests that failed because TRIM is not supported" }, + { "failed", KSTAT_DATA_UINT64, + "Number of TRIM requests that failed for reasons other than not supported" }, +}; + +static kstat_t *zio_trim_ksp; + +/* + * ========================================================================== + * I/O type descriptions + * ========================================================================== + */ +const char *zio_type_name[ZIO_TYPES] = { + "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", + "zio_ioctl" +}; + +boolean_t zio_dva_throttle_enabled = B_TRUE; +SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RWTUN, + &zio_dva_throttle_enabled, 0, "Enable allocation throttling"); + +/* + * ========================================================================== + * I/O kmem caches + * ========================================================================== + */ +kmem_cache_t *zio_cache; +kmem_cache_t *zio_link_cache; +kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; + +#ifdef _KERNEL +extern vmem_t *zio_alloc_arena; +#endif + +#define BP_SPANB(indblkshift, level) \ + (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) +#define COMPARE_META_LEVEL 0x80000000ul +/* + * The following actions directly effect the spa's sync-to-convergence logic. + * The values below define the sync pass when we start performing the action. + * Care should be taken when changing these values as they directly impact + * spa_sync() performance. Tuning these values may introduce subtle performance + * pathologies and should only be done in the context of performance analysis. + * These tunables will eventually be removed and replaced with #defines once + * enough analysis has been done to determine optimal values. + * + * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that + * regular blocks are not deferred. + */ +int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, + &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); +int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, + &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); +int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ +SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, + &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); + +/* + * An allocating zio is one that either currently has the DVA allocate + * stage set or will have it later in its lifetime. + */ +#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) + +boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; + +#ifdef illumos +#ifdef ZFS_DEBUG +int zio_buf_debug_limit = 16384; +#else +int zio_buf_debug_limit = 0; +#endif +#endif + +static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); + +void +zio_init(void) +{ + size_t c; + zio_cache = kmem_cache_create("zio_cache", + sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + zio_link_cache = kmem_cache_create("zio_link_cache", + sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + if (!zio_use_uma) + goto out; + + /* + * For small buffers, we want a cache for each multiple of + * SPA_MINBLOCKSIZE. For larger buffers, we want a cache + * for each quarter-power of 2. + */ + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + size_t size = (c + 1) << SPA_MINBLOCKSHIFT; + size_t p2 = size; + size_t align = 0; + int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0; + + while (!ISP2(p2)) + p2 &= p2 - 1; + +#ifdef illumos +#ifndef _KERNEL + /* + * If we are using watchpoints, put each buffer on its own page, + * to eliminate the performance overhead of trapping to the + * kernel when modifying a non-watched buffer that shares the + * page with a watched buffer. + */ + if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) + continue; +#endif +#endif /* illumos */ + if (size <= 4 * SPA_MINBLOCKSIZE) { + align = SPA_MINBLOCKSIZE; + } else if (IS_P2ALIGNED(size, p2 >> 2)) { + align = MIN(p2 >> 2, PAGESIZE); + } + + if (align != 0) { + char name[36]; + (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, NULL, cflags); + + /* + * Since zio_data bufs do not appear in crash dumps, we + * pass KMC_NOTOUCH so that no allocator metadata is + * stored with the buffers. + */ + (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); + zio_data_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, NULL, + cflags | KMC_NOTOUCH | KMC_NODEBUG); + } + } + + while (--c != 0) { + ASSERT(zio_buf_cache[c] != NULL); + if (zio_buf_cache[c - 1] == NULL) + zio_buf_cache[c - 1] = zio_buf_cache[c]; + + ASSERT(zio_data_buf_cache[c] != NULL); + if (zio_data_buf_cache[c - 1] == NULL) + zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; + } +out: + + zio_inject_init(); + + zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", + KSTAT_TYPE_NAMED, + sizeof(zio_trim_stats) / sizeof(kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (zio_trim_ksp != NULL) { + zio_trim_ksp->ks_data = &zio_trim_stats; + kstat_install(zio_trim_ksp); + } +} + +void +zio_fini(void) +{ + size_t c; + kmem_cache_t *last_cache = NULL; + kmem_cache_t *last_data_cache = NULL; + + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + if (zio_buf_cache[c] != last_cache) { + last_cache = zio_buf_cache[c]; + kmem_cache_destroy(zio_buf_cache[c]); + } + zio_buf_cache[c] = NULL; + + if (zio_data_buf_cache[c] != last_data_cache) { + last_data_cache = zio_data_buf_cache[c]; + kmem_cache_destroy(zio_data_buf_cache[c]); + } + zio_data_buf_cache[c] = NULL; + } + + kmem_cache_destroy(zio_link_cache); + kmem_cache_destroy(zio_cache); + + zio_inject_fini(); + + if (zio_trim_ksp != NULL) { + kstat_delete(zio_trim_ksp); + zio_trim_ksp = NULL; + } +} + +/* + * ========================================================================== + * Allocate and free I/O buffers + * ========================================================================== + */ + +/* + * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a + * crashdump if the kernel panics, so use it judiciously. Obviously, it's + * useful to inspect ZFS metadata, but if possible, we should avoid keeping + * excess / transient data in-core during a crashdump. + */ +void * +zio_buf_alloc(size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + int flags = zio_exclude_metadata ? KM_NODEBUG : 0; + + VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + if (zio_use_uma) + return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); + else + return (kmem_alloc(size, KM_SLEEP|flags)); +} + +/* + * Use zio_data_buf_alloc to allocate data. The data will not appear in a + * crashdump if the kernel panics. This exists so that we will limit the amount + * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount + * of kernel heap dumped to disk when the kernel panics) + */ +void * +zio_data_buf_alloc(size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + if (zio_use_uma) + return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); + else + return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); +} + +void +zio_buf_free(void *buf, size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + if (zio_use_uma) + kmem_cache_free(zio_buf_cache[c], buf); + else + kmem_free(buf, size); +} + +void +zio_data_buf_free(void *buf, size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + if (zio_use_uma) + kmem_cache_free(zio_data_buf_cache[c], buf); + else + kmem_free(buf, size); +} + +/* + * ========================================================================== + * Push and pop I/O transform buffers + * ========================================================================== + */ +void +zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, + zio_transform_func_t *transform) +{ + zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); + + /* + * Ensure that anyone expecting this zio to contain a linear ABD isn't + * going to get a nasty surprise when they try to access the data. + */ +#ifdef illumos + IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); +#else + IMPLY(zio->io_abd != NULL && abd_is_linear(zio->io_abd), + abd_is_linear(data)); +#endif + + zt->zt_orig_abd = zio->io_abd; + zt->zt_orig_size = zio->io_size; + zt->zt_bufsize = bufsize; + zt->zt_transform = transform; + + zt->zt_next = zio->io_transform_stack; + zio->io_transform_stack = zt; + + zio->io_abd = data; + zio->io_size = size; +} + +void +zio_pop_transforms(zio_t *zio) +{ + zio_transform_t *zt; + + while ((zt = zio->io_transform_stack) != NULL) { + if (zt->zt_transform != NULL) + zt->zt_transform(zio, + zt->zt_orig_abd, zt->zt_orig_size); + + if (zt->zt_bufsize != 0) + abd_free(zio->io_abd); + + zio->io_abd = zt->zt_orig_abd; + zio->io_size = zt->zt_orig_size; + zio->io_transform_stack = zt->zt_next; + + kmem_free(zt, sizeof (zio_transform_t)); + } +} + +/* + * ========================================================================== + * I/O transform callbacks for subblocks and decompression + * ========================================================================== + */ +static void +zio_subblock(zio_t *zio, abd_t *data, uint64_t size) +{ + ASSERT(zio->io_size > size); + + if (zio->io_type == ZIO_TYPE_READ) + abd_copy(data, zio->io_abd, size); +} + +static void +zio_decompress(zio_t *zio, abd_t *data, uint64_t size) +{ + if (zio->io_error == 0) { + void *tmp = abd_borrow_buf(data, size); + int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), + zio->io_abd, tmp, zio->io_size, size); + abd_return_buf_copy(data, tmp, size); + + if (ret != 0) + zio->io_error = SET_ERROR(EIO); + } +} + +/* + * ========================================================================== + * I/O parent/child relationships and pipeline interlocks + * ========================================================================== + */ +zio_t * +zio_walk_parents(zio_t *cio, zio_link_t **zl) +{ + list_t *pl = &cio->io_parent_list; + + *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); + if (*zl == NULL) + return (NULL); + + ASSERT((*zl)->zl_child == cio); + return ((*zl)->zl_parent); +} + +zio_t * +zio_walk_children(zio_t *pio, zio_link_t **zl) +{ + list_t *cl = &pio->io_child_list; + + ASSERT(MUTEX_HELD(&pio->io_lock)); + + *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); + if (*zl == NULL) + return (NULL); + + ASSERT((*zl)->zl_parent == pio); + return ((*zl)->zl_child); +} + +zio_t * +zio_unique_parent(zio_t *cio) +{ + zio_link_t *zl = NULL; + zio_t *pio = zio_walk_parents(cio, &zl); + + VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); + return (pio); +} + +void +zio_add_child(zio_t *pio, zio_t *cio) +{ + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); + + /* + * Logical I/Os can have logical, gang, or vdev children. + * Gang I/Os can have gang or vdev children. + * Vdev I/Os can only have vdev children. + * The following ASSERT captures all of these constraints. + */ + ASSERT3S(cio->io_child_type, <=, pio->io_child_type); + + zl->zl_parent = pio; + zl->zl_child = cio; + + mutex_enter(&pio->io_lock); + mutex_enter(&cio->io_lock); + + ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); + + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; + + list_insert_head(&pio->io_child_list, zl); + list_insert_head(&cio->io_parent_list, zl); + + pio->io_child_count++; + cio->io_parent_count++; + + mutex_exit(&cio->io_lock); + mutex_exit(&pio->io_lock); +} + +static void +zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) +{ + ASSERT(zl->zl_parent == pio); + ASSERT(zl->zl_child == cio); + + mutex_enter(&pio->io_lock); + mutex_enter(&cio->io_lock); + + list_remove(&pio->io_child_list, zl); + list_remove(&cio->io_parent_list, zl); + + pio->io_child_count--; + cio->io_parent_count--; + + mutex_exit(&cio->io_lock); + mutex_exit(&pio->io_lock); + kmem_cache_free(zio_link_cache, zl); +} + +static boolean_t +zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) +{ + boolean_t waiting = B_FALSE; + + mutex_enter(&zio->io_lock); + ASSERT(zio->io_stall == NULL); + for (int c = 0; c < ZIO_CHILD_TYPES; c++) { + if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) + continue; + + uint64_t *countp = &zio->io_children[c][wait]; + if (*countp != 0) { + zio->io_stage >>= 1; + ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); + zio->io_stall = countp; + waiting = B_TRUE; + break; + } + } + mutex_exit(&zio->io_lock); + return (waiting); +} + +static void +zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, + zio_t **next_to_executep) +{ + uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; + int *errorp = &pio->io_child_error[zio->io_child_type]; + + mutex_enter(&pio->io_lock); + if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) + *errorp = zio_worst_error(*errorp, zio->io_error); + pio->io_reexecute |= zio->io_reexecute; + ASSERT3U(*countp, >, 0); + + (*countp)--; + + if (*countp == 0 && pio->io_stall == countp) { + zio_taskq_type_t type = + pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : + ZIO_TASKQ_INTERRUPT; + pio->io_stall = NULL; + mutex_exit(&pio->io_lock); + + /* + * If we can tell the caller to execute this parent next, do + * so. Otherwise dispatch the parent zio as its own task. + * + * Having the caller execute the parent when possible reduces + * locking on the zio taskq's, reduces context switch + * overhead, and has no recursion penalty. Note that one + * read from disk typically causes at least 3 zio's: a + * zio_null(), the logical zio_read(), and then a physical + * zio. When the physical ZIO completes, we are able to call + * zio_done() on all 3 of these zio's from one invocation of + * zio_execute() by returning the parent back to + * zio_execute(). Since the parent isn't executed until this + * thread returns back to zio_execute(), the caller should do + * so promptly. + * + * In other cases, dispatching the parent prevents + * overflowing the stack when we have deeply nested + * parent-child relationships, as we do with the "mega zio" + * of writes for spa_sync(), and the chain of ZIL blocks. + */ + if (next_to_executep != NULL && *next_to_executep == NULL) { + *next_to_executep = pio; + } else { + zio_taskq_dispatch(pio, type, B_FALSE); + } + } else { + mutex_exit(&pio->io_lock); + } +} + +static void +zio_inherit_child_errors(zio_t *zio, enum zio_child c) +{ + if (zio->io_child_error[c] != 0 && zio->io_error == 0) + zio->io_error = zio->io_child_error[c]; +} + +int +zio_bookmark_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) + return (-1); + if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) + return (1); + + if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) + return (-1); + if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) + return (1); + + if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) + return (-1); + if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) + return (1); + + if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) + return (-1); + if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) + return (1); + + if (z1 < z2) + return (-1); + if (z1 > z2) + return (1); + + return (0); +} + +/* + * ========================================================================== + * Create the various types of I/O (read, write, free, etc) + * ========================================================================== + */ +static zio_t * +zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, + abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, + void *private, zio_type_t type, zio_priority_t priority, + enum zio_flag flags, vdev_t *vd, uint64_t offset, + const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) +{ + zio_t *zio; + + IMPLY(type != ZIO_TYPE_FREE, psize <= SPA_MAXBLOCKSIZE); + ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); + ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); + + ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); + ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); + ASSERT(vd || stage == ZIO_STAGE_OPEN); + + IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0); + + zio = kmem_cache_alloc(zio_cache, KM_SLEEP); + bzero(zio, sizeof (zio_t)); + + mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); + + list_create(&zio->io_parent_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_parent_node)); + list_create(&zio->io_child_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_child_node)); + metaslab_trace_init(&zio->io_alloc_list); + + if (vd != NULL) + zio->io_child_type = ZIO_CHILD_VDEV; + else if (flags & ZIO_FLAG_GANG_CHILD) + zio->io_child_type = ZIO_CHILD_GANG; + else if (flags & ZIO_FLAG_DDT_CHILD) + zio->io_child_type = ZIO_CHILD_DDT; + else + zio->io_child_type = ZIO_CHILD_LOGICAL; + + if (bp != NULL) { + zio->io_bp = (blkptr_t *)bp; + zio->io_bp_copy = *bp; + zio->io_bp_orig = *bp; + if (type != ZIO_TYPE_WRITE || + zio->io_child_type == ZIO_CHILD_DDT) + zio->io_bp = &zio->io_bp_copy; /* so caller can free */ + if (zio->io_child_type == ZIO_CHILD_LOGICAL) + zio->io_logical = zio; + if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) + pipeline |= ZIO_GANG_STAGES; + } + + zio->io_spa = spa; + zio->io_txg = txg; + zio->io_done = done; + zio->io_private = private; + zio->io_type = type; + zio->io_priority = priority; + zio->io_vd = vd; + zio->io_offset = offset; + zio->io_orig_abd = zio->io_abd = data; + zio->io_orig_size = zio->io_size = psize; + zio->io_lsize = lsize; + zio->io_orig_flags = zio->io_flags = flags; + zio->io_orig_stage = zio->io_stage = stage; + zio->io_orig_pipeline = zio->io_pipeline = pipeline; + zio->io_pipeline_trace = ZIO_STAGE_OPEN; + + zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); + zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); + + if (zb != NULL) + zio->io_bookmark = *zb; + + if (pio != NULL) { + if (zio->io_logical == NULL) + zio->io_logical = pio->io_logical; + if (zio->io_child_type == ZIO_CHILD_GANG) + zio->io_gang_leader = pio->io_gang_leader; + zio_add_child(pio, zio); + } + + return (zio); +} + +static void +zio_destroy(zio_t *zio) +{ + metaslab_trace_fini(&zio->io_alloc_list); + list_destroy(&zio->io_parent_list); + list_destroy(&zio->io_child_list); + mutex_destroy(&zio->io_lock); + cv_destroy(&zio->io_cv); + kmem_cache_free(zio_cache, zio); +} + +zio_t * +zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, + void *private, enum zio_flag flags) +{ + zio_t *zio; + + zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, + ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, + ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); + + return (zio); +} + +zio_t * +zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) +{ + return (zio_null(NULL, spa, NULL, done, private, flags)); +} + +void +zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) +{ + if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { + zfs_panic_recover("blkptr at %p has invalid TYPE %llu", + bp, (longlong_t)BP_GET_TYPE(bp)); + } + if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || + BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { + zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", + bp, (longlong_t)BP_GET_CHECKSUM(bp)); + } + if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || + BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { + zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", + bp, (longlong_t)BP_GET_COMPRESS(bp)); + } + if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { + zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", + bp, (longlong_t)BP_GET_LSIZE(bp)); + } + if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { + zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", + bp, (longlong_t)BP_GET_PSIZE(bp)); + } + + if (BP_IS_EMBEDDED(bp)) { + if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { + zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", + bp, (longlong_t)BPE_GET_ETYPE(bp)); + } + } + + /* + * Do not verify individual DVAs if the config is not trusted. This + * will be done once the zio is executed in vdev_mirror_map_alloc. + */ + if (!spa->spa_trust_config) + return; + + /* + * Pool-specific checks. + * + * Note: it would be nice to verify that the blk_birth and + * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() + * allows the birth time of log blocks (and dmu_sync()-ed blocks + * that are in the log) to be arbitrarily large. + */ + for (int i = 0; i < BP_GET_NDVAS(bp); i++) { + uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); + if (vdevid >= spa->spa_root_vdev->vdev_children) { + zfs_panic_recover("blkptr at %p DVA %u has invalid " + "VDEV %llu", + bp, i, (longlong_t)vdevid); + continue; + } + vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; + if (vd == NULL) { + zfs_panic_recover("blkptr at %p DVA %u has invalid " + "VDEV %llu", + bp, i, (longlong_t)vdevid); + continue; + } + if (vd->vdev_ops == &vdev_hole_ops) { + zfs_panic_recover("blkptr at %p DVA %u has hole " + "VDEV %llu", + bp, i, (longlong_t)vdevid); + continue; + } + if (vd->vdev_ops == &vdev_missing_ops) { + /* + * "missing" vdevs are valid during import, but we + * don't have their detailed info (e.g. asize), so + * we can't perform any more checks on them. + */ + continue; + } + uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); + uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); + if (BP_IS_GANG(bp)) + asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + if (offset + asize > vd->vdev_asize) { + zfs_panic_recover("blkptr at %p DVA %u has invalid " + "OFFSET %llu", + bp, i, (longlong_t)offset); + } + } +} + +boolean_t +zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) +{ + uint64_t vdevid = DVA_GET_VDEV(dva); + + if (vdevid >= spa->spa_root_vdev->vdev_children) + return (B_FALSE); + + vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; + if (vd == NULL) + return (B_FALSE); + + if (vd->vdev_ops == &vdev_hole_ops) + return (B_FALSE); + + if (vd->vdev_ops == &vdev_missing_ops) { + return (B_FALSE); + } + + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t asize = DVA_GET_ASIZE(dva); + + if (BP_IS_GANG(bp)) + asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + if (offset + asize > vd->vdev_asize) + return (B_FALSE); + + return (B_TRUE); +} + +zio_t * +zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, + zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) +{ + zio_t *zio; + + zfs_blkptr_verify(spa, bp); + + zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, + data, size, size, done, private, + ZIO_TYPE_READ, priority, flags, NULL, 0, zb, + ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? + ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); + + return (zio); +} + +zio_t * +zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, + zio_done_func_t *ready, zio_done_func_t *children_ready, + zio_done_func_t *physdone, zio_done_func_t *done, + void *private, zio_priority_t priority, enum zio_flag flags, + const zbookmark_phys_t *zb) +{ + zio_t *zio; + + ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && + zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && + zp->zp_compress >= ZIO_COMPRESS_OFF && + zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && + DMU_OT_IS_VALID(zp->zp_type) && + zp->zp_level < 32 && + zp->zp_copies > 0 && + zp->zp_copies <= spa_max_replication(spa)); + + zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, + ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, + ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? + ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); + + zio->io_ready = ready; + zio->io_children_ready = children_ready; + zio->io_physdone = physdone; + zio->io_prop = *zp; + + /* + * Data can be NULL if we are going to call zio_write_override() to + * provide the already-allocated BP. But we may need the data to + * verify a dedup hit (if requested). In this case, don't try to + * dedup (just take the already-allocated BP verbatim). + */ + if (data == NULL && zio->io_prop.zp_dedup_verify) { + zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; + } + + return (zio); +} + +zio_t * +zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, + uint64_t size, zio_done_func_t *done, void *private, + zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) +{ + zio_t *zio; + + zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, + ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, + ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); + + return (zio); +} + +void +zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) +{ + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(zio->io_stage == ZIO_STAGE_OPEN); + ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); + + /* + * We must reset the io_prop to match the values that existed + * when the bp was first written by dmu_sync() keeping in mind + * that nopwrite and dedup are mutually exclusive. + */ + zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; + zio->io_prop.zp_nopwrite = nopwrite; + zio->io_prop.zp_copies = copies; + zio->io_bp_override = bp; +} + +void +zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) +{ + + zfs_blkptr_verify(spa, bp); + + /* + * The check for EMBEDDED is a performance optimization. We + * process the free here (by ignoring it) rather than + * putting it on the list and then processing it in zio_free_sync(). + */ + if (BP_IS_EMBEDDED(bp)) + return; + metaslab_check_free(spa, bp); + + /* + * Frees that are for the currently-syncing txg, are not going to be + * deferred, and which will not need to do a read (i.e. not GANG or + * DEDUP), can be processed immediately. Otherwise, put them on the + * in-memory list for later processing. + */ + if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || + txg != spa->spa_syncing_txg || + spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { + bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); + } else { + VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, + BP_GET_PSIZE(bp), 0))); + } +} + +zio_t * +zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, + uint64_t size, enum zio_flag flags) +{ + zio_t *zio; + enum zio_stage stage = ZIO_FREE_PIPELINE; + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(spa_syncing_txg(spa) == txg); + ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); + + if (BP_IS_EMBEDDED(bp)) + return (zio_null(pio, spa, NULL, NULL, NULL, 0)); + + metaslab_check_free(spa, bp); + arc_freed(spa, bp); + dsl_scan_freed(spa, bp); + + if (zfs_trim_enabled) + stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | + ZIO_STAGE_VDEV_IO_ASSESS; + /* + * GANG and DEDUP blocks can induce a read (for the gang block header, + * or the DDT), so issue them asynchronously so that this thread is + * not tied up. + */ + else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) + stage |= ZIO_STAGE_ISSUE_ASYNC; + + flags |= ZIO_FLAG_DONT_QUEUE; + + zio = zio_create(pio, spa, txg, bp, NULL, size, + size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, + flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); + + return (zio); +} + +zio_t * +zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, + zio_done_func_t *done, void *private, enum zio_flag flags) +{ + zio_t *zio; + + zfs_blkptr_verify(spa, bp); + + if (BP_IS_EMBEDDED(bp)) + return (zio_null(pio, spa, NULL, NULL, NULL, 0)); + + /* + * A claim is an allocation of a specific block. Claims are needed + * to support immediate writes in the intent log. The issue is that + * immediate writes contain committed data, but in a txg that was + * *not* committed. Upon opening the pool after an unclean shutdown, + * the intent log claims all blocks that contain immediate write data + * so that the SPA knows they're in use. + * + * All claims *must* be resolved in the first txg -- before the SPA + * starts allocating blocks -- so that nothing is allocated twice. + * If txg == 0 we just verify that the block is claimable. + */ + ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, + spa_min_claim_txg(spa)); + ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); + ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ + + zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), + BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, + flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); + ASSERT0(zio->io_queued_timestamp); + + return (zio); +} + +zio_t * +zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, + zio_priority_t priority, enum zio_flag flags) +{ + zio_t *zio; + int c; + + if (vd->vdev_children == 0) { + zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, + ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, + ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); + + zio->io_cmd = cmd; + } else { + zio = zio_null(pio, spa, NULL, NULL, NULL, flags); + + for (c = 0; c < vd->vdev_children; c++) + zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, + offset, size, done, private, priority, flags)); + } + + return (zio); +} + +zio_t * +zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, + abd_t *data, int checksum, zio_done_func_t *done, void *private, + zio_priority_t priority, enum zio_flag flags, boolean_t labels) +{ + zio_t *zio; + + ASSERT(vd->vdev_children == 0); + ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT3U(offset + size, <=, vd->vdev_psize); + + zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, + private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, + offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); + + zio->io_prop.zp_checksum = checksum; + + return (zio); +} + +zio_t * +zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, + abd_t *data, int checksum, zio_done_func_t *done, void *private, + zio_priority_t priority, enum zio_flag flags, boolean_t labels) +{ + zio_t *zio; + + ASSERT(vd->vdev_children == 0); + ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT3U(offset + size, <=, vd->vdev_psize); + + zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, + private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, + offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); + + zio->io_prop.zp_checksum = checksum; + + if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { + /* + * zec checksums are necessarily destructive -- they modify + * the end of the write buffer to hold the verifier/checksum. + * Therefore, we must make a local copy in case the data is + * being written to multiple places in parallel. + */ + abd_t *wbuf = abd_alloc_sametype(data, size); + abd_copy(wbuf, data, size); + + zio_push_transform(zio, wbuf, size, size, NULL); + } + + return (zio); +} + +/* + * Create a child I/O to do some work for us. + */ +zio_t * +zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, + abd_t *data, uint64_t size, int type, zio_priority_t priority, + enum zio_flag flags, zio_done_func_t *done, void *private) +{ + enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; + zio_t *zio; + + /* + * vdev child I/Os do not propagate their error to the parent. + * Therefore, for correct operation the caller *must* check for + * and handle the error in the child i/o's done callback. + * The only exceptions are i/os that we don't care about + * (OPTIONAL or REPAIR). + */ + ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || + done != NULL); + + if (type == ZIO_TYPE_READ && bp != NULL) { + /* + * If we have the bp, then the child should perform the + * checksum and the parent need not. This pushes error + * detection as close to the leaves as possible and + * eliminates redundant checksums in the interior nodes. + */ + pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; + pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; + } + + /* Not all IO types require vdev io done stage e.g. free */ + if (type == ZIO_TYPE_FREE && + !(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) + pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; + + if (vd->vdev_ops->vdev_op_leaf) { + ASSERT0(vd->vdev_children); + offset += VDEV_LABEL_START_SIZE; + } + + flags |= ZIO_VDEV_CHILD_FLAGS(pio); + + /* + * If we've decided to do a repair, the write is not speculative -- + * even if the original read was. + */ + if (flags & ZIO_FLAG_IO_REPAIR) + flags &= ~ZIO_FLAG_SPECULATIVE; + + /* + * If we're creating a child I/O that is not associated with a + * top-level vdev, then the child zio is not an allocating I/O. + * If this is a retried I/O then we ignore it since we will + * have already processed the original allocating I/O. + */ + if (flags & ZIO_FLAG_IO_ALLOCATING && + (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { + metaslab_class_t *mc = spa_normal_class(pio->io_spa); + + ASSERT(mc->mc_alloc_throttle_enabled); + ASSERT(type == ZIO_TYPE_WRITE); + ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); + ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || + pio->io_child_type == ZIO_CHILD_GANG); + + flags &= ~ZIO_FLAG_IO_ALLOCATING; + } + + zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, + done, private, type, priority, flags, vd, offset, &pio->io_bookmark, + ZIO_STAGE_VDEV_IO_START >> 1, pipeline); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + + zio->io_physdone = pio->io_physdone; + if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) + zio->io_logical->io_phys_children++; + + return (zio); +} + +zio_t * +zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, + zio_type_t type, zio_priority_t priority, enum zio_flag flags, + zio_done_func_t *done, void *private) +{ + zio_t *zio; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + zio = zio_create(NULL, vd->vdev_spa, 0, NULL, + data, size, size, done, private, type, priority, + flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, + vd, offset, NULL, + ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); + + return (zio); +} + +void +zio_flush(zio_t *zio, vdev_t *vd) +{ + zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, + NULL, NULL, ZIO_PRIORITY_NOW, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); +} + +zio_t * +zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) +{ + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + return (zio_create(zio, spa, 0, NULL, NULL, size, size, NULL, NULL, + ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, + vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); +} + +void +zio_shrink(zio_t *zio, uint64_t size) +{ + ASSERT3P(zio->io_executor, ==, NULL); + ASSERT3P(zio->io_orig_size, ==, zio->io_size); + ASSERT3U(size, <=, zio->io_size); + + /* + * We don't shrink for raidz because of problems with the + * reconstruction when reading back less than the block size. + * Note, BP_IS_RAIDZ() assumes no compression. + */ + ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); + if (!BP_IS_RAIDZ(zio->io_bp)) { + /* we are not doing a raw write */ + ASSERT3U(zio->io_size, ==, zio->io_lsize); + zio->io_orig_size = zio->io_size = zio->io_lsize = size; + } +} + +/* + * ========================================================================== + * Prepare to read and write logical blocks + * ========================================================================== + */ + +static zio_t * +zio_read_bp_init(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && + zio->io_child_type == ZIO_CHILD_LOGICAL && + !(zio->io_flags & ZIO_FLAG_RAW)) { + uint64_t psize = + BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); + zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), + psize, psize, zio_decompress); + } + + if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + int psize = BPE_GET_PSIZE(bp); + void *data = abd_borrow_buf(zio->io_abd, psize); + decode_embedded_bp_compressed(bp, data); + abd_return_buf_copy(zio->io_abd, data, psize); + } else { + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); + } + + if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + + if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + + if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) + zio->io_pipeline = ZIO_DDT_READ_PIPELINE; + + return (zio); +} + +static zio_t * +zio_write_bp_init(zio_t *zio) +{ + if (!IO_IS_ALLOCATING(zio)) + return (zio); + + ASSERT(zio->io_child_type != ZIO_CHILD_DDT); + + if (zio->io_bp_override) { + blkptr_t *bp = zio->io_bp; + zio_prop_t *zp = &zio->io_prop; + + ASSERT(bp->blk_birth != zio->io_txg); + ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); + + *bp = *zio->io_bp_override; + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + if (BP_IS_EMBEDDED(bp)) + return (zio); + + /* + * If we've been overridden and nopwrite is set then + * set the flag accordingly to indicate that a nopwrite + * has already occurred. + */ + if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { + ASSERT(!zp->zp_dedup); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); + zio->io_flags |= ZIO_FLAG_NOPWRITE; + return (zio); + } + + ASSERT(!zp->zp_nopwrite); + + if (BP_IS_HOLE(bp) || !zp->zp_dedup) + return (zio); + + ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); + + if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { + BP_SET_DEDUP(bp, 1); + zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; + return (zio); + } + + /* + * We were unable to handle this as an override bp, treat + * it as a regular write I/O. + */ + zio->io_bp_override = NULL; + *bp = zio->io_bp_orig; + zio->io_pipeline = zio->io_orig_pipeline; + } + + return (zio); +} + +static zio_t * +zio_write_compress(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + zio_prop_t *zp = &zio->io_prop; + enum zio_compress compress = zp->zp_compress; + blkptr_t *bp = zio->io_bp; + uint64_t lsize = zio->io_lsize; + uint64_t psize = zio->io_size; + int pass = 1; + + EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); + + /* + * If our children haven't all reached the ready stage, + * wait for them and then repeat this pipeline stage. + */ + if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | + ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { + return (NULL); + } + + if (!IO_IS_ALLOCATING(zio)) + return (zio); + + if (zio->io_children_ready != NULL) { + /* + * Now that all our children are ready, run the callback + * associated with this zio in case it wants to modify the + * data to be written. + */ + ASSERT3U(zp->zp_level, >, 0); + zio->io_children_ready(zio); + } + + ASSERT(zio->io_child_type != ZIO_CHILD_DDT); + ASSERT(zio->io_bp_override == NULL); + + if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { + /* + * We're rewriting an existing block, which means we're + * working on behalf of spa_sync(). For spa_sync() to + * converge, it must eventually be the case that we don't + * have to allocate new blocks. But compression changes + * the blocksize, which forces a reallocate, and makes + * convergence take longer. Therefore, after the first + * few passes, stop compressing to ensure convergence. + */ + pass = spa_sync_pass(spa); + + ASSERT(zio->io_txg == spa_syncing_txg(spa)); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(!BP_GET_DEDUP(bp)); + + if (pass >= zfs_sync_pass_dont_compress) + compress = ZIO_COMPRESS_OFF; + + /* Make sure someone doesn't change their mind on overwrites */ + ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), + spa_max_replication(spa)) == BP_GET_NDVAS(bp)); + } + + /* If it's a compressed write that is not raw, compress the buffer. */ + if (compress != ZIO_COMPRESS_OFF && psize == lsize) { + void *cbuf = zio_buf_alloc(lsize); + psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); + if (psize == 0 || psize == lsize) { + compress = ZIO_COMPRESS_OFF; + zio_buf_free(cbuf, lsize); + } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && + zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && + spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { + encode_embedded_bp_compressed(bp, + cbuf, compress, lsize, psize); + BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); + BP_SET_TYPE(bp, zio->io_prop.zp_type); + BP_SET_LEVEL(bp, zio->io_prop.zp_level); + zio_buf_free(cbuf, lsize); + bp->blk_birth = zio->io_txg; + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + ASSERT(spa_feature_is_active(spa, + SPA_FEATURE_EMBEDDED_DATA)); + return (zio); + } else { + /* + * Round up compressed size up to the ashift + * of the smallest-ashift device, and zero the tail. + * This ensures that the compressed size of the BP + * (and thus compressratio property) are correct, + * in that we charge for the padding used to fill out + * the last sector. + */ + ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); + size_t rounded = (size_t)P2ROUNDUP(psize, + 1ULL << spa->spa_min_ashift); + if (rounded >= lsize) { + compress = ZIO_COMPRESS_OFF; + zio_buf_free(cbuf, lsize); + psize = lsize; + } else { + abd_t *cdata = abd_get_from_buf(cbuf, lsize); + abd_take_ownership_of_buf(cdata, B_TRUE); + abd_zero_off(cdata, psize, rounded - psize); + psize = rounded; + zio_push_transform(zio, cdata, + psize, lsize, NULL); + } + } + + /* + * We were unable to handle this as an override bp, treat + * it as a regular write I/O. + */ + zio->io_bp_override = NULL; + *bp = zio->io_bp_orig; + zio->io_pipeline = zio->io_orig_pipeline; + } else { + ASSERT3U(psize, !=, 0); + } + + /* + * The final pass of spa_sync() must be all rewrites, but the first + * few passes offer a trade-off: allocating blocks defers convergence, + * but newly allocated blocks are sequential, so they can be written + * to disk faster. Therefore, we allow the first few passes of + * spa_sync() to allocate new blocks, but force rewrites after that. + * There should only be a handful of blocks after pass 1 in any case. + */ + if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && + BP_GET_PSIZE(bp) == psize && + pass >= zfs_sync_pass_rewrite) { + ASSERT(psize != 0); + enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; + zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; + zio->io_flags |= ZIO_FLAG_IO_REWRITE; + } else { + BP_ZERO(bp); + zio->io_pipeline = ZIO_WRITE_PIPELINE; + } + + if (psize == 0) { + if (zio->io_bp_orig.blk_birth != 0 && + spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { + BP_SET_LSIZE(bp, lsize); + BP_SET_TYPE(bp, zp->zp_type); + BP_SET_LEVEL(bp, zp->zp_level); + BP_SET_BIRTH(bp, zio->io_txg, 0); + } + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + } else { + ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); + BP_SET_LSIZE(bp, lsize); + BP_SET_TYPE(bp, zp->zp_type); + BP_SET_LEVEL(bp, zp->zp_level); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, compress); + BP_SET_CHECKSUM(bp, zp->zp_checksum); + BP_SET_DEDUP(bp, zp->zp_dedup); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + if (zp->zp_dedup) { + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); + zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; + } + if (zp->zp_nopwrite) { + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); + zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; + } + } + return (zio); +} + +static zio_t * +zio_free_bp_init(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + if (zio->io_child_type == ZIO_CHILD_LOGICAL) { + if (BP_GET_DEDUP(bp)) + zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; + } + + ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); + + return (zio); +} + +/* + * ========================================================================== + * Execute the I/O pipeline + * ========================================================================== + */ + +static void +zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) +{ + spa_t *spa = zio->io_spa; + zio_type_t t = zio->io_type; + int flags = (cutinline ? TQ_FRONT : 0); + + ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); + + /* + * If we're a config writer or a probe, the normal issue and + * interrupt threads may all be blocked waiting for the config lock. + * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. + */ + if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) + t = ZIO_TYPE_NULL; + + /* + * A similar issue exists for the L2ARC write thread until L2ARC 2.0. + */ + if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) + t = ZIO_TYPE_NULL; + + /* + * If this is a high priority I/O, then use the high priority taskq if + * available. + */ + if (zio->io_priority == ZIO_PRIORITY_NOW && + spa->spa_zio_taskq[t][q + 1].stqs_count != 0) + q++; + + ASSERT3U(q, <, ZIO_TASKQ_TYPES); + + /* + * NB: We are assuming that the zio can only be dispatched + * to a single taskq at a time. It would be a grievous error + * to dispatch the zio to another taskq at the same time. + */ +#if defined(illumos) || !defined(_KERNEL) + ASSERT(zio->io_tqent.tqent_next == NULL); +#else + ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); +#endif + spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, + flags, &zio->io_tqent); +} + +static boolean_t +zio_taskq_member(zio_t *zio, zio_taskq_type_t q) +{ + kthread_t *executor = zio->io_executor; + spa_t *spa = zio->io_spa; + + for (zio_type_t t = 0; t < ZIO_TYPES; t++) { + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + uint_t i; + for (i = 0; i < tqs->stqs_count; i++) { + if (taskq_member(tqs->stqs_taskq[i], executor)) + return (B_TRUE); + } + } + + return (B_FALSE); +} + +static zio_t * +zio_issue_async(zio_t *zio) +{ + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); + + return (NULL); +} + +void +zio_interrupt(zio_t *zio) +{ + zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); +} + +void +zio_delay_interrupt(zio_t *zio) +{ + /* + * The timeout_generic() function isn't defined in userspace, so + * rather than trying to implement the function, the zio delay + * functionality has been disabled for userspace builds. + */ + +#ifdef _KERNEL + /* + * If io_target_timestamp is zero, then no delay has been registered + * for this IO, thus jump to the end of this function and "skip" the + * delay; issuing it directly to the zio layer. + */ + if (zio->io_target_timestamp != 0) { + hrtime_t now = gethrtime(); + + if (now >= zio->io_target_timestamp) { + /* + * This IO has already taken longer than the target + * delay to complete, so we don't want to delay it + * any longer; we "miss" the delay and issue it + * directly to the zio layer. This is likely due to + * the target latency being set to a value less than + * the underlying hardware can satisfy (e.g. delay + * set to 1ms, but the disks take 10ms to complete an + * IO request). + */ + + DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, + hrtime_t, now); + + zio_interrupt(zio); + } else { + hrtime_t diff = zio->io_target_timestamp - now; + + DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, + hrtime_t, now, hrtime_t, diff); + + (void) timeout_generic(CALLOUT_NORMAL, + (void (*)(void *))zio_interrupt, zio, diff, 1, 0); + } + + return; + } +#endif + + DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); + zio_interrupt(zio); +} + +/* + * Execute the I/O pipeline until one of the following occurs: + * + * (1) the I/O completes + * (2) the pipeline stalls waiting for dependent child I/Os + * (3) the I/O issues, so we're waiting for an I/O completion interrupt + * (4) the I/O is delegated by vdev-level caching or aggregation + * (5) the I/O is deferred due to vdev-level queueing + * (6) the I/O is handed off to another thread. + * + * In all cases, the pipeline stops whenever there's no CPU work; it never + * burns a thread in cv_wait(). + * + * There's no locking on io_stage because there's no legitimate way + * for multiple threads to be attempting to process the same I/O. + */ +static zio_pipe_stage_t *zio_pipeline[]; + +void +zio_execute(zio_t *zio) +{ + ASSERT3U(zio->io_queued_timestamp, >, 0); + + while (zio->io_stage < ZIO_STAGE_DONE) { + enum zio_stage pipeline = zio->io_pipeline; + enum zio_stage stage = zio->io_stage; + + zio->io_executor = curthread; + + ASSERT(!MUTEX_HELD(&zio->io_lock)); + ASSERT(ISP2(stage)); + ASSERT(zio->io_stall == NULL); + + do { + stage <<= 1; + } while ((stage & pipeline) == 0); + + ASSERT(stage <= ZIO_STAGE_DONE); + + /* + * If we are in interrupt context and this pipeline stage + * will grab a config lock that is held across I/O, + * or may wait for an I/O that needs an interrupt thread + * to complete, issue async to avoid deadlock. + * + * For VDEV_IO_START, we cut in line so that the io will + * be sent to disk promptly. + */ + if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && + zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { + boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? + zio_requeue_io_start_cut_in_line : B_FALSE; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); + return; + } + + zio->io_stage = stage; + zio->io_pipeline_trace |= zio->io_stage; + + /* + * The zio pipeline stage returns the next zio to execute + * (typically the same as this one), or NULL if we should + * stop. + */ + zio = zio_pipeline[highbit64(stage) - 1](zio); + + if (zio == NULL) + return; + } +} + +/* + * ========================================================================== + * Initiate I/O, either sync or async + * ========================================================================== + */ +int +zio_wait(zio_t *zio) +{ + int error; + + ASSERT3P(zio->io_stage, ==, ZIO_STAGE_OPEN); + ASSERT3P(zio->io_executor, ==, NULL); + + zio->io_waiter = curthread; + ASSERT0(zio->io_queued_timestamp); + zio->io_queued_timestamp = gethrtime(); + + zio_execute(zio); + + mutex_enter(&zio->io_lock); + while (zio->io_executor != NULL) + cv_wait(&zio->io_cv, &zio->io_lock); + mutex_exit(&zio->io_lock); + + error = zio->io_error; + zio_destroy(zio); + + return (error); +} + +void +zio_nowait(zio_t *zio) +{ + ASSERT3P(zio->io_executor, ==, NULL); + + if (zio->io_child_type == ZIO_CHILD_LOGICAL && + zio_unique_parent(zio) == NULL) { + /* + * This is a logical async I/O with no parent to wait for it. + * We add it to the spa_async_root_zio "Godfather" I/O which + * will ensure they complete prior to unloading the pool. + */ + spa_t *spa = zio->io_spa; + + zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); + } + + ASSERT0(zio->io_queued_timestamp); + zio->io_queued_timestamp = gethrtime(); + zio_execute(zio); +} + +/* + * ========================================================================== + * Reexecute, cancel, or suspend/resume failed I/O + * ========================================================================== + */ + +static void +zio_reexecute(zio_t *pio) +{ + zio_t *cio, *cio_next; + + ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); + ASSERT(pio->io_gang_leader == NULL); + ASSERT(pio->io_gang_tree == NULL); + + pio->io_flags = pio->io_orig_flags; + pio->io_stage = pio->io_orig_stage; + pio->io_pipeline = pio->io_orig_pipeline; + pio->io_reexecute = 0; + pio->io_flags |= ZIO_FLAG_REEXECUTED; + pio->io_pipeline_trace = 0; + pio->io_error = 0; + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_state[w] = 0; + for (int c = 0; c < ZIO_CHILD_TYPES; c++) + pio->io_child_error[c] = 0; + + if (IO_IS_ALLOCATING(pio)) + BP_ZERO(pio->io_bp); + + /* + * As we reexecute pio's children, new children could be created. + * New children go to the head of pio's io_child_list, however, + * so we will (correctly) not reexecute them. The key is that + * the remainder of pio's io_child_list, from 'cio_next' onward, + * cannot be affected by any side effects of reexecuting 'cio'. + */ + zio_link_t *zl = NULL; + mutex_enter(&pio->io_lock); + for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio, &zl); + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_children[cio->io_child_type][w]++; + mutex_exit(&pio->io_lock); + zio_reexecute(cio); + mutex_enter(&pio->io_lock); + } + mutex_exit(&pio->io_lock); + + /* + * Now that all children have been reexecuted, execute the parent. + * We don't reexecute "The Godfather" I/O here as it's the + * responsibility of the caller to wait on it. + */ + if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { + pio->io_queued_timestamp = gethrtime(); + zio_execute(pio); + } +} + +void +zio_suspend(spa_t *spa, zio_t *zio) +{ + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) + fm_panic("Pool '%s' has encountered an uncorrectable I/O " + "failure and the failure mode property for this pool " + "is set to panic.", spa_name(spa)); + + zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); + + mutex_enter(&spa->spa_suspend_lock); + + if (spa->spa_suspend_zio_root == NULL) + spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + + spa->spa_suspended = B_TRUE; + + if (zio != NULL) { + ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); + ASSERT(zio != spa->spa_suspend_zio_root); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(zio_unique_parent(zio) == NULL); + ASSERT(zio->io_stage == ZIO_STAGE_DONE); + zio_add_child(spa->spa_suspend_zio_root, zio); + } + + mutex_exit(&spa->spa_suspend_lock); +} + +int +zio_resume(spa_t *spa) +{ + zio_t *pio; + + /* + * Reexecute all previously suspended i/o. + */ + mutex_enter(&spa->spa_suspend_lock); + spa->spa_suspended = B_FALSE; + cv_broadcast(&spa->spa_suspend_cv); + pio = spa->spa_suspend_zio_root; + spa->spa_suspend_zio_root = NULL; + mutex_exit(&spa->spa_suspend_lock); + + if (pio == NULL) + return (0); + + zio_reexecute(pio); + return (zio_wait(pio)); +} + +void +zio_resume_wait(spa_t *spa) +{ + mutex_enter(&spa->spa_suspend_lock); + while (spa_suspended(spa)) + cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); + mutex_exit(&spa->spa_suspend_lock); +} + +/* + * ========================================================================== + * Gang blocks. + * + * A gang block is a collection of small blocks that looks to the DMU + * like one large block. When zio_dva_allocate() cannot find a block + * of the requested size, due to either severe fragmentation or the pool + * being nearly full, it calls zio_write_gang_block() to construct the + * block from smaller fragments. + * + * A gang block consists of a gang header (zio_gbh_phys_t) and up to + * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like + * an indirect block: it's an array of block pointers. It consumes + * only one sector and hence is allocatable regardless of fragmentation. + * The gang header's bps point to its gang members, which hold the data. + * + * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> + * as the verifier to ensure uniqueness of the SHA256 checksum. + * Critically, the gang block bp's blk_cksum is the checksum of the data, + * not the gang header. This ensures that data block signatures (needed for + * deduplication) are independent of how the block is physically stored. + * + * Gang blocks can be nested: a gang member may itself be a gang block. + * Thus every gang block is a tree in which root and all interior nodes are + * gang headers, and the leaves are normal blocks that contain user data. + * The root of the gang tree is called the gang leader. + * + * To perform any operation (read, rewrite, free, claim) on a gang block, + * zio_gang_assemble() first assembles the gang tree (minus data leaves) + * in the io_gang_tree field of the original logical i/o by recursively + * reading the gang leader and all gang headers below it. This yields + * an in-core tree containing the contents of every gang header and the + * bps for every constituent of the gang block. + * + * With the gang tree now assembled, zio_gang_issue() just walks the gang tree + * and invokes a callback on each bp. To free a gang block, zio_gang_issue() + * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. + * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). + * zio_read_gang() is a wrapper around zio_read() that omits reading gang + * headers, since we already have those in io_gang_tree. zio_rewrite_gang() + * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() + * of the gang header plus zio_checksum_compute() of the data to update the + * gang header's blk_cksum as described above. + * + * The two-phase assemble/issue model solves the problem of partial failure -- + * what if you'd freed part of a gang block but then couldn't read the + * gang header for another part? Assembling the entire gang tree first + * ensures that all the necessary gang header I/O has succeeded before + * starting the actual work of free, claim, or write. Once the gang tree + * is assembled, free and claim are in-memory operations that cannot fail. + * + * In the event that a gang write fails, zio_dva_unallocate() walks the + * gang tree to immediately free (i.e. insert back into the space map) + * everything we've allocated. This ensures that we don't get ENOSPC + * errors during repeated suspend/resume cycles due to a flaky device. + * + * Gang rewrites only happen during sync-to-convergence. If we can't assemble + * the gang tree, we won't modify the block, so we can safely defer the free + * (knowing that the block is still intact). If we *can* assemble the gang + * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free + * each constituent bp and we can allocate a new block on the next sync pass. + * + * In all cases, the gang tree allows complete recovery from partial failure. + * ========================================================================== + */ + +static void +zio_gang_issue_func_done(zio_t *zio) +{ + abd_put(zio->io_abd); +} + +static zio_t * +zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) +{ + if (gn != NULL) + return (pio); + + return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), + BP_GET_PSIZE(bp), zio_gang_issue_func_done, + NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark)); +} + +static zio_t * +zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) +{ + zio_t *zio; + + if (gn != NULL) { + abd_t *gbh_abd = + abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); + zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, + gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark); + /* + * As we rewrite each gang header, the pipeline will compute + * a new gang block header checksum for it; but no one will + * compute a new data checksum, so we do that here. The one + * exception is the gang leader: the pipeline already computed + * its data checksum because that stage precedes gang assembly. + * (Presently, nothing actually uses interior data checksums; + * this is just good hygiene.) + */ + if (gn != pio->io_gang_leader->io_gang_tree) { + abd_t *buf = abd_get_offset(data, offset); + + zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), + buf, BP_GET_PSIZE(bp)); + + abd_put(buf); + } + /* + * If we are here to damage data for testing purposes, + * leave the GBH alone so that we can detect the damage. + */ + if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) + zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; + } else { + zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, + abd_get_offset(data, offset), BP_GET_PSIZE(bp), + zio_gang_issue_func_done, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + } + + return (zio); +} + +/* ARGSUSED */ +static zio_t * +zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) +{ + return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, + BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), + ZIO_GANG_CHILD_FLAGS(pio))); +} + +/* ARGSUSED */ +static zio_t * +zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) +{ + return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, + NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); +} + +static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { + NULL, + zio_read_gang, + zio_rewrite_gang, + zio_free_gang, + zio_claim_gang, + NULL +}; + +static void zio_gang_tree_assemble_done(zio_t *zio); + +static zio_gang_node_t * +zio_gang_node_alloc(zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn; + + ASSERT(*gnpp == NULL); + + gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); + gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); + *gnpp = gn; + + return (gn); +} + +static void +zio_gang_node_free(zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn = *gnpp; + + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + ASSERT(gn->gn_child[g] == NULL); + + zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); + kmem_free(gn, sizeof (*gn)); + *gnpp = NULL; +} + +static void +zio_gang_tree_free(zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn = *gnpp; + + if (gn == NULL) + return; + + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + zio_gang_tree_free(&gn->gn_child[g]); + + zio_gang_node_free(gnpp); +} + +static void +zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); + abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); + + ASSERT(gio->io_gang_leader == gio); + ASSERT(BP_IS_GANG(bp)); + + zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_gang_tree_assemble_done, gn, gio->io_priority, + ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); +} + +static void +zio_gang_tree_assemble_done(zio_t *zio) +{ + zio_t *gio = zio->io_gang_leader; + zio_gang_node_t *gn = zio->io_private; + blkptr_t *bp = zio->io_bp; + + ASSERT(gio == zio_unique_parent(zio)); + ASSERT(zio->io_child_count == 0); + + if (zio->io_error) + return; + + /* this ABD was created from a linear buf in zio_gang_tree_assemble */ + if (BP_SHOULD_BYTESWAP(bp)) + byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); + + ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); + ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); + ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + + abd_put(zio->io_abd); + + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + if (!BP_IS_GANG(gbp)) + continue; + zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); + } +} + +static void +zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, + uint64_t offset) +{ + zio_t *gio = pio->io_gang_leader; + zio_t *zio; + + ASSERT(BP_IS_GANG(bp) == !!gn); + ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); + ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); + + /* + * If you're a gang header, your data is in gn->gn_gbh. + * If you're a gang member, your data is in 'data' and gn == NULL. + */ + zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); + + if (gn != NULL) { + ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + if (BP_IS_HOLE(gbp)) + continue; + zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, + offset); + offset += BP_GET_PSIZE(gbp); + } + } + + if (gn == gio->io_gang_tree && gio->io_abd != NULL) + ASSERT3U(gio->io_size, ==, offset); + + if (zio != pio) + zio_nowait(zio); +} + +static zio_t * +zio_gang_assemble(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + + zio->io_gang_leader = zio; + + zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); + + return (zio); +} + +static zio_t * +zio_gang_issue(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { + return (NULL); + } + + ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + + if (zio->io_child_error[ZIO_CHILD_GANG] == 0) + zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, + 0); + else + zio_gang_tree_free(&zio->io_gang_tree); + + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + return (zio); +} + +static void +zio_write_gang_member_ready(zio_t *zio) +{ + zio_t *pio = zio_unique_parent(zio); + zio_t *gio = zio->io_gang_leader; + dva_t *cdva = zio->io_bp->blk_dva; + dva_t *pdva = pio->io_bp->blk_dva; + uint64_t asize; + + if (BP_IS_HOLE(zio->io_bp)) + return; + + ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); + + ASSERT(zio->io_child_type == ZIO_CHILD_GANG); + ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); + ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); + ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); + ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); + + mutex_enter(&pio->io_lock); + for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { + ASSERT(DVA_GET_GANG(&pdva[d])); + asize = DVA_GET_ASIZE(&pdva[d]); + asize += DVA_GET_ASIZE(&cdva[d]); + DVA_SET_ASIZE(&pdva[d], asize); + } + mutex_exit(&pio->io_lock); +} + +static void +zio_write_gang_done(zio_t *zio) +{ + /* + * The io_abd field will be NULL for a zio with no data. The io_flags + * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't + * check for it here as it is cleared in zio_ready. + */ + if (zio->io_abd != NULL) + abd_put(zio->io_abd); +} + +static zio_t * +zio_write_gang_block(zio_t *pio) +{ + spa_t *spa = pio->io_spa; + metaslab_class_t *mc = spa_normal_class(spa); + blkptr_t *bp = pio->io_bp; + zio_t *gio = pio->io_gang_leader; + zio_t *zio; + zio_gang_node_t *gn, **gnpp; + zio_gbh_phys_t *gbh; + abd_t *gbh_abd; + uint64_t txg = pio->io_txg; + uint64_t resid = pio->io_size; + uint64_t lsize; + int copies = gio->io_prop.zp_copies; + int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); + zio_prop_t zp; + int error; + boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); + + int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(has_data); + + flags |= METASLAB_ASYNC_ALLOC; + VERIFY(refcount_held(&mc->mc_alloc_slots[pio->io_allocator], + pio)); + + /* + * The logical zio has already placed a reservation for + * 'copies' allocation slots but gang blocks may require + * additional copies. These additional copies + * (i.e. gbh_copies - copies) are guaranteed to succeed + * since metaslab_class_throttle_reserve() always allows + * additional reservations for gang blocks. + */ + VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, + pio->io_allocator, pio, flags)); + } + + error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, + bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, + &pio->io_alloc_list, pio, pio->io_allocator); + if (error) { + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(has_data); + + /* + * If we failed to allocate the gang block header then + * we remove any additional allocation reservations that + * we placed here. The original reservation will + * be removed when the logical I/O goes to the ready + * stage. + */ + metaslab_class_throttle_unreserve(mc, + gbh_copies - copies, pio->io_allocator, pio); + } + pio->io_error = error; + return (pio); + } + + if (pio == gio) { + gnpp = &gio->io_gang_tree; + } else { + gnpp = pio->io_private; + ASSERT(pio->io_ready == zio_write_gang_member_ready); + } + + gn = zio_gang_node_alloc(gnpp); + gbh = gn->gn_gbh; + bzero(gbh, SPA_GANGBLOCKSIZE); + gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); + + /* + * Create the gang header. + */ + zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_write_gang_done, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + + /* + * Create and nowait the gang children. + */ + for (int g = 0; resid != 0; resid -= lsize, g++) { + lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), + SPA_MINBLOCKSIZE); + ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); + + zp.zp_checksum = gio->io_prop.zp_checksum; + zp.zp_compress = ZIO_COMPRESS_OFF; + zp.zp_type = DMU_OT_NONE; + zp.zp_level = 0; + zp.zp_copies = gio->io_prop.zp_copies; + zp.zp_dedup = B_FALSE; + zp.zp_dedup_verify = B_FALSE; + zp.zp_nopwrite = B_FALSE; + + zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], + has_data ? abd_get_offset(pio->io_abd, pio->io_size - + resid) : NULL, lsize, lsize, &zp, + zio_write_gang_member_ready, NULL, NULL, + zio_write_gang_done, &gn->gn_child[g], pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(has_data); + + /* + * Gang children won't throttle but we should + * account for their work, so reserve an allocation + * slot for them here. + */ + VERIFY(metaslab_class_throttle_reserve(mc, + zp.zp_copies, cio->io_allocator, cio, flags)); + } + zio_nowait(cio); + } + + /* + * Set pio's pipeline to just wait for zio to finish. + */ + pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + zio_nowait(zio); + + return (pio); +} + +/* + * The zio_nop_write stage in the pipeline determines if allocating a + * new bp is necessary. The nopwrite feature can handle writes in + * either syncing or open context (i.e. zil writes) and as a result is + * mutually exclusive with dedup. + * + * By leveraging a cryptographically secure checksum, such as SHA256, we + * can compare the checksums of the new data and the old to determine if + * allocating a new block is required. Note that our requirements for + * cryptographic strength are fairly weak: there can't be any accidental + * hash collisions, but we don't need to be secure against intentional + * (malicious) collisions. To trigger a nopwrite, you have to be able + * to write the file to begin with, and triggering an incorrect (hash + * collision) nopwrite is no worse than simply writing to the file. + * That said, there are no known attacks against the checksum algorithms + * used for nopwrite, assuming that the salt and the checksums + * themselves remain secret. + */ +static zio_t * +zio_nop_write(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + zio_prop_t *zp = &zio->io_prop; + + ASSERT(BP_GET_LEVEL(bp) == 0); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); + ASSERT(zp->zp_nopwrite); + ASSERT(!zp->zp_dedup); + ASSERT(zio->io_bp_override == NULL); + ASSERT(IO_IS_ALLOCATING(zio)); + + /* + * Check to see if the original bp and the new bp have matching + * characteristics (i.e. same checksum, compression algorithms, etc). + * If they don't then just continue with the pipeline which will + * allocate a new bp. + */ + if (BP_IS_HOLE(bp_orig) || + !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE) || + BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || + BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || + BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || + zp->zp_copies != BP_GET_NDVAS(bp_orig)) + return (zio); + + /* + * If the checksums match then reset the pipeline so that we + * avoid allocating a new bp and issuing any I/O. + */ + if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { + ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE); + ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); + ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); + ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); + ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, + sizeof (uint64_t)) == 0); + + *bp = *bp_orig; + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + zio->io_flags |= ZIO_FLAG_NOPWRITE; + } + + return (zio); +} + +/* + * ========================================================================== + * Dedup + * ========================================================================== + */ +static void +zio_ddt_child_read_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp; + zio_t *pio = zio_unique_parent(zio); + + mutex_enter(&pio->io_lock); + ddp = ddt_phys_select(dde, bp); + if (zio->io_error == 0) + ddt_phys_clear(ddp); /* this ddp doesn't need repair */ + + if (zio->io_error == 0 && dde->dde_repair_abd == NULL) + dde->dde_repair_abd = zio->io_abd; + else + abd_free(zio->io_abd); + mutex_exit(&pio->io_lock); +} + +static zio_t * +zio_ddt_read_start(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(BP_GET_PSIZE(bp) == zio->io_size); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + if (zio->io_child_error[ZIO_CHILD_DDT]) { + ddt_t *ddt = ddt_select(zio->io_spa, bp); + ddt_entry_t *dde = ddt_repair_start(ddt, bp); + ddt_phys_t *ddp = dde->dde_phys; + ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); + blkptr_t blk; + + ASSERT(zio->io_vsd == NULL); + zio->io_vsd = dde; + + if (ddp_self == NULL) + return (zio); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) + continue; + ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, + &blk); + zio_nowait(zio_read(zio, zio->io_spa, &blk, + abd_alloc_for_io(zio->io_size, B_TRUE), + zio->io_size, zio_ddt_child_read_done, dde, + zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | + ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); + } + return (zio); + } + + zio_nowait(zio_read(zio, zio->io_spa, bp, + zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); + + return (zio); +} + +static zio_t * +zio_ddt_read_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { + return (NULL); + } + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(BP_GET_PSIZE(bp) == zio->io_size); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + if (zio->io_child_error[ZIO_CHILD_DDT]) { + ddt_t *ddt = ddt_select(zio->io_spa, bp); + ddt_entry_t *dde = zio->io_vsd; + if (ddt == NULL) { + ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); + return (zio); + } + if (dde == NULL) { + zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); + return (NULL); + } + if (dde->dde_repair_abd != NULL) { + abd_copy(zio->io_abd, dde->dde_repair_abd, + zio->io_size); + zio->io_child_error[ZIO_CHILD_DDT] = 0; + } + ddt_repair_done(ddt, dde); + zio->io_vsd = NULL; + } + + ASSERT(zio->io_vsd == NULL); + + return (zio); +} + +static boolean_t +zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) +{ + spa_t *spa = zio->io_spa; + boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW); + + /* We should never get a raw, override zio */ + ASSERT(!(zio->io_bp_override && do_raw)); + + /* + * Note: we compare the original data, not the transformed data, + * because when zio->io_bp is an override bp, we will not have + * pushed the I/O transforms. That's an important optimization + * because otherwise we'd compress/encrypt all dmu_sync() data twice. + */ + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + zio_t *lio = dde->dde_lead_zio[p]; + + if (lio != NULL) { + return (lio->io_orig_size != zio->io_orig_size || + abd_cmp(zio->io_orig_abd, lio->io_orig_abd, + zio->io_orig_size) != 0); + } + } + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + ddt_phys_t *ddp = &dde->dde_phys[p]; + + if (ddp->ddp_phys_birth != 0) { + arc_buf_t *abuf = NULL; + arc_flags_t aflags = ARC_FLAG_WAIT; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; + blkptr_t blk = *zio->io_bp; + int error; + + ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + + ddt_exit(ddt); + + /* + * Intuitively, it would make more sense to compare + * io_abd than io_orig_abd in the raw case since you + * don't want to look at any transformations that have + * happened to the data. However, for raw I/Os the + * data will actually be the same in io_abd and + * io_orig_abd, so all we have to do is issue this as + * a raw ARC read. + */ + if (do_raw) { + zio_flags |= ZIO_FLAG_RAW; + ASSERT3U(zio->io_size, ==, zio->io_orig_size); + ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd, + zio->io_size)); + ASSERT3P(zio->io_transform_stack, ==, NULL); + } + + error = arc_read(NULL, spa, &blk, + arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, + zio_flags, &aflags, &zio->io_bookmark); + + if (error == 0) { + if (arc_buf_size(abuf) != zio->io_orig_size || + abd_cmp_buf(zio->io_orig_abd, abuf->b_data, + zio->io_orig_size) != 0) + error = SET_ERROR(EEXIST); + arc_buf_destroy(abuf, &abuf); + } + + ddt_enter(ddt); + return (error != 0); + } + } + + return (B_FALSE); +} + +static void +zio_ddt_child_write_ready(zio_t *zio) +{ + int p = zio->io_prop.zp_copies; + ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp = &dde->dde_phys[p]; + zio_t *pio; + + if (zio->io_error) + return; + + ddt_enter(ddt); + + ASSERT(dde->dde_lead_zio[p] == zio); + + ddt_phys_fill(ddp, zio->io_bp); + + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) + ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); + + ddt_exit(ddt); +} + +static void +zio_ddt_child_write_done(zio_t *zio) +{ + int p = zio->io_prop.zp_copies; + ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp = &dde->dde_phys[p]; + + ddt_enter(ddt); + + ASSERT(ddp->ddp_refcnt == 0); + ASSERT(dde->dde_lead_zio[p] == zio); + dde->dde_lead_zio[p] = NULL; + + if (zio->io_error == 0) { + zio_link_t *zl = NULL; + while (zio_walk_parents(zio, &zl) != NULL) + ddt_phys_addref(ddp); + } else { + ddt_phys_clear(ddp); + } + + ddt_exit(ddt); +} + +static void +zio_ddt_ditto_write_done(zio_t *zio) +{ + int p = DDT_PHYS_DITTO; + zio_prop_t *zp = &zio->io_prop; + blkptr_t *bp = zio->io_bp; + ddt_t *ddt = ddt_select(zio->io_spa, bp); + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp = &dde->dde_phys[p]; + ddt_key_t *ddk = &dde->dde_key; + + ddt_enter(ddt); + + ASSERT(ddp->ddp_refcnt == 0); + ASSERT(dde->dde_lead_zio[p] == zio); + dde->dde_lead_zio[p] = NULL; + + if (zio->io_error == 0) { + ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); + ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); + ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); + if (ddp->ddp_phys_birth != 0) + ddt_phys_free(ddt, ddk, ddp, zio->io_txg); + ddt_phys_fill(ddp, bp); + } + + ddt_exit(ddt); +} + +static zio_t * +zio_ddt_write(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + uint64_t txg = zio->io_txg; + zio_prop_t *zp = &zio->io_prop; + int p = zp->zp_copies; + int ditto_copies; + zio_t *cio = NULL; + zio_t *dio = NULL; + ddt_t *ddt = ddt_select(spa, bp); + ddt_entry_t *dde; + ddt_phys_t *ddp; + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); + ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); + ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); + + ddt_enter(ddt); + dde = ddt_lookup(ddt, bp, B_TRUE); + ddp = &dde->dde_phys[p]; + + if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { + /* + * If we're using a weak checksum, upgrade to a strong checksum + * and try again. If we're already using a strong checksum, + * we can't resolve it, so just convert to an ordinary write. + * (And automatically e-mail a paper to Nature?) + */ + if (!(zio_checksum_table[zp->zp_checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP)) { + zp->zp_checksum = spa_dedup_checksum(spa); + zio_pop_transforms(zio); + zio->io_stage = ZIO_STAGE_OPEN; + BP_ZERO(bp); + } else { + zp->zp_dedup = B_FALSE; + BP_SET_DEDUP(bp, B_FALSE); + } + ASSERT(!BP_GET_DEDUP(bp)); + zio->io_pipeline = ZIO_WRITE_PIPELINE; + ddt_exit(ddt); + return (zio); + } + + ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); + ASSERT(ditto_copies < SPA_DVAS_PER_BP); + + if (ditto_copies > ddt_ditto_copies_present(dde) && + dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { + zio_prop_t czp = *zp; + + czp.zp_copies = ditto_copies; + + /* + * If we arrived here with an override bp, we won't have run + * the transform stack, so we won't have the data we need to + * generate a child i/o. So, toss the override bp and restart. + * This is safe, because using the override bp is just an + * optimization; and it's rare, so the cost doesn't matter. + */ + if (zio->io_bp_override) { + zio_pop_transforms(zio); + zio->io_stage = ZIO_STAGE_OPEN; + zio->io_pipeline = ZIO_WRITE_PIPELINE; + zio->io_bp_override = NULL; + BP_ZERO(bp); + ddt_exit(ddt); + return (zio); + } + + dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, + zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, + NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + + zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL); + dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; + } + + if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { + if (ddp->ddp_phys_birth != 0) + ddt_bp_fill(ddp, bp, txg); + if (dde->dde_lead_zio[p] != NULL) + zio_add_child(zio, dde->dde_lead_zio[p]); + else + ddt_phys_addref(ddp); + } else if (zio->io_bp_override) { + ASSERT(bp->blk_birth == txg); + ASSERT(BP_EQUAL(bp, zio->io_bp_override)); + ddt_phys_fill(ddp, bp); + ddt_phys_addref(ddp); + } else { + cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, + zio->io_orig_size, zio->io_orig_size, zp, + zio_ddt_child_write_ready, NULL, NULL, + zio_ddt_child_write_done, dde, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + + zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); + dde->dde_lead_zio[p] = cio; + } + + ddt_exit(ddt); + + if (cio) + zio_nowait(cio); + if (dio) + zio_nowait(dio); + + return (zio); +} + +ddt_entry_t *freedde; /* for debugging */ + +static zio_t * +zio_ddt_free(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + ddt_t *ddt = ddt_select(spa, bp); + ddt_entry_t *dde; + ddt_phys_t *ddp; + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + ddt_enter(ddt); + freedde = dde = ddt_lookup(ddt, bp, B_TRUE); + ddp = ddt_phys_select(dde, bp); + ddt_phys_decref(ddp); + ddt_exit(ddt); + + return (zio); +} + +/* + * ========================================================================== + * Allocate and free blocks + * ========================================================================== + */ + +static zio_t * +zio_io_to_allocate(spa_t *spa, int allocator) +{ + zio_t *zio; + + ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); + + zio = avl_first(&spa->spa_alloc_trees[allocator]); + if (zio == NULL) + return (NULL); + + ASSERT(IO_IS_ALLOCATING(zio)); + + /* + * Try to place a reservation for this zio. If we're unable to + * reserve then we throttle. + */ + ASSERT3U(zio->io_allocator, ==, allocator); + if (!metaslab_class_throttle_reserve(spa_normal_class(spa), + zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { + return (NULL); + } + + avl_remove(&spa->spa_alloc_trees[allocator], zio); + ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); + + return (zio); +} + +static zio_t * +zio_dva_throttle(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + zio_t *nio; + + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || + !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled || + zio->io_child_type == ZIO_CHILD_GANG || + zio->io_flags & ZIO_FLAG_NODATA) { + return (zio); + } + + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + + ASSERT3U(zio->io_queued_timestamp, >, 0); + ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); + + zbookmark_phys_t *bm = &zio->io_bookmark; + /* + * We want to try to use as many allocators as possible to help improve + * performance, but we also want logically adjacent IOs to be physically + * adjacent to improve sequential read performance. We chunk each object + * into 2^20 block regions, and then hash based on the objset, object, + * level, and region to accomplish both of these goals. + */ + zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, + bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; + mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); + + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); + + nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator); + mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); + + return (nio); +} + +void +zio_allocate_dispatch(spa_t *spa, int allocator) +{ + zio_t *zio; + + mutex_enter(&spa->spa_alloc_locks[allocator]); + zio = zio_io_to_allocate(spa, allocator); + mutex_exit(&spa->spa_alloc_locks[allocator]); + if (zio == NULL) + return; + + ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); + ASSERT0(zio->io_error); + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); +} + +static zio_t * +zio_dva_allocate(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + metaslab_class_t *mc = spa_normal_class(spa); + blkptr_t *bp = zio->io_bp; + int error; + int flags = 0; + + if (zio->io_gang_leader == NULL) { + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + zio->io_gang_leader = zio; + } + + ASSERT(BP_IS_HOLE(bp)); + ASSERT0(BP_GET_NDVAS(bp)); + ASSERT3U(zio->io_prop.zp_copies, >, 0); + ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); + ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); + + if (zio->io_flags & ZIO_FLAG_NODATA) { + flags |= METASLAB_DONT_THROTTLE; + } + if (zio->io_flags & ZIO_FLAG_GANG_CHILD) { + flags |= METASLAB_GANG_CHILD; + } + if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) { + flags |= METASLAB_ASYNC_ALLOC; + } + + error = metaslab_alloc(spa, mc, zio->io_size, bp, + zio->io_prop.zp_copies, zio->io_txg, NULL, flags, + &zio->io_alloc_list, zio, zio->io_allocator); + + if (error != 0) { + zfs_dbgmsg("%s: metaslab allocation failure: zio %p, " + "size %llu, error %d", spa_name(spa), zio, zio->io_size, + error); + if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) + return (zio_write_gang_block(zio)); + zio->io_error = error; + } + + return (zio); +} + +static zio_t * +zio_dva_free(zio_t *zio) +{ + metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); + + return (zio); +} + +static zio_t * +zio_dva_claim(zio_t *zio) +{ + int error; + + error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); + if (error) + zio->io_error = error; + + return (zio); +} + +/* + * Undo an allocation. This is used by zio_done() when an I/O fails + * and we want to give back the block we just allocated. + * This handles both normal blocks and gang blocks. + */ +static void +zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) +{ + ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); + ASSERT(zio->io_bp_override == NULL); + + if (!BP_IS_HOLE(bp)) + metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); + + if (gn != NULL) { + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + zio_dva_unallocate(zio, gn->gn_child[g], + &gn->gn_gbh->zg_blkptr[g]); + } + } +} + +/* + * Try to allocate an intent log block. Return 0 on success, errno on failure. + */ +int +zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp, + blkptr_t *old_bp, uint64_t size, boolean_t *slog) +{ + int error = 1; + zio_alloc_list_t io_alloc_list; + + ASSERT(txg > spa_syncing_txg(spa)); + + metaslab_trace_init(&io_alloc_list); + /* + * When allocating a zil block, we don't have information about + * the final destination of the block except the objset it's part + * of, so we just hash the objset ID to pick the allocator to get + * some parallelism. + */ + error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, + txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL, + cityhash4(0, 0, 0, objset) % spa->spa_alloc_count); + if (error == 0) { + *slog = TRUE; + } else { + error = metaslab_alloc(spa, spa_normal_class(spa), size, + new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, + &io_alloc_list, NULL, cityhash4(0, 0, 0, objset) % + spa->spa_alloc_count); + if (error == 0) + *slog = FALSE; + } + metaslab_trace_fini(&io_alloc_list); + + if (error == 0) { + BP_SET_LSIZE(new_bp, size); + BP_SET_PSIZE(new_bp, size); + BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(new_bp, + spa_version(spa) >= SPA_VERSION_SLIM_ZIL + ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); + BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); + BP_SET_LEVEL(new_bp, 0); + BP_SET_DEDUP(new_bp, 0); + BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); + } else { + zfs_dbgmsg("%s: zil block allocation failure: " + "size %llu, error %d", spa_name(spa), size, error); + } + + return (error); +} + +/* + * ========================================================================== + * Read, write and delete to physical devices + * ========================================================================== + */ + + +/* + * Issue an I/O to the underlying vdev. Typically the issue pipeline + * stops after this stage and will resume upon I/O completion. + * However, there are instances where the vdev layer may need to + * continue the pipeline when an I/O was not issued. Since the I/O + * that was sent to the vdev layer might be different than the one + * currently active in the pipeline (see vdev_queue_io()), we explicitly + * force the underlying vdev layers to call either zio_execute() or + * zio_interrupt() to ensure that the pipeline continues with the correct I/O. + */ +static zio_t * +zio_vdev_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + uint64_t align; + spa_t *spa = zio->io_spa; + int ret; + + ASSERT(zio->io_error == 0); + ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); + + if (vd == NULL) { + if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) + spa_config_enter(spa, SCL_ZIO, zio, RW_READER); + + /* + * The mirror_ops handle multiple DVAs in a single BP. + */ + vdev_mirror_ops.vdev_op_io_start(zio); + return (NULL); + } + + if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && + zio->io_priority == ZIO_PRIORITY_NOW) { + trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); + return (zio); + } + + ASSERT3P(zio->io_logical, !=, zio); + if (zio->io_type == ZIO_TYPE_WRITE) { + ASSERT(spa->spa_trust_config); + + if (zio->io_vd->vdev_removing) { + /* + * Note: the code can handle other kinds of writes, + * but we don't expect them. + */ + ASSERT(zio->io_flags & + (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | + ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); + } + } + + /* + * We keep track of time-sensitive I/Os so that the scan thread + * can quickly react to certain workloads. In particular, we care + * about non-scrubbing, top-level reads and writes with the following + * characteristics: + * - synchronous writes of user data to non-slog devices + * - any reads of user data + * When these conditions are met, adjust the timestamp of spa_last_io + * which allows the scan thread to adjust its workload accordingly. + */ + if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && + vd == vd->vdev_top && !vd->vdev_islog && + zio->io_bookmark.zb_objset != DMU_META_OBJSET && + zio->io_txg != spa_syncing_txg(spa)) { + uint64_t old = spa->spa_last_io; + uint64_t new = ddi_get_lbolt64(); + if (old != new) + (void) atomic_cas_64(&spa->spa_last_io, old, new); + } + align = 1ULL << vd->vdev_top->vdev_ashift; + + if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && + P2PHASE(zio->io_size, align) != 0) { + /* Transform logical writes to be a full physical block size. */ + uint64_t asize = P2ROUNDUP(zio->io_size, align); + abd_t *abuf = NULL; + if (zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE) + abuf = abd_alloc_sametype(zio->io_abd, asize); + ASSERT(vd == vd->vdev_top); + if (zio->io_type == ZIO_TYPE_WRITE) { + abd_copy(abuf, zio->io_abd, zio->io_size); + abd_zero_off(abuf, zio->io_size, asize - zio->io_size); + } + zio_push_transform(zio, abuf, asize, abuf ? asize : 0, + zio_subblock); + } + + /* + * If this is not a physical io, make sure that it is properly aligned + * before proceeding. + */ + if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { + ASSERT0(P2PHASE(zio->io_offset, align)); + ASSERT0(P2PHASE(zio->io_size, align)); + } else { + /* + * For the physical io we allow alignment + * to a logical block size. + */ + uint64_t log_align = + 1ULL << vd->vdev_top->vdev_logical_ashift; + ASSERT0(P2PHASE(zio->io_offset, log_align)); + ASSERT0(P2PHASE(zio->io_size, log_align)); + } + + VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); + + /* + * If this is a repair I/O, and there's no self-healing involved -- + * that is, we're just resilvering what we expect to resilver -- + * then don't do the I/O unless zio's txg is actually in vd's DTL. + * This prevents spurious resilvering. + * + * There are a few ways that we can end up creating these spurious + * resilver i/os: + * + * 1. A resilver i/o will be issued if any DVA in the BP has a + * dirty DTL. The mirror code will issue resilver writes to + * each DVA, including the one(s) that are not on vdevs with dirty + * DTLs. + * + * 2. With nested replication, which happens when we have a + * "replacing" or "spare" vdev that's a child of a mirror or raidz. + * For example, given mirror(replacing(A+B), C), it's likely that + * only A is out of date (it's the new device). In this case, we'll + * read from C, then use the data to resilver A+B -- but we don't + * actually want to resilver B, just A. The top-level mirror has no + * way to know this, so instead we just discard unnecessary repairs + * as we work our way down the vdev tree. + * + * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. + * The same logic applies to any form of nested replication: ditto + * + mirror, RAID-Z + replacing, etc. + * + * However, indirect vdevs point off to other vdevs which may have + * DTL's, so we never bypass them. The child i/os on concrete vdevs + * will be properly bypassed instead. + */ + if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && + !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && + zio->io_txg != 0 && /* not a delegated i/o */ + vd->vdev_ops != &vdev_indirect_ops && + !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + zio_vdev_io_bypass(zio); + return (zio); + } + + if (vd->vdev_ops->vdev_op_leaf) { + switch (zio->io_type) { + case ZIO_TYPE_READ: + if (vdev_cache_read(zio)) + return (zio); + /* FALLTHROUGH */ + case ZIO_TYPE_WRITE: + case ZIO_TYPE_FREE: + if ((zio = vdev_queue_io(zio)) == NULL) + return (NULL); + + if (!vdev_accessible(vd, zio)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return (NULL); + } + break; + } + /* + * Note that we ignore repair writes for TRIM because they can + * conflict with normal writes. This isn't an issue because, by + * definition, we only repair blocks that aren't freed. + */ + if (zio->io_type == ZIO_TYPE_WRITE && + !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && + !trim_map_write_start(zio)) + return (NULL); + } + + vd->vdev_ops->vdev_op_io_start(zio); + return (NULL); +} + +static zio_t * +zio_vdev_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; + boolean_t unexpected_error = B_FALSE; + + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { + return (NULL); + } + + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); + + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_FREE)) { + + if (zio->io_type == ZIO_TYPE_WRITE && + !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) + trim_map_write_done(zio); + + vdev_queue_io_done(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) + vdev_cache_write(zio); + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_device_injection(vd, + zio, EIO); + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_label_injection(zio, EIO); + + if (zio->io_error) { + if (zio->io_error == ENOTSUP && + zio->io_type == ZIO_TYPE_FREE) { + /* Not all devices support TRIM. */ + } else if (!vdev_accessible(vd, zio)) { + zio->io_error = SET_ERROR(ENXIO); + } else { + unexpected_error = B_TRUE; + } + } + } + + ops->vdev_op_io_done(zio); + + if (unexpected_error) + VERIFY(vdev_probe(vd, zio) == NULL); + + return (zio); +} + +/* + * This function is used to change the priority of an existing zio that is + * currently in-flight. This is used by the arc to upgrade priority in the + * event that a demand read is made for a block that is currently queued + * as a scrub or async read IO. Otherwise, the high priority read request + * would end up having to wait for the lower priority IO. + */ +void +zio_change_priority(zio_t *pio, zio_priority_t priority) +{ + zio_t *cio, *cio_next; + zio_link_t *zl = NULL; + + ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + + if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { + vdev_queue_change_io_priority(pio, priority); + } else { + pio->io_priority = priority; + } + + mutex_enter(&pio->io_lock); + for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio, &zl); + zio_change_priority(cio, priority); + } + mutex_exit(&pio->io_lock); +} + +/* + * For non-raidz ZIOs, we can just copy aside the bad data read from the + * disk, and use that to finish the checksum ereport later. + */ +static void +zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, + const void *good_buf) +{ + /* no processing needed */ + zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); +} + +/*ARGSUSED*/ +void +zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) +{ + void *buf = zio_buf_alloc(zio->io_size); + + abd_copy_to_buf(buf, zio->io_abd, zio->io_size); + + zcr->zcr_cbinfo = zio->io_size; + zcr->zcr_cbdata = buf; + zcr->zcr_finish = zio_vsd_default_cksum_finish; + zcr->zcr_free = zio_buf_free; +} + +static zio_t * +zio_vdev_io_assess(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { + return (NULL); + } + + if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) + spa_config_exit(zio->io_spa, SCL_ZIO, zio); + + if (zio->io_vsd != NULL) { + zio->io_vsd_ops->vsd_free(zio); + zio->io_vsd = NULL; + } + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_fault_injection(zio, EIO); + + if (zio->io_type == ZIO_TYPE_FREE && + zio->io_priority != ZIO_PRIORITY_NOW) { + switch (zio->io_error) { + case 0: + ZIO_TRIM_STAT_INCR(bytes, zio->io_size); + ZIO_TRIM_STAT_BUMP(success); + break; + case EOPNOTSUPP: + ZIO_TRIM_STAT_BUMP(unsupported); + break; + default: + ZIO_TRIM_STAT_BUMP(failed); + break; + } + } + + /* + * If the I/O failed, determine whether we should attempt to retry it. + * + * On retry, we cut in line in the issue queue, since we don't want + * compression/checksumming/etc. work to prevent our (cheap) IO reissue. + */ + if (zio->io_error && vd == NULL && + !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { + ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ + zio->io_error = 0; + zio->io_flags |= ZIO_FLAG_IO_RETRY | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; + zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, + zio_requeue_io_start_cut_in_line); + return (NULL); + } + + /* + * If we got an error on a leaf device, convert it to ENXIO + * if the device is not accessible at all. + */ + if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && + !vdev_accessible(vd, zio)) + zio->io_error = SET_ERROR(ENXIO); + + /* + * If we can't write to an interior vdev (mirror or RAID-Z), + * set vdev_cant_write so that we stop trying to allocate from it. + */ + if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && + vd != NULL && !vd->vdev_ops->vdev_op_leaf) { + vd->vdev_cant_write = B_TRUE; + } + + /* + * If a cache flush returns ENOTSUP or ENOTTY, we know that no future + * attempts will ever succeed. In this case we set a persistent bit so + * that we don't bother with it in the future. + */ + if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && + zio->io_type == ZIO_TYPE_IOCTL && + zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) + vd->vdev_nowritecache = B_TRUE; + + if (zio->io_error) + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + zio->io_physdone != NULL) { + ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); + ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); + zio->io_physdone(zio->io_logical); + } + + return (zio); +} + +void +zio_vdev_io_reissue(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); + ASSERT(zio->io_error == 0); + + zio->io_stage >>= 1; +} + +void +zio_vdev_io_redone(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); + + zio->io_stage >>= 1; +} + +void +zio_vdev_io_bypass(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); + ASSERT(zio->io_error == 0); + + zio->io_flags |= ZIO_FLAG_IO_BYPASS; + zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; +} + +/* + * ========================================================================== + * Generate and verify checksums + * ========================================================================== + */ +static zio_t * +zio_checksum_generate(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + enum zio_checksum checksum; + + if (bp == NULL) { + /* + * This is zio_write_phys(). + * We're either generating a label checksum, or none at all. + */ + checksum = zio->io_prop.zp_checksum; + + if (checksum == ZIO_CHECKSUM_OFF) + return (zio); + + ASSERT(checksum == ZIO_CHECKSUM_LABEL); + } else { + if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { + ASSERT(!IO_IS_ALLOCATING(zio)); + checksum = ZIO_CHECKSUM_GANG_HEADER; + } else { + checksum = BP_GET_CHECKSUM(bp); + } + } + + zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); + + return (zio); +} + +static zio_t * +zio_checksum_verify(zio_t *zio) +{ + zio_bad_cksum_t info; + blkptr_t *bp = zio->io_bp; + int error; + + ASSERT(zio->io_vd != NULL); + + if (bp == NULL) { + /* + * This is zio_read_phys(). + * We're either verifying a label checksum, or nothing at all. + */ + if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) + return (zio); + + ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); + } + + if ((error = zio_checksum_error(zio, &info)) != 0) { + zio->io_error = error; + if (error == ECKSUM && + !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + zfs_ereport_start_checksum(zio->io_spa, + zio->io_vd, zio, zio->io_offset, + zio->io_size, NULL, &info); + } + } + + return (zio); +} + +/* + * Called by RAID-Z to ensure we don't compute the checksum twice. + */ +void +zio_checksum_verified(zio_t *zio) +{ + zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; +} + +/* + * ========================================================================== + * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. + * An error of 0 indicates success. ENXIO indicates whole-device failure, + * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO + * indicate errors that are specific to one I/O, and most likely permanent. + * Any other error is presumed to be worse because we weren't expecting it. + * ========================================================================== + */ +int +zio_worst_error(int e1, int e2) +{ + static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; + int r1, r2; + + for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) + if (e1 == zio_error_rank[r1]) + break; + + for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) + if (e2 == zio_error_rank[r2]) + break; + + return (r1 > r2 ? e1 : e2); +} + +/* + * ========================================================================== + * I/O completion + * ========================================================================== + */ +static zio_t * +zio_ready(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + zio_t *pio, *pio_next; + zio_link_t *zl = NULL; + + if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, + ZIO_WAIT_READY)) { + return (NULL); + } + + if (zio->io_ready) { + ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || + (zio->io_flags & ZIO_FLAG_NOPWRITE)); + ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); + + zio->io_ready(zio); + } + + if (bp != NULL && bp != &zio->io_bp_copy) + zio->io_bp_copy = *bp; + + if (zio->io_error != 0) { + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + /* + * We were unable to allocate anything, unreserve and + * issue the next I/O to allocate. + */ + metaslab_class_throttle_unreserve( + spa_normal_class(zio->io_spa), + zio->io_prop.zp_copies, zio->io_allocator, zio); + zio_allocate_dispatch(zio->io_spa, zio->io_allocator); + } + } + + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_READY] = 1; + pio = zio_walk_parents(zio, &zl); + mutex_exit(&zio->io_lock); + + /* + * As we notify zio's parents, new parents could be added. + * New parents go to the head of zio's io_parent_list, however, + * so we will (correctly) not notify them. The remainder of zio's + * io_parent_list, from 'pio_next' onward, cannot change because + * all parents must wait for us to be done before they can be done. + */ + for (; pio != NULL; pio = pio_next) { + pio_next = zio_walk_parents(zio, &zl); + zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); + } + + if (zio->io_flags & ZIO_FLAG_NODATA) { + if (BP_IS_GANG(bp)) { + zio->io_flags &= ~ZIO_FLAG_NODATA; + } else { + ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); + zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; + } + } + + if (zio_injection_enabled && + zio->io_spa->spa_syncing_txg == zio->io_txg) + zio_handle_ignored_writes(zio); + + return (zio); +} + +/* + * Update the allocation throttle accounting. + */ +static void +zio_dva_throttle_done(zio_t *zio) +{ + zio_t *lio = zio->io_logical; + zio_t *pio = zio_unique_parent(zio); + vdev_t *vd = zio->io_vd; + int flags = METASLAB_ASYNC_ALLOC; + + ASSERT3P(zio->io_bp, !=, NULL); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + ASSERT(vd != NULL); + ASSERT3P(vd, ==, vd->vdev_top); + ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY))); + ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); + ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); + ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); + + /* + * Parents of gang children can have two flavors -- ones that + * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) + * and ones that allocated the constituent blocks. The allocation + * throttle needs to know the allocating parent zio so we must find + * it here. + */ + if (pio->io_child_type == ZIO_CHILD_GANG) { + /* + * If our parent is a rewrite gang child then our grandparent + * would have been the one that performed the allocation. + */ + if (pio->io_flags & ZIO_FLAG_IO_REWRITE) + pio = zio_unique_parent(pio); + flags |= METASLAB_GANG_CHILD; + } + + ASSERT(IO_IS_ALLOCATING(pio)); + ASSERT3P(zio, !=, zio->io_logical); + ASSERT(zio->io_logical != NULL); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); + ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); + + mutex_enter(&pio->io_lock); + metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, + pio->io_allocator, B_TRUE); + mutex_exit(&pio->io_lock); + + metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), + 1, pio->io_allocator, pio); + + /* + * Call into the pipeline to see if there is more work that + * needs to be done. If there is work to be done it will be + * dispatched to another taskq thread. + */ + zio_allocate_dispatch(zio->io_spa, pio->io_allocator); +} + +static zio_t * +zio_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + zio_t *lio = zio->io_logical; + blkptr_t *bp = zio->io_bp; + vdev_t *vd = zio->io_vd; + uint64_t psize = zio->io_size; + zio_t *pio, *pio_next; + metaslab_class_t *mc = spa_normal_class(spa); + zio_link_t *zl = NULL; + + /* + * If our children haven't all completed, + * wait for them and then repeat this pipeline stage. + */ + if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { + return (NULL); + } + + /* + * If the allocation throttle is enabled, then update the accounting. + * We only track child I/Os that are part of an allocating async + * write. We must do this since the allocation is performed + * by the logical I/O but the actual write is done by child I/Os. + */ + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && + zio->io_child_type == ZIO_CHILD_VDEV) { + ASSERT(mc->mc_alloc_throttle_enabled); + zio_dva_throttle_done(zio); + } + + /* + * If the allocation throttle is enabled, verify that + * we have decremented the refcounts for every I/O that was throttled. + */ + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(bp != NULL); + metaslab_group_alloc_verify(spa, zio->io_bp, zio, + zio->io_allocator); + VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator], + zio)); + } + + for (int c = 0; c < ZIO_CHILD_TYPES; c++) + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + ASSERT(zio->io_children[c][w] == 0); + + if (bp != NULL && !BP_IS_EMBEDDED(bp)) { + ASSERT(bp->blk_pad[0] == 0); + ASSERT(bp->blk_pad[1] == 0); + ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || + (bp == zio_unique_parent(zio)->io_bp)); + if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && + zio->io_bp_override == NULL && + !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { + ASSERT(!BP_SHOULD_BYTESWAP(bp)); + ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); + ASSERT(BP_COUNT_GANG(bp) == 0 || + (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); + } + if (zio->io_flags & ZIO_FLAG_NOPWRITE) + VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); + } + + /* + * If there were child vdev/gang/ddt errors, they apply to us now. + */ + zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); + zio_inherit_child_errors(zio, ZIO_CHILD_GANG); + zio_inherit_child_errors(zio, ZIO_CHILD_DDT); + + /* + * If the I/O on the transformed data was successful, generate any + * checksum reports now while we still have the transformed data. + */ + if (zio->io_error == 0) { + while (zio->io_cksum_report != NULL) { + zio_cksum_report_t *zcr = zio->io_cksum_report; + uint64_t align = zcr->zcr_align; + uint64_t asize = P2ROUNDUP(psize, align); + char *abuf = NULL; + abd_t *adata = zio->io_abd; + + if (asize != psize) { + adata = abd_alloc_linear(asize, B_TRUE); + abd_copy(adata, zio->io_abd, psize); + abd_zero_off(adata, psize, asize - psize); + } + + if (adata != NULL) + abuf = abd_borrow_buf_copy(adata, asize); + + zio->io_cksum_report = zcr->zcr_next; + zcr->zcr_next = NULL; + zcr->zcr_finish(zcr, abuf); + zfs_ereport_free_checksum(zcr); + + if (adata != NULL) + abd_return_buf(adata, abuf, asize); + + if (asize != psize) + abd_free(adata); + } + } + + zio_pop_transforms(zio); /* note: may set zio->io_error */ + + vdev_stat_update(zio, psize); + + if (zio->io_error) { + /* + * If this I/O is attached to a particular vdev, + * generate an error message describing the I/O failure + * at the block level. We ignore these errors if the + * device is currently unavailable. + */ + if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) + zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); + + if ((zio->io_error == EIO || !(zio->io_flags & + (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && + zio == lio) { + /* + * For logical I/O requests, tell the SPA to log the + * error and generate a logical data ereport. + */ + spa_log_error(spa, zio); + zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, + 0, 0); + } + } + + if (zio->io_error && zio == lio) { + /* + * Determine whether zio should be reexecuted. This will + * propagate all the way to the root via zio_notify_parent(). + */ + ASSERT(vd == NULL && bp != NULL); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + if (IO_IS_ALLOCATING(zio) && + !(zio->io_flags & ZIO_FLAG_CANFAIL)) { + if (zio->io_error != ENOSPC) + zio->io_reexecute |= ZIO_REEXECUTE_NOW; + else + zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + } + + if ((zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_FREE) && + !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && + zio->io_error == ENXIO && + spa_load_state(spa) == SPA_LOAD_NONE && + spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) + zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + + if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) + zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + + /* + * Here is a possibly good place to attempt to do + * either combinatorial reconstruction or error correction + * based on checksums. It also might be a good place + * to send out preliminary ereports before we suspend + * processing. + */ + } + + /* + * If there were logical child errors, they apply to us now. + * We defer this until now to avoid conflating logical child + * errors with errors that happened to the zio itself when + * updating vdev stats and reporting FMA events above. + */ + zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); + + if ((zio->io_error || zio->io_reexecute) && + IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && + !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) + zio_dva_unallocate(zio, zio->io_gang_tree, bp); + + zio_gang_tree_free(&zio->io_gang_tree); + + /* + * Godfather I/Os should never suspend. + */ + if ((zio->io_flags & ZIO_FLAG_GODFATHER) && + (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) + zio->io_reexecute = 0; + + if (zio->io_reexecute) { + /* + * This is a logical I/O that wants to reexecute. + * + * Reexecute is top-down. When an i/o fails, if it's not + * the root, it simply notifies its parent and sticks around. + * The parent, seeing that it still has children in zio_done(), + * does the same. This percolates all the way up to the root. + * The root i/o will reexecute or suspend the entire tree. + * + * This approach ensures that zio_reexecute() honors + * all the original i/o dependency relationships, e.g. + * parents not executing until children are ready. + */ + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + zio->io_gang_leader = NULL; + + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_DONE] = 1; + mutex_exit(&zio->io_lock); + + /* + * "The Godfather" I/O monitors its children but is + * not a true parent to them. It will track them through + * the pipeline but severs its ties whenever they get into + * trouble (e.g. suspended). This allows "The Godfather" + * I/O to return status without blocking. + */ + zl = NULL; + for (pio = zio_walk_parents(zio, &zl); pio != NULL; + pio = pio_next) { + zio_link_t *remove_zl = zl; + pio_next = zio_walk_parents(zio, &zl); + + if ((pio->io_flags & ZIO_FLAG_GODFATHER) && + (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { + zio_remove_child(pio, zio, remove_zl); + /* + * This is a rare code path, so we don't + * bother with "next_to_execute". + */ + zio_notify_parent(pio, zio, ZIO_WAIT_DONE, + NULL); + } + } + + if ((pio = zio_unique_parent(zio)) != NULL) { + /* + * We're not a root i/o, so there's nothing to do + * but notify our parent. Don't propagate errors + * upward since we haven't permanently failed yet. + */ + ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); + zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; + /* + * This is a rare code path, so we don't bother with + * "next_to_execute". + */ + zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); + } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { + /* + * We'd fail again if we reexecuted now, so suspend + * until conditions improve (e.g. device comes online). + */ + zio_suspend(spa, zio); + } else { + /* + * Reexecution is potentially a huge amount of work. + * Hand it off to the otherwise-unused claim taskq. + */ +#if defined(illumos) || !defined(_KERNEL) + ASSERT(zio->io_tqent.tqent_next == NULL); +#else + ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); +#endif + spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, + ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, + 0, &zio->io_tqent); + } + return (NULL); + } + + ASSERT(zio->io_child_count == 0); + ASSERT(zio->io_reexecute == 0); + ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); + + /* + * Report any checksum errors, since the I/O is complete. + */ + while (zio->io_cksum_report != NULL) { + zio_cksum_report_t *zcr = zio->io_cksum_report; + zio->io_cksum_report = zcr->zcr_next; + zcr->zcr_next = NULL; + zcr->zcr_finish(zcr, NULL); + zfs_ereport_free_checksum(zcr); + } + + /* + * It is the responsibility of the done callback to ensure that this + * particular zio is no longer discoverable for adoption, and as + * such, cannot acquire any new parents. + */ + if (zio->io_done) + zio->io_done(zio); + + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_DONE] = 1; + mutex_exit(&zio->io_lock); + + /* + * We are done executing this zio. We may want to execute a parent + * next. See the comment in zio_notify_parent(). + */ + zio_t *next_to_execute = NULL; + zl = NULL; + for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { + zio_link_t *remove_zl = zl; + pio_next = zio_walk_parents(zio, &zl); + zio_remove_child(pio, zio, remove_zl); + zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); + } + + if (zio->io_waiter != NULL) { + mutex_enter(&zio->io_lock); + zio->io_executor = NULL; + cv_broadcast(&zio->io_cv); + mutex_exit(&zio->io_lock); + } else { + zio_destroy(zio); + } + + return (next_to_execute); +} + +/* + * ========================================================================== + * I/O pipeline definition + * ========================================================================== + */ +static zio_pipe_stage_t *zio_pipeline[] = { + NULL, + zio_read_bp_init, + zio_write_bp_init, + zio_free_bp_init, + zio_issue_async, + zio_write_compress, + zio_checksum_generate, + zio_nop_write, + zio_ddt_read_start, + zio_ddt_read_done, + zio_ddt_write, + zio_ddt_free, + zio_gang_assemble, + zio_gang_issue, + zio_dva_throttle, + zio_dva_allocate, + zio_dva_free, + zio_dva_claim, + zio_ready, + zio_vdev_io_start, + zio_vdev_io_done, + zio_vdev_io_assess, + zio_checksum_verify, + zio_done +}; + + + + +/* + * Compare two zbookmark_phys_t's to see which we would reach first in a + * pre-order traversal of the object tree. + * + * This is simple in every case aside from the meta-dnode object. For all other + * objects, we traverse them in order (object 1 before object 2, and so on). + * However, all of these objects are traversed while traversing object 0, since + * the data it points to is the list of objects. Thus, we need to convert to a + * canonical representation so we can compare meta-dnode bookmarks to + * non-meta-dnode bookmarks. + * + * We do this by calculating "equivalents" for each field of the zbookmark. + * zbookmarks outside of the meta-dnode use their own object and level, and + * calculate the level 0 equivalent (the first L0 blkid that is contained in the + * blocks this bookmark refers to) by multiplying their blkid by their span + * (the number of L0 blocks contained within one block at their level). + * zbookmarks inside the meta-dnode calculate their object equivalent + * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use + * level + 1<<31 (any value larger than a level could ever be) for their level. + * This causes them to always compare before a bookmark in their object + * equivalent, compare appropriately to bookmarks in other objects, and to + * compare appropriately to other bookmarks in the meta-dnode. + */ +int +zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, + const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) +{ + /* + * These variables represent the "equivalent" values for the zbookmark, + * after converting zbookmarks inside the meta dnode to their + * normal-object equivalents. + */ + uint64_t zb1obj, zb2obj; + uint64_t zb1L0, zb2L0; + uint64_t zb1level, zb2level; + + if (zb1->zb_object == zb2->zb_object && + zb1->zb_level == zb2->zb_level && + zb1->zb_blkid == zb2->zb_blkid) + return (0); + + /* + * BP_SPANB calculates the span in blocks. + */ + zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); + zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); + + if (zb1->zb_object == DMU_META_DNODE_OBJECT) { + zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); + zb1L0 = 0; + zb1level = zb1->zb_level + COMPARE_META_LEVEL; + } else { + zb1obj = zb1->zb_object; + zb1level = zb1->zb_level; + } + + if (zb2->zb_object == DMU_META_DNODE_OBJECT) { + zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); + zb2L0 = 0; + zb2level = zb2->zb_level + COMPARE_META_LEVEL; + } else { + zb2obj = zb2->zb_object; + zb2level = zb2->zb_level; + } + + /* Now that we have a canonical representation, do the comparison. */ + if (zb1obj != zb2obj) + return (zb1obj < zb2obj ? -1 : 1); + else if (zb1L0 != zb2L0) + return (zb1L0 < zb2L0 ? -1 : 1); + else if (zb1level != zb2level) + return (zb1level > zb2level ? -1 : 1); + /* + * This can (theoretically) happen if the bookmarks have the same object + * and level, but different blkids, if the block sizes are not the same. + * There is presently no way to change the indirect block sizes + */ + return (0); +} + +/* + * This function checks the following: given that last_block is the place that + * our traversal stopped last time, does that guarantee that we've visited + * every node under subtree_root? Therefore, we can't just use the raw output + * of zbookmark_compare. We have to pass in a modified version of + * subtree_root; by incrementing the block id, and then checking whether + * last_block is before or equal to that, we can tell whether or not having + * visited last_block implies that all of subtree_root's children have been + * visited. + */ +boolean_t +zbookmark_subtree_completed(const dnode_phys_t *dnp, + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) +{ + zbookmark_phys_t mod_zb = *subtree_root; + mod_zb.zb_blkid++; + ASSERT(last_block->zb_level == 0); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + /* + * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the + * data block size in sectors, because that variable is only used if + * the bookmark refers to a block in the meta-dnode. Since we don't + * know without examining it what object it refers to, and there's no + * harm in passing in this value in other cases, we always pass it in. + * + * We pass in 0 for the indirect block size shift because zb2 must be + * level 0. The indirect block size is only used to calculate the span + * of the bookmark, but since the bookmark must be level 0, the span is + * always 1, so the math works out. + * + * If you make changes to how the zbookmark_compare code works, be sure + * to make sure that this code still works afterwards. + */ + return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, + 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, + last_block) <= 0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c new file mode 100644 index 000000000000..8924804a6fcb --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c @@ -0,0 +1,475 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/zil.h> +#include <sys/abd.h> +#include <zfs_fletcher.h> + +/* + * Checksum vectors. + * + * In the SPA, everything is checksummed. We support checksum vectors + * for three distinct reasons: + * + * 1. Different kinds of data need different levels of protection. + * For SPA metadata, we always want a very strong checksum. + * For user data, we let users make the trade-off between speed + * and checksum strength. + * + * 2. Cryptographic hash and MAC algorithms are an area of active research. + * It is likely that in future hash functions will be at least as strong + * as current best-of-breed, and may be substantially faster as well. + * We want the ability to take advantage of these new hashes as soon as + * they become available. + * + * 3. If someone develops hardware that can compute a strong hash quickly, + * we want the ability to take advantage of that hardware. + * + * Of course, we don't want a checksum upgrade to invalidate existing + * data, so we store the checksum *function* in eight bits of the bp. + * This gives us room for up to 256 different checksum functions. + * + * When writing a block, we always checksum it with the latest-and-greatest + * checksum function of the appropriate strength. When reading a block, + * we compare the expected checksum against the actual checksum, which we + * compute via the checksum function specified by BP_GET_CHECKSUM(bp). + * + * SALTED CHECKSUMS + * + * To enable the use of less secure hash algorithms with dedup, we + * introduce the notion of salted checksums (MACs, really). A salted + * checksum is fed both a random 256-bit value (the salt) and the data + * to be checksummed. This salt is kept secret (stored on the pool, but + * never shown to the user). Thus even if an attacker knew of collision + * weaknesses in the hash algorithm, they won't be able to mount a known + * plaintext attack on the DDT, since the actual hash value cannot be + * known ahead of time. How the salt is used is algorithm-specific + * (some might simply prefix it to the data block, others might need to + * utilize a full-blown HMAC). On disk the salt is stored in a ZAP + * object in the MOS (DMU_POOL_CHECKSUM_SALT). + * + * CONTEXT TEMPLATES + * + * Some hashing algorithms need to perform a substantial amount of + * initialization work (e.g. salted checksums above may need to pre-hash + * the salt) before being able to process data. Performing this + * redundant work for each block would be wasteful, so we instead allow + * a checksum algorithm to do the work once (the first time it's used) + * and then keep this pre-initialized context as a template inside the + * spa_t (spa_cksum_tmpls). If the zio_checksum_info_t contains + * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to + * construct and destruct the pre-initialized checksum context. The + * pre-initialized context is then reused during each checksum + * invocation and passed to the checksum function. + */ + +/*ARGSUSED*/ +static void +abd_checksum_off(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); +} + +/*ARGSUSED*/ +void +abd_fletcher_2_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_2_incremental_native, zcp); +} + +/*ARGSUSED*/ +void +abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_2_incremental_byteswap, zcp); +} + +/*ARGSUSED*/ +void +abd_fletcher_4_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_4_incremental_native, zcp); +} + +/*ARGSUSED*/ +void +abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_4_incremental_byteswap, zcp); +} + +zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { + {{NULL, NULL}, NULL, NULL, 0, "inherit"}, + {{NULL, NULL}, NULL, NULL, 0, "on"}, + {{abd_checksum_off, abd_checksum_off}, + NULL, NULL, 0, "off"}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, + "label"}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, + "gang_header"}, + {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, + {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, + NULL, NULL, 0, "fletcher2"}, + {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, + {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, + {{abd_checksum_off, abd_checksum_off}, + NULL, NULL, 0, "noparity"}, + {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, + {{abd_checksum_skein_native, abd_checksum_skein_byteswap}, + abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free, + ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, +#ifdef illumos + {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap}, + abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free, + ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | + ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, +#endif +}; + +/* + * The flag corresponding to the "verify" in dedup=[checksum,]verify + * must be cleared first, so callers should use ZIO_CHECKSUM_MASK. + */ +spa_feature_t +zio_checksum_to_feature(enum zio_checksum cksum) +{ + VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0); + + switch (cksum) { + case ZIO_CHECKSUM_SHA512: + return (SPA_FEATURE_SHA512); + case ZIO_CHECKSUM_SKEIN: + return (SPA_FEATURE_SKEIN); +#ifdef illumos + case ZIO_CHECKSUM_EDONR: + return (SPA_FEATURE_EDONR); +#endif + } + return (SPA_FEATURE_NONE); +} + +enum zio_checksum +zio_checksum_select(enum zio_checksum child, enum zio_checksum parent) +{ + ASSERT(child < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); + + if (child == ZIO_CHECKSUM_INHERIT) + return (parent); + + if (child == ZIO_CHECKSUM_ON) + return (ZIO_CHECKSUM_ON_VALUE); + + return (child); +} + +enum zio_checksum +zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, + enum zio_checksum parent) +{ + ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); + ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); + + if (child == ZIO_CHECKSUM_INHERIT) + return (parent); + + if (child == ZIO_CHECKSUM_ON) + return (spa_dedup_checksum(spa)); + + if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY)) + return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY); + + ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags & + ZCHECKSUM_FLAG_DEDUP) || + (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF); + + return (child); +} + +/* + * Set the external verifier for a gang block based on <vdev, offset, txg>, + * a tuple which is guaranteed to be unique for the life of the pool. + */ +static void +zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp) +{ + dva_t *dva = BP_IDENTITY(bp); + uint64_t txg = BP_PHYSICAL_BIRTH(bp); + + ASSERT(BP_IS_GANG(bp)); + + ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); +} + +/* + * Set the external verifier for a label block based on its offset. + * The vdev is implicit, and the txg is unknowable at pool open time -- + * hence the logic in vdev_uberblock_load() to find the most recent copy. + */ +static void +zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) +{ + ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); +} + +/* + * Calls the template init function of a checksum which supports context + * templates and installs the template into the spa_t. + */ +static void +zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) +{ + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + + if (ci->ci_tmpl_init == NULL) + return; + if (spa->spa_cksum_tmpls[checksum] != NULL) + return; + + VERIFY(ci->ci_tmpl_free != NULL); + mutex_enter(&spa->spa_cksum_tmpls_lock); + if (spa->spa_cksum_tmpls[checksum] == NULL) { + spa->spa_cksum_tmpls[checksum] = + ci->ci_tmpl_init(&spa->spa_cksum_salt); + VERIFY(spa->spa_cksum_tmpls[checksum] != NULL); + } + mutex_exit(&spa->spa_cksum_tmpls_lock); +} + +/* + * Generate the checksum. + */ +void +zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, + abd_t *abd, uint64_t size) +{ + blkptr_t *bp = zio->io_bp; + uint64_t offset = zio->io_offset; + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + zio_cksum_t cksum; + spa_t *spa = zio->io_spa; + + ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(ci->ci_func[0] != NULL); + + zio_checksum_template_init(checksum, spa); + + if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { + zio_eck_t *eck; + void *data = abd_to_buf(abd); + + if (checksum == ZIO_CHECKSUM_ZILOG2) { + zil_chain_t *zilc = data; + + size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ, + uint64_t); + eck = &zilc->zc_eck; + } else { + eck = (zio_eck_t *)((char *)data + size) - 1; + } + if (checksum == ZIO_CHECKSUM_GANG_HEADER) + zio_checksum_gang_verifier(&eck->zec_cksum, bp); + else if (checksum == ZIO_CHECKSUM_LABEL) + zio_checksum_label_verifier(&eck->zec_cksum, offset); + else + bp->blk_cksum = eck->zec_cksum; + eck->zec_magic = ZEC_MAGIC; + ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], + &cksum); + eck->zec_cksum = cksum; + } else { + ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], + &bp->blk_cksum); + } +} + +int +zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, + abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) +{ + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + zio_cksum_t actual_cksum, expected_cksum; + int byteswap; + + if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) + return (SET_ERROR(EINVAL)); + + zio_checksum_template_init(checksum, spa); + + if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { + zio_eck_t *eck; + zio_cksum_t verifier; + uint64_t data_size = size; + void *data = abd_borrow_buf_copy(abd, data_size); + + if (checksum == ZIO_CHECKSUM_ZILOG2) { + zil_chain_t *zilc = data; + uint64_t nused; + + eck = &zilc->zc_eck; + if (eck->zec_magic == ZEC_MAGIC) { + nused = zilc->zc_nused; + } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) { + nused = BSWAP_64(zilc->zc_nused); + } else { + abd_return_buf(abd, data, data_size); + return (SET_ERROR(ECKSUM)); + } + + if (nused > data_size) { + abd_return_buf(abd, data, data_size); + return (SET_ERROR(ECKSUM)); + } + + size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); + } else { + eck = (zio_eck_t *)((char *)data + data_size) - 1; + } + + if (checksum == ZIO_CHECKSUM_GANG_HEADER) + zio_checksum_gang_verifier(&verifier, bp); + else if (checksum == ZIO_CHECKSUM_LABEL) + zio_checksum_label_verifier(&verifier, offset); + else + verifier = bp->blk_cksum; + + byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); + + if (byteswap) + byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); + + size_t eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data; + expected_cksum = eck->zec_cksum; + eck->zec_cksum = verifier; + abd_return_buf_copy(abd, data, data_size); + + ci->ci_func[byteswap](abd, size, + spa->spa_cksum_tmpls[checksum], &actual_cksum); + abd_copy_from_buf_off(abd, &expected_cksum, + eck_offset, sizeof (zio_cksum_t)); + + if (byteswap) { + byteswap_uint64_array(&expected_cksum, + sizeof (zio_cksum_t)); + } + } else { + byteswap = BP_SHOULD_BYTESWAP(bp); + expected_cksum = bp->blk_cksum; + ci->ci_func[byteswap](abd, size, + spa->spa_cksum_tmpls[checksum], &actual_cksum); + } + + if (info != NULL) { + info->zbc_expected = expected_cksum; + info->zbc_actual = actual_cksum; + info->zbc_checksum_name = ci->ci_name; + info->zbc_byteswapped = byteswap; + info->zbc_injected = 0; + info->zbc_has_cksum = 1; + } + + if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) + return (SET_ERROR(ECKSUM)); + + return (0); +} + +int +zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +{ + blkptr_t *bp = zio->io_bp; + uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + int error; + uint64_t size = (bp == NULL ? zio->io_size : + (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t offset = zio->io_offset; + abd_t *data = zio->io_abd; + spa_t *spa = zio->io_spa; + + error = zio_checksum_error_impl(spa, bp, checksum, data, size, + offset, info); + + if (zio_injection_enabled && error == 0 && zio->io_error == 0) { + error = zio_handle_fault_injection(zio, ECKSUM); + if (error != 0) + info->zbc_injected = 1; + } + + return (error); +} + +/* + * Called by a spa_t that's about to be deallocated. This steps through + * all of the checksum context templates and deallocates any that were + * initialized using the algorithm-specific template init function. + */ +void +zio_checksum_templates_free(spa_t *spa) +{ + for (enum zio_checksum checksum = 0; + checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { + if (spa->spa_cksum_tmpls[checksum] != NULL) { + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + + VERIFY(ci->ci_tmpl_free != NULL); + ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); + spa->spa_cksum_tmpls[checksum] = NULL; + } + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c new file mode 100644 index 000000000000..b87303889ddb --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c @@ -0,0 +1,215 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/compress.h> +#include <sys/kstat.h> +#include <sys/spa.h> +#include <sys/zfeature.h> +#include <sys/zio.h> +#include <sys/zio_compress.h> + +typedef struct zcomp_stats { + kstat_named_t zcompstat_attempts; + kstat_named_t zcompstat_empty; + kstat_named_t zcompstat_skipped_insufficient_gain; +} zcomp_stats_t; + +static zcomp_stats_t zcomp_stats = { + { "attempts", KSTAT_DATA_UINT64 }, + { "empty", KSTAT_DATA_UINT64 }, + { "skipped_insufficient_gain", KSTAT_DATA_UINT64 } +}; + +#define ZCOMPSTAT_INCR(stat, val) \ + atomic_add_64(&zcomp_stats.stat.value.ui64, (val)); + +#define ZCOMPSTAT_BUMP(stat) ZCOMPSTAT_INCR(stat, 1); + +kstat_t *zcomp_ksp; + +/* + * If nonzero, every 1/X decompression attempts will fail, simulating + * an undetected memory error. + */ +uint64_t zio_decompress_fail_fraction = 0; + +/* + * Compression vectors. + */ +zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { + {"inherit", 0, NULL, NULL}, + {"on", 0, NULL, NULL}, + {"uncompressed", 0, NULL, NULL}, + {"lzjb", 0, lzjb_compress, lzjb_decompress}, + {"empty", 0, NULL, NULL}, + {"gzip-1", 1, gzip_compress, gzip_decompress}, + {"gzip-2", 2, gzip_compress, gzip_decompress}, + {"gzip-3", 3, gzip_compress, gzip_decompress}, + {"gzip-4", 4, gzip_compress, gzip_decompress}, + {"gzip-5", 5, gzip_compress, gzip_decompress}, + {"gzip-6", 6, gzip_compress, gzip_decompress}, + {"gzip-7", 7, gzip_compress, gzip_decompress}, + {"gzip-8", 8, gzip_compress, gzip_decompress}, + {"gzip-9", 9, gzip_compress, gzip_decompress}, + {"zle", 64, zle_compress, zle_decompress}, + {"lz4", 0, lz4_compress, lz4_decompress} +}; + +enum zio_compress +zio_compress_select(spa_t *spa, enum zio_compress child, + enum zio_compress parent) +{ + enum zio_compress result; + + ASSERT(child < ZIO_COMPRESS_FUNCTIONS); + ASSERT(parent < ZIO_COMPRESS_FUNCTIONS); + ASSERT(parent != ZIO_COMPRESS_INHERIT); + + result = child; + if (result == ZIO_COMPRESS_INHERIT) + result = parent; + + if (result == ZIO_COMPRESS_ON) { + if (spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS)) + result = ZIO_COMPRESS_LZ4_ON_VALUE; + else + result = ZIO_COMPRESS_LEGACY_ON_VALUE; + } + + return (result); +} + +/*ARGSUSED*/ +static int +zio_compress_zeroed_cb(void *data, size_t len, void *private) +{ + uint64_t *end = (uint64_t *)((char *)data + len); + for (uint64_t *word = (uint64_t *)data; word < end; word++) + if (*word != 0) + return (1); + + return (0); +} + +size_t +zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len) +{ + size_t c_len, d_len; + zio_compress_info_t *ci = &zio_compress_table[c]; + + ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); + ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); + + ZCOMPSTAT_BUMP(zcompstat_attempts); + + /* + * If the data is all zeroes, we don't even need to allocate + * a block for it. We indicate this by returning zero size. + */ + if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0) { + ZCOMPSTAT_BUMP(zcompstat_empty); + return (0); + } + + if (c == ZIO_COMPRESS_EMPTY) + return (s_len); + + /* Compress at least 12.5% */ + d_len = s_len - (s_len >> 3); + + /* No compression algorithms can read from ABDs directly */ + void *tmp = abd_borrow_buf_copy(src, s_len); + c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level); + abd_return_buf(src, tmp, s_len); + + if (c_len > d_len) { + ZCOMPSTAT_BUMP(zcompstat_skipped_insufficient_gain); + return (s_len); + } + + ASSERT3U(c_len, <=, d_len); + return (c_len); +} + +int +zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, + size_t s_len, size_t d_len) +{ + zio_compress_info_t *ci = &zio_compress_table[c]; + if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) + return (SET_ERROR(EINVAL)); + + return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); +} + +int +zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len, size_t d_len) +{ + void *tmp = abd_borrow_buf_copy(src, s_len); + int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len); + abd_return_buf(src, tmp, s_len); + + /* + * Decompression shouldn't fail, because we've already verifyied + * the checksum. However, for extra protection (e.g. against bitflips + * in non-ECC RAM), we handle this error (and test it). + */ + ASSERT0(ret); + if (zio_decompress_fail_fraction != 0 && + spa_get_random(zio_decompress_fail_fraction) == 0) + ret = SET_ERROR(EINVAL); + + return (ret); +} + +void +zio_compress_init(void) +{ + + zcomp_ksp = kstat_create("zfs", 0, "zcompstats", "misc", + KSTAT_TYPE_NAMED, sizeof (zcomp_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (zcomp_ksp != NULL) { + zcomp_ksp->ks_data = &zcomp_stats; + kstat_install(zcomp_ksp); + } +} + +void +zio_compress_fini(void) +{ + if (zcomp_ksp != NULL) { + kstat_delete(zcomp_ksp); + zcomp_ksp = NULL; + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c new file mode 100644 index 000000000000..26f59af9968f --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c @@ -0,0 +1,755 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + */ + +/* + * ZFS fault injection + * + * To handle fault injection, we keep track of a series of zinject_record_t + * structures which describe which logical block(s) should be injected with a + * fault. These are kept in a global list. Each record corresponds to a given + * spa_t and maintains a special hold on the spa_t so that it cannot be deleted + * or exported while the injection record exists. + * + * Device level injection is done using the 'zi_guid' field. If this is set, it + * means that the error is destined for a particular device, not a piece of + * data. + * + * This is a rather poor data structure and algorithm, but we don't expect more + * than a few faults at any one time, so it should be sufficient for our needs. + */ + +#include <sys/arc.h> +#include <sys/zio_impl.h> +#include <sys/zfs_ioctl.h> +#include <sys/vdev_impl.h> +#include <sys/dmu_objset.h> +#include <sys/fs/zfs.h> + +uint32_t zio_injection_enabled; + +/* + * Data describing each zinject handler registered on the system, and + * contains the list node linking the handler in the global zinject + * handler list. + */ +typedef struct inject_handler { + int zi_id; + spa_t *zi_spa; + zinject_record_t zi_record; + uint64_t *zi_lanes; + int zi_next_lane; + list_node_t zi_link; +} inject_handler_t; + +/* + * List of all zinject handlers registered on the system, protected by + * the inject_lock defined below. + */ +static list_t inject_handlers; + +/* + * This protects insertion into, and traversal of, the inject handler + * list defined above; as well as the inject_delay_count. Any time a + * handler is inserted or removed from the list, this lock should be + * taken as a RW_WRITER; and any time traversal is done over the list + * (without modification to it) this lock should be taken as a RW_READER. + */ +static krwlock_t inject_lock; + +/* + * This holds the number of zinject delay handlers that have been + * registered on the system. It is protected by the inject_lock defined + * above. Thus modifications to this count must be a RW_WRITER of the + * inject_lock, and reads of this count must be (at least) a RW_READER + * of the lock. + */ +static int inject_delay_count = 0; + +/* + * This lock is used only in zio_handle_io_delay(), refer to the comment + * in that function for more details. + */ +static kmutex_t inject_delay_mtx; + +/* + * Used to assign unique identifying numbers to each new zinject handler. + */ +static int inject_next_id = 1; + +/* + * Returns true if the given record matches the I/O in progress. + */ +static boolean_t +zio_match_handler(zbookmark_phys_t *zb, uint64_t type, + zinject_record_t *record, int error) +{ + /* + * Check for a match against the MOS, which is based on type + */ + if (zb->zb_objset == DMU_META_OBJSET && + record->zi_objset == DMU_META_OBJSET && + record->zi_object == DMU_META_DNODE_OBJECT) { + if (record->zi_type == DMU_OT_NONE || + type == record->zi_type) + return (record->zi_freq == 0 || + spa_get_random(100) < record->zi_freq); + else + return (B_FALSE); + } + + /* + * Check for an exact match. + */ + if (zb->zb_objset == record->zi_objset && + zb->zb_object == record->zi_object && + zb->zb_level == record->zi_level && + zb->zb_blkid >= record->zi_start && + zb->zb_blkid <= record->zi_end && + error == record->zi_error) + return (record->zi_freq == 0 || + spa_get_random(100) < record->zi_freq); + + return (B_FALSE); +} + +/* + * Panic the system when a config change happens in the function + * specified by tag. + */ +void +zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) +{ + inject_handler_t *handler; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (spa != handler->zi_spa) + continue; + + if (handler->zi_record.zi_type == type && + strcmp(tag, handler->zi_record.zi_func) == 0) + panic("Panic requested in function %s\n", tag); + } + + rw_exit(&inject_lock); +} + +/* + * Determine if the I/O in question should return failure. Returns the errno + * to be returned to the caller. + */ +int +zio_handle_fault_injection(zio_t *zio, int error) +{ + int ret = 0; + inject_handler_t *handler; + + /* + * Ignore I/O not associated with any logical data. + */ + if (zio->io_logical == NULL) + return (0); + + /* + * Currently, we only support fault injection on reads. + */ + if (zio->io_type != ZIO_TYPE_READ) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) + continue; + + /* If this handler matches, return EIO */ + if (zio_match_handler(&zio->io_logical->io_bookmark, + zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, + &handler->zi_record, error)) { + ret = error; + break; + } + } + + rw_exit(&inject_lock); + + return (ret); +} + +/* + * Determine if the zio is part of a label update and has an injection + * handler associated with that portion of the label. Currently, we + * allow error injection in either the nvlist or the uberblock region of + * of the vdev label. + */ +int +zio_handle_label_injection(zio_t *zio, int error) +{ + inject_handler_t *handler; + vdev_t *vd = zio->io_vd; + uint64_t offset = zio->io_offset; + int label; + int ret = 0; + + if (offset >= VDEV_LABEL_START_SIZE && + offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + uint64_t start = handler->zi_record.zi_start; + uint64_t end = handler->zi_record.zi_end; + + if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) + continue; + + /* + * The injection region is the relative offsets within a + * vdev label. We must determine the label which is being + * updated and adjust our region accordingly. + */ + label = vdev_label_number(vd->vdev_psize, offset); + start = vdev_label_offset(vd->vdev_psize, label, start); + end = vdev_label_offset(vd->vdev_psize, label, end); + + if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && + (offset >= start && offset <= end)) { + ret = error; + break; + } + } + rw_exit(&inject_lock); + return (ret); +} + + +int +zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) +{ + inject_handler_t *handler; + int ret = 0; + + /* + * We skip over faults in the labels unless it's during + * device open (i.e. zio == NULL). + */ + if (zio != NULL) { + uint64_t offset = zio->io_offset; + + if (offset < VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) + return (0); + } + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) + continue; + + if (vd->vdev_guid == handler->zi_record.zi_guid) { + if (handler->zi_record.zi_failfast && + (zio == NULL || (zio->io_flags & + (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { + continue; + } + + /* Handle type specific I/O failures */ + if (zio != NULL && + handler->zi_record.zi_iotype != ZIO_TYPES && + handler->zi_record.zi_iotype != zio->io_type) + continue; + + if (handler->zi_record.zi_error == error) { + /* + * For a failed open, pretend like the device + * has gone away. + */ + if (error == ENXIO) + vd->vdev_stat.vs_aux = + VDEV_AUX_OPEN_FAILED; + + /* + * Treat these errors as if they had been + * retried so that all the appropriate stats + * and FMA events are generated. + */ + if (!handler->zi_record.zi_failfast && + zio != NULL) + zio->io_flags |= ZIO_FLAG_IO_RETRY; + + ret = error; + break; + } + if (handler->zi_record.zi_error == ENXIO) { + ret = SET_ERROR(EIO); + break; + } + } + } + + rw_exit(&inject_lock); + + return (ret); +} + +/* + * Simulate hardware that ignores cache flushes. For requested number + * of seconds nix the actual writing to disk. + */ +void +zio_handle_ignored_writes(zio_t *zio) +{ + inject_handler_t *handler; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + /* Ignore errors not destined for this pool */ + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) + continue; + + /* + * Positive duration implies # of seconds, negative + * a number of txgs + */ + if (handler->zi_record.zi_timer == 0) { + if (handler->zi_record.zi_duration > 0) + handler->zi_record.zi_timer = ddi_get_lbolt64(); + else + handler->zi_record.zi_timer = zio->io_txg; + } + + /* Have a "problem" writing 60% of the time */ + if (spa_get_random(100) < 60) + zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; + break; + } + + rw_exit(&inject_lock); +} + +void +spa_handle_ignored_writes(spa_t *spa) +{ + inject_handler_t *handler; + + if (zio_injection_enabled == 0) + return; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) + continue; + + if (handler->zi_record.zi_duration > 0) { + VERIFY(handler->zi_record.zi_timer == 0 || + handler->zi_record.zi_timer + + handler->zi_record.zi_duration * hz > + ddi_get_lbolt64()); + } else { + /* duration is negative so the subtraction here adds */ + VERIFY(handler->zi_record.zi_timer == 0 || + handler->zi_record.zi_timer - + handler->zi_record.zi_duration >= + spa_syncing_txg(spa)); + } + } + + rw_exit(&inject_lock); +} + +hrtime_t +zio_handle_io_delay(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + inject_handler_t *min_handler = NULL; + hrtime_t min_target = 0; + + rw_enter(&inject_lock, RW_READER); + + /* + * inject_delay_count is a subset of zio_injection_enabled that + * is only incremented for delay handlers. These checks are + * mainly added to remind the reader why we're not explicitly + * checking zio_injection_enabled like the other functions. + */ + IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); + IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); + + /* + * If there aren't any inject delay handlers registered, then we + * can short circuit and simply return 0 here. A value of zero + * informs zio_delay_interrupt() that this request should not be + * delayed. This short circuit keeps us from acquiring the + * inject_delay_mutex unnecessarily. + */ + if (inject_delay_count == 0) { + rw_exit(&inject_lock); + return (0); + } + + /* + * Each inject handler has a number of "lanes" associated with + * it. Each lane is able to handle requests independently of one + * another, and at a latency defined by the inject handler + * record's zi_timer field. Thus if a handler in configured with + * a single lane with a 10ms latency, it will delay requests + * such that only a single request is completed every 10ms. So, + * if more than one request is attempted per each 10ms interval, + * the average latency of the requests will be greater than + * 10ms; but if only a single request is submitted each 10ms + * interval the average latency will be 10ms. + * + * We need to acquire this mutex to prevent multiple concurrent + * threads being assigned to the same lane of a given inject + * handler. The mutex allows us to perform the following two + * operations atomically: + * + * 1. determine the minimum handler and minimum target + * value of all the possible handlers + * 2. update that minimum handler's lane array + * + * Without atomicity, two (or more) threads could pick the same + * lane in step (1), and then conflict with each other in step + * (2). This could allow a single lane handler to process + * multiple requests simultaneously, which shouldn't be possible. + */ + mutex_enter(&inject_delay_mtx); + + for (inject_handler_t *handler = list_head(&inject_handlers); + handler != NULL; handler = list_next(&inject_handlers, handler)) { + if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) + continue; + + if (vd->vdev_guid != handler->zi_record.zi_guid) + continue; + + /* + * Defensive; should never happen as the array allocation + * occurs prior to inserting this handler on the list. + */ + ASSERT3P(handler->zi_lanes, !=, NULL); + + /* + * This should never happen, the zinject command should + * prevent a user from setting an IO delay with zero lanes. + */ + ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); + + ASSERT3U(handler->zi_record.zi_nlanes, >, + handler->zi_next_lane); + + /* + * We want to issue this IO to the lane that will become + * idle the soonest, so we compare the soonest this + * specific handler can complete the IO with all other + * handlers, to find the lowest value of all possible + * lanes. We then use this lane to submit the request. + * + * Since each handler has a constant value for its + * delay, we can just use the "next" lane for that + * handler; as it will always be the lane with the + * lowest value for that particular handler (i.e. the + * lane that will become idle the soonest). This saves a + * scan of each handler's lanes array. + * + * There's two cases to consider when determining when + * this specific IO request should complete. If this + * lane is idle, we want to "submit" the request now so + * it will complete after zi_timer milliseconds. Thus, + * we set the target to now + zi_timer. + * + * If the lane is busy, we want this request to complete + * zi_timer milliseconds after the lane becomes idle. + * Since the 'zi_lanes' array holds the time at which + * each lane will become idle, we use that value to + * determine when this request should complete. + */ + hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); + hrtime_t busy = handler->zi_record.zi_timer + + handler->zi_lanes[handler->zi_next_lane]; + hrtime_t target = MAX(idle, busy); + + if (min_handler == NULL) { + min_handler = handler; + min_target = target; + continue; + } + + ASSERT3P(min_handler, !=, NULL); + ASSERT3U(min_target, !=, 0); + + /* + * We don't yet increment the "next lane" variable since + * we still might find a lower value lane in another + * handler during any remaining iterations. Once we're + * sure we've selected the absolute minimum, we'll claim + * the lane and increment the handler's "next lane" + * field below. + */ + + if (target < min_target) { + min_handler = handler; + min_target = target; + } + } + + /* + * 'min_handler' will be NULL if no IO delays are registered for + * this vdev, otherwise it will point to the handler containing + * the lane that will become idle the soonest. + */ + if (min_handler != NULL) { + ASSERT3U(min_target, !=, 0); + min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; + + /* + * If we've used all possible lanes for this handler, + * loop back and start using the first lane again; + * otherwise, just increment the lane index. + */ + min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % + min_handler->zi_record.zi_nlanes; + } + + mutex_exit(&inject_delay_mtx); + rw_exit(&inject_lock); + + return (min_target); +} + +/* + * Create a new handler for the given record. We add it to the list, adding + * a reference to the spa_t in the process. We increment zio_injection_enabled, + * which is the switch to trigger all fault injection. + */ +int +zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) +{ + inject_handler_t *handler; + int error; + spa_t *spa; + + /* + * If this is pool-wide metadata, make sure we unload the corresponding + * spa_t, so that the next attempt to load it will trigger the fault. + * We call spa_reset() to unload the pool appropriately. + */ + if (flags & ZINJECT_UNLOAD_SPA) + if ((error = spa_reset(name)) != 0) + return (error); + + if (record->zi_cmd == ZINJECT_DELAY_IO) { + /* + * A value of zero for the number of lanes or for the + * delay time doesn't make sense. + */ + if (record->zi_timer == 0 || record->zi_nlanes == 0) + return (SET_ERROR(EINVAL)); + + /* + * The number of lanes is directly mapped to the size of + * an array used by the handler. Thus, to ensure the + * user doesn't trigger an allocation that's "too large" + * we cap the number of lanes here. + */ + if (record->zi_nlanes >= UINT16_MAX) + return (SET_ERROR(EINVAL)); + } + + if (!(flags & ZINJECT_NULL)) { + /* + * spa_inject_ref() will add an injection reference, which will + * prevent the pool from being removed from the namespace while + * still allowing it to be unloaded. + */ + if ((spa = spa_inject_addref(name)) == NULL) + return (SET_ERROR(ENOENT)); + + handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); + + handler->zi_spa = spa; + handler->zi_record = *record; + + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + handler->zi_lanes = kmem_zalloc( + sizeof (*handler->zi_lanes) * + handler->zi_record.zi_nlanes, KM_SLEEP); + handler->zi_next_lane = 0; + } else { + handler->zi_lanes = NULL; + handler->zi_next_lane = 0; + } + + rw_enter(&inject_lock, RW_WRITER); + + /* + * We can't move this increment into the conditional + * above because we need to hold the RW_WRITER lock of + * inject_lock, and we don't want to hold that while + * allocating the handler's zi_lanes array. + */ + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3S(inject_delay_count, >=, 0); + inject_delay_count++; + ASSERT3S(inject_delay_count, >, 0); + } + + *id = handler->zi_id = inject_next_id++; + list_insert_tail(&inject_handlers, handler); + atomic_inc_32(&zio_injection_enabled); + + rw_exit(&inject_lock); + } + + /* + * Flush the ARC, so that any attempts to read this data will end up + * going to the ZIO layer. Note that this is a little overkill, but + * we don't have the necessary ARC interfaces to do anything else, and + * fault injection isn't a performance critical path. + */ + if (flags & ZINJECT_FLUSH_ARC) + /* + * We must use FALSE to ensure arc_flush returns, since + * we're not preventing concurrent ARC insertions. + */ + arc_flush(NULL, FALSE); + + return (0); +} + +/* + * Returns the next record with an ID greater than that supplied to the + * function. Used to iterate over all handlers in the system. + */ +int +zio_inject_list_next(int *id, char *name, size_t buflen, + zinject_record_t *record) +{ + inject_handler_t *handler; + int ret; + + mutex_enter(&spa_namespace_lock); + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) + if (handler->zi_id > *id) + break; + + if (handler) { + *record = handler->zi_record; + *id = handler->zi_id; + (void) strncpy(name, spa_name(handler->zi_spa), buflen); + ret = 0; + } else { + ret = SET_ERROR(ENOENT); + } + + rw_exit(&inject_lock); + mutex_exit(&spa_namespace_lock); + + return (ret); +} + +/* + * Clear the fault handler with the given identifier, or return ENOENT if none + * exists. + */ +int +zio_clear_fault(int id) +{ + inject_handler_t *handler; + + rw_enter(&inject_lock, RW_WRITER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) + if (handler->zi_id == id) + break; + + if (handler == NULL) { + rw_exit(&inject_lock); + return (SET_ERROR(ENOENT)); + } + + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3S(inject_delay_count, >, 0); + inject_delay_count--; + ASSERT3S(inject_delay_count, >=, 0); + } + + list_remove(&inject_handlers, handler); + rw_exit(&inject_lock); + + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3P(handler->zi_lanes, !=, NULL); + kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * + handler->zi_record.zi_nlanes); + } else { + ASSERT3P(handler->zi_lanes, ==, NULL); + } + + spa_inject_delref(handler->zi_spa); + kmem_free(handler, sizeof (inject_handler_t)); + atomic_dec_32(&zio_injection_enabled); + + return (0); +} + +void +zio_inject_init(void) +{ + rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&inject_handlers, sizeof (inject_handler_t), + offsetof(inject_handler_t, zi_link)); +} + +void +zio_inject_fini(void) +{ + list_destroy(&inject_handlers); + mutex_destroy(&inject_delay_mtx); + rw_destroy(&inject_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c new file mode 100644 index 000000000000..13c5673fbe26 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Zero-length encoding. This is a fast and simple algorithm to eliminate + * runs of zeroes. Each chunk of compressed data begins with a length byte, b. + * If b < n (where n is the compression parameter) then the next b + 1 bytes + * are literal values. If b >= n then the next (256 - b + 1) bytes are zero. + */ +#include <sys/types.h> +#include <sys/sysmacros.h> + +size_t +zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *s_end = src + s_len; + uchar_t *d_end = dst + d_len; + + while (src < s_end && dst < d_end - 1) { + uchar_t *first = src; + uchar_t *len = dst++; + if (src[0] == 0) { + uchar_t *last = src + (256 - n); + while (src < MIN(last, s_end) && src[0] == 0) + src++; + *len = src - first - 1 + n; + } else { + uchar_t *last = src + n; + if (d_end - dst < n) + break; + while (src < MIN(last, s_end) - 1 && (src[0] | src[1])) + *dst++ = *src++; + if (src[0]) + *dst++ = *src++; + *len = src - first - 1; + } + } + return (src == s_end ? dst - (uchar_t *)d_start : s_len); +} + +int +zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *s_end = src + s_len; + uchar_t *d_end = dst + d_len; + + while (src < s_end && dst < d_end) { + int len = 1 + *src++; + if (len <= n) { + while (len-- != 0) + *dst++ = *src++; + } else { + len -= n; + while (len-- != 0) + *dst++ = 0; + } + } + return (dst == d_end ? 0 : -1); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c new file mode 100644 index 000000000000..ec333e54d2a8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c @@ -0,0 +1,187 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + * Copyright 2016 The MathWorks, Inc. All rights reserved. + */ + +/* + * A Zero Reference Lock (ZRL) is a reference count that can lock out new + * references only when the count is zero and only without waiting if the count + * is not already zero. It is similar to a read-write lock in that it allows + * multiple readers and only a single writer, but it does not allow a writer to + * block while waiting for readers to exit, and therefore the question of + * reader/writer priority is moot (no WRWANT bit). Since the equivalent of + * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it + * is perfectly safe for the same reader to acquire the same lock multiple + * times. The fact that a ZRL is reentrant for readers (through multiple calls + * to zrl_add()) makes it convenient for determining whether something is + * actively referenced without the fuss of flagging lock ownership across + * function calls. + */ +#include <sys/zrlock.h> + +/* + * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is + * treated as zero references. + */ +#define ZRL_LOCKED -1 +#define ZRL_DESTROYED -2 + +void +zrl_init(zrlock_t *zrl) +{ + mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL); + zrl->zr_refcount = 0; + cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL); +#ifdef ZFS_DEBUG + zrl->zr_owner = NULL; + zrl->zr_caller = NULL; +#endif +} + +void +zrl_destroy(zrlock_t *zrl) +{ + ASSERT0(zrl->zr_refcount); + + mutex_destroy(&zrl->zr_mtx); + zrl->zr_refcount = ZRL_DESTROYED; + cv_destroy(&zrl->zr_cv); +} + +void +zrl_add_impl(zrlock_t *zrl, const char *zc) +{ + for (;;) { + uint32_t n = (uint32_t)zrl->zr_refcount; + while (n != ZRL_LOCKED) { + uint32_t cas = atomic_cas_32( + (uint32_t *)&zrl->zr_refcount, n, n + 1); + if (cas == n) { + ASSERT3S((int32_t)n, >=, 0); +#ifdef ZFS_DEBUG + if (zrl->zr_owner == curthread) { + DTRACE_PROBE2(zrlock__reentry, + zrlock_t *, zrl, uint32_t, n); + } + zrl->zr_owner = curthread; + zrl->zr_caller = zc; +#endif + return; + } + n = cas; + } + + mutex_enter(&zrl->zr_mtx); + while (zrl->zr_refcount == ZRL_LOCKED) { + cv_wait(&zrl->zr_cv, &zrl->zr_mtx); + } + mutex_exit(&zrl->zr_mtx); + } +} + +void +zrl_remove(zrlock_t *zrl) +{ + uint32_t n; + +#ifdef ZFS_DEBUG + if (zrl->zr_owner == curthread) { + zrl->zr_owner = NULL; + zrl->zr_caller = NULL; + } +#endif + n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount); + ASSERT3S((int32_t)n, >=, 0); +} + +int +zrl_tryenter(zrlock_t *zrl) +{ + uint32_t n = (uint32_t)zrl->zr_refcount; + + if (n == 0) { + uint32_t cas = atomic_cas_32( + (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED); + if (cas == 0) { +#ifdef ZFS_DEBUG + ASSERT3P(zrl->zr_owner, ==, NULL); + zrl->zr_owner = curthread; +#endif + return (1); + } + } + + ASSERT3S((int32_t)n, >, ZRL_DESTROYED); + + return (0); +} + +void +zrl_exit(zrlock_t *zrl) +{ + ASSERT3S(zrl->zr_refcount, ==, ZRL_LOCKED); + + mutex_enter(&zrl->zr_mtx); +#ifdef ZFS_DEBUG + ASSERT3P(zrl->zr_owner, ==, curthread); + zrl->zr_owner = NULL; + membar_producer(); /* make sure the owner store happens first */ +#endif + zrl->zr_refcount = 0; + cv_broadcast(&zrl->zr_cv); + mutex_exit(&zrl->zr_mtx); +} + +int +zrl_refcount(zrlock_t *zrl) +{ + ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED); + + int n = (int)zrl->zr_refcount; + return (n <= 0 ? 0 : n); +} + +int +zrl_is_zero(zrlock_t *zrl) +{ + ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED); + + return (zrl->zr_refcount <= 0); +} + +int +zrl_is_locked(zrlock_t *zrl) +{ + ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED); + + return (zrl->zr_refcount == ZRL_LOCKED); +} + +#ifdef ZFS_DEBUG +kthread_t * +zrl_owner(zrlock_t *zrl) +{ + return (zrl->zr_owner); +} +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c new file mode 100644 index 000000000000..f26f911cb00e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c @@ -0,0 +1,343 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +/* + * ZTHR Infrastructure + * =================== + * + * ZTHR threads are used for isolated operations that span multiple txgs + * within a SPA. They generally exist from SPA creation/loading and until + * the SPA is exported/destroyed. The ideal requirements for an operation + * to be modeled with a zthr are the following: + * + * 1] The operation needs to run over multiple txgs. + * 2] There is be a single point of reference in memory or on disk that + * indicates whether the operation should run/is running or is + * stopped. + * + * If the operation satisfies the above then the following rules guarantee + * a certain level of correctness: + * + * 1] Any thread EXCEPT the zthr changes the work indicator from stopped + * to running but not the opposite. + * 2] Only the zthr can change the work indicator from running to stopped + * (e.g. when it is done) but not the opposite. + * + * This way a normal zthr cycle should go like this: + * + * 1] An external thread changes the work indicator from stopped to + * running and wakes up the zthr. + * 2] The zthr wakes up, checks the indicator and starts working. + * 3] When the zthr is done, it changes the indicator to stopped, allowing + * a new cycle to start. + * + * Besides being awakened by other threads, a zthr can be configured + * during creation to wakeup on it's own after a specified interval + * [see zthr_create_timer()]. + * + * == ZTHR creation + * + * Every zthr needs three inputs to start running: + * + * 1] A user-defined checker function (checkfunc) that decides whether + * the zthr should start working or go to sleep. The function should + * return TRUE when the zthr needs to work or FALSE to let it sleep, + * and should adhere to the following signature: + * boolean_t checkfunc_name(void *args, zthr_t *t); + * + * 2] A user-defined ZTHR function (func) which the zthr executes when + * it is not sleeping. The function should adhere to the following + * signature type: + * int func_name(void *args, zthr_t *t); + * + * 3] A void args pointer that will be passed to checkfunc and func + * implicitly by the infrastructure. + * + * The reason why the above API needs two different functions, + * instead of one that both checks and does the work, has to do with + * the zthr's internal lock (zthr_lock) and the allowed cancellation + * windows. We want to hold the zthr_lock while running checkfunc + * but not while running func. This way the zthr can be cancelled + * while doing work and not while checking for work. + * + * To start a zthr: + * zthr_t *zthr_pointer = zthr_create(checkfunc, func, args); + * or + * zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func, + * args, max_sleep); + * + * After that you should be able to wakeup, cancel, and resume the + * zthr from another thread using zthr_pointer. + * + * NOTE: ZTHR threads could potentially wake up spuriously and the + * user should take this into account when writing a checkfunc. + * [see ZTHR state transitions] + * + * == ZTHR cancellation + * + * ZTHR threads must be cancelled when their SPA is being exported + * or when they need to be paused so they don't interfere with other + * operations. + * + * To cancel a zthr: + * zthr_cancel(zthr_pointer); + * + * To resume it: + * zthr_resume(zthr_pointer); + * + * A zthr will implicitly check if it has received a cancellation + * signal every time func returns and everytime it wakes up [see ZTHR + * state transitions below]. + * + * At times, waiting for the zthr's func to finish its job may take + * time. This may be very time-consuming for some operations that + * need to cancel the SPA's zthrs (e.g spa_export). For this scenario + * the user can explicitly make their ZTHR function aware of incoming + * cancellation signals using zthr_iscancelled(). A common pattern for + * that looks like this: + * + * int + * func_name(void *args, zthr_t *t) + * { + * ... <unpack args> ... + * while (!work_done && !zthr_iscancelled(t)) { + * ... <do more work> ... + * } + * return (0); + * } + * + * == ZTHR exit + * + * For the rare cases where the zthr wants to stop running voluntarily + * while running its ZTHR function (func), we provide zthr_exit(). + * When a zthr has voluntarily stopped running, it can be resumed with + * zthr_resume(), just like it would if it was cancelled by some other + * thread. + * + * == ZTHR cleanup + * + * Cancelling a zthr doesn't clean up its metadata (internal locks, + * function pointers to func and checkfunc, etc..). This is because + * we want to keep them around in case we want to resume the execution + * of the zthr later. Similarly for zthrs that exit themselves. + * + * To completely cleanup a zthr, cancel it first to ensure that it + * is not running and then use zthr_destroy(). + * + * == ZTHR state transitions + * + * zthr creation + * + + * | + * | woke up + * | +--------------+ sleep + * | | ^ + * | | | + * | | | FALSE + * | | | + * v v FALSE + + * cancelled? +---------> checkfunc? + * + ^ + + * | | | + * | | | TRUE + * | | | + * | | func returned v + * | +---------------+ func + * | + * | TRUE + * | + * v + * zthr stopped running + * + */ + +#include <sys/zfs_context.h> +#include <sys/zthr.h> + +void +zthr_exit(zthr_t *t, int rc) +{ + ASSERT3P(t->zthr_thread, ==, curthread); + mutex_enter(&t->zthr_lock); + t->zthr_thread = NULL; + t->zthr_rc = rc; + cv_broadcast(&t->zthr_cv); + mutex_exit(&t->zthr_lock); + thread_exit(); +} + +static void +zthr_procedure(void *arg) +{ + zthr_t *t = arg; + int rc = 0; + + mutex_enter(&t->zthr_lock); + while (!t->zthr_cancel) { + if (t->zthr_checkfunc(t->zthr_arg, t)) { + mutex_exit(&t->zthr_lock); + rc = t->zthr_func(t->zthr_arg, t); + mutex_enter(&t->zthr_lock); + } else { + /* go to sleep */ + if (t->zthr_wait_time == 0) { + cv_wait(&t->zthr_cv, &t->zthr_lock); + } else { + (void) cv_timedwait_hires(&t->zthr_cv, + &t->zthr_lock, t->zthr_wait_time, + MSEC2NSEC(1), 0); + } + } + } + mutex_exit(&t->zthr_lock); + + zthr_exit(t, rc); +} + +zthr_t * +zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg) +{ + return (zthr_create_timer(checkfunc, func, arg, (hrtime_t)0)); +} + +/* + * Create a zthr with specified maximum sleep time. If the time + * in sleeping state exceeds max_sleep, a wakeup(do the check and + * start working if required) will be triggered. + */ +zthr_t * +zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func, + void *arg, hrtime_t max_sleep) +{ + zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP); + mutex_init(&t->zthr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL); + + mutex_enter(&t->zthr_lock); + t->zthr_checkfunc = checkfunc; + t->zthr_func = func; + t->zthr_arg = arg; + t->zthr_wait_time = max_sleep; + + t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, + 0, &p0, TS_RUN, minclsyspri); + mutex_exit(&t->zthr_lock); + + return (t); +} + +void +zthr_destroy(zthr_t *t) +{ + VERIFY3P(t->zthr_thread, ==, NULL); + mutex_destroy(&t->zthr_lock); + cv_destroy(&t->zthr_cv); + kmem_free(t, sizeof (*t)); +} + +/* + * Note: If the zthr is not sleeping and misses the wakeup + * (e.g it is running its ZTHR function), it will check if + * there is work to do before going to sleep using its checker + * function [see ZTHR state transition in ZTHR block comment]. + * Thus, missing the wakeup still yields the expected behavior. + */ +void +zthr_wakeup(zthr_t *t) +{ + mutex_enter(&t->zthr_lock); + cv_broadcast(&t->zthr_cv); + mutex_exit(&t->zthr_lock); +} + +/* + * Note: If the zthr is not running (e.g. has been cancelled + * already), this is a no-op. + */ +int +zthr_cancel(zthr_t *t) +{ + int rc = 0; + + mutex_enter(&t->zthr_lock); + + /* broadcast in case the zthr is sleeping */ + cv_broadcast(&t->zthr_cv); + + t->zthr_cancel = B_TRUE; + while (t->zthr_thread != NULL) + cv_wait(&t->zthr_cv, &t->zthr_lock); + t->zthr_cancel = B_FALSE; + rc = t->zthr_rc; + mutex_exit(&t->zthr_lock); + + return (rc); +} + +void +zthr_resume(zthr_t *t) +{ + ASSERT3P(t->zthr_thread, ==, NULL); + + mutex_enter(&t->zthr_lock); + + ASSERT3P(&t->zthr_checkfunc, !=, NULL); + ASSERT3P(&t->zthr_func, !=, NULL); + ASSERT(!t->zthr_cancel); + + t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, + 0, &p0, TS_RUN, minclsyspri); + + mutex_exit(&t->zthr_lock); +} + +/* + * This function is intended to be used by the zthr itself + * to check if another thread has signal it to stop running. + * + * returns TRUE if we are in the middle of trying to cancel + * this thread. + * + * returns FALSE otherwise. + */ +boolean_t +zthr_iscancelled(zthr_t *t) +{ + boolean_t cancelled; + + ASSERT3P(t->zthr_thread, ==, curthread); + + mutex_enter(&t->zthr_lock); + cancelled = t->zthr_cancel; + mutex_exit(&t->zthr_lock); + + return (cancelled); +} + +boolean_t +zthr_isrunning(zthr_t *t) +{ + boolean_t running; + + mutex_enter(&t->zthr_lock); + running = (t->zthr_thread != NULL); + mutex_exit(&t->zthr_lock); + + return (running); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c new file mode 100644 index 000000000000..3c7f669a351a --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -0,0 +1,3252 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Portions Copyright 2010 Robert Milkowski + * + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ + +/* + * ZFS volume emulation driver. + * + * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. + * Volumes are accessed through the symbolic links named: + * + * /dev/zvol/dsk/<pool_name>/<dataset_name> + * /dev/zvol/rdsk/<pool_name>/<dataset_name> + * + * These links are created by the /dev filesystem (sdev_zvolops.c). + * Volumes are persistent through reboot. No user command needs to be + * run before opening and using a device. + * + * FreeBSD notes. + * On FreeBSD ZVOLs are simply GEOM providers like any other storage device + * in the system. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/errno.h> +#include <sys/uio.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/zap.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/disk.h> +#include <sys/dmu_traverse.h> +#include <sys/dnode.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dkio.h> +#include <sys/byteorder.h> +#include <sys/sunddi.h> +#include <sys/dirent.h> +#include <sys/policy.h> +#include <sys/queue.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_ioctl.h> +#include <sys/zil.h> +#include <sys/refcount.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_rlock.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_raidz.h> +#include <sys/zvol.h> +#include <sys/zil_impl.h> +#include <sys/dbuf.h> +#include <sys/dmu_tx.h> +#include <sys/zfeature.h> +#include <sys/zio_checksum.h> +#include <sys/zil_impl.h> +#include <sys/filio.h> + +#include <geom/geom.h> + +#include "zfs_namecheck.h" + +#ifndef illumos +struct g_class zfs_zvol_class = { + .name = "ZFS::ZVOL", + .version = G_VERSION, +}; + +DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); + +#endif +void *zfsdev_state; +static char *zvol_tag = "zvol_tag"; + +#define ZVOL_DUMPSIZE "dumpsize" + +/* + * This lock protects the zfsdev_state structure from being modified + * while it's being used, e.g. an open that comes in before a create + * finishes. It also protects temporary opens of the dataset so that, + * e.g., an open doesn't get a spurious EBUSY. + */ +#ifdef illumos +kmutex_t zfsdev_state_lock; +#else +/* + * In FreeBSD we've replaced the upstream zfsdev_state_lock with the + * spa_namespace_lock in the ZVOL code. + */ +#define zfsdev_state_lock spa_namespace_lock +#endif +static uint32_t zvol_minors; + +#ifndef illumos +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); +static int volmode = ZFS_VOLMODE_GEOM; +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0, + "Expose as GEOM providers (1), device files (2) or neither"); +static boolean_t zpool_on_zvol = B_FALSE; +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, + "Allow zpools to use zvols as vdevs (DANGEROUS)"); + +#endif +typedef struct zvol_extent { + list_node_t ze_node; + dva_t ze_dva; /* dva associated with this extent */ + uint64_t ze_nblks; /* number of blocks in extent */ +} zvol_extent_t; + +/* + * The in-core state of each volume. + */ +typedef struct zvol_state { +#ifndef illumos + LIST_ENTRY(zvol_state) zv_links; +#endif + char zv_name[MAXPATHLEN]; /* pool/dd name */ + uint64_t zv_volsize; /* amount of space we advertise */ + uint64_t zv_volblocksize; /* volume block size */ +#ifdef illumos + minor_t zv_minor; /* minor number */ +#else + struct cdev *zv_dev; /* non-GEOM device */ + struct g_provider *zv_provider; /* GEOM provider */ +#endif + uint8_t zv_min_bs; /* minimum addressable block shift */ + uint8_t zv_flags; /* readonly, dumpified, etc. */ + objset_t *zv_objset; /* objset handle */ +#ifdef illumos + uint32_t zv_open_count[OTYPCNT]; /* open counts */ +#endif + uint32_t zv_total_opens; /* total open count */ + uint32_t zv_sync_cnt; /* synchronous open count */ + zilog_t *zv_zilog; /* ZIL handle */ + list_t zv_extents; /* List of extents for dump */ + znode_t zv_znode; /* for range locking */ + dnode_t *zv_dn; /* dnode hold */ +#ifndef illumos + int zv_state; + int zv_volmode; /* Provide GEOM or cdev */ + struct bio_queue_head zv_queue; + struct mtx zv_queue_mtx; /* zv_queue mutex */ +#endif +} zvol_state_t; + +#ifndef illumos +static LIST_HEAD(, zvol_state) all_zvols; +#endif +/* + * zvol specific flags + */ +#define ZVOL_RDONLY 0x1 +#define ZVOL_DUMPIFIED 0x2 +#define ZVOL_EXCL 0x4 +#define ZVOL_WCE 0x8 + +/* + * zvol maximum transfer in one DMU tx. + */ +int zvol_maxphys = DMU_MAX_ACCESS/2; + +/* + * Toggle unmap functionality. + */ +boolean_t zvol_unmap_enabled = B_TRUE; + +/* + * If true, unmaps requested as synchronous are executed synchronously, + * otherwise all unmaps are asynchronous. + */ +boolean_t zvol_unmap_sync_enabled = B_FALSE; + +#ifndef illumos +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, + &zvol_unmap_enabled, 0, + "Enable UNMAP functionality"); + +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_sync_enabled, CTLFLAG_RWTUN, + &zvol_unmap_sync_enabled, 0, + "UNMAPs requested as sync are executed synchronously"); + +static d_open_t zvol_d_open; +static d_close_t zvol_d_close; +static d_read_t zvol_read; +static d_write_t zvol_write; +static d_ioctl_t zvol_d_ioctl; +static d_strategy_t zvol_strategy; + +static struct cdevsw zvol_cdevsw = { + .d_version = D_VERSION, + .d_open = zvol_d_open, + .d_close = zvol_d_close, + .d_read = zvol_read, + .d_write = zvol_write, + .d_ioctl = zvol_d_ioctl, + .d_strategy = zvol_strategy, + .d_name = "zvol", + .d_flags = D_DISK | D_TRACKCLOSE, +}; + +static void zvol_geom_run(zvol_state_t *zv); +static void zvol_geom_destroy(zvol_state_t *zv); +static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); +static void zvol_geom_start(struct bio *bp); +static void zvol_geom_worker(void *arg); +static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, + uint64_t len, boolean_t sync); +#endif /* !illumos */ + +extern int zfs_set_prop_nvlist(const char *, zprop_source_t, + nvlist_t *, nvlist_t *); +static int zvol_remove_zv(zvol_state_t *); +static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, + struct lwb *lwb, zio_t *zio); +static int zvol_dumpify(zvol_state_t *zv); +static int zvol_dump_fini(zvol_state_t *zv); +static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); + +static void +zvol_size_changed(zvol_state_t *zv, uint64_t volsize) +{ +#ifdef illumos + dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor); + + zv->zv_volsize = volsize; + VERIFY(ddi_prop_update_int64(dev, zfs_dip, + "Size", volsize) == DDI_SUCCESS); + VERIFY(ddi_prop_update_int64(dev, zfs_dip, + "Nblocks", lbtodb(volsize)) == DDI_SUCCESS); + + /* Notify specfs to invalidate the cached size */ + spec_size_invalidate(dev, VBLK); + spec_size_invalidate(dev, VCHR); +#else /* !illumos */ + zv->zv_volsize = volsize; + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { + struct g_provider *pp; + + pp = zv->zv_provider; + if (pp == NULL) + return; + g_topology_lock(); + + /* + * Do not invoke resize event when initial size was zero. + * ZVOL initializes the size on first open, this is not + * real resizing. + */ + if (pp->mediasize == 0) + pp->mediasize = zv->zv_volsize; + else + g_resize_provider(pp, zv->zv_volsize); + g_topology_unlock(); + } +#endif /* illumos */ +} + +int +zvol_check_volsize(uint64_t volsize, uint64_t blocksize) +{ + if (volsize == 0) + return (SET_ERROR(EINVAL)); + + if (volsize % blocksize != 0) + return (SET_ERROR(EINVAL)); + +#ifdef _ILP32 + if (volsize - 1 > SPEC_MAXOFFSET_T) + return (SET_ERROR(EOVERFLOW)); +#endif + return (0); +} + +int +zvol_check_volblocksize(uint64_t volblocksize) +{ + if (volblocksize < SPA_MINBLOCKSIZE || + volblocksize > SPA_OLD_MAXBLOCKSIZE || + !ISP2(volblocksize)) + return (SET_ERROR(EDOM)); + + return (0); +} + +int +zvol_get_stats(objset_t *os, nvlist_t *nv) +{ + int error; + dmu_object_info_t doi; + uint64_t val; + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); + if (error) + return (error); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); + + error = dmu_object_info(os, ZVOL_OBJ, &doi); + + if (error == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, + doi.doi_data_block_size); + } + + return (error); +} + +static zvol_state_t * +zvol_minor_lookup(const char *name) +{ +#ifdef illumos + minor_t minor; +#endif + zvol_state_t *zv; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + +#ifdef illumos + for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) { + zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); + if (zv == NULL) + continue; +#else + LIST_FOREACH(zv, &all_zvols, zv_links) { +#endif + if (strcmp(zv->zv_name, name) == 0) + return (zv); + } + + return (NULL); +} + +/* extent mapping arg */ +struct maparg { + zvol_state_t *ma_zv; + uint64_t ma_blks; +}; + +/*ARGSUSED*/ +static int +zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + struct maparg *ma = arg; + zvol_extent_t *ze; + int bs = ma->ma_zv->zv_volblocksize; + + if (bp == NULL || BP_IS_HOLE(bp) || + zb->zb_object != ZVOL_OBJ || zb->zb_level != 0) + return (0); + + VERIFY(!BP_IS_EMBEDDED(bp)); + + VERIFY3U(ma->ma_blks, ==, zb->zb_blkid); + ma->ma_blks++; + + /* Abort immediately if we have encountered gang blocks */ + if (BP_IS_GANG(bp)) + return (SET_ERROR(EFRAGS)); + + /* + * See if the block is at the end of the previous extent. + */ + ze = list_tail(&ma->ma_zv->zv_extents); + if (ze && + DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) && + DVA_GET_OFFSET(BP_IDENTITY(bp)) == + DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) { + ze->ze_nblks++; + return (0); + } + + dprintf_bp(bp, "%s", "next blkptr:"); + + /* start a new extent */ + ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP); + ze->ze_dva = bp->blk_dva[0]; /* structure assignment */ + ze->ze_nblks = 1; + list_insert_tail(&ma->ma_zv->zv_extents, ze); + return (0); +} + +static void +zvol_free_extents(zvol_state_t *zv) +{ + zvol_extent_t *ze; + + while (ze = list_head(&zv->zv_extents)) { + list_remove(&zv->zv_extents, ze); + kmem_free(ze, sizeof (zvol_extent_t)); + } +} + +static int +zvol_get_lbas(zvol_state_t *zv) +{ + objset_t *os = zv->zv_objset; + struct maparg ma; + int err; + + ma.ma_zv = zv; + ma.ma_blks = 0; + zvol_free_extents(zv); + + /* commit any in-flight changes before traversing the dataset */ + txg_wait_synced(dmu_objset_pool(os), 0); + err = traverse_dataset(dmu_objset_ds(os), 0, + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma); + if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) { + zvol_free_extents(zv); + return (err ? err : EIO); + } + + return (0); +} + +/* ARGSUSED */ +void +zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +{ + zfs_creat_t *zct = arg; + nvlist_t *nvprops = zct->zct_props; + int error; + uint64_t volblocksize, volsize; + + VERIFY(nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); + if (nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) + volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); + + /* + * These properties must be removed from the list so the generic + * property setting step won't apply to them. + */ + VERIFY(nvlist_remove_all(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); + (void) nvlist_remove_all(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); + + error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); + ASSERT(error == 0); +} + +/* + * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we + * implement DKIOCFREE/free-long-range. + */ +static int +zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) +{ + zvol_state_t *zv = arg1; + lr_truncate_t *lr = arg2; + uint64_t offset, length; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + offset = lr->lr_offset; + length = lr->lr_length; + + return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length)); +} + +/* + * Replay a TX_WRITE ZIL transaction that didn't get committed + * after a system failure + */ +static int +zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) +{ + zvol_state_t *zv = arg1; + lr_write_t *lr = arg2; + objset_t *os = zv->zv_objset; + char *data = (char *)(lr + 1); /* data follows lr_write_t */ + uint64_t offset, length; + dmu_tx_t *tx; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + offset = lr->lr_offset; + length = lr->lr_length; + + /* If it's a dmu_sync() block, write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); + if (length < blocksize) { + offset -= offset % blocksize; + length = blocksize; + } + } + + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_write(os, ZVOL_OBJ, offset, length, data, tx); + dmu_tx_commit(tx); + } + + return (error); +} + +/* ARGSUSED */ +static int +zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) +{ + return (SET_ERROR(ENOTSUP)); +} + +/* + * Callback vectors for replaying records. + * Only TX_WRITE and TX_TRUNCATE are needed for zvol. + */ +zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { + zvol_replay_err, /* 0 no such transaction type */ + zvol_replay_err, /* TX_CREATE */ + zvol_replay_err, /* TX_MKDIR */ + zvol_replay_err, /* TX_MKXATTR */ + zvol_replay_err, /* TX_SYMLINK */ + zvol_replay_err, /* TX_REMOVE */ + zvol_replay_err, /* TX_RMDIR */ + zvol_replay_err, /* TX_LINK */ + zvol_replay_err, /* TX_RENAME */ + zvol_replay_write, /* TX_WRITE */ + zvol_replay_truncate, /* TX_TRUNCATE */ + zvol_replay_err, /* TX_SETATTR */ + zvol_replay_err, /* TX_ACL */ + zvol_replay_err, /* TX_CREATE_ACL */ + zvol_replay_err, /* TX_CREATE_ATTR */ + zvol_replay_err, /* TX_CREATE_ACL_ATTR */ + zvol_replay_err, /* TX_MKDIR_ACL */ + zvol_replay_err, /* TX_MKDIR_ATTR */ + zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ + zvol_replay_err, /* TX_WRITE2 */ +}; + +#ifdef illumos +int +zvol_name2minor(const char *name, minor_t *minor) +{ + zvol_state_t *zv; + + mutex_enter(&zfsdev_state_lock); + zv = zvol_minor_lookup(name); + if (minor && zv) + *minor = zv->zv_minor; + mutex_exit(&zfsdev_state_lock); + return (zv ? 0 : -1); +} +#endif /* illumos */ + +/* + * Create a minor node (plus a whole lot more) for the specified volume. + */ +int +zvol_create_minor(const char *name) +{ + zfs_soft_state_t *zs; + zvol_state_t *zv; + objset_t *os; +#ifdef illumos + dmu_object_info_t doi; + minor_t minor = 0; + char chrbuf[30], blkbuf[30]; +#else + struct g_provider *pp; + struct g_geom *gp; + uint64_t mode; +#endif + int error; + +#ifndef illumos + ZFS_LOG(1, "Creating ZVOL %s...", name); +#endif + + mutex_enter(&zfsdev_state_lock); + + if (zvol_minor_lookup(name) != NULL) { + mutex_exit(&zfsdev_state_lock); + return (SET_ERROR(EEXIST)); + } + + /* lie and say we're read-only */ + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os); + + if (error) { + mutex_exit(&zfsdev_state_lock); + return (error); + } + +#ifdef illumos + if ((minor = zfsdev_minor_alloc()) == 0) { + dmu_objset_disown(os, FTAG); + mutex_exit(&zfsdev_state_lock); + return (SET_ERROR(ENXIO)); + } + + if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) { + dmu_objset_disown(os, FTAG); + mutex_exit(&zfsdev_state_lock); + return (SET_ERROR(EAGAIN)); + } + (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, + (char *)name); + + (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor); + + if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, + minor, DDI_PSEUDO, 0) == DDI_FAILURE) { + ddi_soft_state_free(zfsdev_state, minor); + dmu_objset_disown(os, FTAG); + mutex_exit(&zfsdev_state_lock); + return (SET_ERROR(EAGAIN)); + } + + (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor); + + if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, + minor, DDI_PSEUDO, 0) == DDI_FAILURE) { + ddi_remove_minor_node(zfs_dip, chrbuf); + ddi_soft_state_free(zfsdev_state, minor); + dmu_objset_disown(os, FTAG); + mutex_exit(&zfsdev_state_lock); + return (SET_ERROR(EAGAIN)); + } + + zs = ddi_get_soft_state(zfsdev_state, minor); + zs->zss_type = ZSST_ZVOL; + zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); +#else /* !illumos */ + + zv = kmem_zalloc(sizeof(*zv), KM_SLEEP); + zv->zv_state = 0; + error = dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL); + if (error != 0 || mode == ZFS_VOLMODE_DEFAULT) + mode = volmode; + + DROP_GIANT(); + zv->zv_volmode = mode; + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { + g_topology_lock(); + gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); + gp->start = zvol_geom_start; + gp->access = zvol_geom_access; + pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; + pp->sectorsize = DEV_BSIZE; + pp->mediasize = 0; + pp->private = zv; + + zv->zv_provider = pp; + bioq_init(&zv->zv_queue); + mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF); + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { + struct make_dev_args args; + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zvol_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0640; + args.mda_si_drv2 = zv; + error = make_dev_s(&args, &zv->zv_dev, + "%s/%s", ZVOL_DRIVER, name); + if (error != 0) { + kmem_free(zv, sizeof(*zv)); + dmu_objset_disown(os, FTAG); + mutex_exit(&zfsdev_state_lock); + return (error); + } + zv->zv_dev->si_iosize_max = MAXPHYS; + } + LIST_INSERT_HEAD(&all_zvols, zv, zv_links); +#endif /* illumos */ + + (void) strlcpy(zv->zv_name, name, MAXPATHLEN); + zv->zv_min_bs = DEV_BSHIFT; +#ifdef illumos + zv->zv_minor = minor; +#endif + zv->zv_objset = os; + if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) + zv->zv_flags |= ZVOL_RDONLY; + mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, + sizeof (rl_t), offsetof(rl_t, r_node)); + list_create(&zv->zv_extents, sizeof (zvol_extent_t), + offsetof(zvol_extent_t, ze_node)); +#ifdef illumos + /* get and cache the blocksize */ + error = dmu_object_info(os, ZVOL_OBJ, &doi); + ASSERT(error == 0); + zv->zv_volblocksize = doi.doi_data_block_size; +#endif + + if (spa_writeable(dmu_objset_spa(os))) { + if (zil_replay_disable) + zil_destroy(dmu_objset_zil(os), B_FALSE); + else + zil_replay(os, zv, zvol_replay_vector); + } + dmu_objset_disown(os, FTAG); + zv->zv_objset = NULL; + + zvol_minors++; + + mutex_exit(&zfsdev_state_lock); +#ifndef illumos + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { + zvol_geom_run(zv); + g_topology_unlock(); + } + PICKUP_GIANT(); + + ZFS_LOG(1, "ZVOL %s created.", name); +#endif + + return (0); +} + +/* + * Remove minor node for the specified volume. + */ +static int +zvol_remove_zv(zvol_state_t *zv) +{ +#ifdef illumos + char nmbuf[20]; + minor_t minor = zv->zv_minor; +#endif + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + if (zv->zv_total_opens != 0) + return (SET_ERROR(EBUSY)); + +#ifdef illumos + (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor); + ddi_remove_minor_node(zfs_dip, nmbuf); + + (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor); + ddi_remove_minor_node(zfs_dip, nmbuf); +#else + ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); + + LIST_REMOVE(zv, zv_links); + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { + g_topology_lock(); + zvol_geom_destroy(zv); + g_topology_unlock(); + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { + if (zv->zv_dev != NULL) + destroy_dev(zv->zv_dev); + } +#endif + + avl_destroy(&zv->zv_znode.z_range_avl); + mutex_destroy(&zv->zv_znode.z_range_lock); + + kmem_free(zv, sizeof (zvol_state_t)); +#ifdef illumos + ddi_soft_state_free(zfsdev_state, minor); +#endif + zvol_minors--; + return (0); +} + +int +zvol_remove_minor(const char *name) +{ + zvol_state_t *zv; + int rc; + + mutex_enter(&zfsdev_state_lock); + if ((zv = zvol_minor_lookup(name)) == NULL) { + mutex_exit(&zfsdev_state_lock); + return (SET_ERROR(ENXIO)); + } + rc = zvol_remove_zv(zv); + mutex_exit(&zfsdev_state_lock); + return (rc); +} + +int +zvol_first_open(zvol_state_t *zv) +{ + dmu_object_info_t doi; + objset_t *os; + uint64_t volsize; + int error; + uint64_t readonly; + + /* lie and say we're read-only */ + error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE, + zvol_tag, &os); + if (error) + return (error); + + zv->zv_objset = os; + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) { + ASSERT(error == 0); + dmu_objset_disown(os, zvol_tag); + return (error); + } + + /* get and cache the blocksize */ + error = dmu_object_info(os, ZVOL_OBJ, &doi); + if (error) { + ASSERT(error == 0); + dmu_objset_disown(os, zvol_tag); + return (error); + } + zv->zv_volblocksize = doi.doi_data_block_size; + + error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn); + if (error) { + dmu_objset_disown(os, zvol_tag); + return (error); + } + + zvol_size_changed(zv, volsize); + zv->zv_zilog = zil_open(os, zvol_get_data); + + VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly, + NULL) == 0); + if (readonly || dmu_objset_is_snapshot(os) || + !spa_writeable(dmu_objset_spa(os))) + zv->zv_flags |= ZVOL_RDONLY; + else + zv->zv_flags &= ~ZVOL_RDONLY; + return (error); +} + +void +zvol_last_close(zvol_state_t *zv) +{ + zil_close(zv->zv_zilog); + zv->zv_zilog = NULL; + + dnode_rele(zv->zv_dn, zvol_tag); + zv->zv_dn = NULL; + + /* + * Evict cached data + */ + if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) && + !(zv->zv_flags & ZVOL_RDONLY)) + txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); + dmu_objset_evict_dbufs(zv->zv_objset); + + dmu_objset_disown(zv->zv_objset, zvol_tag); + zv->zv_objset = NULL; +} + +#ifdef illumos +int +zvol_prealloc(zvol_state_t *zv) +{ + objset_t *os = zv->zv_objset; + dmu_tx_t *tx; + uint64_t refd, avail, usedobjs, availobjs; + uint64_t resid = zv->zv_volsize; + uint64_t off = 0; + + /* Check the space usage before attempting to allocate the space */ + dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs); + if (avail < zv->zv_volsize) + return (SET_ERROR(ENOSPC)); + + /* Free old extents if they exist */ + zvol_free_extents(zv); + + while (resid != 0) { + int error; + uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE); + + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off); + return (error); + } + dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx); + dmu_tx_commit(tx); + off += bytes; + resid -= bytes; + } + txg_wait_synced(dmu_objset_pool(os), 0); + + return (0); +} +#endif /* illumos */ + +static int +zvol_update_volsize(objset_t *os, uint64_t volsize) +{ + dmu_tx_t *tx; + int error; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, + &volsize, tx); + dmu_tx_commit(tx); + + if (error == 0) + error = dmu_free_long_range(os, + ZVOL_OBJ, volsize, DMU_OBJECT_END); + return (error); +} + +void +zvol_remove_minors(const char *name) +{ +#ifdef illumos + zvol_state_t *zv; + char *namebuf; + minor_t minor; + + namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP); + (void) strncpy(namebuf, name, strlen(name)); + (void) strcat(namebuf, "/"); + mutex_enter(&zfsdev_state_lock); + for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) { + + zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); + if (zv == NULL) + continue; + if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0) + (void) zvol_remove_zv(zv); + } + kmem_free(namebuf, strlen(name) + 2); + + mutex_exit(&zfsdev_state_lock); +#else /* !illumos */ + zvol_state_t *zv, *tzv; + size_t namelen; + + namelen = strlen(name); + + DROP_GIANT(); + mutex_enter(&zfsdev_state_lock); + + LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) { + if (strcmp(zv->zv_name, name) == 0 || + (strncmp(zv->zv_name, name, namelen) == 0 && + strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' || + zv->zv_name[namelen] == '@'))) { + (void) zvol_remove_zv(zv); + } + } + + mutex_exit(&zfsdev_state_lock); + PICKUP_GIANT(); +#endif /* illumos */ +} + +static int +zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize) +{ + uint64_t old_volsize = 0ULL; + int error = 0; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + /* + * Reinitialize the dump area to the new size. If we + * failed to resize the dump area then restore it back to + * its original size. We must set the new volsize prior + * to calling dumpvp_resize() to ensure that the devices' + * size(9P) is not visible by the dump subsystem. + */ + old_volsize = zv->zv_volsize; + zvol_size_changed(zv, volsize); + +#ifdef ZVOL_DUMP + if (zv->zv_flags & ZVOL_DUMPIFIED) { + if ((error = zvol_dumpify(zv)) != 0 || + (error = dumpvp_resize()) != 0) { + int dumpify_error; + + (void) zvol_update_volsize(zv->zv_objset, old_volsize); + zvol_size_changed(zv, old_volsize); + dumpify_error = zvol_dumpify(zv); + error = dumpify_error ? dumpify_error : error; + } + } +#endif /* ZVOL_DUMP */ + +#ifdef illumos + /* + * Generate a LUN expansion event. + */ + if (error == 0) { + sysevent_id_t eid; + nvlist_t *attr; + char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV, + zv->zv_minor); + + VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); + + (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, + ESC_DEV_DLE, attr, &eid, DDI_SLEEP); + + nvlist_free(attr); + kmem_free(physpath, MAXPATHLEN); + } +#endif /* illumos */ + return (error); +} + +int +zvol_set_volsize(const char *name, uint64_t volsize) +{ + zvol_state_t *zv = NULL; + objset_t *os; + int error; + dmu_object_info_t doi; + uint64_t readonly; + boolean_t owned = B_FALSE; + + error = dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); + if (error != 0) + return (error); + if (readonly) + return (SET_ERROR(EROFS)); + + mutex_enter(&zfsdev_state_lock); + zv = zvol_minor_lookup(name); + + if (zv == NULL || zv->zv_objset == NULL) { + if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, + FTAG, &os)) != 0) { + mutex_exit(&zfsdev_state_lock); + return (error); + } + owned = B_TRUE; + if (zv != NULL) + zv->zv_objset = os; + } else { + os = zv->zv_objset; + } + + if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 || + (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0) + goto out; + + error = zvol_update_volsize(os, volsize); + + if (error == 0 && zv != NULL) + error = zvol_update_live_volsize(zv, volsize); +out: + if (owned) { + dmu_objset_disown(os, FTAG); + if (zv != NULL) + zv->zv_objset = NULL; + } + mutex_exit(&zfsdev_state_lock); + return (error); +} + +/*ARGSUSED*/ +#ifdef illumos +int +zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) +#else +static int +zvol_open(struct g_provider *pp, int flag, int count) +#endif +{ + zvol_state_t *zv; + int err = 0; +#ifdef illumos + + mutex_enter(&zfsdev_state_lock); + + zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL); + if (zv == NULL) { + mutex_exit(&zfsdev_state_lock); + return (SET_ERROR(ENXIO)); + } + + if (zv->zv_total_opens == 0) + err = zvol_first_open(zv); + if (err) { + mutex_exit(&zfsdev_state_lock); + return (err); + } +#else /* !illumos */ + boolean_t locked = B_FALSE; + + if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { + /* + * if zfs_geom_probe_vdev_key is set, that means that zfs is + * attempting to probe geom providers while looking for a + * replacement for a missing VDEV. In this case, the + * spa_namespace_lock will not be held, but it is still illegal + * to use a zvol as a vdev. Deadlocks can result if another + * thread has spa_namespace_lock + */ + return (EOPNOTSUPP); + } + /* + * Protect against recursively entering spa_namespace_lock + * when spa_open() is used for a pool on a (local) ZVOL(s). + * This is needed since we replaced upstream zfsdev_state_lock + * with spa_namespace_lock in the ZVOL code. + * We are using the same trick as spa_open(). + * Note that calls in zvol_first_open which need to resolve + * pool name to a spa object will enter spa_open() + * recursively, but that function already has all the + * necessary protection. + */ + if (!MUTEX_HELD(&zfsdev_state_lock)) { + mutex_enter(&zfsdev_state_lock); + locked = B_TRUE; + } + + zv = pp->private; + if (zv == NULL) { + if (locked) + mutex_exit(&zfsdev_state_lock); + return (SET_ERROR(ENXIO)); + } + + if (zv->zv_total_opens == 0) { + err = zvol_first_open(zv); + if (err) { + if (locked) + mutex_exit(&zfsdev_state_lock); + return (err); + } + pp->mediasize = zv->zv_volsize; + pp->stripeoffset = 0; + pp->stripesize = zv->zv_volblocksize; + } +#endif /* illumos */ + if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + err = SET_ERROR(EROFS); + goto out; + } + if (zv->zv_flags & ZVOL_EXCL) { + err = SET_ERROR(EBUSY); + goto out; + } +#ifdef FEXCL + if (flag & FEXCL) { + if (zv->zv_total_opens != 0) { + err = SET_ERROR(EBUSY); + goto out; + } + zv->zv_flags |= ZVOL_EXCL; + } +#endif + +#ifdef illumos + if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) { + zv->zv_open_count[otyp]++; + zv->zv_total_opens++; + } + mutex_exit(&zfsdev_state_lock); +#else + zv->zv_total_opens += count; + if (locked) + mutex_exit(&zfsdev_state_lock); +#endif + + return (err); +out: + if (zv->zv_total_opens == 0) + zvol_last_close(zv); +#ifdef illumos + mutex_exit(&zfsdev_state_lock); +#else + if (locked) + mutex_exit(&zfsdev_state_lock); +#endif + return (err); +} + +/*ARGSUSED*/ +#ifdef illumos +int +zvol_close(dev_t dev, int flag, int otyp, cred_t *cr) +{ + minor_t minor = getminor(dev); + zvol_state_t *zv; + int error = 0; + + mutex_enter(&zfsdev_state_lock); + + zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); + if (zv == NULL) { + mutex_exit(&zfsdev_state_lock); +#else /* !illumos */ +static int +zvol_close(struct g_provider *pp, int flag, int count) +{ + zvol_state_t *zv; + int error = 0; + boolean_t locked = B_FALSE; + + /* See comment in zvol_open(). */ + if (!MUTEX_HELD(&zfsdev_state_lock)) { + mutex_enter(&zfsdev_state_lock); + locked = B_TRUE; + } + + zv = pp->private; + if (zv == NULL) { + if (locked) + mutex_exit(&zfsdev_state_lock); +#endif /* illumos */ + return (SET_ERROR(ENXIO)); + } + + if (zv->zv_flags & ZVOL_EXCL) { + ASSERT(zv->zv_total_opens == 1); + zv->zv_flags &= ~ZVOL_EXCL; + } + + /* + * If the open count is zero, this is a spurious close. + * That indicates a bug in the kernel / DDI framework. + */ +#ifdef illumos + ASSERT(zv->zv_open_count[otyp] != 0); +#endif + ASSERT(zv->zv_total_opens != 0); + + /* + * You may get multiple opens, but only one close. + */ +#ifdef illumos + zv->zv_open_count[otyp]--; + zv->zv_total_opens--; +#else + zv->zv_total_opens -= count; +#endif + + if (zv->zv_total_opens == 0) + zvol_last_close(zv); + +#ifdef illumos + mutex_exit(&zfsdev_state_lock); +#else + if (locked) + mutex_exit(&zfsdev_state_lock); +#endif + return (error); +} + +static void +zvol_get_done(zgd_t *zgd, int error) +{ + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_range_unlock(zgd->zgd_rl); + + if (error == 0 && zgd->zgd_bp) + zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); + + kmem_free(zgd, sizeof (zgd_t)); +} + +/* + * Get data to generate a TX_WRITE intent log record. + */ +static int +zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +{ + zvol_state_t *zv = arg; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; /* length of user data */ + dmu_buf_t *db; + zgd_t *zgd; + int error; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_lwb = lwb; + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, + RL_READER); + error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, + DMU_READ_NO_PREFETCH); + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's written out + * and its checksum is being calculated that no one can change + * the data. Contrarily to zfs_get_data we need not re-check + * blocksize after we get the lock because it cannot be changed. + */ + size = zv->zv_volblocksize; + offset = P2ALIGN(offset, size); + zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, + RL_READER); + error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zvol_get_done, zgd); + + if (error == 0) + return (0); + } + } + + zvol_get_done(zgd, error); + + return (error); +} + +/* + * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. + * + * We store data in the log buffers if it's small enough. + * Otherwise we will later flush the data out via dmu_sync(). + */ +ssize_t zvol_immediate_write_sz = 32768; +#ifdef _KERNEL +SYSCTL_LONG(_vfs_zfs_vol, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN, + &zvol_immediate_write_sz, 0, "Minimal size for indirect log write"); +#endif + +static void +zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid, + boolean_t sync) +{ + uint32_t blocksize = zv->zv_volblocksize; + zilog_t *zilog = zv->zv_zilog; + itx_wr_state_t write_state; + + if (zil_replaying(zilog, tx)) + return; + + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + write_state = WR_INDIRECT; + else if (!spa_has_slogs(zilog->zl_spa) && + resid >= blocksize && blocksize > zvol_immediate_write_sz) + write_state = WR_INDIRECT; + else if (sync) + write_state = WR_COPIED; + else + write_state = WR_NEED_COPY; + + while (resid) { + itx_t *itx; + lr_write_t *lr; + itx_wr_state_t wr_state = write_state; + ssize_t len = resid; + + if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA) + wr_state = WR_NEED_COPY; + else if (wr_state == WR_INDIRECT) + len = MIN(blocksize - P2PHASE(off, blocksize), resid); + + itx = zil_itx_create(TX_WRITE, sizeof (*lr) + + (wr_state == WR_COPIED ? len : 0)); + lr = (lr_write_t *)&itx->itx_lr; + if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, + off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + wr_state = WR_NEED_COPY; + } + + itx->itx_wr_state = wr_state; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = off; + lr->lr_length = len; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); + + itx->itx_private = zv; + + if (!sync && (zv->zv_sync_cnt == 0)) + itx->itx_sync = B_FALSE; + + zil_itx_assign(zilog, itx, tx); + + off += len; + resid -= len; + } +} + +#ifdef illumos +static int +zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset, + uint64_t size, boolean_t doread, boolean_t isdump) +{ + vdev_disk_t *dvd; + int c; + int numerrors = 0; + + if (vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops) { + for (c = 0; c < vd->vdev_children; c++) { + int err = zvol_dumpio_vdev(vd->vdev_child[c], + addr, offset, origoffset, size, doread, isdump); + if (err != 0) { + numerrors++; + } else if (doread) { + break; + } + } + } + + if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops) + return (numerrors < vd->vdev_children ? 0 : EIO); + + if (doread && !vdev_readable(vd)) + return (SET_ERROR(EIO)); + else if (!doread && !vdev_writeable(vd)) + return (SET_ERROR(EIO)); + + if (vd->vdev_ops == &vdev_raidz_ops) { + return (vdev_raidz_physio(vd, + addr, size, offset, origoffset, doread, isdump)); + } + + offset += VDEV_LABEL_START_SIZE; + + if (ddi_in_panic() || isdump) { + ASSERT(!doread); + if (doread) + return (SET_ERROR(EIO)); + dvd = vd->vdev_tsd; + ASSERT3P(dvd, !=, NULL); + return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset), + lbtodb(size))); + } else { + dvd = vd->vdev_tsd; + ASSERT3P(dvd, !=, NULL); + return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size, + offset, doread ? B_READ : B_WRITE)); + } +} + +static int +zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size, + boolean_t doread, boolean_t isdump) +{ + vdev_t *vd; + int error; + zvol_extent_t *ze; + spa_t *spa = dmu_objset_spa(zv->zv_objset); + + /* Must be sector aligned, and not stradle a block boundary. */ + if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) || + P2BOUNDARY(offset, size, zv->zv_volblocksize)) { + return (SET_ERROR(EINVAL)); + } + ASSERT(size <= zv->zv_volblocksize); + + /* Locate the extent this belongs to */ + ze = list_head(&zv->zv_extents); + while (offset >= ze->ze_nblks * zv->zv_volblocksize) { + offset -= ze->ze_nblks * zv->zv_volblocksize; + ze = list_next(&zv->zv_extents, ze); + } + + if (ze == NULL) + return (SET_ERROR(EINVAL)); + + if (!ddi_in_panic()) + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva)); + offset += DVA_GET_OFFSET(&ze->ze_dva); + error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva), + size, doread, isdump); + + if (!ddi_in_panic()) + spa_config_exit(spa, SCL_STATE, FTAG); + + return (error); +} + +int +zvol_strategy(buf_t *bp) +{ + zfs_soft_state_t *zs = NULL; +#else /* !illumos */ +void +zvol_strategy(struct bio *bp) +{ +#endif /* illumos */ + zvol_state_t *zv; + uint64_t off, volsize; + size_t resid; + char *addr; + objset_t *os; + rl_t *rl; + int error = 0; +#ifdef illumos + boolean_t doread = bp->b_flags & B_READ; +#else + boolean_t doread = 0; +#endif + boolean_t is_dumpified; + boolean_t sync; + +#ifdef illumos + if (getminor(bp->b_edev) == 0) { + error = SET_ERROR(EINVAL); + } else { + zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev)); + if (zs == NULL) + error = SET_ERROR(ENXIO); + else if (zs->zss_type != ZSST_ZVOL) + error = SET_ERROR(EINVAL); + } + + if (error) { + bioerror(bp, error); + biodone(bp); + return (0); + } + + zv = zs->zss_data; + + if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) { + bioerror(bp, EROFS); + biodone(bp); + return (0); + } + + off = ldbtob(bp->b_blkno); +#else /* !illumos */ + if (bp->bio_to) + zv = bp->bio_to->private; + else + zv = bp->bio_dev->si_drv2; + + if (zv == NULL) { + error = SET_ERROR(ENXIO); + goto out; + } + + if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) { + error = SET_ERROR(EROFS); + goto out; + } + + switch (bp->bio_cmd) { + case BIO_FLUSH: + goto sync; + case BIO_READ: + doread = 1; + case BIO_WRITE: + case BIO_DELETE: + break; + default: + error = EOPNOTSUPP; + goto out; + } + + off = bp->bio_offset; +#endif /* illumos */ + volsize = zv->zv_volsize; + + os = zv->zv_objset; + ASSERT(os != NULL); + +#ifdef illumos + bp_mapin(bp); + addr = bp->b_un.b_addr; + resid = bp->b_bcount; + + if (resid > 0 && (off < 0 || off >= volsize)) { + bioerror(bp, EIO); + biodone(bp); + return (0); + } + + is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED; + sync = ((!(bp->b_flags & B_ASYNC) && + !(zv->zv_flags & ZVOL_WCE)) || + (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) && + !doread && !is_dumpified; +#else /* !illumos */ + addr = bp->bio_data; + resid = bp->bio_length; + + if (resid > 0 && (off < 0 || off >= volsize)) { + error = SET_ERROR(EIO); + goto out; + } + + is_dumpified = B_FALSE; + sync = !doread && !is_dumpified && + zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; +#endif /* illumos */ + + /* + * There must be no buffer changes when doing a dmu_sync() because + * we can't change the data whilst calculating the checksum. + */ + rl = zfs_range_lock(&zv->zv_znode, off, resid, + doread ? RL_READER : RL_WRITER); + +#ifndef illumos + if (bp->bio_cmd == BIO_DELETE) { + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + } else { + zvol_log_truncate(zv, tx, off, resid, sync); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, + off, resid); + resid = 0; + } + goto unlock; + } +#endif + while (resid != 0 && off < volsize) { + size_t size = MIN(resid, zvol_maxphys); +#ifdef illumos + if (is_dumpified) { + size = MIN(size, P2END(off, zv->zv_volblocksize) - off); + error = zvol_dumpio(zv, addr, off, size, + doread, B_FALSE); + } else if (doread) { +#else + if (doread) { +#endif + error = dmu_read(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH); + } else { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_write(os, ZVOL_OBJ, off, size, addr, tx); + zvol_log_write(zv, tx, off, size, sync); + dmu_tx_commit(tx); + } + } + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + off += size; + addr += size; + resid -= size; + } +#ifndef illumos +unlock: +#endif + zfs_range_unlock(rl); + +#ifdef illumos + if ((bp->b_resid = resid) == bp->b_bcount) + bioerror(bp, off > volsize ? EINVAL : error); + + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + biodone(bp); + + return (0); +#else /* !illumos */ + bp->bio_completed = bp->bio_length - resid; + if (bp->bio_completed < bp->bio_length && off > volsize) + error = EINVAL; + + if (sync) { +sync: + zil_commit(zv->zv_zilog, ZVOL_OBJ); + } +out: + if (bp->bio_to) + g_io_deliver(bp, error); + else + biofinish(bp, NULL, error); +#endif /* illumos */ +} + +#ifdef illumos +/* + * Set the buffer count to the zvol maximum transfer. + * Using our own routine instead of the default minphys() + * means that for larger writes we write bigger buffers on X86 + * (128K instead of 56K) and flush the disk write cache less often + * (every zvol_maxphys - currently 1MB) instead of minphys (currently + * 56K on X86 and 128K on sparc). + */ +void +zvol_minphys(struct buf *bp) +{ + if (bp->b_bcount > zvol_maxphys) + bp->b_bcount = zvol_maxphys; +} + +int +zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks) +{ + minor_t minor = getminor(dev); + zvol_state_t *zv; + int error = 0; + uint64_t size; + uint64_t boff; + uint64_t resid; + + zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); + if (zv == NULL) + return (SET_ERROR(ENXIO)); + + if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0) + return (SET_ERROR(EINVAL)); + + boff = ldbtob(blkno); + resid = ldbtob(nblocks); + + VERIFY3U(boff + resid, <=, zv->zv_volsize); + + while (resid) { + size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff); + error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE); + if (error) + break; + boff += size; + addr += size; + resid -= size; + } + + return (error); +} + +/*ARGSUSED*/ +int +zvol_read(dev_t dev, uio_t *uio, cred_t *cr) +{ + minor_t minor = getminor(dev); +#else /* !illumos */ +int +zvol_read(struct cdev *dev, struct uio *uio, int ioflag) +{ +#endif /* illumos */ + zvol_state_t *zv; + uint64_t volsize; + rl_t *rl; + int error = 0; + +#ifdef illumos + zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); + if (zv == NULL) + return (SET_ERROR(ENXIO)); +#else + zv = dev->si_drv2; +#endif + + volsize = zv->zv_volsize; + /* uio_loffset == volsize isn't an error as its required for EOF processing. */ + if (uio->uio_resid > 0 && + (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) + return (SET_ERROR(EIO)); + +#ifdef illumos + if (zv->zv_flags & ZVOL_DUMPIFIED) { + error = physio(zvol_strategy, NULL, dev, B_READ, + zvol_minphys, uio); + return (error); + } +#endif + + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, + RL_READER); + while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { + uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); + + /* don't read past the end */ + if (bytes > volsize - uio->uio_loffset) + bytes = volsize - uio->uio_loffset; + + error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes); + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + } + zfs_range_unlock(rl); + return (error); +} + +#ifdef illumos +/*ARGSUSED*/ +int +zvol_write(dev_t dev, uio_t *uio, cred_t *cr) +{ + minor_t minor = getminor(dev); +#else /* !illumos */ +int +zvol_write(struct cdev *dev, struct uio *uio, int ioflag) +{ +#endif /* illumos */ + zvol_state_t *zv; + uint64_t volsize; + rl_t *rl; + int error = 0; + boolean_t sync; + +#ifdef illumos + zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); + if (zv == NULL) + return (SET_ERROR(ENXIO)); +#else + zv = dev->si_drv2; +#endif + + volsize = zv->zv_volsize; + /* uio_loffset == volsize isn't an error as its required for EOF processing. */ + if (uio->uio_resid > 0 && + (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) + return (SET_ERROR(EIO)); + +#ifdef illumos + if (zv->zv_flags & ZVOL_DUMPIFIED) { + error = physio(zvol_strategy, NULL, dev, B_WRITE, + zvol_minphys, uio); + return (error); + } + + sync = !(zv->zv_flags & ZVOL_WCE) || +#else + sync = (ioflag & IO_SYNC) || +#endif + (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, + RL_WRITER); + while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { + uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); + uint64_t off = uio->uio_loffset; + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + + if (bytes > volsize - off) /* don't write past the end */ + bytes = volsize - off; + + dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + break; + } + error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx); + if (error == 0) + zvol_log_write(zv, tx, off, bytes, sync); + dmu_tx_commit(tx); + + if (error) + break; + } + zfs_range_unlock(rl); + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + return (error); +} + +#ifdef illumos +int +zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs) +{ + struct uuid uuid = EFI_RESERVED; + efi_gpe_t gpe = { 0 }; + uint32_t crc; + dk_efi_t efi; + int length; + char *ptr; + + if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag)) + return (SET_ERROR(EFAULT)); + ptr = (char *)(uintptr_t)efi.dki_data_64; + length = efi.dki_length; + /* + * Some clients may attempt to request a PMBR for the + * zvol. Currently this interface will return EINVAL to + * such requests. These requests could be supported by + * adding a check for lba == 0 and consing up an appropriate + * PMBR. + */ + if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0) + return (SET_ERROR(EINVAL)); + + gpe.efi_gpe_StartingLBA = LE_64(34ULL); + gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1); + UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid); + + if (efi.dki_lba == 1) { + efi_gpt_t gpt = { 0 }; + + gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE); + gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); + gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt)); + gpt.efi_gpt_MyLBA = LE_64(1ULL); + gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL); + gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1); + gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL); + gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1); + gpt.efi_gpt_SizeOfPartitionEntry = + LE_32(sizeof (efi_gpe_t)); + CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table); + gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); + CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table); + gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); + if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length), + flag)) + return (SET_ERROR(EFAULT)); + ptr += sizeof (gpt); + length -= sizeof (gpt); + } + if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe), + length), flag)) + return (SET_ERROR(EFAULT)); + return (0); +} + +/* + * BEGIN entry points to allow external callers access to the volume. + */ +/* + * Return the volume parameters needed for access from an external caller. + * These values are invariant as long as the volume is held open. + */ +int +zvol_get_volume_params(minor_t minor, uint64_t *blksize, + uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl, + void **rl_hdl, void **dnode_hdl) +{ + zvol_state_t *zv; + + zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); + if (zv == NULL) + return (SET_ERROR(ENXIO)); + if (zv->zv_flags & ZVOL_DUMPIFIED) + return (SET_ERROR(ENXIO)); + + ASSERT(blksize && max_xfer_len && minor_hdl && + objset_hdl && zil_hdl && rl_hdl && dnode_hdl); + + *blksize = zv->zv_volblocksize; + *max_xfer_len = (uint64_t)zvol_maxphys; + *minor_hdl = zv; + *objset_hdl = zv->zv_objset; + *zil_hdl = zv->zv_zilog; + *rl_hdl = &zv->zv_znode; + *dnode_hdl = zv->zv_dn; + return (0); +} + +/* + * Return the current volume size to an external caller. + * The size can change while the volume is open. + */ +uint64_t +zvol_get_volume_size(void *minor_hdl) +{ + zvol_state_t *zv = minor_hdl; + + return (zv->zv_volsize); +} + +/* + * Return the current WCE setting to an external caller. + * The WCE setting can change while the volume is open. + */ +int +zvol_get_volume_wce(void *minor_hdl) +{ + zvol_state_t *zv = minor_hdl; + + return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0); +} + +/* + * Entry point for external callers to zvol_log_write + */ +void +zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid, + boolean_t sync) +{ + zvol_state_t *zv = minor_hdl; + + zvol_log_write(zv, tx, off, resid, sync); +} +/* + * END entry points to allow external callers access to the volume. + */ +#endif /* illumos */ + +/* + * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. + */ +static void +zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, + boolean_t sync) +{ + itx_t *itx; + lr_truncate_t *lr; + zilog_t *zilog = zv->zv_zilog; + + if (zil_replaying(zilog, tx)) + return; + + itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); + lr = (lr_truncate_t *)&itx->itx_lr; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = off; + lr->lr_length = len; + + itx->itx_sync = (sync || zv->zv_sync_cnt != 0); + zil_itx_assign(zilog, itx, tx); +} + +#ifdef illumos +/* + * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I). + * Also a dirtbag dkio ioctl for unmap/free-block functionality. + */ +/*ARGSUSED*/ +int +zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) +{ + zvol_state_t *zv; + struct dk_callback *dkc; + int error = 0; + rl_t *rl; + + mutex_enter(&zfsdev_state_lock); + + zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL); + + if (zv == NULL) { + mutex_exit(&zfsdev_state_lock); + return (SET_ERROR(ENXIO)); + } + ASSERT(zv->zv_total_opens > 0); + + switch (cmd) { + + case DKIOCINFO: + { + struct dk_cinfo dki; + + bzero(&dki, sizeof (dki)); + (void) strcpy(dki.dki_cname, "zvol"); + (void) strcpy(dki.dki_dname, "zvol"); + dki.dki_ctype = DKC_UNKNOWN; + dki.dki_unit = getminor(dev); + dki.dki_maxtransfer = + 1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs); + mutex_exit(&zfsdev_state_lock); + if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag)) + error = SET_ERROR(EFAULT); + return (error); + } + + case DKIOCGMEDIAINFO: + { + struct dk_minfo dkm; + + bzero(&dkm, sizeof (dkm)); + dkm.dki_lbsize = 1U << zv->zv_min_bs; + dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; + dkm.dki_media_type = DK_UNKNOWN; + mutex_exit(&zfsdev_state_lock); + if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag)) + error = SET_ERROR(EFAULT); + return (error); + } + + case DKIOCGMEDIAINFOEXT: + { + struct dk_minfo_ext dkmext; + + bzero(&dkmext, sizeof (dkmext)); + dkmext.dki_lbsize = 1U << zv->zv_min_bs; + dkmext.dki_pbsize = zv->zv_volblocksize; + dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; + dkmext.dki_media_type = DK_UNKNOWN; + mutex_exit(&zfsdev_state_lock); + if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag)) + error = SET_ERROR(EFAULT); + return (error); + } + + case DKIOCGETEFI: + { + uint64_t vs = zv->zv_volsize; + uint8_t bs = zv->zv_min_bs; + + mutex_exit(&zfsdev_state_lock); + error = zvol_getefi((void *)arg, flag, vs, bs); + return (error); + } + + case DKIOCFLUSHWRITECACHE: + dkc = (struct dk_callback *)arg; + mutex_exit(&zfsdev_state_lock); + zil_commit(zv->zv_zilog, ZVOL_OBJ); + if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) { + (*dkc->dkc_callback)(dkc->dkc_cookie, error); + error = 0; + } + return (error); + + case DKIOCGETWCE: + { + int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0; + if (ddi_copyout(&wce, (void *)arg, sizeof (int), + flag)) + error = SET_ERROR(EFAULT); + break; + } + case DKIOCSETWCE: + { + int wce; + if (ddi_copyin((void *)arg, &wce, sizeof (int), + flag)) { + error = SET_ERROR(EFAULT); + break; + } + if (wce) { + zv->zv_flags |= ZVOL_WCE; + mutex_exit(&zfsdev_state_lock); + } else { + zv->zv_flags &= ~ZVOL_WCE; + mutex_exit(&zfsdev_state_lock); + zil_commit(zv->zv_zilog, ZVOL_OBJ); + } + return (0); + } + + case DKIOCGGEOM: + case DKIOCGVTOC: + /* + * commands using these (like prtvtoc) expect ENOTSUP + * since we're emulating an EFI label + */ + error = SET_ERROR(ENOTSUP); + break; + + case DKIOCDUMPINIT: + rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, + RL_WRITER); + error = zvol_dumpify(zv); + zfs_range_unlock(rl); + break; + + case DKIOCDUMPFINI: + if (!(zv->zv_flags & ZVOL_DUMPIFIED)) + break; + rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, + RL_WRITER); + error = zvol_dump_fini(zv); + zfs_range_unlock(rl); + break; + + case DKIOCFREE: + { + dkioc_free_t df; + dmu_tx_t *tx; + + if (!zvol_unmap_enabled) + break; + + if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) { + error = SET_ERROR(EFAULT); + break; + } + + /* + * Apply Postel's Law to length-checking. If they overshoot, + * just blank out until the end, if there's a need to blank + * out anything. + */ + if (df.df_start >= zv->zv_volsize) + break; /* No need to do anything... */ + + mutex_exit(&zfsdev_state_lock); + + rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length, + RL_WRITER); + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + } else { + zvol_log_truncate(zv, tx, df.df_start, + df.df_length, B_TRUE); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, + df.df_start, df.df_length); + } + + zfs_range_unlock(rl); + + /* + * If the write-cache is disabled, 'sync' property + * is set to 'always', or if the caller is asking for + * a synchronous free, commit this operation to the zil. + * This will sync any previous uncommitted writes to the + * zvol object. + * Can be overridden by the zvol_unmap_sync_enabled tunable. + */ + if ((error == 0) && zvol_unmap_sync_enabled && + (!(zv->zv_flags & ZVOL_WCE) || + (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) || + (df.df_flags & DF_WAIT_SYNC))) { + zil_commit(zv->zv_zilog, ZVOL_OBJ); + } + + return (error); + } + + default: + error = SET_ERROR(ENOTTY); + break; + + } + mutex_exit(&zfsdev_state_lock); + return (error); +} +#endif /* illumos */ + +int +zvol_busy(void) +{ + return (zvol_minors != 0); +} + +void +zvol_init(void) +{ + VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t), + 1) == 0); +#ifdef illumos + mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); +#else + ZFS_LOG(1, "ZVOL Initialized."); +#endif +} + +void +zvol_fini(void) +{ +#ifdef illumos + mutex_destroy(&zfsdev_state_lock); +#endif + ddi_soft_state_fini(&zfsdev_state); + ZFS_LOG(1, "ZVOL Deinitialized."); +} + +#ifdef illumos +/*ARGSUSED*/ +static int +zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP)) + return (1); + return (0); +} + +/*ARGSUSED*/ +static void +zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx); +} + +static int +zvol_dump_init(zvol_state_t *zv, boolean_t resize) +{ + dmu_tx_t *tx; + int error; + objset_t *os = zv->zv_objset; + spa_t *spa = dmu_objset_spa(os); + vdev_t *vd = spa->spa_root_vdev; + nvlist_t *nv = NULL; + uint64_t version = spa_version(spa); + uint64_t checksum, compress, refresrv, vbs, dedup; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + ASSERT(vd->vdev_ops == &vdev_root_ops); + + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0, + DMU_OBJECT_END); + if (error != 0) + return (error); + /* wait for dmu_free_long_range to actually free the blocks */ + txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); + + /* + * If the pool on which the dump device is being initialized has more + * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is + * enabled. If so, bump that feature's counter to indicate that the + * feature is active. We also check the vdev type to handle the + * following case: + * # zpool create test raidz disk1 disk2 disk3 + * Now have spa_root_vdev->vdev_children == 1 (the raidz vdev), + * the raidz vdev itself has 3 children. + */ + if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) { + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_MULTI_VDEV_CRASH_DUMP)) + return (SET_ERROR(ENOTSUP)); + (void) dsl_sync_task(spa_name(spa), + zfs_mvdev_dump_feature_check, + zfs_mvdev_dump_activate_feature_sync, NULL, + 2, ZFS_SPACE_CHECK_RESERVED); + } + + if (!resize) { + error = dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); + if (error == 0) { + error = dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, + NULL); + } + if (error == 0) { + error = dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + &refresrv, NULL); + } + if (error == 0) { + error = dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, + NULL); + } + if (version >= SPA_VERSION_DEDUP && error == 0) { + error = dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL); + } + } + if (error != 0) + return (error); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + dmu_tx_hold_bonus(tx, ZVOL_OBJ); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + return (error); + } + + /* + * If we are resizing the dump device then we only need to + * update the refreservation to match the newly updated + * zvolsize. Otherwise, we save off the original state of the + * zvol so that we can restore them if the zvol is ever undumpified. + */ + if (resize) { + error = zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, + &zv->zv_volsize, tx); + } else { + error = zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, + &compress, tx); + if (error == 0) { + error = zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, + &checksum, tx); + } + if (error == 0) { + error = zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, + &refresrv, tx); + } + if (error == 0) { + error = zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, + &vbs, tx); + } + if (error == 0) { + error = dmu_object_set_blocksize( + os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx); + } + if (version >= SPA_VERSION_DEDUP && error == 0) { + error = zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, + &dedup, tx); + } + if (error == 0) + zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE; + } + dmu_tx_commit(tx); + + /* + * We only need update the zvol's property if we are initializing + * the dump area for the first time. + */ + if (error == 0 && !resize) { + /* + * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum + * function. Otherwise, use the old default -- OFF. + */ + checksum = spa_feature_is_active(spa, + SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY : + ZIO_CHECKSUM_OFF; + + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0); + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), + ZIO_COMPRESS_OFF) == 0); + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), + checksum) == 0); + if (version >= SPA_VERSION_DEDUP) { + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_DEDUP), + ZIO_CHECKSUM_OFF) == 0); + } + + error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL, + nv, NULL); + nvlist_free(nv); + } + + /* Allocate the space for the dump */ + if (error == 0) + error = zvol_prealloc(zv); + return (error); +} + +static int +zvol_dumpify(zvol_state_t *zv) +{ + int error = 0; + uint64_t dumpsize = 0; + dmu_tx_t *tx; + objset_t *os = zv->zv_objset; + + if (zv->zv_flags & ZVOL_RDONLY) + return (SET_ERROR(EROFS)); + + if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, + 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) { + boolean_t resize = (dumpsize > 0); + + if ((error = zvol_dump_init(zv, resize)) != 0) { + (void) zvol_dump_fini(zv); + return (error); + } + } + + /* + * Build up our lba mapping. + */ + error = zvol_get_lbas(zv); + if (error) { + (void) zvol_dump_fini(zv); + return (error); + } + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + (void) zvol_dump_fini(zv); + return (error); + } + + zv->zv_flags |= ZVOL_DUMPIFIED; + error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1, + &zv->zv_volsize, tx); + dmu_tx_commit(tx); + + if (error) { + (void) zvol_dump_fini(zv); + return (error); + } + + txg_wait_synced(dmu_objset_pool(os), 0); + return (0); +} + +static int +zvol_dump_fini(zvol_state_t *zv) +{ + dmu_tx_t *tx; + objset_t *os = zv->zv_objset; + nvlist_t *nv; + int error = 0; + uint64_t checksum, compress, refresrv, vbs, dedup; + uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset)); + + /* + * Attempt to restore the zvol back to its pre-dumpified state. + * This is a best-effort attempt as it's possible that not all + * of these properties were initialized during the dumpify process + * (i.e. error during zvol_dump_init). + */ + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx); + dmu_tx_commit(tx); + + (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum); + (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress); + (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv); + (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs); + + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum); + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress); + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv); + if (version >= SPA_VERSION_DEDUP && + zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) { + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_DEDUP), dedup); + } + (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL, + nv, NULL); + nvlist_free(nv); + + zvol_free_extents(zv); + zv->zv_flags &= ~ZVOL_DUMPIFIED; + (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END); + /* wait for dmu_free_long_range to actually free the blocks */ + txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, ZVOL_OBJ); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0) + zv->zv_volblocksize = vbs; + dmu_tx_commit(tx); + + return (0); +} +#else /* !illumos */ + +static void +zvol_geom_run(zvol_state_t *zv) +{ + struct g_provider *pp; + + pp = zv->zv_provider; + g_error_provider(pp, 0); + + kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0, + "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER)); +} + +static void +zvol_geom_destroy(zvol_state_t *zv) +{ + struct g_provider *pp; + + g_topology_assert(); + + mtx_lock(&zv->zv_queue_mtx); + zv->zv_state = 1; + wakeup_one(&zv->zv_queue); + while (zv->zv_state != 2) + msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0); + mtx_destroy(&zv->zv_queue_mtx); + + pp = zv->zv_provider; + zv->zv_provider = NULL; + pp->private = NULL; + g_wither_geom(pp->geom, ENXIO); +} + +static int +zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) +{ + int count, error, flags; + + g_topology_assert(); + + /* + * To make it easier we expect either open or close, but not both + * at the same time. + */ + KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || + (acr <= 0 && acw <= 0 && ace <= 0), + ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", + pp->name, acr, acw, ace)); + + if (pp->private == NULL) { + if (acr <= 0 && acw <= 0 && ace <= 0) + return (0); + return (pp->error); + } + + /* + * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0, + * because GEOM already handles that and handles it a bit differently. + * GEOM allows for multiple read/exclusive consumers and ZFS allows + * only one exclusive consumer, no matter if it is reader or writer. + * I like better the way GEOM works so I'll leave it for GEOM to + * decide what to do. + */ + + count = acr + acw + ace; + if (count == 0) + return (0); + + flags = 0; + if (acr != 0 || ace != 0) + flags |= FREAD; + if (acw != 0) + flags |= FWRITE; + + g_topology_unlock(); + if (count > 0) + error = zvol_open(pp, flags, count); + else + error = zvol_close(pp, flags, -count); + g_topology_lock(); + return (error); +} + +static void +zvol_geom_start(struct bio *bp) +{ + zvol_state_t *zv; + boolean_t first; + + zv = bp->bio_to->private; + ASSERT(zv != NULL); + switch (bp->bio_cmd) { + case BIO_FLUSH: + if (!THREAD_CAN_SLEEP()) + goto enqueue; + zil_commit(zv->zv_zilog, ZVOL_OBJ); + g_io_deliver(bp, 0); + break; + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + if (!THREAD_CAN_SLEEP()) + goto enqueue; + zvol_strategy(bp); + break; + case BIO_GETATTR: { + spa_t *spa = dmu_objset_spa(zv->zv_objset); + uint64_t refd, avail, usedobjs, availobjs, val; + + if (g_handleattr_int(bp, "GEOM::candelete", 1)) + return; + if (strcmp(bp->bio_attribute, "blocksavail") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + if (g_handleattr_off_t(bp, "blocksavail", + avail / DEV_BSIZE)) + return; + } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + if (g_handleattr_off_t(bp, "blocksused", + refd / DEV_BSIZE)) + return; + } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { + avail = metaslab_class_get_space(spa_normal_class(spa)); + avail -= metaslab_class_get_alloc(spa_normal_class(spa)); + if (g_handleattr_off_t(bp, "poolblocksavail", + avail / DEV_BSIZE)) + return; + } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { + refd = metaslab_class_get_alloc(spa_normal_class(spa)); + if (g_handleattr_off_t(bp, "poolblocksused", + refd / DEV_BSIZE)) + return; + } + /* FALLTHROUGH */ + } + default: + g_io_deliver(bp, EOPNOTSUPP); + break; + } + return; + +enqueue: + mtx_lock(&zv->zv_queue_mtx); + first = (bioq_first(&zv->zv_queue) == NULL); + bioq_insert_tail(&zv->zv_queue, bp); + mtx_unlock(&zv->zv_queue_mtx); + if (first) + wakeup_one(&zv->zv_queue); +} + +static void +zvol_geom_worker(void *arg) +{ + zvol_state_t *zv; + struct bio *bp; + + thread_lock(curthread); + sched_prio(curthread, PRIBIO); + thread_unlock(curthread); + + zv = arg; + for (;;) { + mtx_lock(&zv->zv_queue_mtx); + bp = bioq_takefirst(&zv->zv_queue); + if (bp == NULL) { + if (zv->zv_state == 1) { + zv->zv_state = 2; + wakeup(&zv->zv_state); + mtx_unlock(&zv->zv_queue_mtx); + kthread_exit(); + } + msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP, + "zvol:io", 0); + continue; + } + mtx_unlock(&zv->zv_queue_mtx); + switch (bp->bio_cmd) { + case BIO_FLUSH: + zil_commit(zv->zv_zilog, ZVOL_OBJ); + g_io_deliver(bp, 0); + break; + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + zvol_strategy(bp); + break; + default: + g_io_deliver(bp, EOPNOTSUPP); + break; + } + } +} + +extern boolean_t dataset_name_hidden(const char *name); + +static int +zvol_create_snapshots(objset_t *os, const char *name) +{ + uint64_t cookie, obj; + char *sname; + int error, len; + + cookie = obj = 0; + sname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + +#if 0 + (void) dmu_objset_find(name, dmu_objset_prefetch, NULL, + DS_FIND_SNAPSHOTS); +#endif + + for (;;) { + len = snprintf(sname, MAXPATHLEN, "%s@", name); + if (len >= MAXPATHLEN) { + dmu_objset_rele(os, FTAG); + error = ENAMETOOLONG; + break; + } + + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + error = dmu_snapshot_list_next(os, MAXPATHLEN - len, + sname + len, &obj, &cookie, NULL); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (error != 0) { + if (error == ENOENT) + error = 0; + break; + } + + error = zvol_create_minor(sname); + if (error != 0 && error != EEXIST) { + printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n", + sname, error); + break; + } + } + + kmem_free(sname, MAXPATHLEN); + return (error); +} + +int +zvol_create_minors(const char *name) +{ + uint64_t cookie; + objset_t *os; + char *osname, *p; + int error, len; + + if (dataset_name_hidden(name)) + return (0); + + if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { + printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n", + name, error); + return (error); + } + if (dmu_objset_type(os) == DMU_OST_ZVOL) { + dsl_dataset_long_hold(os->os_dsl_dataset, FTAG); + dsl_pool_rele(dmu_objset_pool(os), FTAG); + error = zvol_create_minor(name); + if (error == 0 || error == EEXIST) { + error = zvol_create_snapshots(os, name); + } else { + printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n", + name, error); + } + dsl_dataset_long_rele(os->os_dsl_dataset, FTAG); + dsl_dataset_rele(os->os_dsl_dataset, FTAG); + return (error); + } + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (0); + } + + osname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) { + dmu_objset_rele(os, FTAG); + kmem_free(osname, MAXPATHLEN); + return (ENOENT); + } + p = osname + strlen(osname); + len = MAXPATHLEN - (p - osname); + +#if 0 + /* Prefetch the datasets. */ + cookie = 0; + while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) { + if (!dataset_name_hidden(osname)) + (void) dmu_objset_prefetch(osname, NULL); + } +#endif + + cookie = 0; + while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL, + &cookie) == 0) { + dmu_objset_rele(os, FTAG); + (void)zvol_create_minors(osname); + if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { + printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n", + name, error); + return (error); + } + } + + dmu_objset_rele(os, FTAG); + kmem_free(osname, MAXPATHLEN); + return (0); +} + +static void +zvol_rename_minor(zvol_state_t *zv, const char *newname) +{ + struct g_geom *gp; + struct g_provider *pp; + struct cdev *dev; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { + g_topology_lock(); + pp = zv->zv_provider; + ASSERT(pp != NULL); + gp = pp->geom; + ASSERT(gp != NULL); + + zv->zv_provider = NULL; + g_wither_provider(pp, ENXIO); + + pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; + pp->sectorsize = DEV_BSIZE; + pp->mediasize = zv->zv_volsize; + pp->private = zv; + zv->zv_provider = pp; + g_error_provider(pp, 0); + g_topology_unlock(); + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { + struct make_dev_args args; + + if ((dev = zv->zv_dev) != NULL) { + zv->zv_dev = NULL; + destroy_dev(dev); + if (zv->zv_total_opens > 0) { + zv->zv_flags &= ~ZVOL_EXCL; + zv->zv_total_opens = 0; + zvol_last_close(zv); + } + } + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zvol_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0640; + args.mda_si_drv2 = zv; + if (make_dev_s(&args, &zv->zv_dev, + "%s/%s", ZVOL_DRIVER, newname) == 0) + zv->zv_dev->si_iosize_max = MAXPHYS; + } + strlcpy(zv->zv_name, newname, sizeof(zv->zv_name)); +} + +void +zvol_rename_minors(const char *oldname, const char *newname) +{ + char name[MAXPATHLEN]; + struct g_provider *pp; + struct g_geom *gp; + size_t oldnamelen, newnamelen; + zvol_state_t *zv; + char *namebuf; + boolean_t locked = B_FALSE; + + oldnamelen = strlen(oldname); + newnamelen = strlen(newname); + + DROP_GIANT(); + /* See comment in zvol_open(). */ + if (!MUTEX_HELD(&zfsdev_state_lock)) { + mutex_enter(&zfsdev_state_lock); + locked = B_TRUE; + } + + LIST_FOREACH(zv, &all_zvols, zv_links) { + if (strcmp(zv->zv_name, oldname) == 0) { + zvol_rename_minor(zv, newname); + } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && + (zv->zv_name[oldnamelen] == '/' || + zv->zv_name[oldnamelen] == '@')) { + snprintf(name, sizeof(name), "%s%c%s", newname, + zv->zv_name[oldnamelen], + zv->zv_name + oldnamelen + 1); + zvol_rename_minor(zv, name); + } + } + + if (locked) + mutex_exit(&zfsdev_state_lock); + PICKUP_GIANT(); +} + +static int +zvol_d_open(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + zvol_state_t *zv = dev->si_drv2; + int err = 0; + + mutex_enter(&zfsdev_state_lock); + if (zv->zv_total_opens == 0) + err = zvol_first_open(zv); + if (err) { + mutex_exit(&zfsdev_state_lock); + return (err); + } + if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + err = SET_ERROR(EROFS); + goto out; + } + if (zv->zv_flags & ZVOL_EXCL) { + err = SET_ERROR(EBUSY); + goto out; + } +#ifdef FEXCL + if (flags & FEXCL) { + if (zv->zv_total_opens != 0) { + err = SET_ERROR(EBUSY); + goto out; + } + zv->zv_flags |= ZVOL_EXCL; + } +#endif + + zv->zv_total_opens++; + if (flags & (FSYNC | FDSYNC)) { + zv->zv_sync_cnt++; + if (zv->zv_sync_cnt == 1) + zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); + } + mutex_exit(&zfsdev_state_lock); + return (err); +out: + if (zv->zv_total_opens == 0) + zvol_last_close(zv); + mutex_exit(&zfsdev_state_lock); + return (err); +} + +static int +zvol_d_close(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + zvol_state_t *zv = dev->si_drv2; + + mutex_enter(&zfsdev_state_lock); + if (zv->zv_flags & ZVOL_EXCL) { + ASSERT(zv->zv_total_opens == 1); + zv->zv_flags &= ~ZVOL_EXCL; + } + + /* + * If the open count is zero, this is a spurious close. + * That indicates a bug in the kernel / DDI framework. + */ + ASSERT(zv->zv_total_opens != 0); + + /* + * You may get multiple opens, but only one close. + */ + zv->zv_total_opens--; + if (flags & (FSYNC | FDSYNC)) + zv->zv_sync_cnt--; + + if (zv->zv_total_opens == 0) + zvol_last_close(zv); + + mutex_exit(&zfsdev_state_lock); + return (0); +} + +static int +zvol_d_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) +{ + zvol_state_t *zv; + rl_t *rl; + off_t offset, length; + int i, error; + boolean_t sync; + + zv = dev->si_drv2; + + error = 0; + KASSERT(zv->zv_total_opens > 0, + ("Device with zero access count in zvol_d_ioctl")); + + i = IOCPARM_LEN(cmd); + switch (cmd) { + case DIOCGSECTORSIZE: + *(u_int *)data = DEV_BSIZE; + break; + case DIOCGMEDIASIZE: + *(off_t *)data = zv->zv_volsize; + break; + case DIOCGFLUSH: + zil_commit(zv->zv_zilog, ZVOL_OBJ); + break; + case DIOCGDELETE: + if (!zvol_unmap_enabled) + break; + + offset = ((off_t *)data)[0]; + length = ((off_t *)data)[1]; + if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || + offset < 0 || offset >= zv->zv_volsize || + length <= 0) { + printf("%s: offset=%jd length=%jd\n", __func__, offset, + length); + error = EINVAL; + break; + } + + rl = zfs_range_lock(&zv->zv_znode, offset, length, RL_WRITER); + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + sync = FALSE; + dmu_tx_abort(tx); + } else { + sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + zvol_log_truncate(zv, tx, offset, length, sync); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, + offset, length); + } + zfs_range_unlock(rl); + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + break; + case DIOCGSTRIPESIZE: + *(off_t *)data = zv->zv_volblocksize; + break; + case DIOCGSTRIPEOFFSET: + *(off_t *)data = 0; + break; + case DIOCGATTR: { + spa_t *spa = dmu_objset_spa(zv->zv_objset); + struct diocgattr_arg *arg = (struct diocgattr_arg *)data; + uint64_t refd, avail, usedobjs, availobjs; + + if (strcmp(arg->name, "GEOM::candelete") == 0) + arg->value.i = 1; + else if (strcmp(arg->name, "blocksavail") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + arg->value.off = avail / DEV_BSIZE; + } else if (strcmp(arg->name, "blocksused") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + arg->value.off = refd / DEV_BSIZE; + } else if (strcmp(arg->name, "poolblocksavail") == 0) { + avail = metaslab_class_get_space(spa_normal_class(spa)); + avail -= metaslab_class_get_alloc(spa_normal_class(spa)); + arg->value.off = avail / DEV_BSIZE; + } else if (strcmp(arg->name, "poolblocksused") == 0) { + refd = metaslab_class_get_alloc(spa_normal_class(spa)); + arg->value.off = refd / DEV_BSIZE; + } else + error = ENOIOCTL; + break; + } + case FIOSEEKHOLE: + case FIOSEEKDATA: { + off_t *off = (off_t *)data; + uint64_t noff; + boolean_t hole; + + hole = (cmd == FIOSEEKHOLE); + noff = *off; + error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); + *off = noff; + break; + } + default: + error = ENOIOCTL; + } + + return (error); +} +#endif /* illumos */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/callb.c b/sys/cddl/contrib/opensolaris/uts/common/os/callb.c new file mode 100644 index 000000000000..da479087f869 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/os/callb.c @@ -0,0 +1,438 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/mutex.h> +#include <sys/condvar.h> +#include <sys/callb.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/kobj.h> +#include <sys/systm.h> /* for delay() */ +#include <sys/taskq.h> /* For TASKQ_NAMELEN */ +#include <sys/kernel.h> + +#define CB_MAXNAME TASKQ_NAMELEN + +/* + * The callb mechanism provides generic event scheduling/echoing. + * A callb function is registered and called on behalf of the event. + */ +typedef struct callb { + struct callb *c_next; /* next in class or on freelist */ + kthread_id_t c_thread; /* ptr to caller's thread struct */ + char c_flag; /* info about the callb state */ + uchar_t c_class; /* this callb's class */ + kcondvar_t c_done_cv; /* signal callb completion */ + boolean_t (*c_func)(); /* cb function: returns true if ok */ + void *c_arg; /* arg to c_func */ + char c_name[CB_MAXNAME+1]; /* debug:max func name length */ +} callb_t; + +/* + * callb c_flag bitmap definitions + */ +#define CALLB_FREE 0x0 +#define CALLB_TAKEN 0x1 +#define CALLB_EXECUTING 0x2 + +/* + * Basic structure for a callb table. + * All callbs are organized into different class groups described + * by ct_class array. + * The callbs within a class are single-linked and normally run by a + * serial execution. + */ +typedef struct callb_table { + kmutex_t ct_lock; /* protect all callb states */ + callb_t *ct_freelist; /* free callb structures */ + int ct_busy; /* != 0 prevents additions */ + kcondvar_t ct_busy_cv; /* to wait for not busy */ + int ct_ncallb; /* num of callbs allocated */ + callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */ +} callb_table_t; + +int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC; + +static callb_id_t callb_add_common(boolean_t (*)(void *, int), + void *, int, char *, kthread_id_t); + +static callb_table_t callb_table; /* system level callback table */ +static callb_table_t *ct = &callb_table; +static kmutex_t callb_safe_mutex; +callb_cpr_t callb_cprinfo_safe = { + &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, 0, 0 }; + +/* + * Init all callb tables in the system. + */ +void +callb_init(void *dummy __unused) +{ + callb_table.ct_busy = 0; /* mark table open for additions */ + mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +callb_fini(void *dummy __unused) +{ + callb_t *cp; + int i; + + mutex_enter(&ct->ct_lock); + for (i = 0; i < 16; i++) { + while ((cp = ct->ct_freelist) != NULL) { + ct->ct_freelist = cp->c_next; + ct->ct_ncallb--; + kmem_free(cp, sizeof (callb_t)); + } + if (ct->ct_ncallb == 0) + break; + /* Not all callbacks finished, waiting for the rest. */ + mutex_exit(&ct->ct_lock); + tsleep(ct, 0, "callb", hz / 4); + mutex_enter(&ct->ct_lock); + } + if (ct->ct_ncallb > 0) + printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb); + mutex_exit(&ct->ct_lock); + mutex_destroy(&callb_safe_mutex); + mutex_destroy(&callb_table.ct_lock); +} + +/* + * callout_add() is called to register func() be called later. + */ +static callb_id_t +callb_add_common(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name, kthread_id_t t) +{ + callb_t *cp; + + ASSERT(class < NCBCLASS); + + mutex_enter(&ct->ct_lock); + while (ct->ct_busy) + cv_wait(&ct->ct_busy_cv, &ct->ct_lock); + if ((cp = ct->ct_freelist) == NULL) { + ct->ct_ncallb++; + cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP); + } + ct->ct_freelist = cp->c_next; + cp->c_thread = t; + cp->c_func = func; + cp->c_arg = arg; + cp->c_class = (uchar_t)class; + cp->c_flag |= CALLB_TAKEN; +#ifdef DEBUG + if (strlen(name) > CB_MAXNAME) + cmn_err(CE_WARN, "callb_add: name of callback function '%s' " + "too long -- truncated to %d chars", + name, CB_MAXNAME); +#endif + (void) strncpy(cp->c_name, name, CB_MAXNAME); + cp->c_name[CB_MAXNAME] = '\0'; + + /* + * Insert the new callb at the head of its class list. + */ + cp->c_next = ct->ct_first_cb[class]; + ct->ct_first_cb[class] = cp; + + mutex_exit(&ct->ct_lock); + return ((callb_id_t)cp); +} + +/* + * The default function to add an entry to the callback table. Since + * it uses curthread as the thread identifier to store in the table, + * it should be used for the normal case of a thread which is calling + * to add ITSELF to the table. + */ +callb_id_t +callb_add(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name) +{ + return (callb_add_common(func, arg, class, name, curthread)); +} + +/* + * A special version of callb_add() above for use by threads which + * might be adding an entry to the table on behalf of some other + * thread (for example, one which is constructed but not yet running). + * In this version the thread id is an argument. + */ +callb_id_t +callb_add_thread(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name, kthread_id_t t) +{ + return (callb_add_common(func, arg, class, name, t)); +} + +/* + * callout_delete() is called to remove an entry identified by id + * that was originally placed there by a call to callout_add(). + * return -1 if fail to delete a callb entry otherwise return 0. + */ +int +callb_delete(callb_id_t id) +{ + callb_t **pp; + callb_t *me = (callb_t *)id; + + mutex_enter(&ct->ct_lock); + + for (;;) { + pp = &ct->ct_first_cb[me->c_class]; + while (*pp != NULL && *pp != me) + pp = &(*pp)->c_next; + +#ifdef DEBUG + if (*pp != me) { + cmn_err(CE_WARN, "callb delete bogus entry 0x%p", + (void *)me); + mutex_exit(&ct->ct_lock); + return (-1); + } +#endif /* DEBUG */ + + /* + * It is not allowed to delete a callb in the middle of + * executing otherwise, the callb_execute() will be confused. + */ + if (!(me->c_flag & CALLB_EXECUTING)) + break; + + cv_wait(&me->c_done_cv, &ct->ct_lock); + } + /* relink the class list */ + *pp = me->c_next; + + /* clean up myself and return the free callb to the head of freelist */ + me->c_flag = CALLB_FREE; + me->c_next = ct->ct_freelist; + ct->ct_freelist = me; + + mutex_exit(&ct->ct_lock); + return (0); +} + +/* + * class: indicates to execute all callbs in the same class; + * code: optional argument for the callb functions. + * return: = 0: success + * != 0: ptr to string supplied when callback was registered + */ +void * +callb_execute_class(int class, int code) +{ + callb_t *cp; + void *ret = NULL; + + ASSERT(class < NCBCLASS); + + mutex_enter(&ct->ct_lock); + + for (cp = ct->ct_first_cb[class]; + cp != NULL && ret == 0; cp = cp->c_next) { + while (cp->c_flag & CALLB_EXECUTING) + cv_wait(&cp->c_done_cv, &ct->ct_lock); + /* + * cont if the callb is deleted while we're sleeping + */ + if (cp->c_flag == CALLB_FREE) + continue; + cp->c_flag |= CALLB_EXECUTING; + +#ifdef CALLB_DEBUG + printf("callb_execute: name=%s func=%p arg=%p\n", + cp->c_name, (void *)cp->c_func, (void *)cp->c_arg); +#endif /* CALLB_DEBUG */ + + mutex_exit(&ct->ct_lock); + /* If callback function fails, pass back client's name */ + if (!(*cp->c_func)(cp->c_arg, code)) + ret = cp->c_name; + mutex_enter(&ct->ct_lock); + + cp->c_flag &= ~CALLB_EXECUTING; + cv_broadcast(&cp->c_done_cv); + } + mutex_exit(&ct->ct_lock); + return (ret); +} + +/* + * callers make sure no recursive entries to this func. + * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure. + * + * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we + * use a cv_timedwait() in case the kernel thread is blocked. + * + * Note that this is a generic callback handler for daemon CPR and + * should NOT be changed to accommodate any specific requirement in a daemon. + * Individual daemons that require changes to the handler shall write + * callback routines in their own daemon modules. + */ +boolean_t +callb_generic_cpr(void *arg, int code) +{ + callb_cpr_t *cp = (callb_cpr_t *)arg; + clock_t ret = 0; /* assume success */ + + mutex_enter(cp->cc_lockp); + + switch (code) { + case CB_CODE_CPR_CHKPT: + cp->cc_events |= CALLB_CPR_START; +#ifdef CPR_NOT_THREAD_SAFE + while (!(cp->cc_events & CALLB_CPR_SAFE)) + /* cv_timedwait() returns -1 if it times out. */ + if ((ret = cv_reltimedwait(&cp->cc_callb_cv, + cp->cc_lockp, (callb_timeout_sec * hz), + TR_CLOCK_TICK)) == -1) + break; +#endif + break; + + case CB_CODE_CPR_RESUME: + cp->cc_events &= ~CALLB_CPR_START; + cv_signal(&cp->cc_stop_cv); + break; + } + mutex_exit(cp->cc_lockp); + return (ret != -1); +} + +/* + * The generic callback function associated with kernel threads which + * are always considered safe. + */ +/* ARGSUSED */ +boolean_t +callb_generic_cpr_safe(void *arg, int code) +{ + return (B_TRUE); +} +/* + * Prevent additions to callback table. + */ +void +callb_lock_table(void) +{ + mutex_enter(&ct->ct_lock); + ASSERT(ct->ct_busy == 0); + ct->ct_busy = 1; + mutex_exit(&ct->ct_lock); +} + +/* + * Allow additions to callback table. + */ +void +callb_unlock_table(void) +{ + mutex_enter(&ct->ct_lock); + ASSERT(ct->ct_busy != 0); + ct->ct_busy = 0; + cv_broadcast(&ct->ct_busy_cv); + mutex_exit(&ct->ct_lock); +} + +#ifdef illumos +/* + * Return a boolean value indicating whether a particular kernel thread is + * stopped in accordance with the cpr callback protocol. If returning + * false, also return a pointer to the thread name via the 2nd argument. + */ +boolean_t +callb_is_stopped(kthread_id_t tp, caddr_t *thread_name) +{ + callb_t *cp; + boolean_t ret_val; + + mutex_enter(&ct->ct_lock); + + for (cp = ct->ct_first_cb[CB_CL_CPR_DAEMON]; + cp != NULL && tp != cp->c_thread; cp = cp->c_next) + ; + + ret_val = (cp != NULL); + if (ret_val) { + /* + * We found the thread in the callback table and have + * provisionally set the return value to true. Now + * see if it is marked "safe" and is sleeping or stopped. + */ + callb_cpr_t *ccp = (callb_cpr_t *)cp->c_arg; + + *thread_name = cp->c_name; /* in case not stopped */ + mutex_enter(ccp->cc_lockp); + + if (ccp->cc_events & CALLB_CPR_SAFE) { + int retry; + + mutex_exit(ccp->cc_lockp); + for (retry = 0; retry < CALLB_MAX_RETRY; retry++) { + thread_lock(tp); + if (tp->t_state & (TS_SLEEP | TS_STOPPED)) { + thread_unlock(tp); + break; + } + thread_unlock(tp); + delay(CALLB_THREAD_DELAY); + } + ret_val = retry < CALLB_MAX_RETRY; + } else { + ret_val = + (ccp->cc_events & CALLB_CPR_ALWAYS_SAFE) != 0; + mutex_exit(ccp->cc_lockp); + } + } else { + /* + * Thread not found in callback table. Make the best + * attempt to identify the thread in the error message. + */ + ulong_t offset; + char *sym = kobj_getsymname((uintptr_t)tp->t_startpc, + &offset); + + *thread_name = sym ? sym : "*unknown*"; + } + + mutex_exit(&ct->ct_lock); + return (ret_val); +} +#endif /* illumos */ + +SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL); +SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL); diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/fm.c b/sys/cddl/contrib/opensolaris/uts/common/os/fm.c new file mode 100644 index 000000000000..57b23356b12d --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/os/fm.c @@ -0,0 +1,1399 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Fault Management Architecture (FMA) Resource and Protocol Support + * + * The routines contained herein provide services to support kernel subsystems + * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089). + * + * Name-Value Pair Lists + * + * The embodiment of an FMA protocol element (event, fmri or authority) is a + * name-value pair list (nvlist_t). FMA-specific nvlist construtor and + * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used + * to create an nvpair list using custom allocators. Callers may choose to + * allocate either from the kernel memory allocator, or from a preallocated + * buffer, useful in constrained contexts like high-level interrupt routines. + * + * Protocol Event and FMRI Construction + * + * Convenience routines are provided to construct nvlist events according to + * the FMA Event Protocol and Naming Schema specification for ereports and + * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes. + * + * ENA Manipulation + * + * Routines to generate ENA formats 0, 1 and 2 are available as well as + * routines to increment formats 1 and 2. Individual fields within the + * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(), + * fm_ena_format_get() and fm_ena_gen_get(). + */ + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/sysevent.h> +#include <sys/nvpair.h> +#include <sys/cmn_err.h> +#include <sys/cpuvar.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/compress.h> +#include <sys/cpuvar.h> +#include <sys/kobj.h> +#include <sys/kstat.h> +#include <sys/processor.h> +#include <sys/pcpu.h> +#include <sys/sunddi.h> +#include <sys/systeminfo.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/fm/util.h> +#include <sys/fm/protocol.h> + +/* + * URL and SUNW-MSG-ID value to display for fm_panic(), defined below. These + * values must be kept in sync with the FMA source code in usr/src/cmd/fm. + */ +static const char *fm_url = "http://www.sun.com/msg"; +static const char *fm_msgid = "SUNOS-8000-0G"; +static char *volatile fm_panicstr = NULL; + +#ifdef illumos +errorq_t *ereport_errorq; +#endif +void *ereport_dumpbuf; +size_t ereport_dumplen; + +static uint_t ereport_chanlen = ERPT_EVCH_MAX; +static evchan_t *ereport_chan = NULL; +static ulong_t ereport_qlen = 0; +static size_t ereport_size = 0; +static int ereport_cols = 80; + +extern void fastreboot_disable_highpil(void); + +/* + * Common fault management kstats to record ereport generation + * failures + */ + +struct erpt_kstat { + kstat_named_t erpt_dropped; /* num erpts dropped on post */ + kstat_named_t erpt_set_failed; /* num erpt set failures */ + kstat_named_t fmri_set_failed; /* num fmri set failures */ + kstat_named_t payload_set_failed; /* num payload set failures */ +}; + +static struct erpt_kstat erpt_kstat_data = { + { "erpt-dropped", KSTAT_DATA_UINT64 }, + { "erpt-set-failed", KSTAT_DATA_UINT64 }, + { "fmri-set-failed", KSTAT_DATA_UINT64 }, + { "payload-set-failed", KSTAT_DATA_UINT64 } +}; + +#ifdef illumos +/*ARGSUSED*/ +static void +fm_drain(void *private, void *data, errorq_elem_t *eep) +{ + nvlist_t *nvl = errorq_elem_nvl(ereport_errorq, eep); + + if (!panicstr) + (void) fm_ereport_post(nvl, EVCH_TRYHARD); + else + fm_nvprint(nvl); +} +#endif + +void +fm_init(void) +{ + kstat_t *ksp; + +#ifdef illumos + (void) sysevent_evc_bind(FM_ERROR_CHAN, + &ereport_chan, EVCH_CREAT | EVCH_HOLD_PEND); + + (void) sysevent_evc_control(ereport_chan, + EVCH_SET_CHAN_LEN, &ereport_chanlen); +#endif + + if (ereport_qlen == 0) + ereport_qlen = ERPT_MAX_ERRS * MAX(max_ncpus, 4); + + if (ereport_size == 0) + ereport_size = ERPT_DATA_SZ; + +#ifdef illumos + ereport_errorq = errorq_nvcreate("fm_ereport_queue", + (errorq_func_t)fm_drain, NULL, ereport_qlen, ereport_size, + FM_ERR_PIL, ERRORQ_VITAL); + if (ereport_errorq == NULL) + panic("failed to create required ereport error queue"); +#endif + + ereport_dumpbuf = kmem_alloc(ereport_size, KM_SLEEP); + ereport_dumplen = ereport_size; + + /* Initialize ereport allocation and generation kstats */ + ksp = kstat_create("unix", 0, "fm", "misc", KSTAT_TYPE_NAMED, + sizeof (struct erpt_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp != NULL) { + ksp->ks_data = &erpt_kstat_data; + kstat_install(ksp); + } else { + cmn_err(CE_NOTE, "failed to create fm/misc kstat\n"); + + } +} + +#ifdef illumos +/* + * Formatting utility function for fm_nvprintr. We attempt to wrap chunks of + * output so they aren't split across console lines, and return the end column. + */ +/*PRINTFLIKE4*/ +static int +fm_printf(int depth, int c, int cols, const char *format, ...) +{ + va_list ap; + int width; + char c1; + + va_start(ap, format); + width = vsnprintf(&c1, sizeof (c1), format, ap); + va_end(ap); + + if (c + width >= cols) { + console_printf("\n\r"); + c = 0; + if (format[0] != ' ' && depth > 0) { + console_printf(" "); + c++; + } + } + + va_start(ap, format); + console_vprintf(format, ap); + va_end(ap); + + return ((c + width) % cols); +} + +/* + * Recursively print a nvlist in the specified column width and return the + * column we end up in. This function is called recursively by fm_nvprint(), + * below. We generically format the entire nvpair using hexadecimal + * integers and strings, and elide any integer arrays. Arrays are basically + * used for cache dumps right now, so we suppress them so as not to overwhelm + * the amount of console output we produce at panic time. This can be further + * enhanced as FMA technology grows based upon the needs of consumers. All + * FMA telemetry is logged using the dump device transport, so the console + * output serves only as a fallback in case this procedure is unsuccessful. + */ +static int +fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) +{ + nvpair_t *nvp; + + for (nvp = nvlist_next_nvpair(nvl, NULL); + nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { + + data_type_t type = nvpair_type(nvp); + const char *name = nvpair_name(nvp); + + boolean_t b; + uint8_t i8; + uint16_t i16; + uint32_t i32; + uint64_t i64; + char *str; + nvlist_t *cnv; + + if (strcmp(name, FM_CLASS) == 0) + continue; /* already printed by caller */ + + c = fm_printf(d, c, cols, " %s=", name); + + switch (type) { + case DATA_TYPE_BOOLEAN: + c = fm_printf(d + 1, c, cols, " 1"); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + (void) nvpair_value_boolean_value(nvp, &b); + c = fm_printf(d + 1, c, cols, b ? "1" : "0"); + break; + + case DATA_TYPE_BYTE: + (void) nvpair_value_byte(nvp, &i8); + c = fm_printf(d + 1, c, cols, "%x", i8); + break; + + case DATA_TYPE_INT8: + (void) nvpair_value_int8(nvp, (void *)&i8); + c = fm_printf(d + 1, c, cols, "%x", i8); + break; + + case DATA_TYPE_UINT8: + (void) nvpair_value_uint8(nvp, &i8); + c = fm_printf(d + 1, c, cols, "%x", i8); + break; + + case DATA_TYPE_INT16: + (void) nvpair_value_int16(nvp, (void *)&i16); + c = fm_printf(d + 1, c, cols, "%x", i16); + break; + + case DATA_TYPE_UINT16: + (void) nvpair_value_uint16(nvp, &i16); + c = fm_printf(d + 1, c, cols, "%x", i16); + break; + + case DATA_TYPE_INT32: + (void) nvpair_value_int32(nvp, (void *)&i32); + c = fm_printf(d + 1, c, cols, "%x", i32); + break; + + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &i32); + c = fm_printf(d + 1, c, cols, "%x", i32); + break; + + case DATA_TYPE_INT64: + (void) nvpair_value_int64(nvp, (void *)&i64); + c = fm_printf(d + 1, c, cols, "%llx", + (u_longlong_t)i64); + break; + + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &i64); + c = fm_printf(d + 1, c, cols, "%llx", + (u_longlong_t)i64); + break; + + case DATA_TYPE_HRTIME: + (void) nvpair_value_hrtime(nvp, (void *)&i64); + c = fm_printf(d + 1, c, cols, "%llx", + (u_longlong_t)i64); + break; + + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &str); + c = fm_printf(d + 1, c, cols, "\"%s\"", + str ? str : "<NULL>"); + break; + + case DATA_TYPE_NVLIST: + c = fm_printf(d + 1, c, cols, "["); + (void) nvpair_value_nvlist(nvp, &cnv); + c = fm_nvprintr(cnv, d + 1, c, cols); + c = fm_printf(d + 1, c, cols, " ]"); + break; + + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "["); + (void) nvpair_value_nvlist_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) { + c = fm_nvprintr(val[i], d + 1, c, cols); + } + c = fm_printf(d + 1, c, cols, " ]"); + } + break; + + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_BYTE_ARRAY: + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + case DATA_TYPE_INT16_ARRAY: + case DATA_TYPE_UINT16_ARRAY: + case DATA_TYPE_INT32_ARRAY: + case DATA_TYPE_UINT32_ARRAY: + case DATA_TYPE_INT64_ARRAY: + case DATA_TYPE_UINT64_ARRAY: + case DATA_TYPE_STRING_ARRAY: + c = fm_printf(d + 1, c, cols, "[...]"); + break; + case DATA_TYPE_UNKNOWN: + c = fm_printf(d + 1, c, cols, "<unknown>"); + break; + } + } + + return (c); +} + +void +fm_nvprint(nvlist_t *nvl) +{ + char *class; + int c = 0; + + console_printf("\r"); + + if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0) + c = fm_printf(0, c, ereport_cols, "%s", class); + + if (fm_nvprintr(nvl, 0, c, ereport_cols) != 0) + console_printf("\n"); + + console_printf("\n"); +} + +/* + * Wrapper for panic() that first produces an FMA-style message for admins. + * Normally such messages are generated by fmd(1M)'s syslog-msgs agent: this + * is the one exception to that rule and the only error that gets messaged. + * This function is intended for use by subsystems that have detected a fatal + * error and enqueued appropriate ereports and wish to then force a panic. + */ +/*PRINTFLIKE1*/ +void +fm_panic(const char *format, ...) +{ + va_list ap; + + (void) atomic_cas_ptr((void *)&fm_panicstr, NULL, (void *)format); +#if defined(__i386) || defined(__amd64) + fastreboot_disable_highpil(); +#endif /* __i386 || __amd64 */ + va_start(ap, format); + vpanic(format, ap); + va_end(ap); +} + +/* + * Simply tell the caller if fm_panicstr is set, ie. an fma event has + * caused the panic. If so, something other than the default panic + * diagnosis method will diagnose the cause of the panic. + */ +int +is_fm_panic() +{ + if (fm_panicstr) + return (1); + else + return (0); +} + +/* + * Print any appropriate FMA banner message before the panic message. This + * function is called by panicsys() and prints the message for fm_panic(). + * We print the message here so that it comes after the system is quiesced. + * A one-line summary is recorded in the log only (cmn_err(9F) with "!" prefix). + * The rest of the message is for the console only and not needed in the log, + * so it is printed using console_printf(). We break it up into multiple + * chunks so as to avoid overflowing any small legacy prom_printf() buffers. + */ +void +fm_banner(void) +{ + timespec_t tod; + hrtime_t now; + + if (!fm_panicstr) + return; /* panic was not initiated by fm_panic(); do nothing */ + + if (panicstr) { + tod = panic_hrestime; + now = panic_hrtime; + } else { + gethrestime(&tod); + now = gethrtime_waitfree(); + } + + cmn_err(CE_NOTE, "!SUNW-MSG-ID: %s, " + "TYPE: Error, VER: 1, SEVERITY: Major\n", fm_msgid); + + console_printf( +"\n\rSUNW-MSG-ID: %s, TYPE: Error, VER: 1, SEVERITY: Major\n" +"EVENT-TIME: 0x%lx.0x%lx (0x%llx)\n", + fm_msgid, tod.tv_sec, tod.tv_nsec, (u_longlong_t)now); + + console_printf( +"PLATFORM: %s, CSN: -, HOSTNAME: %s\n" +"SOURCE: %s, REV: %s %s\n", + platform, utsname.nodename, utsname.sysname, + utsname.release, utsname.version); + + console_printf( +"DESC: Errors have been detected that require a reboot to ensure system\n" +"integrity. See %s/%s for more information.\n", + fm_url, fm_msgid); + + console_printf( +"AUTO-RESPONSE: Solaris will attempt to save and diagnose the error telemetry\n" +"IMPACT: The system will sync files, save a crash dump if needed, and reboot\n" +"REC-ACTION: Save the error summary below in case telemetry cannot be saved\n"); + + console_printf("\n"); +} + +/* + * Utility function to write all of the pending ereports to the dump device. + * This function is called at either normal reboot or panic time, and simply + * iterates over the in-transit messages in the ereport sysevent channel. + */ +void +fm_ereport_dump(void) +{ + evchanq_t *chq; + sysevent_t *sep; + erpt_dump_t ed; + + timespec_t tod; + hrtime_t now; + char *buf; + size_t len; + + if (panicstr) { + tod = panic_hrestime; + now = panic_hrtime; + } else { + if (ereport_errorq != NULL) + errorq_drain(ereport_errorq); + gethrestime(&tod); + now = gethrtime_waitfree(); + } + + /* + * In the panic case, sysevent_evc_walk_init() will return NULL. + */ + if ((chq = sysevent_evc_walk_init(ereport_chan, NULL)) == NULL && + !panicstr) + return; /* event channel isn't initialized yet */ + + while ((sep = sysevent_evc_walk_step(chq)) != NULL) { + if ((buf = sysevent_evc_event_attr(sep, &len)) == NULL) + break; + + ed.ed_magic = ERPT_MAGIC; + ed.ed_chksum = checksum32(buf, len); + ed.ed_size = (uint32_t)len; + ed.ed_pad = 0; + ed.ed_hrt_nsec = SE_TIME(sep); + ed.ed_hrt_base = now; + ed.ed_tod_base.sec = tod.tv_sec; + ed.ed_tod_base.nsec = tod.tv_nsec; + + dumpvp_write(&ed, sizeof (ed)); + dumpvp_write(buf, len); + } + + sysevent_evc_walk_fini(chq); +} +#endif + +/* + * Post an error report (ereport) to the sysevent error channel. The error + * channel must be established with a prior call to sysevent_evc_create() + * before publication may occur. + */ +void +fm_ereport_post(nvlist_t *ereport, int evc_flag) +{ + size_t nvl_size = 0; + evchan_t *error_chan; + sysevent_id_t eid; + + (void) nvlist_size(ereport, &nvl_size, NV_ENCODE_NATIVE); + if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) { + atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); + return; + } + +#ifdef illumos + if (sysevent_evc_bind(FM_ERROR_CHAN, &error_chan, + EVCH_CREAT|EVCH_HOLD_PEND) != 0) { + atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); + return; + } + + if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR, + SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) { + atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); + (void) sysevent_evc_unbind(error_chan); + return; + } + (void) sysevent_evc_unbind(error_chan); +#else + (void) ddi_log_sysevent(NULL, SUNW_VENDOR, EC_DEV_STATUS, + ESC_DEV_DLE, ereport, &eid, DDI_SLEEP); +#endif +} + +/* + * Wrapppers for FM nvlist allocators + */ +/* ARGSUSED */ +static void * +i_fm_alloc(nv_alloc_t *nva, size_t size) +{ + return (kmem_zalloc(size, KM_SLEEP)); +} + +/* ARGSUSED */ +static void +i_fm_free(nv_alloc_t *nva, void *buf, size_t size) +{ + kmem_free(buf, size); +} + +const nv_alloc_ops_t fm_mem_alloc_ops = { + NULL, + NULL, + i_fm_alloc, + i_fm_free, + NULL +}; + +/* + * Create and initialize a new nv_alloc_t for a fixed buffer, buf. A pointer + * to the newly allocated nv_alloc_t structure is returned upon success or NULL + * is returned to indicate that the nv_alloc structure could not be created. + */ +nv_alloc_t * +fm_nva_xcreate(char *buf, size_t bufsz) +{ + nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP); + + if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) { + kmem_free(nvhdl, sizeof (nv_alloc_t)); + return (NULL); + } + + return (nvhdl); +} + +/* + * Destroy a previously allocated nv_alloc structure. The fixed buffer + * associated with nva must be freed by the caller. + */ +void +fm_nva_xdestroy(nv_alloc_t *nva) +{ + nv_alloc_fini(nva); + kmem_free(nva, sizeof (nv_alloc_t)); +} + +/* + * Create a new nv list. A pointer to a new nv list structure is returned + * upon success or NULL is returned to indicate that the structure could + * not be created. The newly created nv list is created and managed by the + * operations installed in nva. If nva is NULL, the default FMA nva + * operations are installed and used. + * + * When called from the kernel and nva == NULL, this function must be called + * from passive kernel context with no locks held that can prevent a + * sleeping memory allocation from occurring. Otherwise, this function may + * be called from other kernel contexts as long a valid nva created via + * fm_nva_create() is supplied. + */ +nvlist_t * +fm_nvlist_create(nv_alloc_t *nva) +{ + int hdl_alloced = 0; + nvlist_t *nvl; + nv_alloc_t *nvhdl; + + if (nva == NULL) { + nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP); + + if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) { + kmem_free(nvhdl, sizeof (nv_alloc_t)); + return (NULL); + } + hdl_alloced = 1; + } else { + nvhdl = nva; + } + + if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) { + if (hdl_alloced) { + nv_alloc_fini(nvhdl); + kmem_free(nvhdl, sizeof (nv_alloc_t)); + } + return (NULL); + } + + return (nvl); +} + +/* + * Destroy a previously allocated nvlist structure. flag indicates whether + * or not the associated nva structure should be freed (FM_NVA_FREE) or + * retained (FM_NVA_RETAIN). Retaining the nv alloc structure allows + * it to be re-used for future nvlist creation operations. + */ +void +fm_nvlist_destroy(nvlist_t *nvl, int flag) +{ + nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl); + + nvlist_free(nvl); + + if (nva != NULL) { + if (flag == FM_NVA_FREE) + fm_nva_xdestroy(nva); + } +} + +int +i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap) +{ + int nelem, ret = 0; + data_type_t type; + + while (ret == 0 && name != NULL) { + type = va_arg(ap, data_type_t); + switch (type) { + case DATA_TYPE_BYTE: + ret = nvlist_add_byte(payload, name, + va_arg(ap, uint_t)); + break; + case DATA_TYPE_BYTE_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_byte_array(payload, name, + va_arg(ap, uchar_t *), nelem); + break; + case DATA_TYPE_BOOLEAN_VALUE: + ret = nvlist_add_boolean_value(payload, name, + va_arg(ap, boolean_t)); + break; + case DATA_TYPE_BOOLEAN_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_boolean_array(payload, name, + va_arg(ap, boolean_t *), nelem); + break; + case DATA_TYPE_INT8: + ret = nvlist_add_int8(payload, name, + va_arg(ap, int)); + break; + case DATA_TYPE_INT8_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_int8_array(payload, name, + va_arg(ap, int8_t *), nelem); + break; + case DATA_TYPE_UINT8: + ret = nvlist_add_uint8(payload, name, + va_arg(ap, uint_t)); + break; + case DATA_TYPE_UINT8_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_uint8_array(payload, name, + va_arg(ap, uint8_t *), nelem); + break; + case DATA_TYPE_INT16: + ret = nvlist_add_int16(payload, name, + va_arg(ap, int)); + break; + case DATA_TYPE_INT16_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_int16_array(payload, name, + va_arg(ap, int16_t *), nelem); + break; + case DATA_TYPE_UINT16: + ret = nvlist_add_uint16(payload, name, + va_arg(ap, uint_t)); + break; + case DATA_TYPE_UINT16_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_uint16_array(payload, name, + va_arg(ap, uint16_t *), nelem); + break; + case DATA_TYPE_INT32: + ret = nvlist_add_int32(payload, name, + va_arg(ap, int32_t)); + break; + case DATA_TYPE_INT32_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_int32_array(payload, name, + va_arg(ap, int32_t *), nelem); + break; + case DATA_TYPE_UINT32: + ret = nvlist_add_uint32(payload, name, + va_arg(ap, uint32_t)); + break; + case DATA_TYPE_UINT32_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_uint32_array(payload, name, + va_arg(ap, uint32_t *), nelem); + break; + case DATA_TYPE_INT64: + ret = nvlist_add_int64(payload, name, + va_arg(ap, int64_t)); + break; + case DATA_TYPE_INT64_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_int64_array(payload, name, + va_arg(ap, int64_t *), nelem); + break; + case DATA_TYPE_UINT64: + ret = nvlist_add_uint64(payload, name, + va_arg(ap, uint64_t)); + break; + case DATA_TYPE_UINT64_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_uint64_array(payload, name, + va_arg(ap, uint64_t *), nelem); + break; + case DATA_TYPE_STRING: + ret = nvlist_add_string(payload, name, + va_arg(ap, char *)); + break; + case DATA_TYPE_STRING_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_string_array(payload, name, + va_arg(ap, char **), nelem); + break; + case DATA_TYPE_NVLIST: + ret = nvlist_add_nvlist(payload, name, + va_arg(ap, nvlist_t *)); + break; + case DATA_TYPE_NVLIST_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_nvlist_array(payload, name, + va_arg(ap, nvlist_t **), nelem); + break; + default: + ret = EINVAL; + } + + name = va_arg(ap, char *); + } + return (ret); +} + +void +fm_payload_set(nvlist_t *payload, ...) +{ + int ret; + const char *name; + va_list ap; + + va_start(ap, payload); + name = va_arg(ap, char *); + ret = i_fm_payload_set(payload, name, ap); + va_end(ap); + + if (ret) + atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64); +} + +/* + * Set-up and validate the members of an ereport event according to: + * + * Member name Type Value + * ==================================================== + * class string ereport + * version uint8_t 0 + * ena uint64_t <ena> + * detector nvlist_t <detector> + * ereport-payload nvlist_t <var args> + * + * We don't actually add a 'version' member to the payload. Really, + * the version quoted to us by our caller is that of the category 1 + * "ereport" event class (and we require FM_EREPORT_VERS0) but + * the payload version of the actual leaf class event under construction + * may be something else. Callers should supply a version in the varargs, + * or (better) we could take two version arguments - one for the + * ereport category 1 classification (expect FM_EREPORT_VERS0) and one + * for the leaf class. + */ +void +fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class, + uint64_t ena, const nvlist_t *detector, ...) +{ + char ereport_class[FM_MAX_CLASS]; + const char *name; + va_list ap; + int ret; + + if (version != FM_EREPORT_VERS0) { + atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); + return; + } + + (void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s", + FM_EREPORT_CLASS, erpt_class); + if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) { + atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); + return; + } + + if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) { + atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); + } + + if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR, + (nvlist_t *)detector) != 0) { + atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); + } + + va_start(ap, detector); + name = va_arg(ap, const char *); + ret = i_fm_payload_set(ereport, name, ap); + va_end(ap); + + if (ret) + atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); +} + +/* + * Set-up and validate the members of an hc fmri according to; + * + * Member name Type Value + * =================================================== + * version uint8_t 0 + * auth nvlist_t <auth> + * hc-name string <name> + * hc-id string <id> + * + * Note that auth and hc-id are optional members. + */ + +#define HC_MAXPAIRS 20 +#define HC_MAXNAMELEN 50 + +static int +fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth) +{ + if (version != FM_HC_SCHEME_VERSION) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return (0); + } + + if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 || + nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return (0); + } + + if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, + (nvlist_t *)auth) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return (0); + } + + return (1); +} + +void +fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth, + nvlist_t *snvl, int npairs, ...) +{ + nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); + nvlist_t *pairs[HC_MAXPAIRS]; + va_list ap; + int i; + + if (!fm_fmri_hc_set_common(fmri, version, auth)) + return; + + npairs = MIN(npairs, HC_MAXPAIRS); + + va_start(ap, npairs); + for (i = 0; i < npairs; i++) { + const char *name = va_arg(ap, const char *); + uint32_t id = va_arg(ap, uint32_t); + char idstr[11]; + + (void) snprintf(idstr, sizeof (idstr), "%u", id); + + pairs[i] = fm_nvlist_create(nva); + if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || + nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + } + } + va_end(ap); + + if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0) + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + + for (i = 0; i < npairs; i++) + fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); + + if (snvl != NULL) { + if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + } + } +} + +/* + * Set-up and validate the members of an dev fmri according to: + * + * Member name Type Value + * ==================================================== + * version uint8_t 0 + * auth nvlist_t <auth> + * devpath string <devpath> + * [devid] string <devid> + * [target-port-l0id] string <target-port-lun0-id> + * + * Note that auth and devid are optional members. + */ +void +fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth, + const char *devpath, const char *devid, const char *tpl0) +{ + int err = 0; + + if (version != DEV_SCHEME_VERSION0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version); + err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV); + + if (auth != NULL) { + err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY, + (nvlist_t *)auth); + } + + err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath); + + if (devid != NULL) + err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid); + + if (tpl0 != NULL) + err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0); + + if (err) + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + +} + +/* + * Set-up and validate the members of an cpu fmri according to: + * + * Member name Type Value + * ==================================================== + * version uint8_t 0 + * auth nvlist_t <auth> + * cpuid uint32_t <cpu_id> + * cpumask uint8_t <cpu_mask> + * serial uint64_t <serial_id> + * + * Note that auth, cpumask, serial are optional members. + * + */ +void +fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth, + uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp) +{ + uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64; + + if (version < CPU_SCHEME_VERSION1) { + atomic_inc_64(failedp); + return; + } + + if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) { + atomic_inc_64(failedp); + return; + } + + if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME, + FM_FMRI_SCHEME_CPU) != 0) { + atomic_inc_64(failedp); + return; + } + + if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY, + (nvlist_t *)auth) != 0) + atomic_inc_64(failedp); + + if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0) + atomic_inc_64(failedp); + + if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK, + *cpu_maskp) != 0) + atomic_inc_64(failedp); + + if (serial_idp == NULL || nvlist_add_string(fmri_cpu, + FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0) + atomic_inc_64(failedp); +} + +/* + * Set-up and validate the members of a mem according to: + * + * Member name Type Value + * ==================================================== + * version uint8_t 0 + * auth nvlist_t <auth> [optional] + * unum string <unum> + * serial string <serial> [optional*] + * offset uint64_t <offset> [optional] + * + * * serial is required if offset is present + */ +void +fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth, + const char *unum, const char *serial, uint64_t offset) +{ + if (version != MEM_SCHEME_VERSION0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (!serial && (offset != (uint64_t)-1)) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (auth != NULL) { + if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, + (nvlist_t *)auth) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + } + } + + if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + } + + if (serial != NULL) { + if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID, + (char **)&serial, 1) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + } + if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri, + FM_FMRI_MEM_OFFSET, offset) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + } + } +} + +void +fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid, + uint64_t vdev_guid) +{ + if (version != ZFS_SCHEME_VERSION0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + } + + if (vdev_guid != 0) { + if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + } + } +} + +uint64_t +fm_ena_increment(uint64_t ena) +{ + uint64_t new_ena; + + switch (ENA_FORMAT(ena)) { + case FM_ENA_FMT1: + new_ena = ena + (1 << ENA_FMT1_GEN_SHFT); + break; + case FM_ENA_FMT2: + new_ena = ena + (1 << ENA_FMT2_GEN_SHFT); + break; + default: + new_ena = 0; + } + + return (new_ena); +} + +uint64_t +fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format) +{ + uint64_t ena = 0; + + switch (format) { + case FM_ENA_FMT1: + if (timestamp) { + ena = (uint64_t)((format & ENA_FORMAT_MASK) | + ((cpuid << ENA_FMT1_CPUID_SHFT) & + ENA_FMT1_CPUID_MASK) | + ((timestamp << ENA_FMT1_TIME_SHFT) & + ENA_FMT1_TIME_MASK)); + } else { + ena = (uint64_t)((format & ENA_FORMAT_MASK) | + ((cpuid << ENA_FMT1_CPUID_SHFT) & + ENA_FMT1_CPUID_MASK) | + ((gethrtime_waitfree() << ENA_FMT1_TIME_SHFT) & + ENA_FMT1_TIME_MASK)); + } + break; + case FM_ENA_FMT2: + ena = (uint64_t)((format & ENA_FORMAT_MASK) | + ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK)); + break; + default: + break; + } + + return (ena); +} + +uint64_t +fm_ena_generate(uint64_t timestamp, uchar_t format) +{ + return (fm_ena_generate_cpu(timestamp, PCPU_GET(cpuid), format)); +} + +uint64_t +fm_ena_generation_get(uint64_t ena) +{ + uint64_t gen; + + switch (ENA_FORMAT(ena)) { + case FM_ENA_FMT1: + gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT; + break; + case FM_ENA_FMT2: + gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT; + break; + default: + gen = 0; + break; + } + + return (gen); +} + +uchar_t +fm_ena_format_get(uint64_t ena) +{ + + return (ENA_FORMAT(ena)); +} + +uint64_t +fm_ena_id_get(uint64_t ena) +{ + uint64_t id; + + switch (ENA_FORMAT(ena)) { + case FM_ENA_FMT1: + id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT; + break; + case FM_ENA_FMT2: + id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT; + break; + default: + id = 0; + } + + return (id); +} + +uint64_t +fm_ena_time_get(uint64_t ena) +{ + uint64_t time; + + switch (ENA_FORMAT(ena)) { + case FM_ENA_FMT1: + time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT; + break; + case FM_ENA_FMT2: + time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT; + break; + default: + time = 0; + } + + return (time); +} + +#ifdef illumos +/* + * Convert a getpcstack() trace to symbolic name+offset, and add the resulting + * string array to a Fault Management ereport as FM_EREPORT_PAYLOAD_NAME_STACK. + */ +void +fm_payload_stack_add(nvlist_t *payload, const pc_t *stack, int depth) +{ + int i; + char *sym; + ulong_t off; + char *stkpp[FM_STK_DEPTH]; + char buf[FM_STK_DEPTH * FM_SYM_SZ]; + char *stkp = buf; + + for (i = 0; i < depth && i != FM_STK_DEPTH; i++, stkp += FM_SYM_SZ) { + if ((sym = kobj_getsymname(stack[i], &off)) != NULL) + (void) snprintf(stkp, FM_SYM_SZ, "%s+%lx", sym, off); + else + (void) snprintf(stkp, FM_SYM_SZ, "%lx", (long)stack[i]); + stkpp[i] = stkp; + } + + fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_STACK, + DATA_TYPE_STRING_ARRAY, depth, stkpp, NULL); +} +#endif + +#ifdef illumos +void +print_msg_hwerr(ctid_t ct_id, proc_t *p) +{ + uprintf("Killed process %d (%s) in contract id %d " + "due to hardware error\n", p->p_pid, p->p_user.u_comm, ct_id); +} +#endif + +void +fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth, + nvlist_t *snvl, nvlist_t *bboard, int npairs, ...) +{ + nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); + nvlist_t *pairs[HC_MAXPAIRS]; + nvlist_t **hcl; + uint_t n; + int i, j; + va_list ap; + char *hcname, *hcid; + + if (!fm_fmri_hc_set_common(fmri, version, auth)) + return; + + /* + * copy the bboard nvpairs to the pairs array + */ + if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n) + != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + for (i = 0; i < n; i++) { + if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME, + &hcname) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + pairs[i] = fm_nvlist_create(nva); + if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 || + nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) { + for (j = 0; j <= i; j++) { + if (pairs[j] != NULL) + fm_nvlist_destroy(pairs[j], + FM_NVA_RETAIN); + } + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + } + + /* + * create the pairs from passed in pairs + */ + npairs = MIN(npairs, HC_MAXPAIRS); + + va_start(ap, npairs); + for (i = n; i < npairs + n; i++) { + const char *name = va_arg(ap, const char *); + uint32_t id = va_arg(ap, uint32_t); + char idstr[11]; + (void) snprintf(idstr, sizeof (idstr), "%u", id); + pairs[i] = fm_nvlist_create(nva); + if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || + nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { + for (j = 0; j <= i; j++) { + if (pairs[j] != NULL) + fm_nvlist_destroy(pairs[j], + FM_NVA_RETAIN); + } + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + } + va_end(ap); + + /* + * Create the fmri hc list + */ + if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, + npairs + n) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + for (i = 0; i < npairs + n; i++) { + fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); + } + + if (snvl != NULL) { + if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/list.c b/sys/cddl/contrib/opensolaris/uts/common/os/list.c new file mode 100644 index 000000000000..e8db13a5cf68 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/os/list.c @@ -0,0 +1,245 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Generic doubly-linked list implementation + */ + +#include <sys/list.h> +#include <sys/list_impl.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/debug.h> + +#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) +#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head) + +#define list_insert_after_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_prev = (node); \ + lnew->list_next = (node)->list_next; \ + (node)->list_next->list_prev = lnew; \ + (node)->list_next = lnew; \ +} + +#define list_insert_before_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_next = (node); \ + lnew->list_prev = (node)->list_prev; \ + (node)->list_prev->list_next = lnew; \ + (node)->list_prev = lnew; \ +} + +#define list_remove_node(node) \ + (node)->list_prev->list_next = (node)->list_next; \ + (node)->list_next->list_prev = (node)->list_prev; \ + (node)->list_next = (node)->list_prev = NULL + +void +list_create(list_t *list, size_t size, size_t offset) +{ + ASSERT(list); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (list_node_t)); + + list->list_size = size; + list->list_offset = offset; + list->list_head.list_next = list->list_head.list_prev = + &list->list_head; +} + +void +list_destroy(list_t *list) +{ + list_node_t *node = &list->list_head; + + ASSERT(list); + ASSERT(list->list_head.list_next == node); + ASSERT(list->list_head.list_prev == node); + + node->list_next = node->list_prev = NULL; +} + +void +list_insert_after(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_head(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_after_node(list, lold, nobject); + } +} + +void +list_insert_before(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_tail(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_before_node(list, lold, nobject); + } +} + +void +list_insert_head(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_after_node(list, lold, object); +} + +void +list_insert_tail(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_before_node(list, lold, object); +} + +void +list_remove(list_t *list, void *object) +{ + list_node_t *lold = list_d2l(list, object); + ASSERT(!list_empty(list)); + ASSERT(lold->list_next != NULL); + list_remove_node(lold); +} + +void * +list_remove_head(list_t *list) +{ + list_node_t *head = list->list_head.list_next; + if (head == &list->list_head) + return (NULL); + list_remove_node(head); + return (list_object(list, head)); +} + +void * +list_remove_tail(list_t *list) +{ + list_node_t *tail = list->list_head.list_prev; + if (tail == &list->list_head) + return (NULL); + list_remove_node(tail); + return (list_object(list, tail)); +} + +void * +list_head(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_next)); +} + +void * +list_tail(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_prev)); +} + +void * +list_next(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_next != &list->list_head) + return (list_object(list, node->list_next)); + + return (NULL); +} + +void * +list_prev(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_prev != &list->list_head) + return (list_object(list, node->list_prev)); + + return (NULL); +} + +/* + * Insert src list after dst list. Empty src list thereafter. + */ +void +list_move_tail(list_t *dst, list_t *src) +{ + list_node_t *dstnode = &dst->list_head; + list_node_t *srcnode = &src->list_head; + + ASSERT(dst->list_size == src->list_size); + ASSERT(dst->list_offset == src->list_offset); + + if (list_empty(src)) + return; + + dstnode->list_prev->list_next = srcnode->list_next; + srcnode->list_next->list_prev = dstnode->list_prev; + dstnode->list_prev = srcnode->list_prev; + srcnode->list_prev->list_next = dstnode; + + /* empty src list */ + srcnode->list_next = srcnode->list_prev = srcnode; +} + +void +list_link_replace(list_node_t *lold, list_node_t *lnew) +{ + ASSERT(list_link_active(lold)); + ASSERT(!list_link_active(lnew)); + + lnew->list_next = lold->list_next; + lnew->list_prev = lold->list_prev; + lold->list_prev->list_next = lnew; + lold->list_next->list_prev = lnew; + lold->list_next = lold->list_prev = NULL; +} + +void +list_link_init(list_node_t *link) +{ + link->list_next = NULL; + link->list_prev = NULL; +} + +int +list_link_active(list_node_t *link) +{ + return (link->list_next != NULL); +} + +int +list_is_empty(list_t *list) +{ + return (list_empty(list)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c b/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c new file mode 100644 index 000000000000..3682853de902 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/nvpair.h> + +static void * +nv_alloc_sys(nv_alloc_t *nva, size_t size) +{ + return (kmem_alloc(size, (int)(uintptr_t)nva->nva_arg)); +} + +/*ARGSUSED*/ +static void +nv_free_sys(nv_alloc_t *nva, void *buf, size_t size) +{ + kmem_free(buf, size); +} + +static const nv_alloc_ops_t system_ops = { + NULL, /* nv_ao_init() */ + NULL, /* nv_ao_fini() */ + nv_alloc_sys, /* nv_ao_alloc() */ + nv_free_sys, /* nv_ao_free() */ + NULL /* nv_ao_reset() */ +}; + +nv_alloc_t nv_alloc_sleep_def = { + &system_ops, + (void *)KM_SLEEP +}; + +nv_alloc_t nv_alloc_nosleep_def = { + &system_ops, + (void *)KM_NOSLEEP +}; + +nv_alloc_t *nv_alloc_sleep = &nv_alloc_sleep_def; +nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def; diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h new file mode 100644 index 000000000000..b81678ca07d2 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h @@ -0,0 +1,313 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2017 RackTop Systems. + */ + +#ifndef _SYS_ACL_H +#define _SYS_ACL_H + +#include <sys/types.h> +#include <sys/acl_impl.h> + +#if defined(_KERNEL) +/* + * When compiling OpenSolaris kernel code, this file is included instead of the + * FreeBSD one. Include the original sys/acl.h as well. + */ +#undef _SYS_ACL_H +#include_next <sys/acl.h> +#define _SYS_ACL_H +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_ACL_ENTRIES (1024) /* max entries of each type */ +typedef struct { + int a_type; /* the type of ACL entry */ + uid_t a_id; /* the entry in -uid or gid */ + o_mode_t a_perm; /* the permission field */ +} aclent_t; + +typedef struct ace { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ +} ace_t; + +#ifndef _KERNEL +typedef struct acl_info acl_t; +#endif + +/* + * The following are Defined types for an aclent_t. + */ +#define USER_OBJ (0x01) /* object owner */ +#define USER (0x02) /* additional users */ +#define GROUP_OBJ (0x04) /* owning group of the object */ +#define GROUP (0x08) /* additional groups */ +#define CLASS_OBJ (0x10) /* file group class and mask entry */ +#define OTHER_OBJ (0x20) /* other entry for the object */ +#define ACL_DEFAULT (0x1000) /* default flag */ +/* default object owner */ +#define DEF_USER_OBJ (ACL_DEFAULT | USER_OBJ) +/* default additional users */ +#define DEF_USER (ACL_DEFAULT | USER) +/* default owning group */ +#define DEF_GROUP_OBJ (ACL_DEFAULT | GROUP_OBJ) +/* default additional groups */ +#define DEF_GROUP (ACL_DEFAULT | GROUP) +/* default mask entry */ +#define DEF_CLASS_OBJ (ACL_DEFAULT | CLASS_OBJ) +/* default other entry */ +#define DEF_OTHER_OBJ (ACL_DEFAULT | OTHER_OBJ) + +/* + * The following are defined for ace_t. + */ +#define ACE_READ_DATA 0x00000001 +#define ACE_LIST_DIRECTORY 0x00000001 +#define ACE_WRITE_DATA 0x00000002 +#define ACE_ADD_FILE 0x00000002 +#define ACE_APPEND_DATA 0x00000004 +#define ACE_ADD_SUBDIRECTORY 0x00000004 +#define ACE_READ_NAMED_ATTRS 0x00000008 +#define ACE_WRITE_NAMED_ATTRS 0x00000010 +#define ACE_EXECUTE 0x00000020 +#define ACE_DELETE_CHILD 0x00000040 +#define ACE_READ_ATTRIBUTES 0x00000080 +#define ACE_WRITE_ATTRIBUTES 0x00000100 +#define ACE_DELETE 0x00010000 +#define ACE_READ_ACL 0x00020000 +#define ACE_WRITE_ACL 0x00040000 +#define ACE_WRITE_OWNER 0x00080000 +#define ACE_SYNCHRONIZE 0x00100000 + +#define ACE_FILE_INHERIT_ACE 0x0001 +#define ACE_DIRECTORY_INHERIT_ACE 0x0002 +#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 +#define ACE_INHERIT_ONLY_ACE 0x0008 +#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 +#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 +#define ACE_IDENTIFIER_GROUP 0x0040 +#define ACE_INHERITED_ACE 0x0080 +#define ACE_OWNER 0x1000 +#define ACE_GROUP 0x2000 +#define ACE_EVERYONE 0x4000 + +#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 +#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 +#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 +#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 + +#define ACL_AUTO_INHERIT 0x0001 +#define ACL_PROTECTED 0x0002 +#define ACL_DEFAULTED 0x0004 +#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED| \ + ACL_DEFAULTED) + +#if defined(_KERNEL) || defined(_FAKE_KERNEL) + +/* + * These are only applicable in a CIFS context. + */ +#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E +#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 + +#define ACE_ALL_TYPES 0x001F + +typedef struct ace_object { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ + uint8_t a_obj_type[16]; /* obj type */ + uint8_t a_inherit_obj_type[16]; /* inherit obj */ +} ace_object_t; + +#endif + +#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_SYNCHRONIZE) + +#define ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA| \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) + +#define ACE_READ_PERMS (ACE_READ_DATA|ACE_READ_ACL|ACE_READ_ATTRIBUTES| \ + ACE_READ_NAMED_ATTRS) + +#define ACE_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS) + +#define ACE_MODIFY_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_SYNCHRONIZE) +/* + * The following flags are supported by both NFSv4 ACLs and ace_t. + */ +#define ACE_NFSV4_SUP_FLAGS (ACE_FILE_INHERIT_ACE | \ + ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE | \ + ACE_INHERIT_ONLY_ACE | \ + ACE_INHERITED_ACE | \ + ACE_IDENTIFIER_GROUP) + +#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE| \ + ACE_IDENTIFIER_GROUP) +#define ACE_INHERIT_FLAGS (ACE_FILE_INHERIT_ACE| ACL_INHERITED_ACE| \ + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE) + +/* cmd args to acl(2) for aclent_t */ +#define GETACL 1 +#define SETACL 2 +#define GETACLCNT 3 + +/* cmd's to manipulate ace acls. */ +#define ACE_GETACL 4 +#define ACE_SETACL 5 +#define ACE_GETACLCNT 6 + +/* minimal acl entries from GETACLCNT */ +#define MIN_ACL_ENTRIES 4 + +#if !defined(_KERNEL) + +/* acl check errors */ +#define GRP_ERROR 1 +#define USER_ERROR 2 +#define OTHER_ERROR 3 +#define CLASS_ERROR 4 +#define DUPLICATE_ERROR 5 +#define MISS_ERROR 6 +#define MEM_ERROR 7 +#define ENTRY_ERROR 8 + + +/* + * similar to ufs_acl.h: changed to char type for user commands (tar, cpio) + * Attribute types + */ +#define UFSD_FREE ('0') /* Free entry */ +#define UFSD_ACL ('1') /* Access Control Lists */ +#define UFSD_DFACL ('2') /* reserved for future use */ +#define ACE_ACL ('3') /* ace_t style acls */ + +/* + * flag to [f]acl_get() + * controls whether a trivial acl should be returned. + */ +#define ACL_NO_TRIVIAL 0x2 + + +/* + * Flags to control acl_totext() + */ + +#define ACL_APPEND_ID 0x1 /* append uid/gid to user/group entries */ +#define ACL_COMPACT_FMT 0x2 /* build ACL in ls -V format */ +#define ACL_NORESOLVE 0x4 /* don't do name service lookups */ +#define ACL_SID_FMT 0x8 /* use usersid/groupsid when appropriate */ + +/* + * Legacy aclcheck errors for aclent_t ACLs + */ +#define EACL_GRP_ERROR GRP_ERROR +#define EACL_USER_ERROR USER_ERROR +#define EACL_OTHER_ERROR OTHER_ERROR +#define EACL_CLASS_ERROR CLASS_ERROR +#define EACL_DUPLICATE_ERROR DUPLICATE_ERROR +#define EACL_MISS_ERROR MISS_ERROR +#define EACL_MEM_ERROR MEM_ERROR +#define EACL_ENTRY_ERROR ENTRY_ERROR + +#define EACL_INHERIT_ERROR 9 /* invalid inherit flags */ +#define EACL_FLAGS_ERROR 10 /* unknown flag value */ +#define EACL_PERM_MASK_ERROR 11 /* unknown permission */ +#define EACL_COUNT_ERROR 12 /* invalid acl count */ + +#define EACL_INVALID_SLOT 13 /* invalid acl slot */ +#define EACL_NO_ACL_ENTRY 14 /* Entry doesn't exist */ +#define EACL_DIFF_TYPE 15 /* acls aren't same type */ + +#define EACL_INVALID_USER_GROUP 16 /* need user/group name */ +#define EACL_INVALID_STR 17 /* invalid acl string */ +#define EACL_FIELD_NOT_BLANK 18 /* can't have blank field */ +#define EACL_INVALID_ACCESS_TYPE 19 /* invalid access type */ +#define EACL_UNKNOWN_DATA 20 /* Unrecognized data in ACL */ +#define EACL_MISSING_FIELDS 21 /* missing fields in acl */ + +#define EACL_INHERIT_NOTDIR 22 /* Need dir for inheritance */ + +extern int aclcheck(aclent_t *, int, int *); +extern int acltomode(aclent_t *, int, mode_t *); +extern int aclfrommode(aclent_t *, int, mode_t *); +extern int aclsort(int, int, aclent_t *); +extern char *acltotext(aclent_t *, int); +extern aclent_t *aclfromtext(char *, int *); +extern void acl_free(acl_t *); +extern int acl_get(const char *, int, acl_t **); +extern int facl_get(int, int, acl_t **); +extern int acl_set(const char *, acl_t *acl); +extern int facl_set(int, acl_t *acl); +extern int acl_strip(const char *, uid_t, gid_t, mode_t); +extern int acl_trivial(const char *); +extern char *acl_totext(acl_t *, int); +extern int acl_fromtext(const char *, acl_t **); +extern int acl_check(acl_t *, int); + +#else /* !defined(_KERNEL) */ + +extern void ksort(caddr_t, int, int, int (*)(void *, void *)); +extern int cmp2acls(void *, void *); + +#endif /* !defined(_KERNEL) */ + +extern int acl(const char *path, int cmd, int cnt, void *buf); +extern int facl(int fd, int cmd, int cnt, void *buf); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ACL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h new file mode 100644 index 000000000000..8718f5bcf63f --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ACL_IMPL_H +#define _SYS_ACL_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * acl flags + * + * ACL_AUTO_INHERIT, ACL_PROTECTED and ACL_DEFAULTED + * flags can also be stored in this field. + */ +#define ACL_IS_TRIVIAL 0x10000 +#define ACL_IS_DIR 0x20000 + +typedef enum acl_type { + ACLENT_T = 0, + ACE_T = 1 +} zfs_acl_type_t; + +struct acl_info { + zfs_acl_type_t acl_type; /* style of acl */ + int acl_cnt; /* number of acl entries */ + int acl_entry_size; /* sizeof acl entry */ + int acl_flags; /* special flags about acl */ + void *acl_aclp; /* the acl */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ACL_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h b/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h new file mode 100644 index 000000000000..d694ea4844c3 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h @@ -0,0 +1,87 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ASM_LINKAGE_H +#define _SYS_ASM_LINKAGE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _ASM /* The remainder of this file is only for assembly files */ + +/* + * make annoying differences in assembler syntax go away + */ + +#if defined(__i386__) || defined(__amd64__) + +#define ASM_ENTRY_ALIGN 16 + +#elif defined(__sparc64__) + +/* GCC uses 32-byte function alignment for UltraSPARC CPUs. */ +#define ASM_ENTRY_ALIGN 32 + +#else + +#error Unsupported architecture. + +#endif + +/* + * ENTRY provides the standard procedure entry code and an easy way to + * insert the calls to mcount for profiling. ENTRY_NP is identical, but + * never calls mcount. + */ +#define ENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x; \ + .type x, @function; \ +x: + +/* + * ALTENTRY provides for additional entry points. + */ +#define ALTENTRY(x) \ + .globl x; \ + .type x, @function; \ +x: + +/* + * SET_SIZE trails a function and set the size for the ELF symbol table. + */ +#define SET_SIZE(x) \ + .size x, [.-x] + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h new file mode 100644 index 000000000000..fea46c90481d --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h @@ -0,0 +1,326 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#ifndef _AVL_H +#define _AVL_H + +/* + * This is a private header file. Applications should not directly include + * this file. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/avl_impl.h> + +/* + * This is a generic implementation of AVL trees for use in the Solaris kernel. + * The interfaces provide an efficient way of implementing an ordered set of + * data structures. + * + * AVL trees provide an alternative to using an ordered linked list. Using AVL + * trees will usually be faster, however they requires more storage. An ordered + * linked list in general requires 2 pointers in each data structure. The + * AVL tree implementation uses 3 pointers. The following chart gives the + * approximate performance of operations with the different approaches: + * + * Operation Link List AVL tree + * --------- -------- -------- + * lookup O(n) O(log(n)) + * + * insert 1 node constant constant + * + * delete 1 node constant between constant and O(log(n)) + * + * delete all nodes O(n) O(n) + * + * visit the next + * or prev node constant between constant and O(log(n)) + * + * + * The data structure nodes are anchored at an "avl_tree_t" (the equivalent + * of a list header) and the individual nodes will have a field of + * type "avl_node_t" (corresponding to list pointers). + * + * The type "avl_index_t" is used to indicate a position in the list for + * certain calls. + * + * The usage scenario is generally: + * + * 1. Create the list/tree with: avl_create() + * + * followed by any mixture of: + * + * 2a. Insert nodes with: avl_add(), or avl_find() and avl_insert() + * + * 2b. Visited elements with: + * avl_first() - returns the lowest valued node + * avl_last() - returns the highest valued node + * AVL_NEXT() - given a node go to next higher one + * AVL_PREV() - given a node go to previous lower one + * + * 2c. Find the node with the closest value either less than or greater + * than a given value with avl_nearest(). + * + * 2d. Remove individual nodes from the list/tree with avl_remove(). + * + * and finally when the list is being destroyed + * + * 3. Use avl_destroy_nodes() to quickly process/free up any remaining nodes. + * Note that once you use avl_destroy_nodes(), you can no longer + * use any routine except avl_destroy_nodes() and avl_destoy(). + * + * 4. Use avl_destroy() to destroy the AVL tree itself. + * + * Any locking for multiple thread access is up to the user to provide, just + * as is needed for any linked list implementation. + */ + + +/* + * AVL comparator helpers + */ +#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0)) +#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define AVL_PCMP(a, b) \ + (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) + +/* + * Type used for the root of the AVL tree. + */ +typedef struct avl_tree avl_tree_t; + +/* + * The data nodes in the AVL tree must have a field of this type. + */ +typedef struct avl_node avl_node_t; + +/* + * An opaque type used to locate a position in the tree where a node + * would be inserted. + */ +typedef uintptr_t avl_index_t; + + +/* + * Direction constants used for avl_nearest(). + */ +#define AVL_BEFORE (0) +#define AVL_AFTER (1) + + +/* + * Prototypes + * + * Where not otherwise mentioned, "void *" arguments are a pointer to the + * user data structure which must contain a field of type avl_node_t. + * + * Also assume the user data structures looks like: + * stuct my_type { + * ... + * avl_node_t my_link; + * ... + * }; + */ + +/* + * Initialize an AVL tree. Arguments are: + * + * tree - the tree to be initialized + * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 + * -1 for <, 0 for ==, and +1 for > + * size - the value of sizeof(struct my_type) + * offset - the value of OFFSETOF(struct my_type, my_link) + */ +extern void avl_create(avl_tree_t *tree, + int (*compar) (const void *, const void *), size_t size, size_t offset); + + +/* + * Find a node with a matching value in the tree. Returns the matching node + * found. If not found, it returns NULL and then if "where" is not NULL it sets + * "where" for use with avl_insert() or avl_nearest(). + * + * node - node that has the value being looked for + * where - position for use with avl_nearest() or avl_insert(), may be NULL + */ +extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where); + +/* + * Insert a node into the tree. + * + * node - the node to insert + * where - position as returned from avl_find() + */ +extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where); + +/* + * Insert "new_data" in "tree" in the given "direction" either after + * or before the data "here". + * + * This might be useful for avl clients caching recently accessed + * data to avoid doing avl_find() again for insertion. + * + * new_data - new data to insert + * here - existing node in "tree" + * direction - either AVL_AFTER or AVL_BEFORE the data "here". + */ +extern void avl_insert_here(avl_tree_t *tree, void *new_data, void *here, + int direction); + + +/* + * Return the first or last valued node in the tree. Will return NULL + * if the tree is empty. + * + */ +extern void *avl_first(avl_tree_t *tree); +extern void *avl_last(avl_tree_t *tree); + + +/* + * Return the next or previous valued node in the tree. + * AVL_NEXT() will return NULL if at the last node. + * AVL_PREV() will return NULL if at the first node. + * + * node - the node from which the next or previous node is found + */ +#define AVL_NEXT(tree, node) avl_walk(tree, node, AVL_AFTER) +#define AVL_PREV(tree, node) avl_walk(tree, node, AVL_BEFORE) + + +/* + * Find the node with the nearest value either greater or less than + * the value from a previous avl_find(). Returns the node or NULL if + * there isn't a matching one. + * + * where - position as returned from avl_find() + * direction - either AVL_BEFORE or AVL_AFTER + * + * EXAMPLE get the greatest node that is less than a given value: + * + * avl_tree_t *tree; + * struct my_data look_for_value = {....}; + * struct my_data *node; + * struct my_data *less; + * avl_index_t where; + * + * node = avl_find(tree, &look_for_value, &where); + * if (node != NULL) + * less = AVL_PREV(tree, node); + * else + * less = avl_nearest(tree, where, AVL_BEFORE); + */ +extern void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction); + + +/* + * Add a single node to the tree. + * The node must not be in the tree, and it must not + * compare equal to any other node already in the tree. + * + * node - the node to add + */ +extern void avl_add(avl_tree_t *tree, void *node); + + +/* + * Remove a single node from the tree. The node must be in the tree. + * + * node - the node to remove + */ +extern void avl_remove(avl_tree_t *tree, void *node); + +/* + * Reinsert a node only if its order has changed relative to its nearest + * neighbors. To optimize performance avl_update_lt() checks only the previous + * node and avl_update_gt() checks only the next node. Use avl_update_lt() and + * avl_update_gt() only if you know the direction in which the order of the + * node may change. + */ +extern boolean_t avl_update(avl_tree_t *, void *); +extern boolean_t avl_update_lt(avl_tree_t *, void *); +extern boolean_t avl_update_gt(avl_tree_t *, void *); + +/* + * Swaps the contents of the two trees. + */ +extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2); + +/* + * Return the number of nodes in the tree + */ +extern ulong_t avl_numnodes(avl_tree_t *tree); + +/* + * Return B_TRUE if there are zero nodes in the tree, B_FALSE otherwise. + */ +extern boolean_t avl_is_empty(avl_tree_t *tree); + +/* + * Used to destroy any remaining nodes in a tree. The cookie argument should + * be initialized to NULL before the first call. Returns a node that has been + * removed from the tree and may be free()'d. Returns NULL when the tree is + * empty. + * + * Once you call avl_destroy_nodes(), you can only continuing calling it and + * finally avl_destroy(). No other AVL routines will be valid. + * + * cookie - a "void *" used to save state between calls to avl_destroy_nodes() + * + * EXAMPLE: + * avl_tree_t *tree; + * struct my_data *node; + * void *cookie; + * + * cookie = NULL; + * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) + * free(node); + * avl_destroy(tree); + */ +extern void *avl_destroy_nodes(avl_tree_t *tree, void **cookie); + + +/* + * Final destroy of an AVL tree. Arguments are: + * + * tree - the empty tree to destroy + */ +extern void avl_destroy(avl_tree_t *tree); + + + +#ifdef __cplusplus +} +#endif + +#endif /* _AVL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h new file mode 100644 index 000000000000..620685f370d4 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h @@ -0,0 +1,164 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _AVL_IMPL_H +#define _AVL_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This is a private header file. Applications should not directly include + * this file. + */ + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * generic AVL tree implementation for kernel use + * + * There are 5 pieces of information stored for each node in an AVL tree + * + * pointer to less than child + * pointer to greater than child + * a pointer to the parent of this node + * an indication [0/1] of which child I am of my parent + * a "balance" (-1, 0, +1) indicating which child tree is taller + * + * Since they only need 3 bits, the last two fields are packed into the + * bottom bits of the parent pointer on 64 bit machines to save on space. + */ + +#ifndef _LP64 + +struct avl_node { + struct avl_node *avl_child[2]; /* left/right children */ + struct avl_node *avl_parent; /* this node's parent */ + unsigned short avl_child_index; /* my index in parent's avl_child[] */ + short avl_balance; /* balance value: -1, 0, +1 */ +}; + +#define AVL_XPARENT(n) ((n)->avl_parent) +#define AVL_SETPARENT(n, p) ((n)->avl_parent = (p)) + +#define AVL_XCHILD(n) ((n)->avl_child_index) +#define AVL_SETCHILD(n, c) ((n)->avl_child_index = (unsigned short)(c)) + +#define AVL_XBALANCE(n) ((n)->avl_balance) +#define AVL_SETBALANCE(n, b) ((n)->avl_balance = (short)(b)) + +#else /* _LP64 */ + +/* + * for 64 bit machines, avl_pcb contains parent pointer, balance and child_index + * values packed in the following manner: + * + * |63 3| 2 |1 0 | + * |-------------------------------------|-----------------|-------------| + * | avl_parent hi order bits | avl_child_index | avl_balance | + * | | | + 1 | + * |-------------------------------------|-----------------|-------------| + * + */ +struct avl_node { + struct avl_node *avl_child[2]; /* left/right children nodes */ + uintptr_t avl_pcb; /* parent, child_index, balance */ +}; + +/* + * macros to extract/set fields in avl_pcb + * + * pointer to the parent of the current node is the high order bits + */ +#define AVL_XPARENT(n) ((struct avl_node *)((n)->avl_pcb & ~7)) +#define AVL_SETPARENT(n, p) \ + ((n)->avl_pcb = (((n)->avl_pcb & 7) | (uintptr_t)(p))) + +/* + * index of this node in its parent's avl_child[]: bit #2 + */ +#define AVL_XCHILD(n) (((n)->avl_pcb >> 2) & 1) +#define AVL_SETCHILD(n, c) \ + ((n)->avl_pcb = (uintptr_t)(((n)->avl_pcb & ~4) | ((c) << 2))) + +/* + * balance indication for a node, lowest 2 bits. A valid balance is + * -1, 0, or +1, and is encoded by adding 1 to the value to get the + * unsigned values of 0, 1, 2. + */ +#define AVL_XBALANCE(n) ((int)(((n)->avl_pcb & 3) - 1)) +#define AVL_SETBALANCE(n, b) \ + ((n)->avl_pcb = (uintptr_t)((((n)->avl_pcb & ~3) | ((b) + 1)))) + +#endif /* _LP64 */ + + + +/* + * switch between a node and data pointer for a given tree + * the value of "o" is tree->avl_offset + */ +#define AVL_NODE2DATA(n, o) ((void *)((uintptr_t)(n) - (o))) +#define AVL_DATA2NODE(d, o) ((struct avl_node *)((uintptr_t)(d) + (o))) + + + +/* + * macros used to create/access an avl_index_t + */ +#define AVL_INDEX2NODE(x) ((avl_node_t *)((x) & ~1)) +#define AVL_INDEX2CHILD(x) ((x) & 1) +#define AVL_MKINDEX(n, c) ((avl_index_t)(n) | (c)) + + +/* + * The tree structure. The fields avl_root, avl_compar, and avl_offset come + * first since they are needed for avl_find(). We want them to fit into + * a single 64 byte cache line to make avl_find() as fast as possible. + */ +struct avl_tree { + struct avl_node *avl_root; /* root node in tree */ + int (*avl_compar)(const void *, const void *); + size_t avl_offset; /* offsetof(type, avl_link_t field) */ + ulong_t avl_numnodes; /* number of nodes in the tree */ + size_t avl_size; /* sizeof user type struct */ +}; + + +/* + * This will only by used via AVL_NEXT() or AVL_PREV() + */ +extern void *avl_walk(struct avl_tree *, void *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _AVL_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h b/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h new file mode 100644 index 000000000000..485abc48f1a2 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h @@ -0,0 +1,198 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright 2017 RackTop Systems. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#ifndef _SYS_BITMAP_H +#define _SYS_BITMAP_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/feature_tests.h> +#if defined(__GNUC__) && defined(_ASM_INLINES) && \ + (defined(__i386) || defined(__amd64)) +#include <asm/bitmap.h> +#endif + +/* + * Operations on bitmaps of arbitrary size + * A bitmap is a vector of 1 or more ulong_t's. + * The user of the package is responsible for range checks and keeping + * track of sizes. + */ + +#ifdef _LP64 +#define BT_ULSHIFT 6 /* log base 2 of BT_NBIPUL, to extract word index */ +#define BT_ULSHIFT32 5 /* log base 2 of BT_NBIPUL, to extract word index */ +#else +#define BT_ULSHIFT 5 /* log base 2 of BT_NBIPUL, to extract word index */ +#endif + +#define BT_NBIPUL (1 << BT_ULSHIFT) /* n bits per ulong_t */ +#define BT_ULMASK (BT_NBIPUL - 1) /* to extract bit index */ + +#ifdef _LP64 +#define BT_NBIPUL32 (1 << BT_ULSHIFT32) /* n bits per ulong_t */ +#define BT_ULMASK32 (BT_NBIPUL32 - 1) /* to extract bit index */ +#define BT_ULMAXMASK 0xffffffffffffffff /* used by bt_getlowbit */ +#else +#define BT_ULMAXMASK 0xffffffff +#endif + +/* + * bitmap is a ulong_t *, bitindex an index_t + * + * The macros BT_WIM and BT_BIW internal; there is no need + * for users of this package to use them. + */ + +/* + * word in map + */ +#define BT_WIM(bitmap, bitindex) \ + ((bitmap)[(bitindex) >> BT_ULSHIFT]) +/* + * bit in word + */ +#define BT_BIW(bitindex) \ + (1UL << ((bitindex) & BT_ULMASK)) + +#ifdef _LP64 +#define BT_WIM32(bitmap, bitindex) \ + ((bitmap)[(bitindex) >> BT_ULSHIFT32]) + +#define BT_BIW32(bitindex) \ + (1UL << ((bitindex) & BT_ULMASK32)) +#endif + +/* + * These are public macros + * + * BT_BITOUL == n bits to n ulong_t's + */ +#define BT_BITOUL(nbits) \ + (((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL) +#define BT_SIZEOFMAP(nbits) \ + (BT_BITOUL(nbits) * sizeof (ulong_t)) +#define BT_TEST(bitmap, bitindex) \ + ((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0) +#define BT_SET(bitmap, bitindex) \ + { BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); } +#define BT_CLEAR(bitmap, bitindex) \ + { BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); } + +#ifdef _LP64 +#define BT_BITOUL32(nbits) \ + (((nbits) + BT_NBIPUL32 - 1l) / BT_NBIPUL32) +#define BT_SIZEOFMAP32(nbits) \ + (BT_BITOUL32(nbits) * sizeof (uint_t)) +#define BT_TEST32(bitmap, bitindex) \ + ((BT_WIM32((bitmap), (bitindex)) & BT_BIW32(bitindex)) ? 1 : 0) +#define BT_SET32(bitmap, bitindex) \ + { BT_WIM32((bitmap), (bitindex)) |= BT_BIW32(bitindex); } +#define BT_CLEAR32(bitmap, bitindex) \ + { BT_WIM32((bitmap), (bitindex)) &= ~BT_BIW32(bitindex); } +#endif /* _LP64 */ + + +/* + * BIT_ONLYONESET is a private macro not designed for bitmaps of + * arbitrary size. u must be an unsigned integer/long. It returns + * true if one and only one bit is set in u. + */ +#define BIT_ONLYONESET(u) \ + ((((u) == 0) ? 0 : ((u) & ((u) - 1)) == 0)) + +#if (defined(_KERNEL) || defined(_FAKE_KERNEL)) && !defined(_ASM) +#include <sys/atomic.h> + +/* + * return next available bit index from map with specified number of bits + */ +extern index_t bt_availbit(ulong_t *bitmap, size_t nbits); +/* + * find the highest order bit that is on, and is within or below + * the word specified by wx + */ +extern int bt_gethighbit(ulong_t *mapp, int wx); +extern int bt_range(ulong_t *bitmap, size_t *pos1, size_t *pos2, + size_t end_pos); +/* + * Find highest and lowest one bit set. + * Returns bit number + 1 of bit that is set, otherwise returns 0. + * Low order bit is 0, high order bit is 31. + */ +extern int highbit(ulong_t); +extern int highbit64(uint64_t); +extern int lowbit(ulong_t); +extern int bt_getlowbit(ulong_t *bitmap, size_t start, size_t stop); +extern void bt_copy(ulong_t *, ulong_t *, ulong_t); + +/* + * find the parity + */ +extern int odd_parity(ulong_t); + +/* + * Atomically set/clear bits + * Atomic exclusive operations will set "result" to "-1" + * if the bit is already set/cleared. "result" will be set + * to 0 otherwise. + */ +#define BT_ATOMIC_SET(bitmap, bitindex) \ + { atomic_or_ulong(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); } +#define BT_ATOMIC_CLEAR(bitmap, bitindex) \ + { atomic_and_ulong(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); } + +#define BT_ATOMIC_SET_EXCL(bitmap, bitindex, result) \ + { result = atomic_set_long_excl(&(BT_WIM(bitmap, bitindex)), \ + (bitindex) % BT_NBIPUL); } +#define BT_ATOMIC_CLEAR_EXCL(bitmap, bitindex, result) \ + { result = atomic_clear_long_excl(&(BT_WIM(bitmap, bitindex)), \ + (bitindex) % BT_NBIPUL); } + +/* + * Extracts bits between index h (high, inclusive) and l (low, exclusive) from + * u, which must be an unsigned integer. + */ +#define BITX(u, h, l) (((u) >> (l)) & ((1LU << ((h) - (l) + 1LU)) - 1LU)) + +#endif /* (_KERNEL || _FAKE_KERNEL) && !_ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BITMAP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h b/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h new file mode 100644 index 000000000000..43f14ebb369d --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h @@ -0,0 +1,215 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CALLB_H +#define _SYS_CALLB_H + +#include <sys/kcondvar.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * definitions of callback classes (c_class) + * + * Callbacks belong in the same class if (1) their callback routines + * do the same kind of processing (ideally, using the same callback function) + * and (2) they can/should be executed at the same time in a cpr + * suspend/resume operation. + * + * Note: The DAEMON class, in particular, is for stopping kernel threads + * and nothing else. The CALLB_* macros below should be used to deal + * with kernel threads, and the callback function should be callb_generic_cpr. + * Another idiosyncrasy of the DAEMON class is that if a suspend operation + * fails, some of the callback functions may be called with the RESUME + * code which were never called with SUSPEND. Not a problem currently, + * but see bug 4201851. + */ +#define CB_CL_CPR_DAEMON 0 +#define CB_CL_CPR_VM 1 +#define CB_CL_CPR_CALLOUT 2 +#define CB_CL_CPR_OBP 3 +#define CB_CL_CPR_FB 4 +#define CB_CL_PANIC 5 +#define CB_CL_CPR_RPC 6 +#define CB_CL_CPR_PROMPRINTF 7 +#define CB_CL_UADMIN 8 +#define CB_CL_CPR_PM 9 +#define CB_CL_HALT 10 +#define CB_CL_CPR_DMA 11 +#define CB_CL_CPR_POST_USER 12 +#define CB_CL_UADMIN_PRE_VFS 13 +#define CB_CL_MDBOOT CB_CL_UADMIN +#define CB_CL_ENTER_DEBUGGER 14 +#define CB_CL_CPR_POST_KERNEL 15 +#define CB_CL_CPU_DEEP_IDLE 16 +#define NCBCLASS 17 /* CHANGE ME if classes are added/removed */ + +/* + * CB_CL_CPR_DAEMON class specific definitions are given below: + */ + +/* + * code for CPR callb_execute_class + */ +#define CB_CODE_CPR_CHKPT 0 +#define CB_CODE_CPR_RESUME 1 + +typedef void * callb_id_t; +/* + * Per kernel thread structure for CPR daemon callbacks. + * Must be protected by either a existing lock in the daemon or + * a new lock created for such a purpose. + */ +typedef struct callb_cpr { + kmutex_t *cc_lockp; /* lock to protect this struct */ + char cc_events; /* various events for CPR */ + callb_id_t cc_id; /* callb id address */ + kcondvar_t cc_callb_cv; /* cv for callback waiting */ + kcondvar_t cc_stop_cv; /* cv to checkpoint block */ +} callb_cpr_t; + +/* + * cc_events definitions + */ +#define CALLB_CPR_START 1 /* a checkpoint request's started */ +#define CALLB_CPR_SAFE 2 /* thread is safe for CPR */ +#define CALLB_CPR_ALWAYS_SAFE 4 /* thread is ALWAYS safe for CPR */ + +/* + * Used when checking that all kernel threads are stopped. + */ +#define CALLB_MAX_RETRY 3 /* when waiting for kthread to sleep */ +#define CALLB_THREAD_DELAY 10 /* ticks allowed to reach sleep */ +#define CPR_KTHREAD_TIMEOUT_SEC 90 /* secs before callback times out -- */ + /* due to pwr mgmt of disks, make -- */ + /* big enough for worst spinup time */ + +#ifdef _KERNEL +/* + * + * CALLB_CPR_INIT macro is used by kernel threads to add their entry to + * the callback table and perform other initialization. It automatically + * adds the thread as being in the callback class CB_CL_CPR_DAEMON. + * + * cp - ptr to the callb_cpr_t structure for this kernel thread + * + * lockp - pointer to mutex protecting the callb_cpr_t stuct + * + * func - pointer to the callback function for this kernel thread. + * It has the prototype boolean_t <func>(void *arg, int code) + * where: arg - ptr to the callb_cpr_t structure + * code - not used for this type of callback + * returns: B_TRUE if successful; B_FALSE if unsuccessful. + * + * name - a string giving the name of the kernel thread + * + * Note: lockp is the lock to protect the callb_cpr_t (cp) structure + * later on. No lock held is needed for this initialization. + */ +#define CALLB_CPR_INIT(cp, lockp, func, name) { \ + strlcpy(curthread->td_name, (name), \ + sizeof(curthread->td_name)); \ + bzero((caddr_t)(cp), sizeof (callb_cpr_t)); \ + (cp)->cc_lockp = lockp; \ + (cp)->cc_id = callb_add(func, (void *)(cp), \ + CB_CL_CPR_DAEMON, name); \ + cv_init(&(cp)->cc_callb_cv, NULL, CV_DEFAULT, NULL); \ + cv_init(&(cp)->cc_stop_cv, NULL, CV_DEFAULT, NULL); \ + } + +#ifndef __lock_lint +#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp)); +#else +#define CALLB_CPR_ASSERT(cp) +#endif +/* + * Some threads (like the idle threads) do not adhere to the callback + * protocol and are always considered safe. Such threads must never exit. + * They register their presence by calling this macro during their + * initialization. + * + * Args: + * t - thread pointer of the client kernel thread + * name - a string giving the name of the kernel thread + */ +#define CALLB_CPR_INIT_SAFE(t, name) { \ + (void) callb_add_thread(callb_generic_cpr_safe, \ + (void *) &callb_cprinfo_safe, CB_CL_CPR_DAEMON, \ + name, t); \ + } +/* + * The lock to protect cp's content must be held before + * calling the following two macros. + * + * Any code region between CALLB_CPR_SAFE_BEGIN and CALLB_CPR_SAFE_END + * is safe for checkpoint/resume. + */ +#define CALLB_CPR_SAFE_BEGIN(cp) { \ + CALLB_CPR_ASSERT(cp) \ + (cp)->cc_events |= CALLB_CPR_SAFE; \ + if ((cp)->cc_events & CALLB_CPR_START) \ + cv_signal(&(cp)->cc_callb_cv); \ + } +#define CALLB_CPR_SAFE_END(cp, lockp) { \ + CALLB_CPR_ASSERT(cp) \ + while ((cp)->cc_events & CALLB_CPR_START) \ + cv_wait(&(cp)->cc_stop_cv, lockp); \ + (cp)->cc_events &= ~CALLB_CPR_SAFE; \ + } +/* + * cv_destroy is nop right now but may be needed in the future. + */ +#define CALLB_CPR_EXIT(cp) { \ + CALLB_CPR_ASSERT(cp) \ + (cp)->cc_events |= CALLB_CPR_SAFE; \ + if ((cp)->cc_events & CALLB_CPR_START) \ + cv_signal(&(cp)->cc_callb_cv); \ + mutex_exit((cp)->cc_lockp); \ + (void) callb_delete((cp)->cc_id); \ + cv_destroy(&(cp)->cc_callb_cv); \ + cv_destroy(&(cp)->cc_stop_cv); \ + } + +extern callb_cpr_t callb_cprinfo_safe; +extern callb_id_t callb_add(boolean_t (*)(void *, int), void *, int, char *); +extern callb_id_t callb_add_thread(boolean_t (*)(void *, int), + void *, int, char *, kthread_id_t); +extern int callb_delete(callb_id_t); +extern void callb_execute(callb_id_t, int); +extern void *callb_execute_class(int, int); +extern boolean_t callb_generic_cpr(void *, int); +extern boolean_t callb_generic_cpr_safe(void *, int); +extern boolean_t callb_is_stopped(kthread_id_t, caddr_t *); +extern void callb_lock_table(void); +extern void callb_unlock_table(void); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CALLB_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h b/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h new file mode 100644 index 000000000000..c9857b086575 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CCOMPILE_H +#define _SYS_CCOMPILE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This file contains definitions designed to enable different compilers + * to be used harmoniously on Solaris systems. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Allow for version tests for compiler bugs and features. + */ +#if defined(__GNUC__) +#define __GNUC_VERSION \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#else +#define __GNUC_VERSION 0 +#endif + +#if defined(__ATTRIBUTE_IMPLEMENTED) || defined(__GNUC__) + +/* + * analogous to lint's PRINTFLIKEn + */ +#define __sun_attr___PRINTFLIKE__(__n) \ + __attribute__((__format__(printf, __n, (__n)+1))) +#define __sun_attr___VPRINTFLIKE__(__n) \ + __attribute__((__format__(printf, __n, 0))) + +/* + * Handle the kernel printf routines that can take '%b' too + */ +#if __GNUC_VERSION < 30402 +/* + * XX64 at least this doesn't work correctly yet with 3.4.1 anyway! + */ +#define __sun_attr___KPRINTFLIKE__ __sun_attr___PRINTFLIKE__ +#define __sun_attr___KVPRINTFLIKE__ __sun_attr___VPRINTFLIKE__ +#else +#define __sun_attr___KPRINTFLIKE__(__n) \ + __attribute__((__format__(cmn_err, __n, (__n)+1))) +#define __sun_attr___KVPRINTFLIKE__(__n) \ + __attribute__((__format__(cmn_err, __n, 0))) +#endif + +/* + * This one's pretty obvious -- the function never returns + */ +#define __sun_attr___noreturn__ __attribute__((__noreturn__)) + + +/* + * This is an appropriate label for functions that do not + * modify their arguments, e.g. strlen() + */ +#define __sun_attr___pure__ __attribute__((__pure__)) + +/* + * This is a stronger form of __pure__. Can be used for functions + * that do not modify their arguments and don't depend on global + * memory. + */ +#define __sun_attr___const__ __attribute__((__const__)) + +/* + * structure packing like #pragma pack(1) + */ +#define __sun_attr___packed__ __attribute__((__packed__)) + +#define ___sun_attr_inner(__a) __sun_attr_##__a +#define __sun_attr__(__a) ___sun_attr_inner __a + +#else /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */ + +#define __sun_attr__(__a) + +#endif /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */ + +/* + * Shorthand versions for readability + */ + +#define __PRINTFLIKE(__n) __sun_attr__((__PRINTFLIKE__(__n))) +#define __VPRINTFLIKE(__n) __sun_attr__((__VPRINTFLIKE__(__n))) +#define __KPRINTFLIKE(__n) __sun_attr__((__KPRINTFLIKE__(__n))) +#define __KVPRINTFLIKE(__n) __sun_attr__((__KVPRINTFLIKE__(__n))) +#define __NORETURN __sun_attr__((__noreturn__)) +#define __CONST __sun_attr__((__const__)) +#define __PURE __sun_attr__((__pure__)) + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CCOMPILE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h new file mode 100644 index 000000000000..e710d8e5c30b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h @@ -0,0 +1,128 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CMN_ERR_H +#define _SYS_CMN_ERR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#if defined(_KERNEL) && !defined(_ASM) +#include <sys/va_list.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Common error handling severity levels */ + +#define CE_CONT 0 /* continuation */ +#define CE_NOTE 1 /* notice */ +#define CE_WARN 2 /* warning */ +#define CE_PANIC 3 /* panic */ +#define CE_IGNORE 4 /* print nothing */ + +#ifndef _ASM + +#ifdef _KERNEL + +/*PRINTFLIKE2*/ +extern void cmn_err(int, const char *, ...) + __KPRINTFLIKE(2); +#pragma rarely_called(cmn_err) + +extern void vzcmn_err(zoneid_t, int, const char *, __va_list) + __KVPRINTFLIKE(3); +#pragma rarely_called(vzcmn_err) + +extern void vcmn_err(int, const char *, __va_list) + __KVPRINTFLIKE(2); +#pragma rarely_called(vcmn_err) + +/*PRINTFLIKE3*/ +extern void zcmn_err(zoneid_t, int, const char *, ...) + __KPRINTFLIKE(3); +#pragma rarely_called(zcmn_err) + +/*PRINTFLIKE1*/ +extern void printf(const char *, ...) + __KPRINTFLIKE(1); +#pragma rarely_called(printf) + +extern void vzprintf(zoneid_t, const char *, __va_list) + __KVPRINTFLIKE(2); +#pragma rarely_called(vzprintf) + +/*PRINTFLIKE2*/ +extern void zprintf(zoneid_t, const char *, ...) + __KPRINTFLIKE(2); +#pragma rarely_called(zprintf) + +extern void vprintf(const char *, __va_list) + __KVPRINTFLIKE(1); +#pragma rarely_called(vprintf) + +/*PRINTFLIKE1*/ +extern void uprintf(const char *, ...) + __KPRINTFLIKE(1); +#pragma rarely_called(uprintf) + +extern void vuprintf(const char *, __va_list) + __KVPRINTFLIKE(1); +#pragma rarely_called(vuprintf) + +/*PRINTFLIKE3*/ +extern size_t snprintf(char *, size_t, const char *, ...) + __KPRINTFLIKE(3); +extern size_t vsnprintf(char *, size_t, const char *, __va_list) + __KVPRINTFLIKE(3); +/*PRINTFLIKE2*/ +extern char *sprintf(char *, const char *, ...) + __KPRINTFLIKE(2); +extern char *vsprintf(char *, const char *, __va_list) + __KVPRINTFLIKE(2); + +/*PRINTFLIKE1*/ +extern void panic(const char *, ...) + __KPRINTFLIKE(1) __NORETURN; +#pragma rarely_called(panic) + +extern void vpanic(const char *, __va_list) + __KVPRINTFLIKE(1) __NORETURN; +#pragma rarely_called(vpanic) + +#endif /* _KERNEL */ +#endif /* !_ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CMN_ERR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h b/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h new file mode 100644 index 000000000000..3d79d9511092 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1998 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#ifndef _SYS_COMPRESS_H +#define _SYS_COMPRESS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern size_t compress(void *, void *, size_t); +extern size_t decompress(void *, void *, size_t, size_t); +extern uint32_t checksum32(void *, size_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_COMPRESS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h new file mode 100644 index 000000000000..a91c989bbff1 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h @@ -0,0 +1,158 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 RackTop Systems. + */ + +#ifndef _SYS_CPUPART_H +#define _SYS_CPUPART_H + +#include <sys/types.h> +#include <sys/processor.h> +#include <sys/cpuvar.h> +#include <sys/disp.h> +#include <sys/pset.h> +#include <sys/lgrp.h> +#include <sys/lgrp_user.h> +#include <sys/pg.h> +#include <sys/bitset.h> +#include <sys/time.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_KERNEL) || defined(_FAKE_KERNEL) + +typedef int cpupartid_t; + +/* + * Special partition id. + */ +#define CP_DEFAULT 0 + +/* + * Flags for cpupart_list() + */ +#define CP_ALL 0 /* return all cpu partitions */ +#define CP_NONEMPTY 1 /* return only non-empty ones */ + +typedef struct cpupart { + disp_t cp_kp_queue; /* partition-wide kpreempt queue */ + cpupartid_t cp_id; /* partition ID */ + int cp_ncpus; /* number of online processors */ + struct cpupart *cp_next; /* next partition in list */ + struct cpupart *cp_prev; /* previous partition in list */ + struct cpu *cp_cpulist; /* processor list */ + struct kstat *cp_kstat; /* per-partition statistics */ + + /* + * cp_nrunnable and cp_nrunning are used to calculate load average. + */ + uint_t cp_nrunnable; /* current # of runnable threads */ + uint_t cp_nrunning; /* current # of running threads */ + + /* + * cp_updates, cp_nrunnable_cum, cp_nwaiting_cum, and cp_hp_avenrun + * are used to generate kstat information on an as-needed basis. + */ + uint64_t cp_updates; /* number of statistics updates */ + uint64_t cp_nrunnable_cum; /* cum. # of runnable threads */ + uint64_t cp_nwaiting_cum; /* cum. # of waiting threads */ + + struct loadavg_s cp_loadavg; /* cpupart loadavg */ + + klgrpset_t cp_lgrpset; /* set of lgroups on which this */ + /* partition has cpus */ + lpl_t *cp_lgrploads; /* table of load averages for this */ + /* partition, indexed by lgrp ID */ + int cp_nlgrploads; /* size of cp_lgrploads table */ + uint64_t cp_hp_avenrun[3]; /* high-precision load average */ + uint_t cp_attr; /* bitmask of attributes */ + lgrp_gen_t cp_gen; /* generation number */ + lgrp_id_t cp_lgrp_hint; /* last home lgroup chosen */ + bitset_t cp_cmt_pgs; /* CMT PGs represented */ + bitset_t cp_haltset; /* halted CPUs */ +} cpupart_t; + +typedef struct cpupart_kstat { + kstat_named_t cpk_updates; /* number of updates */ + kstat_named_t cpk_runnable; /* cum # of runnable threads */ + kstat_named_t cpk_waiting; /* cum # waiting for I/O */ + kstat_named_t cpk_ncpus; /* current # of CPUs */ + kstat_named_t cpk_avenrun_1min; /* 1-minute load average */ + kstat_named_t cpk_avenrun_5min; /* 5-minute load average */ + kstat_named_t cpk_avenrun_15min; /* 15-minute load average */ +} cpupart_kstat_t; + +/* + * Macro to obtain the maximum run priority for the global queue associated + * with given cpu partition. + */ +#define CP_MAXRUNPRI(cp) ((cp)->cp_kp_queue.disp_maxrunpri) + +/* + * This macro is used to determine if the given thread must surrender + * CPU to higher priority runnable threads on one of its dispatch queues. + * This should really be defined in <sys/disp.h> but it is not because + * including <sys/cpupart.h> there would cause recursive includes. + */ +#define DISP_MUST_SURRENDER(t) \ + ((DISP_MAXRUNPRI(t) > DISP_PRIO(t)) || \ + (CP_MAXRUNPRI(t->t_cpupart) > DISP_PRIO(t))) + +extern cpupart_t cp_default; +extern cpupart_t *cp_list_head; +extern uint_t cp_numparts; +extern uint_t cp_numparts_nonempty; + +/* + * Each partition contains a bitset that indicates which CPUs are halted and + * which ones are running. Given the growing number of CPUs in current and + * future platforms, it's important to fanout each CPU within its partition's + * haltset to prevent contention due to false sharing. The fanout factor + * is platform specific, and declared accordingly. + */ +extern uint_t cp_haltset_fanout; + +extern void cpupart_initialize_default(); +extern cpupart_t *cpupart_find(psetid_t); +extern int cpupart_create(psetid_t *); +extern int cpupart_destroy(psetid_t); +extern psetid_t cpupart_query_cpu(cpu_t *); +extern int cpupart_attach_cpu(psetid_t, cpu_t *, int); +extern int cpupart_get_cpus(psetid_t *, processorid_t *, uint_t *); +extern int cpupart_bind_thread(kthread_id_t, psetid_t, int, void *, + void *); +extern void cpupart_kpqalloc(pri_t); +extern int cpupart_get_loadavg(psetid_t, int *, int); +extern uint_t cpupart_list(psetid_t *, uint_t, int); +extern int cpupart_setattr(psetid_t, uint_t); +extern int cpupart_getattr(psetid_t, uint_t *); + +#endif /* _KERNEL || _FAKE_KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CPUPART_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h new file mode 100644 index 000000000000..f526c85872e7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h @@ -0,0 +1,830 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. + * Copyright 2017 RackTop Systems. + */ + +#ifndef _SYS_CPUVAR_H +#define _SYS_CPUVAR_H + +#include <sys/thread.h> +#include <sys/sysinfo.h> /* has cpu_stat_t definition */ +#include <sys/disp.h> +#include <sys/processor.h> +#include <sys/kcpc.h> /* has kcpc_ctx_t definition */ + +#include <sys/loadavg.h> +#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP) +#include <sys/machcpuvar.h> +#endif + +#include <sys/types.h> +#include <sys/file.h> +#include <sys/bitmap.h> +#include <sys/rwlock.h> +#include <sys/msacct.h> +#if defined(__GNUC__) && defined(_ASM_INLINES) && defined(_KERNEL) && \ + (defined(__i386) || defined(__amd64)) +#include <asm/cpuvar.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +struct squeue_set_s; + +#define CPU_CACHE_COHERENCE_SIZE 64 + +/* + * For fast event tracing. + */ +struct ftrace_record; +typedef struct ftrace_data { + int ftd_state; /* ftrace flags */ + kmutex_t ftd_unused; /* ftrace buffer lock, unused */ + struct ftrace_record *ftd_cur; /* current record */ + struct ftrace_record *ftd_first; /* first record */ + struct ftrace_record *ftd_last; /* last record */ +} ftrace_data_t; + +struct cyc_cpu; +struct nvlist; + +/* + * Per-CPU data. + * + * Be careful adding new members: if they are not the same in all modules (e.g. + * change size depending on a #define), CTF uniquification can fail to work + * properly. Furthermore, this is transitive in that it applies recursively to + * all types pointed to by cpu_t. + */ +typedef struct cpu { + processorid_t cpu_id; /* CPU number */ + processorid_t cpu_seqid; /* sequential CPU id (0..ncpus-1) */ + volatile cpu_flag_t cpu_flags; /* flags indicating CPU state */ + struct cpu *cpu_self; /* pointer to itself */ + kthread_t *cpu_thread; /* current thread */ + kthread_t *cpu_idle_thread; /* idle thread for this CPU */ + kthread_t *cpu_pause_thread; /* pause thread for this CPU */ + klwp_id_t cpu_lwp; /* current lwp (if any) */ + klwp_id_t cpu_fpowner; /* currently loaded fpu owner */ + struct cpupart *cpu_part; /* partition with this CPU */ + struct lgrp_ld *cpu_lpl; /* pointer to this cpu's load */ + int cpu_cache_offset; /* see kmem.c for details */ + + /* + * Links to other CPUs. It is safe to walk these lists if + * one of the following is true: + * - cpu_lock held + * - preemption disabled via kpreempt_disable + * - PIL >= DISP_LEVEL + * - acting thread is an interrupt thread + * - all other CPUs are paused + */ + struct cpu *cpu_next; /* next existing CPU */ + struct cpu *cpu_prev; /* prev existing CPU */ + struct cpu *cpu_next_onln; /* next online (enabled) CPU */ + struct cpu *cpu_prev_onln; /* prev online (enabled) CPU */ + struct cpu *cpu_next_part; /* next CPU in partition */ + struct cpu *cpu_prev_part; /* prev CPU in partition */ + struct cpu *cpu_next_lgrp; /* next CPU in latency group */ + struct cpu *cpu_prev_lgrp; /* prev CPU in latency group */ + struct cpu *cpu_next_lpl; /* next CPU in lgrp partition */ + struct cpu *cpu_prev_lpl; + + struct cpu_pg *cpu_pg; /* cpu's processor groups */ + + void *cpu_reserved[4]; /* reserved for future use */ + + /* + * Scheduling variables. + */ + disp_t *cpu_disp; /* dispatch queue data */ + /* + * Note that cpu_disp is set before the CPU is added to the system + * and is never modified. Hence, no additional locking is needed + * beyond what's necessary to access the cpu_t structure. + */ + char cpu_runrun; /* scheduling flag - set to preempt */ + char cpu_kprunrun; /* force kernel preemption */ + pri_t cpu_chosen_level; /* priority at which cpu */ + /* was chosen for scheduling */ + kthread_t *cpu_dispthread; /* thread selected for dispatch */ + disp_lock_t cpu_thread_lock; /* dispatcher lock on current thread */ + uint8_t cpu_disp_flags; /* flags used by dispatcher */ + /* + * The following field is updated when ever the cpu_dispthread + * changes. Also in places, where the current thread(cpu_dispthread) + * priority changes. This is used in disp_lowpri_cpu() + */ + pri_t cpu_dispatch_pri; /* priority of cpu_dispthread */ + clock_t cpu_last_swtch; /* last time switched to new thread */ + + /* + * Interrupt data. + */ + caddr_t cpu_intr_stack; /* interrupt stack */ + kthread_t *cpu_intr_thread; /* interrupt thread list */ + uint_t cpu_intr_actv; /* interrupt levels active (bitmask) */ + int cpu_base_spl; /* priority for highest rupt active */ + + /* + * Statistics. + */ + cpu_stats_t cpu_stats; /* per-CPU statistics */ + struct kstat *cpu_info_kstat; /* kstat for cpu info */ + + uintptr_t cpu_profile_pc; /* kernel PC in profile interrupt */ + uintptr_t cpu_profile_upc; /* user PC in profile interrupt */ + uintptr_t cpu_profile_pil; /* PIL when profile interrupted */ + + ftrace_data_t cpu_ftrace; /* per cpu ftrace data */ + + clock_t cpu_deadman_counter; /* used by deadman() */ + uint_t cpu_deadman_countdown; /* used by deadman() */ + + kmutex_t cpu_cpc_ctxlock; /* protects context for idle thread */ + kcpc_ctx_t *cpu_cpc_ctx; /* performance counter context */ + + /* + * Configuration information for the processor_info system call. + */ + processor_info_t cpu_type_info; /* config info */ + time_t cpu_state_begin; /* when CPU entered current state */ + char cpu_cpr_flags; /* CPR related info */ + struct cyc_cpu *cpu_cyclic; /* per cpu cyclic subsystem data */ + struct squeue_set_s *cpu_squeue_set; /* per cpu squeue set */ + struct nvlist *cpu_props; /* pool-related properties */ + + krwlock_t cpu_ft_lock; /* DTrace: fasttrap lock */ + uintptr_t cpu_dtrace_caller; /* DTrace: caller, if any */ + hrtime_t cpu_dtrace_chillmark; /* DTrace: chill mark time */ + hrtime_t cpu_dtrace_chilled; /* DTrace: total chill time */ + volatile uint16_t cpu_mstate; /* cpu microstate */ + volatile uint16_t cpu_mstate_gen; /* generation counter */ + volatile hrtime_t cpu_mstate_start; /* cpu microstate start time */ + volatile hrtime_t cpu_acct[NCMSTATES]; /* cpu microstate data */ + hrtime_t cpu_intracct[NCMSTATES]; /* interrupt mstate data */ + hrtime_t cpu_waitrq; /* cpu run-queue wait time */ + struct loadavg_s cpu_loadavg; /* loadavg info for this cpu */ + + char *cpu_idstr; /* for printing and debugging */ + char *cpu_brandstr; /* for printing */ + + /* + * Sum of all device interrupt weights that are currently directed at + * this cpu. Cleared at start of interrupt redistribution. + */ + int32_t cpu_intr_weight; + void *cpu_vm_data; + + struct cpu_physid *cpu_physid; /* physical associations */ + + uint64_t cpu_curr_clock; /* current clock freq in Hz */ + char *cpu_supp_freqs; /* supported freqs in Hz */ + + uintptr_t cpu_cpcprofile_pc; /* kernel PC in cpc interrupt */ + uintptr_t cpu_cpcprofile_upc; /* user PC in cpc interrupt */ + + /* + * Interrupt load factor used by dispatcher & softcall + */ + hrtime_t cpu_intrlast; /* total interrupt time (nsec) */ + int cpu_intrload; /* interrupt load factor (0-99%) */ + + uint_t cpu_rotor; /* for cheap pseudo-random numbers */ + + struct cu_cpu_info *cpu_cu_info; /* capacity & util. info */ + + /* + * cpu_generation is updated whenever CPU goes on-line or off-line. + * Updates to cpu_generation are protected by cpu_lock. + * + * See CPU_NEW_GENERATION() macro below. + */ + volatile uint_t cpu_generation; /* tracking on/off-line */ + + /* + * New members must be added /before/ this member, as the CTF tools + * rely on this being the last field before cpu_m, so they can + * correctly calculate the offset when synthetically adding the cpu_m + * member in objects that do not have it. This fixup is required for + * uniquification to work correctly. + */ + uintptr_t cpu_m_pad; + +#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP) + struct machcpu cpu_m; /* per architecture info */ +#endif +} cpu_t; + +/* + * The cpu_core structure consists of per-CPU state available in any context. + * On some architectures, this may mean that the page(s) containing the + * NCPU-sized array of cpu_core structures must be locked in the TLB -- it + * is up to the platform to assure that this is performed properly. Note that + * the structure is sized to avoid false sharing. + */ +#define CPUC_SIZE (sizeof (uint16_t) + sizeof (uint8_t) + \ + sizeof (uintptr_t) + sizeof (kmutex_t)) +#define CPUC_PADSIZE CPU_CACHE_COHERENCE_SIZE - CPUC_SIZE + +typedef struct cpu_core { + uint16_t cpuc_dtrace_flags; /* DTrace flags */ + uint8_t cpuc_dcpc_intr_state; /* DCPC provider intr state */ + uint8_t cpuc_pad[CPUC_PADSIZE]; /* padding */ + uintptr_t cpuc_dtrace_illval; /* DTrace illegal value */ + kmutex_t cpuc_pid_lock; /* DTrace pid provider lock */ +} cpu_core_t; + +#ifdef _KERNEL +extern cpu_core_t cpu_core[]; +#endif /* _KERNEL */ + +/* + * CPU_ON_INTR() macro. Returns non-zero if currently on interrupt stack. + * Note that this isn't a test for a high PIL. For example, cpu_intr_actv + * does not get updated when we go through sys_trap from TL>0 at high PIL. + * getpil() should be used instead to check for PIL levels. + */ +#define CPU_ON_INTR(cpup) ((cpup)->cpu_intr_actv >> (LOCK_LEVEL + 1)) + +/* + * Check to see if an interrupt thread might be active at a given ipl. + * If so return true. + * We must be conservative--it is ok to give a false yes, but a false no + * will cause disaster. (But if the situation changes after we check it is + * ok--the caller is trying to ensure that an interrupt routine has been + * exited). + * This is used when trying to remove an interrupt handler from an autovector + * list in avintr.c. + */ +#define INTR_ACTIVE(cpup, level) \ + ((level) <= LOCK_LEVEL ? \ + ((cpup)->cpu_intr_actv & (1 << (level))) : (CPU_ON_INTR(cpup))) + +/* + * CPU_PSEUDO_RANDOM() returns a per CPU value that changes each time one + * looks at it. It's meant as a cheap mechanism to be incorporated in routines + * wanting to avoid biasing, but where true randomness isn't needed (just + * something that changes). + */ +#define CPU_PSEUDO_RANDOM() (CPU->cpu_rotor++) + +#if defined(_KERNEL) || defined(_KMEMUSER) + +#define INTR_STACK_SIZE MAX(DEFAULTSTKSZ, PAGESIZE) + +/* MEMBERS PROTECTED BY "atomicity": cpu_flags */ + +/* + * Flags in the CPU structure. + * + * These are protected by cpu_lock (except during creation). + * + * Offlined-CPUs have three stages of being offline: + * + * CPU_ENABLE indicates that the CPU is participating in I/O interrupts + * that can be directed at a number of different CPUs. If CPU_ENABLE + * is off, the CPU will not be given interrupts that can be sent elsewhere, + * but will still get interrupts from devices associated with that CPU only, + * and from other CPUs. + * + * CPU_OFFLINE indicates that the dispatcher should not allow any threads + * other than interrupt threads to run on that CPU. A CPU will not have + * CPU_OFFLINE set if there are any bound threads (besides interrupts). + * + * CPU_QUIESCED is set if p_offline was able to completely turn idle the + * CPU and it will not have to run interrupt threads. In this case it'll + * stay in the idle loop until CPU_QUIESCED is turned off. + * + * CPU_FROZEN is used only by CPR to mark CPUs that have been successfully + * suspended (in the suspend path), or have yet to be resumed (in the resume + * case). + * + * On some platforms CPUs can be individually powered off. + * The following flags are set for powered off CPUs: CPU_QUIESCED, + * CPU_OFFLINE, and CPU_POWEROFF. The following flags are cleared: + * CPU_RUNNING, CPU_READY, CPU_EXISTS, and CPU_ENABLE. + */ +#define CPU_RUNNING 0x001 /* CPU running */ +#define CPU_READY 0x002 /* CPU ready for cross-calls */ +#define CPU_QUIESCED 0x004 /* CPU will stay in idle */ +#define CPU_EXISTS 0x008 /* CPU is configured */ +#define CPU_ENABLE 0x010 /* CPU enabled for interrupts */ +#define CPU_OFFLINE 0x020 /* CPU offline via p_online */ +#define CPU_POWEROFF 0x040 /* CPU is powered off */ +#define CPU_FROZEN 0x080 /* CPU is frozen via CPR suspend */ +#define CPU_SPARE 0x100 /* CPU offline available for use */ +#define CPU_FAULTED 0x200 /* CPU offline diagnosed faulty */ + +#define FMT_CPU_FLAGS \ + "\20\12fault\11spare\10frozen" \ + "\7poweroff\6offline\5enable\4exist\3quiesced\2ready\1run" + +#define CPU_ACTIVE(cpu) (((cpu)->cpu_flags & CPU_OFFLINE) == 0) + +/* + * Flags for cpu_offline(), cpu_faulted(), and cpu_spare(). + */ +#define CPU_FORCED 0x0001 /* Force CPU offline */ + +/* + * DTrace flags. + */ +#define CPU_DTRACE_NOFAULT 0x0001 /* Don't fault */ +#define CPU_DTRACE_DROP 0x0002 /* Drop this ECB */ +#define CPU_DTRACE_BADADDR 0x0004 /* DTrace fault: bad address */ +#define CPU_DTRACE_BADALIGN 0x0008 /* DTrace fault: bad alignment */ +#define CPU_DTRACE_DIVZERO 0x0010 /* DTrace fault: divide by zero */ +#define CPU_DTRACE_ILLOP 0x0020 /* DTrace fault: illegal operation */ +#define CPU_DTRACE_NOSCRATCH 0x0040 /* DTrace fault: out of scratch */ +#define CPU_DTRACE_KPRIV 0x0080 /* DTrace fault: bad kernel access */ +#define CPU_DTRACE_UPRIV 0x0100 /* DTrace fault: bad user access */ +#define CPU_DTRACE_TUPOFLOW 0x0200 /* DTrace fault: tuple stack overflow */ +#if defined(__sparc) +#define CPU_DTRACE_FAKERESTORE 0x0400 /* pid provider hint to getreg */ +#endif +#define CPU_DTRACE_ENTRY 0x0800 /* pid provider hint to ustack() */ +#define CPU_DTRACE_BADSTACK 0x1000 /* DTrace fault: bad stack */ + +#define CPU_DTRACE_FAULT (CPU_DTRACE_BADADDR | CPU_DTRACE_BADALIGN | \ + CPU_DTRACE_DIVZERO | CPU_DTRACE_ILLOP | \ + CPU_DTRACE_NOSCRATCH | CPU_DTRACE_KPRIV | \ + CPU_DTRACE_UPRIV | CPU_DTRACE_TUPOFLOW | \ + CPU_DTRACE_BADSTACK) +#define CPU_DTRACE_ERROR (CPU_DTRACE_FAULT | CPU_DTRACE_DROP) + +/* + * Dispatcher flags + * These flags must be changed only by the current CPU. + */ +#define CPU_DISP_DONTSTEAL 0x01 /* CPU undergoing context swtch */ +#define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */ + +#endif /* _KERNEL || _KMEMUSER */ + +#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP) + +/* + * Macros for manipulating sets of CPUs as a bitmap. Note that this + * bitmap may vary in size depending on the maximum CPU id a specific + * platform supports. This may be different than the number of CPUs + * the platform supports, since CPU ids can be sparse. We define two + * sets of macros; one for platforms where the maximum CPU id is less + * than the number of bits in a single word (32 in a 32-bit kernel, + * 64 in a 64-bit kernel), and one for platforms that require bitmaps + * of more than one word. + */ + +#define CPUSET_WORDS BT_BITOUL(NCPU) +#define CPUSET_NOTINSET ((uint_t)-1) + +#if CPUSET_WORDS > 1 + +typedef struct cpuset { + ulong_t cpub[CPUSET_WORDS]; +} cpuset_t; + +/* + * Private functions for manipulating cpusets that do not fit in a + * single word. These should not be used directly; instead the + * CPUSET_* macros should be used so the code will be portable + * across different definitions of NCPU. + */ +extern void cpuset_all(cpuset_t *); +extern void cpuset_all_but(cpuset_t *, uint_t); +extern int cpuset_isnull(cpuset_t *); +extern int cpuset_cmp(cpuset_t *, cpuset_t *); +extern void cpuset_only(cpuset_t *, uint_t); +extern uint_t cpuset_find(cpuset_t *); +extern void cpuset_bounds(cpuset_t *, uint_t *, uint_t *); + +#define CPUSET_ALL(set) cpuset_all(&(set)) +#define CPUSET_ALL_BUT(set, cpu) cpuset_all_but(&(set), cpu) +#define CPUSET_ONLY(set, cpu) cpuset_only(&(set), cpu) +#define CPU_IN_SET(set, cpu) BT_TEST((set).cpub, cpu) +#define CPUSET_ADD(set, cpu) BT_SET((set).cpub, cpu) +#define CPUSET_DEL(set, cpu) BT_CLEAR((set).cpub, cpu) +#define CPUSET_ISNULL(set) cpuset_isnull(&(set)) +#define CPUSET_ISEQUAL(set1, set2) cpuset_cmp(&(set1), &(set2)) + +/* + * Find one CPU in the cpuset. + * Sets "cpu" to the id of the found CPU, or CPUSET_NOTINSET if no cpu + * could be found. (i.e. empty set) + */ +#define CPUSET_FIND(set, cpu) { \ + cpu = cpuset_find(&(set)); \ +} + +/* + * Determine the smallest and largest CPU id in the set. Returns + * CPUSET_NOTINSET in smallest and largest when set is empty. + */ +#define CPUSET_BOUNDS(set, smallest, largest) { \ + cpuset_bounds(&(set), &(smallest), &(largest)); \ +} + +/* + * Atomic cpuset operations + * These are safe to use for concurrent cpuset manipulations. + * "xdel" and "xadd" are exclusive operations, that set "result" to "0" + * if the add or del was successful, or "-1" if not successful. + * (e.g. attempting to add a cpu to a cpuset that's already there, or + * deleting a cpu that's not in the cpuset) + */ + +#define CPUSET_ATOMIC_DEL(set, cpu) BT_ATOMIC_CLEAR((set).cpub, (cpu)) +#define CPUSET_ATOMIC_ADD(set, cpu) BT_ATOMIC_SET((set).cpub, (cpu)) + +#define CPUSET_ATOMIC_XADD(set, cpu, result) \ + BT_ATOMIC_SET_EXCL((set).cpub, cpu, result) + +#define CPUSET_ATOMIC_XDEL(set, cpu, result) \ + BT_ATOMIC_CLEAR_EXCL((set).cpub, cpu, result) + + +#define CPUSET_OR(set1, set2) { \ + int _i; \ + for (_i = 0; _i < CPUSET_WORDS; _i++) \ + (set1).cpub[_i] |= (set2).cpub[_i]; \ +} + +#define CPUSET_XOR(set1, set2) { \ + int _i; \ + for (_i = 0; _i < CPUSET_WORDS; _i++) \ + (set1).cpub[_i] ^= (set2).cpub[_i]; \ +} + +#define CPUSET_AND(set1, set2) { \ + int _i; \ + for (_i = 0; _i < CPUSET_WORDS; _i++) \ + (set1).cpub[_i] &= (set2).cpub[_i]; \ +} + +#define CPUSET_ZERO(set) { \ + int _i; \ + for (_i = 0; _i < CPUSET_WORDS; _i++) \ + (set).cpub[_i] = 0; \ +} + +#elif CPUSET_WORDS == 1 + +typedef ulong_t cpuset_t; /* a set of CPUs */ + +#define CPUSET(cpu) (1UL << (cpu)) + +#define CPUSET_ALL(set) ((void)((set) = ~0UL)) +#define CPUSET_ALL_BUT(set, cpu) ((void)((set) = ~CPUSET(cpu))) +#define CPUSET_ONLY(set, cpu) ((void)((set) = CPUSET(cpu))) +#define CPU_IN_SET(set, cpu) ((set) & CPUSET(cpu)) +#define CPUSET_ADD(set, cpu) ((void)((set) |= CPUSET(cpu))) +#define CPUSET_DEL(set, cpu) ((void)((set) &= ~CPUSET(cpu))) +#define CPUSET_ISNULL(set) ((set) == 0) +#define CPUSET_ISEQUAL(set1, set2) ((set1) == (set2)) +#define CPUSET_OR(set1, set2) ((void)((set1) |= (set2))) +#define CPUSET_XOR(set1, set2) ((void)((set1) ^= (set2))) +#define CPUSET_AND(set1, set2) ((void)((set1) &= (set2))) +#define CPUSET_ZERO(set) ((void)((set) = 0)) + +#define CPUSET_FIND(set, cpu) { \ + cpu = (uint_t)(lowbit(set) - 1); \ +} + +#define CPUSET_BOUNDS(set, smallest, largest) { \ + smallest = (uint_t)(lowbit(set) - 1); \ + largest = (uint_t)(highbit(set) - 1); \ +} + +#define CPUSET_ATOMIC_DEL(set, cpu) atomic_and_ulong(&(set), ~CPUSET(cpu)) +#define CPUSET_ATOMIC_ADD(set, cpu) atomic_or_ulong(&(set), CPUSET(cpu)) + +#define CPUSET_ATOMIC_XADD(set, cpu, result) \ + { result = atomic_set_long_excl(&(set), (cpu)); } + +#define CPUSET_ATOMIC_XDEL(set, cpu, result) \ + { result = atomic_clear_long_excl(&(set), (cpu)); } + +#else /* CPUSET_WORDS <= 0 */ + +#error NCPU is undefined or invalid + +#endif /* CPUSET_WORDS */ + +extern cpuset_t cpu_seqid_inuse; + +#endif /* (_KERNEL || _KMEMUSER) && _MACHDEP */ + +#define CPU_CPR_OFFLINE 0x0 +#define CPU_CPR_ONLINE 0x1 +#define CPU_CPR_IS_OFFLINE(cpu) (((cpu)->cpu_cpr_flags & CPU_CPR_ONLINE) == 0) +#define CPU_CPR_IS_ONLINE(cpu) ((cpu)->cpu_cpr_flags & CPU_CPR_ONLINE) +#define CPU_SET_CPR_FLAGS(cpu, flag) ((cpu)->cpu_cpr_flags |= flag) + +#if defined(_KERNEL) || defined(_KMEMUSER) + +extern struct cpu *cpu[]; /* indexed by CPU number */ +extern struct cpu **cpu_seq; /* indexed by sequential CPU id */ +extern cpu_t *cpu_list; /* list of CPUs */ +extern cpu_t *cpu_active; /* list of active CPUs */ +extern int ncpus; /* number of CPUs present */ +extern int ncpus_online; /* number of CPUs not quiesced */ +extern int max_ncpus; /* max present before ncpus is known */ +extern int boot_max_ncpus; /* like max_ncpus but for real */ +extern int boot_ncpus; /* # cpus present @ boot */ +extern processorid_t max_cpuid; /* maximum CPU number */ +extern struct cpu *cpu_inmotion; /* offline or partition move target */ +extern cpu_t *clock_cpu_list; +extern processorid_t max_cpu_seqid_ever; /* maximum seqid ever given */ + +#if defined(__i386) || defined(__amd64) +extern struct cpu *curcpup(void); +#define CPU (curcpup()) /* Pointer to current CPU */ +#else +#define CPU (curthread->t_cpu) /* Pointer to current CPU */ +#endif + +/* + * CPU_CURRENT indicates to thread_affinity_set to use CPU->cpu_id + * as the target and to grab cpu_lock instead of requiring the caller + * to grab it. + */ +#define CPU_CURRENT -3 + +/* + * Per-CPU statistics + * + * cpu_stats_t contains numerous system and VM-related statistics, in the form + * of gauges or monotonically-increasing event occurrence counts. + */ + +#define CPU_STATS_ENTER_K() kpreempt_disable() +#define CPU_STATS_EXIT_K() kpreempt_enable() + +#define CPU_STATS_ADD_K(class, stat, amount) \ + { kpreempt_disable(); /* keep from switching CPUs */\ + CPU_STATS_ADDQ(CPU, class, stat, amount); \ + kpreempt_enable(); \ + } + +#define CPU_STATS_ADDQ(cp, class, stat, amount) { \ + extern void __dtrace_probe___cpu_##class##info_##stat(uint_t, \ + uint64_t *, cpu_t *); \ + uint64_t *stataddr = &((cp)->cpu_stats.class.stat); \ + __dtrace_probe___cpu_##class##info_##stat((amount), \ + stataddr, cp); \ + *(stataddr) += (amount); \ +} + +#define CPU_STATS(cp, stat) \ + ((cp)->cpu_stats.stat) + +/* + * Increment CPU generation value. + * This macro should be called whenever CPU goes on-line or off-line. + * Updates to cpu_generation should be protected by cpu_lock. + */ +#define CPU_NEW_GENERATION(cp) ((cp)->cpu_generation++) + +#endif /* _KERNEL || _KMEMUSER */ + +/* + * CPU support routines (not for genassym.c) + */ +#if (defined(_KERNEL) || defined(_FAKE_KERNEL)) && defined(__STDC__) + +struct zone; + +void cpu_list_init(cpu_t *); +void cpu_add_unit(cpu_t *); +void cpu_del_unit(int cpuid); +void cpu_add_active(cpu_t *); +void cpu_kstat_init(cpu_t *); +void cpu_visibility_add(cpu_t *, struct zone *); +void cpu_visibility_remove(cpu_t *, struct zone *); +void cpu_visibility_configure(cpu_t *, struct zone *); +void cpu_visibility_unconfigure(cpu_t *, struct zone *); +void cpu_visibility_online(cpu_t *, struct zone *); +void cpu_visibility_offline(cpu_t *, struct zone *); +void cpu_create_intrstat(cpu_t *); +void cpu_delete_intrstat(cpu_t *); +int cpu_kstat_intrstat_update(kstat_t *, int); +void cpu_intr_swtch_enter(kthread_t *); +void cpu_intr_swtch_exit(kthread_t *); + +void mbox_lock_init(void); /* initialize cross-call locks */ +void mbox_init(int cpun); /* initialize cross-calls */ +void poke_cpu(int cpun); /* interrupt another CPU (to preempt) */ + +/* + * values for safe_list. Pause state that CPUs are in. + */ +#define PAUSE_IDLE 0 /* normal state */ +#define PAUSE_READY 1 /* paused thread ready to spl */ +#define PAUSE_WAIT 2 /* paused thread is spl-ed high */ +#define PAUSE_DIE 3 /* tell pause thread to leave */ +#define PAUSE_DEAD 4 /* pause thread has left */ + +void mach_cpu_pause(volatile char *); + +void pause_cpus(cpu_t *off_cp, void *(*func)(void *)); +void start_cpus(void); +int cpus_paused(void); + +void cpu_pause_init(void); +cpu_t *cpu_get(processorid_t cpun); /* get the CPU struct associated */ + +int cpu_online(cpu_t *cp); /* take cpu online */ +int cpu_offline(cpu_t *cp, int flags); /* take cpu offline */ +int cpu_spare(cpu_t *cp, int flags); /* take cpu to spare */ +int cpu_faulted(cpu_t *cp, int flags); /* take cpu to faulted */ +int cpu_poweron(cpu_t *cp); /* take powered-off cpu to offline */ +int cpu_poweroff(cpu_t *cp); /* take offline cpu to powered-off */ + +cpu_t *cpu_intr_next(cpu_t *cp); /* get next online CPU taking intrs */ +int cpu_intr_count(cpu_t *cp); /* count # of CPUs handling intrs */ +int cpu_intr_on(cpu_t *cp); /* CPU taking I/O interrupts? */ +void cpu_intr_enable(cpu_t *cp); /* enable I/O interrupts */ +int cpu_intr_disable(cpu_t *cp); /* disable I/O interrupts */ +void cpu_intr_alloc(cpu_t *cp, int n); /* allocate interrupt threads */ + +/* + * Routines for checking CPU states. + */ +int cpu_is_online(cpu_t *); /* check if CPU is online */ +int cpu_is_nointr(cpu_t *); /* check if CPU can service intrs */ +int cpu_is_active(cpu_t *); /* check if CPU can run threads */ +int cpu_is_offline(cpu_t *); /* check if CPU is offline */ +int cpu_is_poweredoff(cpu_t *); /* check if CPU is powered off */ + +int cpu_flagged_online(cpu_flag_t); /* flags show CPU is online */ +int cpu_flagged_nointr(cpu_flag_t); /* flags show CPU not handling intrs */ +int cpu_flagged_active(cpu_flag_t); /* flags show CPU scheduling threads */ +int cpu_flagged_offline(cpu_flag_t); /* flags show CPU is offline */ +int cpu_flagged_poweredoff(cpu_flag_t); /* flags show CPU is powered off */ + +/* + * The processor_info(2) state of a CPU is a simplified representation suitable + * for use by an application program. Kernel subsystems should utilize the + * internal per-CPU state as given by the cpu_flags member of the cpu structure, + * as this information may include platform- or architecture-specific state + * critical to a subsystem's disposition of a particular CPU. + */ +void cpu_set_state(cpu_t *); /* record/timestamp current state */ +int cpu_get_state(cpu_t *); /* get current cpu state */ +const char *cpu_get_state_str(cpu_t *); /* get current cpu state as string */ + + +void cpu_set_curr_clock(uint64_t); /* indicate the current CPU's freq */ +void cpu_set_supp_freqs(cpu_t *, const char *); /* set the CPU supported */ + /* frequencies */ + +int cpu_configure(int); +int cpu_unconfigure(int); +void cpu_destroy_bound_threads(cpu_t *cp); + +extern int cpu_bind_thread(kthread_t *tp, processorid_t bind, + processorid_t *obind, int *error); +extern int cpu_unbind(processorid_t cpu_id, boolean_t force); +extern void thread_affinity_set(kthread_t *t, int cpu_id); +extern void thread_affinity_clear(kthread_t *t); +extern void affinity_set(int cpu_id); +extern void affinity_clear(void); +extern void init_cpu_mstate(struct cpu *, int); +extern void term_cpu_mstate(struct cpu *); +extern void new_cpu_mstate(int, hrtime_t); +extern void get_cpu_mstate(struct cpu *, hrtime_t *); +extern void thread_nomigrate(void); +extern void thread_allowmigrate(void); +extern void weakbinding_stop(void); +extern void weakbinding_start(void); + +/* + * The following routines affect the CPUs participation in interrupt processing, + * if that is applicable on the architecture. This only affects interrupts + * which aren't directed at the processor (not cross calls). + * + * cpu_disable_intr returns non-zero if interrupts were previously enabled. + */ +int cpu_disable_intr(struct cpu *cp); /* stop issuing interrupts to cpu */ +void cpu_enable_intr(struct cpu *cp); /* start issuing interrupts to cpu */ + +/* + * The mutex cpu_lock protects cpu_flags for all CPUs, as well as the ncpus + * and ncpus_online counts. + */ +extern kmutex_t cpu_lock; /* lock protecting CPU data */ + +/* + * CPU state change events + * + * Various subsystems need to know when CPUs change their state. They get this + * information by registering CPU state change callbacks using + * register_cpu_setup_func(). Whenever any CPU changes its state, the callback + * function is called. The callback function is passed three arguments: + * + * Event, described by cpu_setup_t + * CPU ID + * Transparent pointer passed when registering the callback + * + * The callback function is called with cpu_lock held. The return value from the + * callback function is usually ignored, except for CPU_CONFIG and CPU_UNCONFIG + * events. For these two events, non-zero return value indicates a failure and + * prevents successful completion of the operation. + * + * New events may be added in the future. Callback functions should ignore any + * events that they do not understand. + * + * The following events provide notification callbacks: + * + * CPU_INIT A new CPU is started and added to the list of active CPUs + * This event is only used during boot + * + * CPU_CONFIG A newly inserted CPU is prepared for starting running code + * This event is called by DR code + * + * CPU_UNCONFIG CPU has been powered off and needs cleanup + * This event is called by DR code + * + * CPU_ON CPU is enabled but does not run anything yet + * + * CPU_INTR_ON CPU is enabled and has interrupts enabled + * + * CPU_OFF CPU is going offline but can still run threads + * + * CPU_CPUPART_OUT CPU is going to move out of its partition + * + * CPU_CPUPART_IN CPU is going to move to a new partition + * + * CPU_SETUP CPU is set up during boot and can run threads + */ +typedef enum { + CPU_INIT, + CPU_CONFIG, + CPU_UNCONFIG, + CPU_ON, + CPU_OFF, + CPU_CPUPART_IN, + CPU_CPUPART_OUT, + CPU_SETUP, + CPU_INTR_ON +} cpu_setup_t; + +typedef int cpu_setup_func_t(cpu_setup_t, int, void *); + +/* + * Routines used to register interest in cpu's being added to or removed + * from the system. + */ +extern void register_cpu_setup_func(cpu_setup_func_t *, void *); +extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *); +extern void cpu_state_change_notify(int, cpu_setup_t); + +/* + * Call specified function on the given CPU + */ +typedef void (*cpu_call_func_t)(uintptr_t, uintptr_t); +extern void cpu_call(cpu_t *, cpu_call_func_t, uintptr_t, uintptr_t); + + +/* + * Create various strings that describe the given CPU for the + * processor_info system call and configuration-related kstats. + */ +#define CPU_IDSTRLEN 100 + +extern void init_cpu_info(struct cpu *); +extern void populate_idstr(struct cpu *); +extern void cpu_vm_data_init(struct cpu *); +extern void cpu_vm_data_destroy(struct cpu *); + +#endif /* _KERNEL || _FAKE_KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CPUVAR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h new file mode 100644 index 000000000000..5056f9a51105 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h @@ -0,0 +1,193 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef _SYS_CRED_H +#define _SYS_CRED_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The credential is an opaque kernel private data structure defined in + * <sys/cred_impl.h>. + */ + +typedef struct cred cred_t; + +#ifdef _KERNEL + +#define CRED() curthread->t_cred + +struct proc; /* cred.h is included in proc.h */ +struct prcred; +struct ksid; +struct ksidlist; +struct credklpd; +struct credgrp; + +struct auditinfo_addr; /* cred.h is included in audit.h */ + +extern int ngroups_max; +/* + * kcred is used when you need all privileges. + */ +extern struct cred *kcred; + +extern void cred_init(void); +extern void crhold(cred_t *); +extern void crfree(cred_t *); +extern cred_t *cralloc(void); /* all but ref uninitialized */ +extern cred_t *cralloc_ksid(void); /* cralloc() + ksid alloc'ed */ +extern cred_t *crget(void); /* initialized */ +extern cred_t *crcopy(cred_t *); +extern void crcopy_to(cred_t *, cred_t *); +extern cred_t *crdup(cred_t *); +extern void crdup_to(cred_t *, cred_t *); +extern cred_t *crgetcred(void); +extern void crset(struct proc *, cred_t *); +extern void crset_zone_privall(cred_t *); +extern int groupmember(gid_t, const cred_t *); +extern int supgroupmember(gid_t, const cred_t *); +extern int hasprocperm(const cred_t *, const cred_t *); +extern int prochasprocperm(struct proc *, struct proc *, const cred_t *); +extern int crcmp(const cred_t *, const cred_t *); +extern cred_t *zone_kcred(void); + +extern uid_t crgetuid(const cred_t *); +extern uid_t crgetruid(const cred_t *); +extern uid_t crgetsuid(const cred_t *); +extern gid_t crgetgid(const cred_t *); +extern gid_t crgetrgid(const cred_t *); +extern gid_t crgetsgid(const cred_t *); +extern zoneid_t crgetzoneid(const cred_t *); +extern projid_t crgetprojid(const cred_t *); + +extern cred_t *crgetmapped(const cred_t *); + + +extern const struct auditinfo_addr *crgetauinfo(const cred_t *); +extern struct auditinfo_addr *crgetauinfo_modifiable(cred_t *); + +extern uint_t crgetref(const cred_t *); + +extern const gid_t *crgetgroups(const cred_t *); +extern const gid_t *crgetggroups(const struct credgrp *); + +extern int crgetngroups(const cred_t *); + +/* + * Sets real, effective and/or saved uid/gid; + * -1 argument accepted as "no change". + */ +extern int crsetresuid(cred_t *, uid_t, uid_t, uid_t); +extern int crsetresgid(cred_t *, gid_t, gid_t, gid_t); + +/* + * Sets real, effective and saved uids/gids all to the same + * values. Both values must be non-negative and <= MAXUID + */ +extern int crsetugid(cred_t *, uid_t, gid_t); + +/* + * Functions to handle the supplemental group list. + */ +extern int crsetgroups(cred_t *, int, gid_t *); +extern struct credgrp *crgrpcopyin(int, gid_t *); +extern void crgrprele(struct credgrp *); +extern void crsetcredgrp(cred_t *, struct credgrp *); + +/* + * Private interface for setting zone association of credential. + */ +struct zone; +extern void crsetzone(cred_t *, struct zone *); +extern struct zone *crgetzone(const cred_t *); + +/* + * Private interface for setting project id in credential. + */ +extern void crsetprojid(cred_t *, projid_t); + +/* + * Private interface for nfs. + */ +extern cred_t *crnetadjust(cred_t *); + +/* + * Private interface for procfs. + */ +extern void cred2prcred(const cred_t *, struct prcred *); + +/* + * Private interfaces for Rampart Trusted Solaris. + */ +struct ts_label_s; +extern struct ts_label_s *crgetlabel(const cred_t *); +extern boolean_t crisremote(const cred_t *); + +/* + * Private interfaces for ephemeral uids. + */ +#define VALID_UID(id, zn) \ + ((id) <= MAXUID || valid_ephemeral_uid((zn), (id))) + +#define VALID_GID(id, zn) \ + ((id) <= MAXUID || valid_ephemeral_gid((zn), (id))) + +extern boolean_t valid_ephemeral_uid(struct zone *, uid_t); +extern boolean_t valid_ephemeral_gid(struct zone *, gid_t); + +extern int eph_uid_alloc(struct zone *, int, uid_t *, int); +extern int eph_gid_alloc(struct zone *, int, gid_t *, int); + +extern void crsetsid(cred_t *, struct ksid *, int); +extern void crsetsidlist(cred_t *, struct ksidlist *); + +extern struct ksid *crgetsid(const cred_t *, int); +extern struct ksidlist *crgetsidlist(const cred_t *); + +extern int crsetpriv(cred_t *, ...); + +extern struct credklpd *crgetcrklpd(const cred_t *); +extern void crsetcrklpd(cred_t *, struct credklpd *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRED_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h b/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h new file mode 100644 index 000000000000..528814118e1c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h @@ -0,0 +1,360 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _CTF_H +#define _CTF_H + +#ifdef illumos +#pragma ident "%Z%%M% %I% %E% SMI" +#endif + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * CTF - Compact ANSI-C Type Format + * + * This file format can be used to compactly represent the information needed + * by a debugger to interpret the ANSI-C types used by a given program. + * Traditionally, this kind of information is generated by the compiler when + * invoked with the -g flag and is stored in "stabs" strings or in the more + * modern DWARF format. CTF provides a representation of only the information + * that is relevant to debugging a complex, optimized C program such as the + * operating system kernel in a form that is significantly more compact than + * the equivalent stabs or DWARF representation. The format is data-model + * independent, so consumers do not need different code depending on whether + * they are 32-bit or 64-bit programs. CTF assumes that a standard ELF symbol + * table is available for use in the debugger, and uses the structure and data + * of the symbol table to avoid storing redundant information. The CTF data + * may be compressed on disk or in memory, indicated by a bit in the header. + * CTF may be interpreted in a raw disk file, or it may be stored in an ELF + * section, typically named .SUNW_ctf. Data structures are aligned so that + * a raw CTF file or CTF ELF section may be manipulated using mmap(2). + * + * The CTF file or section itself has the following structure: + * + * +--------+--------+---------+----------+-------+--------+ + * | file | type | data | function | data | string | + * | header | labels | objects | info | types | table | + * +--------+--------+---------+----------+-------+--------+ + * + * The file header stores a magic number and version information, encoding + * flags, and the byte offset of each of the sections relative to the end of the + * header itself. If the CTF data has been uniquified against another set of + * CTF data, a reference to that data also appears in the the header. This + * reference is the name of the label corresponding to the types uniquified + * against. + * + * Following the header is a list of labels, used to group the types included in + * the data types section. Each label is accompanied by a type ID i. A given + * label refers to the group of types whose IDs are in the range [0, i]. + * + * Data object and function records are stored in the same order as they appear + * in the corresponding symbol table, except that symbols marked SHN_UNDEF are + * not stored and symbols that have no type data are padded out with zeroes. + * For each data object, the type ID (a small integer) is recorded. For each + * function, the type ID of the return type and argument types is recorded. + * + * The data types section is a list of variable size records that represent each + * type, in order by their ID. The types themselves form a directed graph, + * where each node may contain one or more outgoing edges to other type nodes, + * denoted by their ID. + * + * Strings are recorded as a string table ID (0 or 1) and a byte offset into the + * string table. String table 0 is the internal CTF string table. String table + * 1 is the external string table, which is the string table associated with the + * ELF symbol table for this object. CTF does not record any strings that are + * already in the symbol table, and the CTF string table does not contain any + * duplicated strings. + * + * If the CTF data has been merged with another parent CTF object, some outgoing + * edges may refer to type nodes that exist in another CTF object. The debugger + * and libctf library are responsible for connecting the appropriate objects + * together so that the full set of types can be explored and manipulated. + */ + +#define CTF_MAX_TYPE 0xffff /* max type identifier value */ +#define CTF_MAX_NAME 0x7fffffff /* max offset into a string table */ +#define CTF_MAX_VLEN 0x3ff /* max struct, union, enum members or args */ +#define CTF_MAX_INTOFF 0xff /* max offset of intrinsic value in bits */ +#define CTF_MAX_INTBITS 0xffff /* max size of an intrinsic in bits */ + +/* See ctf_type_t */ +#define CTF_MAX_SIZE 0xfffe /* max size of a type in bytes */ +#define CTF_LSIZE_SENT 0xffff /* sentinel for ctt_size */ +#define CTF_MAX_LSIZE UINT64_MAX + +typedef struct ctf_preamble { + ushort_t ctp_magic; /* magic number (CTF_MAGIC) */ + uchar_t ctp_version; /* data format version number (CTF_VERSION) */ + uchar_t ctp_flags; /* flags (see below) */ +} ctf_preamble_t; + +typedef struct ctf_header { + ctf_preamble_t cth_preamble; + uint_t cth_parlabel; /* ref to name of parent lbl uniq'd against */ + uint_t cth_parname; /* ref to basename of parent */ + uint_t cth_lbloff; /* offset of label section */ + uint_t cth_objtoff; /* offset of object section */ + uint_t cth_funcoff; /* offset of function section */ + uint_t cth_typeoff; /* offset of type section */ + uint_t cth_stroff; /* offset of string section */ + uint_t cth_strlen; /* length of string section in bytes */ +} ctf_header_t; + +#define cth_magic cth_preamble.ctp_magic +#define cth_version cth_preamble.ctp_version +#define cth_flags cth_preamble.ctp_flags + +#ifdef CTF_OLD_VERSIONS + +typedef struct ctf_header_v1 { + ctf_preamble_t cth_preamble; + uint_t cth_objtoff; + uint_t cth_funcoff; + uint_t cth_typeoff; + uint_t cth_stroff; + uint_t cth_strlen; +} ctf_header_v1_t; + +#endif /* CTF_OLD_VERSIONS */ + +#define CTF_MAGIC 0xcff1 /* magic number identifying header */ + +/* data format version number */ +#define CTF_VERSION_1 1 +#define CTF_VERSION_2 2 +#define CTF_VERSION CTF_VERSION_2 /* current version */ + +#define CTF_F_COMPRESS 0x1 /* data buffer is compressed */ + +typedef struct ctf_lblent { + uint_t ctl_label; /* ref to name of label */ + uint_t ctl_typeidx; /* last type associated with this label */ +} ctf_lblent_t; + +typedef struct ctf_stype { + uint_t ctt_name; /* reference to name in string table */ + ushort_t ctt_info; /* encoded kind, variant length (see below) */ + union { + ushort_t _size; /* size of entire type in bytes */ + ushort_t _type; /* reference to another type */ + } _u; +} ctf_stype_t; + +/* + * type sizes, measured in bytes, come in two flavors. 99% of them fit within + * (USHRT_MAX - 1), and thus can be stored in the ctt_size member of a + * ctf_stype_t. The maximum value for these sizes is CTF_MAX_SIZE. The sizes + * larger than CTF_MAX_SIZE must be stored in the ctt_lsize member of a + * ctf_type_t. Use of this member is indicated by the presence of + * CTF_LSIZE_SENT in ctt_size. + */ +typedef struct ctf_type { + uint_t ctt_name; /* reference to name in string table */ + ushort_t ctt_info; /* encoded kind, variant length (see below) */ + union { + ushort_t _size; /* always CTF_LSIZE_SENT */ + ushort_t _type; /* do not use */ + } _u; + uint_t ctt_lsizehi; /* high 32 bits of type size in bytes */ + uint_t ctt_lsizelo; /* low 32 bits of type size in bytes */ +} ctf_type_t; + +#define ctt_size _u._size /* for fundamental types that have a size */ +#define ctt_type _u._type /* for types that reference another type */ + +/* + * The following macros compose and decompose values for ctt_info and + * ctt_name, as well as other structures that contain name references. + * + * ------------------------ + * ctt_info: | kind | isroot | vlen | + * ------------------------ + * 15 11 10 9 0 + * + * kind = CTF_INFO_KIND(c.ctt_info); <-- CTF_K_* value (see below) + * vlen = CTF_INFO_VLEN(c.ctt_info); <-- length of variable data list + * + * stid = CTF_NAME_STID(c.ctt_name); <-- string table id number (0 or 1) + * offset = CTF_NAME_OFFSET(c.ctt_name); <-- string table byte offset + * + * c.ctt_info = CTF_TYPE_INFO(kind, vlen); + * c.ctt_name = CTF_TYPE_NAME(stid, offset); + */ + +#define CTF_INFO_KIND(info) (((info) & 0xf800) >> 11) +#define CTF_INFO_ISROOT(info) (((info) & 0x0400) >> 10) +#define CTF_INFO_VLEN(info) (((info) & CTF_MAX_VLEN)) + +#define CTF_NAME_STID(name) ((name) >> 31) +#define CTF_NAME_OFFSET(name) ((name) & 0x7fffffff) + +#define CTF_TYPE_INFO(kind, isroot, vlen) \ + (((kind) << 11) | (((isroot) ? 1 : 0) << 10) | ((vlen) & CTF_MAX_VLEN)) + +#define CTF_TYPE_NAME(stid, offset) \ + (((stid) << 31) | ((offset) & 0x7fffffff)) + +#define CTF_TYPE_ISPARENT(id) ((id) < 0x8000) +#define CTF_TYPE_ISCHILD(id) ((id) > 0x7fff) + +#define CTF_TYPE_TO_INDEX(id) ((id) & 0x7fff) +#define CTF_INDEX_TO_TYPE(id, child) ((child) ? ((id) | 0x8000) : (id)) +#define CTF_PARENT_SHIFT 15 + +#define CTF_STRTAB_0 0 /* symbolic define for string table id 0 */ +#define CTF_STRTAB_1 1 /* symbolic define for string table id 1 */ + +#define CTF_TYPE_LSIZE(cttp) \ + (((uint64_t)(cttp)->ctt_lsizehi) << 32 | (cttp)->ctt_lsizelo) +#define CTF_SIZE_TO_LSIZE_HI(size) ((uint32_t)((uint64_t)(size) >> 32)) +#define CTF_SIZE_TO_LSIZE_LO(size) ((uint32_t)(size)) + +#ifdef CTF_OLD_VERSIONS + +#define CTF_INFO_KIND_V1(info) (((info) & 0xf000) >> 12) +#define CTF_INFO_ISROOT_V1(info) (((info) & 0x0800) >> 11) +#define CTF_INFO_VLEN_V1(info) (((info) & 0x07ff)) + +#define CTF_TYPE_INFO_V1(kind, isroot, vlen) \ + (((kind) << 12) | (((isroot) ? 1 : 0) << 11) | ((vlen) & 0x07ff)) + +#endif /* CTF_OLD_VERSIONS */ + +/* + * Values for CTF_TYPE_KIND(). If the kind has an associated data list, + * CTF_INFO_VLEN() will extract the number of elements in the list, and + * the type of each element is shown in the comments below. + */ +#define CTF_K_UNKNOWN 0 /* unknown type (used for padding) */ +#define CTF_K_INTEGER 1 /* variant data is CTF_INT_DATA() (see below) */ +#define CTF_K_FLOAT 2 /* variant data is CTF_FP_DATA() (see below) */ +#define CTF_K_POINTER 3 /* ctt_type is referenced type */ +#define CTF_K_ARRAY 4 /* variant data is single ctf_array_t */ +#define CTF_K_FUNCTION 5 /* ctt_type is return type, variant data is */ + /* list of argument types (ushort_t's) */ +#define CTF_K_STRUCT 6 /* variant data is list of ctf_member_t's */ +#define CTF_K_UNION 7 /* variant data is list of ctf_member_t's */ +#define CTF_K_ENUM 8 /* variant data is list of ctf_enum_t's */ +#define CTF_K_FORWARD 9 /* no additional data; ctt_name is tag */ +#define CTF_K_TYPEDEF 10 /* ctt_type is referenced type */ +#define CTF_K_VOLATILE 11 /* ctt_type is base type */ +#define CTF_K_CONST 12 /* ctt_type is base type */ +#define CTF_K_RESTRICT 13 /* ctt_type is base type */ + +#define CTF_K_MAX 31 /* Maximum possible CTF_K_* value */ + +/* + * Values for ctt_type when kind is CTF_K_INTEGER. The flags, offset in bits, + * and size in bits are encoded as a single word using the following macros. + */ +#define CTF_INT_ENCODING(data) (((data) & 0xff000000) >> 24) +#define CTF_INT_OFFSET(data) (((data) & 0x00ff0000) >> 16) +#define CTF_INT_BITS(data) (((data) & 0x0000ffff)) + +#define CTF_INT_DATA(encoding, offset, bits) \ + (((encoding) << 24) | ((offset) << 16) | (bits)) + +#define CTF_INT_SIGNED 0x01 /* integer is signed (otherwise unsigned) */ +#define CTF_INT_CHAR 0x02 /* character display format */ +#define CTF_INT_BOOL 0x04 /* boolean display format */ +#define CTF_INT_VARARGS 0x08 /* varargs display format */ + +/* + * Values for ctt_type when kind is CTF_K_FLOAT. The encoding, offset in bits, + * and size in bits are encoded as a single word using the following macros. + */ +#define CTF_FP_ENCODING(data) (((data) & 0xff000000) >> 24) +#define CTF_FP_OFFSET(data) (((data) & 0x00ff0000) >> 16) +#define CTF_FP_BITS(data) (((data) & 0x0000ffff)) + +#define CTF_FP_DATA(encoding, offset, bits) \ + (((encoding) << 24) | ((offset) << 16) | (bits)) + +#define CTF_FP_SINGLE 1 /* IEEE 32-bit float encoding */ +#define CTF_FP_DOUBLE 2 /* IEEE 64-bit float encoding */ +#define CTF_FP_CPLX 3 /* Complex encoding */ +#define CTF_FP_DCPLX 4 /* Double complex encoding */ +#define CTF_FP_LDCPLX 5 /* Long double complex encoding */ +#define CTF_FP_LDOUBLE 6 /* Long double encoding */ +#define CTF_FP_INTRVL 7 /* Interval (2x32-bit) encoding */ +#define CTF_FP_DINTRVL 8 /* Double interval (2x64-bit) encoding */ +#define CTF_FP_LDINTRVL 9 /* Long double interval (2x128-bit) encoding */ +#define CTF_FP_IMAGRY 10 /* Imaginary (32-bit) encoding */ +#define CTF_FP_DIMAGRY 11 /* Long imaginary (64-bit) encoding */ +#define CTF_FP_LDIMAGRY 12 /* Long double imaginary (128-bit) encoding */ + +#define CTF_FP_MAX 12 /* Maximum possible CTF_FP_* value */ + +typedef struct ctf_array { + ushort_t cta_contents; /* reference to type of array contents */ + ushort_t cta_index; /* reference to type of array index */ + uint_t cta_nelems; /* number of elements */ +} ctf_array_t; + +/* + * Most structure members have bit offsets that can be expressed using a + * short. Some don't. ctf_member_t is used for structs which cannot + * contain any of these large offsets, whereas ctf_lmember_t is used in the + * latter case. If ctt_size for a given struct is >= 8192 bytes, all members + * will be stored as type ctf_lmember_t. + */ + +#define CTF_LSTRUCT_THRESH 8192 + +typedef struct ctf_member { + uint_t ctm_name; /* reference to name in string table */ + ushort_t ctm_type; /* reference to type of member */ + ushort_t ctm_offset; /* offset of this member in bits */ +} ctf_member_t; + +typedef struct ctf_lmember { + uint_t ctlm_name; /* reference to name in string table */ + ushort_t ctlm_type; /* reference to type of member */ + ushort_t ctlm_pad; /* padding */ + uint_t ctlm_offsethi; /* high 32 bits of member offset in bits */ + uint_t ctlm_offsetlo; /* low 32 bits of member offset in bits */ +} ctf_lmember_t; + +#define CTF_LMEM_OFFSET(ctlmp) \ + (((uint64_t)(ctlmp)->ctlm_offsethi) << 32 | (ctlmp)->ctlm_offsetlo) +#define CTF_OFFSET_TO_LMEMHI(offset) ((uint32_t)((uint64_t)(offset) >> 32)) +#define CTF_OFFSET_TO_LMEMLO(offset) ((uint32_t)(offset)) + +typedef struct ctf_enum { + uint_t cte_name; /* reference to name in string table */ + int cte_value; /* value associated with this name */ +} ctf_enum_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _CTF_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h b/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h new file mode 100644 index 000000000000..6b7ab01b6929 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h @@ -0,0 +1,251 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ + +/* + * This header file defines the interfaces available from the CTF debugger + * library, libctf, and an equivalent kernel module. This API can be used by + * a debugger to operate on data in the Compact ANSI-C Type Format (CTF). + * This is NOT a public interface, although it may eventually become one in + * the fullness of time after we gain more experience with the interfaces. + * + * In the meantime, be aware that any program linked with this API in this + * release of Solaris is almost guaranteed to break in the next release. + * + * In short, do not user this header file or the CTF routines for any purpose. + */ + +#ifndef _CTF_API_H +#define _CTF_API_H + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/elf.h> +#include <sys/ctf.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Clients can open one or more CTF containers and obtain a pointer to an + * opaque ctf_file_t. Types are identified by an opaque ctf_id_t token. + * These opaque definitions allow libctf to evolve without breaking clients. + */ +typedef struct ctf_file ctf_file_t; +typedef long ctf_id_t; + +/* + * If the debugger needs to provide the CTF library with a set of raw buffers + * for use as the CTF data, symbol table, and string table, it can do so by + * filling in ctf_sect_t structures and passing them to ctf_bufopen(): + */ +typedef struct ctf_sect { + const char *cts_name; /* section name (if any) */ + ulong_t cts_type; /* section type (ELF SHT_... value) */ + ulong_t cts_flags; /* section flags (ELF SHF_... value) */ +#ifdef illumos + const void *cts_data; /* pointer to section data */ +#else + void *cts_data; /* pointer to section data */ +#endif + size_t cts_size; /* size of data in bytes */ + size_t cts_entsize; /* size of each section entry (symtab only) */ + off64_t cts_offset; /* file offset of this section (if any) */ +} ctf_sect_t; + +/* + * Encoding information for integers, floating-point values, and certain other + * intrinsics can be obtained by calling ctf_type_encoding(), below. The flags + * field will contain values appropriate for the type defined in <sys/ctf.h>. + */ +typedef struct ctf_encoding { + uint_t cte_format; /* data format (CTF_INT_* or CTF_FP_* flags) */ + uint_t cte_offset; /* offset of value in bits */ + uint_t cte_bits; /* size of storage in bits */ +} ctf_encoding_t; + +typedef struct ctf_membinfo { + ctf_id_t ctm_type; /* type of struct or union member */ + ulong_t ctm_offset; /* offset of member in bits */ +} ctf_membinfo_t; + +typedef struct ctf_arinfo { + ctf_id_t ctr_contents; /* type of array contents */ + ctf_id_t ctr_index; /* type of array index */ + uint_t ctr_nelems; /* number of elements */ +} ctf_arinfo_t; + +typedef struct ctf_funcinfo { + ctf_id_t ctc_return; /* function return type */ + uint_t ctc_argc; /* number of typed arguments to function */ + uint_t ctc_flags; /* function attributes (see below) */ +} ctf_funcinfo_t; + +typedef struct ctf_lblinfo { + ctf_id_t ctb_typeidx; /* last type associated with the label */ +} ctf_lblinfo_t; + +#define CTF_FUNC_VARARG 0x1 /* function arguments end with varargs */ + +/* + * Functions that return integer status or a ctf_id_t use the following value + * to indicate failure. ctf_errno() can be used to obtain an error code. + */ +#define CTF_ERR (-1L) + +/* + * The CTF data model is inferred to be the caller's data model or the data + * model of the given object, unless ctf_setmodel() is explicitly called. + */ +#define CTF_MODEL_ILP32 1 /* object data model is ILP32 */ +#define CTF_MODEL_LP64 2 /* object data model is LP64 */ +#ifdef _LP64 +#define CTF_MODEL_NATIVE CTF_MODEL_LP64 +#else +#define CTF_MODEL_NATIVE CTF_MODEL_ILP32 +#endif + +/* + * Dynamic CTF containers can be created using ctf_create(). The ctf_add_* + * routines can be used to add new definitions to the dynamic container. + * New types are labeled as root or non-root to determine whether they are + * visible at the top-level program scope when subsequently doing a lookup. + */ +#define CTF_ADD_NONROOT 0 /* type only visible in nested scope */ +#define CTF_ADD_ROOT 1 /* type visible at top-level scope */ + +/* + * These typedefs are used to define the signature for callback functions + * that can be used with the iteration and visit functions below: + */ +typedef int ctf_visit_f(const char *, ctf_id_t, ulong_t, int, void *); +typedef int ctf_member_f(const char *, ctf_id_t, ulong_t, void *); +typedef int ctf_enum_f(const char *, int, void *); +typedef int ctf_type_f(ctf_id_t, void *); +typedef int ctf_label_f(const char *, const ctf_lblinfo_t *, void *); + +extern ctf_file_t *ctf_bufopen(const ctf_sect_t *, const ctf_sect_t *, + const ctf_sect_t *, int *); +extern ctf_file_t *ctf_fdopen(int, int *); +extern ctf_file_t *ctf_open(const char *, int *); +extern ctf_file_t *ctf_create(int *); +extern ctf_file_t *ctf_dup(ctf_file_t *); +extern void ctf_close(ctf_file_t *); + +extern ctf_file_t *ctf_parent_file(ctf_file_t *); +extern const char *ctf_parent_name(ctf_file_t *); + +extern int ctf_import(ctf_file_t *, ctf_file_t *); +extern int ctf_setmodel(ctf_file_t *, int); +extern int ctf_getmodel(ctf_file_t *); + +extern void ctf_setspecific(ctf_file_t *, void *); +extern void *ctf_getspecific(ctf_file_t *); + +extern int ctf_errno(ctf_file_t *); +extern const char *ctf_errmsg(int); +extern int ctf_version(int); + +extern int ctf_func_info(ctf_file_t *, ulong_t, ctf_funcinfo_t *); +extern int ctf_func_args(ctf_file_t *, ulong_t, uint_t, ctf_id_t *); + +extern ctf_id_t ctf_lookup_by_name(ctf_file_t *, const char *); +extern ctf_id_t ctf_lookup_by_symbol(ctf_file_t *, ulong_t); + +extern ctf_id_t ctf_type_resolve(ctf_file_t *, ctf_id_t); +extern ssize_t ctf_type_lname(ctf_file_t *, ctf_id_t, char *, size_t); +extern char *ctf_type_name(ctf_file_t *, ctf_id_t, char *, size_t); +extern char *ctf_type_qname(ctf_file_t *, ctf_id_t, char *, size_t, + const char *); +extern ssize_t ctf_type_size(ctf_file_t *, ctf_id_t); +extern ssize_t ctf_type_align(ctf_file_t *, ctf_id_t); +extern int ctf_type_kind(ctf_file_t *, ctf_id_t); +extern ctf_id_t ctf_type_reference(ctf_file_t *, ctf_id_t); +extern ctf_id_t ctf_type_pointer(ctf_file_t *, ctf_id_t); +extern int ctf_type_encoding(ctf_file_t *, ctf_id_t, ctf_encoding_t *); +extern int ctf_type_visit(ctf_file_t *, ctf_id_t, ctf_visit_f *, void *); +extern int ctf_type_cmp(ctf_file_t *, ctf_id_t, ctf_file_t *, ctf_id_t); +extern int ctf_type_compat(ctf_file_t *, ctf_id_t, ctf_file_t *, ctf_id_t); + +extern int ctf_member_info(ctf_file_t *, ctf_id_t, const char *, + ctf_membinfo_t *); +extern int ctf_array_info(ctf_file_t *, ctf_id_t, ctf_arinfo_t *); + +extern const char *ctf_enum_name(ctf_file_t *, ctf_id_t, int); +extern int ctf_enum_value(ctf_file_t *, ctf_id_t, const char *, int *); + +extern const char *ctf_label_topmost(ctf_file_t *); +extern int ctf_label_info(ctf_file_t *, const char *, ctf_lblinfo_t *); + +extern int ctf_member_iter(ctf_file_t *, ctf_id_t, ctf_member_f *, void *); +extern int ctf_enum_iter(ctf_file_t *, ctf_id_t, ctf_enum_f *, void *); +extern int ctf_type_iter(ctf_file_t *, ctf_type_f *, void *); +extern int ctf_label_iter(ctf_file_t *, ctf_label_f *, void *); + +extern ctf_id_t ctf_add_array(ctf_file_t *, uint_t, const ctf_arinfo_t *); +extern ctf_id_t ctf_add_const(ctf_file_t *, uint_t, ctf_id_t); +extern ctf_id_t ctf_add_enum(ctf_file_t *, uint_t, const char *); +extern ctf_id_t ctf_add_float(ctf_file_t *, uint_t, + const char *, const ctf_encoding_t *); +extern ctf_id_t ctf_add_forward(ctf_file_t *, uint_t, const char *, uint_t); +extern ctf_id_t ctf_add_function(ctf_file_t *, uint_t, + const ctf_funcinfo_t *, const ctf_id_t *); +extern ctf_id_t ctf_add_integer(ctf_file_t *, uint_t, + const char *, const ctf_encoding_t *); +extern ctf_id_t ctf_add_pointer(ctf_file_t *, uint_t, ctf_id_t); +extern ctf_id_t ctf_add_type(ctf_file_t *, ctf_file_t *, ctf_id_t); +extern ctf_id_t ctf_add_typedef(ctf_file_t *, uint_t, const char *, ctf_id_t); +extern ctf_id_t ctf_add_restrict(ctf_file_t *, uint_t, ctf_id_t); +extern ctf_id_t ctf_add_struct(ctf_file_t *, uint_t, const char *); +extern ctf_id_t ctf_add_union(ctf_file_t *, uint_t, const char *); +extern ctf_id_t ctf_add_volatile(ctf_file_t *, uint_t, ctf_id_t); + +extern int ctf_add_enumerator(ctf_file_t *, ctf_id_t, const char *, int); +extern int ctf_add_member(ctf_file_t *, ctf_id_t, const char *, ctf_id_t); + +extern int ctf_set_array(ctf_file_t *, ctf_id_t, const ctf_arinfo_t *); + +extern int ctf_delete_type(ctf_file_t *, ctf_id_t); + +extern int ctf_update(ctf_file_t *); +extern int ctf_discard(ctf_file_t *); +extern int ctf_write(ctf_file_t *, int); + +#ifdef _KERNEL + +struct module; +extern ctf_file_t *ctf_modopen(struct module *, int *); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _CTF_API_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h b/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h new file mode 100644 index 000000000000..cf8a15e5b6cc --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h @@ -0,0 +1,159 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +#ifndef _SYS_DEBUG_H +#define _SYS_DEBUG_H + +#include <sys/types.h> +#include <sys/note.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * ASSERT(ex) causes a panic or debugger entry if expression ex is not + * true. ASSERT() is included only for debugging, and is a no-op in + * production kernels. VERIFY(ex), on the other hand, behaves like + * ASSERT and is evaluated on both debug and non-debug kernels. + */ + +extern int assfail(const char *, const char *, int); +#define VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__))) +#ifdef DEBUG +#define ASSERT(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__))) +#else +#define ASSERT(x) ((void)0) +#endif + +/* + * Assertion variants sensitive to the compilation data model + */ +#if defined(_LP64) +#define ASSERT64(x) ASSERT(x) +#define ASSERT32(x) +#else +#define ASSERT64(x) +#define ASSERT32(x) ASSERT(x) +#endif + +/* + * IMPLY and EQUIV are assertions of the form: + * + * if (a) then (b) + * and + * if (a) then (b) *AND* if (b) then (a) + */ +#ifdef DEBUG +#define IMPLY(A, B) \ + ((void)(((!(A)) || (B)) || \ + assfail("(" #A ") implies (" #B ")", __FILE__, __LINE__))) +#define EQUIV(A, B) \ + ((void)((!!(A) == !!(B)) || \ + assfail("(" #A ") is equivalent to (" #B ")", __FILE__, __LINE__))) +#else +#define IMPLY(A, B) ((void)0) +#define EQUIV(A, B) ((void)0) +#endif + +/* + * ASSERT3() behaves like ASSERT() except that it is an explicit conditional, + * and prints out the values of the left and right hand expressions as part of + * the panic message to ease debugging. The three variants imply the type + * of their arguments. ASSERT3S() is for signed data types, ASSERT3U() is + * for unsigned, and ASSERT3P() is for pointers. The VERIFY3*() macros + * have the same relationship as above. + */ +extern void assfail3(const char *, uintmax_t, const char *, uintmax_t, + const char *, int); +#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE) do { \ + const TYPE __left = (TYPE)(LEFT); \ + const TYPE __right = (TYPE)(RIGHT); \ + if (!(__left OP __right)) \ + assfail3(#LEFT " " #OP " " #RIGHT, \ + (uintmax_t)__left, #OP, (uintmax_t)__right, \ + __FILE__, __LINE__); \ +_NOTE(CONSTCOND) } while (0) + +#define VERIFY3B(x, y, z) VERIFY3_IMPL(x, y, z, boolean_t) +#define VERIFY3S(x, y, z) VERIFY3_IMPL(x, y, z, int64_t) +#define VERIFY3U(x, y, z) VERIFY3_IMPL(x, y, z, uint64_t) +#define VERIFY3P(x, y, z) VERIFY3_IMPL(x, y, z, uintptr_t) +#define VERIFY0(x) VERIFY3_IMPL(x, ==, 0, uintmax_t) + +#ifdef DEBUG +#define ASSERT3B(x, y, z) VERIFY3_IMPL(x, y, z, boolean_t) +#define ASSERT3S(x, y, z) VERIFY3_IMPL(x, y, z, int64_t) +#define ASSERT3U(x, y, z) VERIFY3_IMPL(x, y, z, uint64_t) +#define ASSERT3P(x, y, z) VERIFY3_IMPL(x, y, z, uintptr_t) +#define ASSERT0(x) VERIFY3_IMPL(x, ==, 0, uintmax_t) +#else +#define ASSERT3B(x, y, z) ((void)0) +#define ASSERT3S(x, y, z) ((void)0) +#define ASSERT3U(x, y, z) ((void)0) +#define ASSERT3P(x, y, z) ((void)0) +#define ASSERT0(x) ((void)0) +#endif + +/* + * Compile-time assertion. The condition 'x' must be constant. + */ +#ifndef CTASSERT +#define CTASSERT(x) _CTASSERT(x, __LINE__) +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) \ + _Static_assert((x), "Static assert failed at " #y) +#endif + +#ifdef _KERNEL + +extern void abort_sequence_enter(char *); +extern void debug_enter(char *); + +#endif /* _KERNEL */ + +#if defined(DEBUG) && !defined(__sun) +/* CSTYLED */ +#define STATIC +#else +/* CSTYLED */ +#define STATIC static +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DEBUG_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h new file mode 100644 index 000000000000..b474f91ce01d --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h @@ -0,0 +1,2510 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DTRACE_H +#define _SYS_DTRACE_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * DTrace Dynamic Tracing Software: Kernel Interfaces + * + * Note: The contents of this file are private to the implementation of the + * Solaris system and DTrace subsystem and are subject to change at any time + * without notice. Applications and drivers using these interfaces will fail + * to run on future releases. These interfaces should not be used for any + * purpose except those expressly outlined in dtrace(7D) and libdtrace(3LIB). + * Please refer to the "Solaris Dynamic Tracing Guide" for more information. + */ + +#ifndef _ASM + +#include <sys/types.h> +#include <sys/modctl.h> +#include <sys/processor.h> +#ifdef illumos +#include <sys/systm.h> +#else +#include <sys/cpuvar.h> +#include <sys/param.h> +#include <sys/linker.h> +#include <sys/ioccom.h> +#include <sys/ucred.h> +typedef int model_t; +#endif +#include <sys/ctf_api.h> +#ifdef illumos +#include <sys/cyclic.h> +#include <sys/int_limits.h> +#else +#include <sys/stdint.h> +#endif + +/* + * DTrace Universal Constants and Typedefs + */ +#define DTRACE_CPUALL -1 /* all CPUs */ +#define DTRACE_IDNONE 0 /* invalid probe identifier */ +#define DTRACE_EPIDNONE 0 /* invalid enabled probe identifier */ +#define DTRACE_AGGIDNONE 0 /* invalid aggregation identifier */ +#define DTRACE_AGGVARIDNONE 0 /* invalid aggregation variable ID */ +#define DTRACE_CACHEIDNONE 0 /* invalid predicate cache */ +#define DTRACE_PROVNONE 0 /* invalid provider identifier */ +#define DTRACE_METAPROVNONE 0 /* invalid meta-provider identifier */ +#define DTRACE_ARGNONE -1 /* invalid argument index */ + +#define DTRACE_PROVNAMELEN 64 +#define DTRACE_MODNAMELEN 64 +#define DTRACE_FUNCNAMELEN 192 +#define DTRACE_NAMELEN 64 +#define DTRACE_FULLNAMELEN (DTRACE_PROVNAMELEN + DTRACE_MODNAMELEN + \ + DTRACE_FUNCNAMELEN + DTRACE_NAMELEN + 4) +#define DTRACE_ARGTYPELEN 128 + +typedef uint32_t dtrace_id_t; /* probe identifier */ +typedef uint32_t dtrace_epid_t; /* enabled probe identifier */ +typedef uint32_t dtrace_aggid_t; /* aggregation identifier */ +typedef int64_t dtrace_aggvarid_t; /* aggregation variable identifier */ +typedef uint16_t dtrace_actkind_t; /* action kind */ +typedef int64_t dtrace_optval_t; /* option value */ +typedef uint32_t dtrace_cacheid_t; /* predicate cache identifier */ + +typedef enum dtrace_probespec { + DTRACE_PROBESPEC_NONE = -1, + DTRACE_PROBESPEC_PROVIDER = 0, + DTRACE_PROBESPEC_MOD, + DTRACE_PROBESPEC_FUNC, + DTRACE_PROBESPEC_NAME +} dtrace_probespec_t; + +/* + * DTrace Intermediate Format (DIF) + * + * The following definitions describe the DTrace Intermediate Format (DIF), a + * a RISC-like instruction set and program encoding used to represent + * predicates and actions that can be bound to DTrace probes. The constants + * below defining the number of available registers are suggested minimums; the + * compiler should use DTRACEIOC_CONF to dynamically obtain the number of + * registers provided by the current DTrace implementation. + */ +#define DIF_VERSION_1 1 /* DIF version 1: Solaris 10 Beta */ +#define DIF_VERSION_2 2 /* DIF version 2: Solaris 10 FCS */ +#define DIF_VERSION DIF_VERSION_2 /* latest DIF instruction set version */ +#define DIF_DIR_NREGS 8 /* number of DIF integer registers */ +#define DIF_DTR_NREGS 8 /* number of DIF tuple registers */ + +#define DIF_OP_OR 1 /* or r1, r2, rd */ +#define DIF_OP_XOR 2 /* xor r1, r2, rd */ +#define DIF_OP_AND 3 /* and r1, r2, rd */ +#define DIF_OP_SLL 4 /* sll r1, r2, rd */ +#define DIF_OP_SRL 5 /* srl r1, r2, rd */ +#define DIF_OP_SUB 6 /* sub r1, r2, rd */ +#define DIF_OP_ADD 7 /* add r1, r2, rd */ +#define DIF_OP_MUL 8 /* mul r1, r2, rd */ +#define DIF_OP_SDIV 9 /* sdiv r1, r2, rd */ +#define DIF_OP_UDIV 10 /* udiv r1, r2, rd */ +#define DIF_OP_SREM 11 /* srem r1, r2, rd */ +#define DIF_OP_UREM 12 /* urem r1, r2, rd */ +#define DIF_OP_NOT 13 /* not r1, rd */ +#define DIF_OP_MOV 14 /* mov r1, rd */ +#define DIF_OP_CMP 15 /* cmp r1, r2 */ +#define DIF_OP_TST 16 /* tst r1 */ +#define DIF_OP_BA 17 /* ba label */ +#define DIF_OP_BE 18 /* be label */ +#define DIF_OP_BNE 19 /* bne label */ +#define DIF_OP_BG 20 /* bg label */ +#define DIF_OP_BGU 21 /* bgu label */ +#define DIF_OP_BGE 22 /* bge label */ +#define DIF_OP_BGEU 23 /* bgeu label */ +#define DIF_OP_BL 24 /* bl label */ +#define DIF_OP_BLU 25 /* blu label */ +#define DIF_OP_BLE 26 /* ble label */ +#define DIF_OP_BLEU 27 /* bleu label */ +#define DIF_OP_LDSB 28 /* ldsb [r1], rd */ +#define DIF_OP_LDSH 29 /* ldsh [r1], rd */ +#define DIF_OP_LDSW 30 /* ldsw [r1], rd */ +#define DIF_OP_LDUB 31 /* ldub [r1], rd */ +#define DIF_OP_LDUH 32 /* lduh [r1], rd */ +#define DIF_OP_LDUW 33 /* lduw [r1], rd */ +#define DIF_OP_LDX 34 /* ldx [r1], rd */ +#define DIF_OP_RET 35 /* ret rd */ +#define DIF_OP_NOP 36 /* nop */ +#define DIF_OP_SETX 37 /* setx intindex, rd */ +#define DIF_OP_SETS 38 /* sets strindex, rd */ +#define DIF_OP_SCMP 39 /* scmp r1, r2 */ +#define DIF_OP_LDGA 40 /* ldga var, ri, rd */ +#define DIF_OP_LDGS 41 /* ldgs var, rd */ +#define DIF_OP_STGS 42 /* stgs var, rs */ +#define DIF_OP_LDTA 43 /* ldta var, ri, rd */ +#define DIF_OP_LDTS 44 /* ldts var, rd */ +#define DIF_OP_STTS 45 /* stts var, rs */ +#define DIF_OP_SRA 46 /* sra r1, r2, rd */ +#define DIF_OP_CALL 47 /* call subr, rd */ +#define DIF_OP_PUSHTR 48 /* pushtr type, rs, rr */ +#define DIF_OP_PUSHTV 49 /* pushtv type, rs, rv */ +#define DIF_OP_POPTS 50 /* popts */ +#define DIF_OP_FLUSHTS 51 /* flushts */ +#define DIF_OP_LDGAA 52 /* ldgaa var, rd */ +#define DIF_OP_LDTAA 53 /* ldtaa var, rd */ +#define DIF_OP_STGAA 54 /* stgaa var, rs */ +#define DIF_OP_STTAA 55 /* sttaa var, rs */ +#define DIF_OP_LDLS 56 /* ldls var, rd */ +#define DIF_OP_STLS 57 /* stls var, rs */ +#define DIF_OP_ALLOCS 58 /* allocs r1, rd */ +#define DIF_OP_COPYS 59 /* copys r1, r2, rd */ +#define DIF_OP_STB 60 /* stb r1, [rd] */ +#define DIF_OP_STH 61 /* sth r1, [rd] */ +#define DIF_OP_STW 62 /* stw r1, [rd] */ +#define DIF_OP_STX 63 /* stx r1, [rd] */ +#define DIF_OP_ULDSB 64 /* uldsb [r1], rd */ +#define DIF_OP_ULDSH 65 /* uldsh [r1], rd */ +#define DIF_OP_ULDSW 66 /* uldsw [r1], rd */ +#define DIF_OP_ULDUB 67 /* uldub [r1], rd */ +#define DIF_OP_ULDUH 68 /* ulduh [r1], rd */ +#define DIF_OP_ULDUW 69 /* ulduw [r1], rd */ +#define DIF_OP_ULDX 70 /* uldx [r1], rd */ +#define DIF_OP_RLDSB 71 /* rldsb [r1], rd */ +#define DIF_OP_RLDSH 72 /* rldsh [r1], rd */ +#define DIF_OP_RLDSW 73 /* rldsw [r1], rd */ +#define DIF_OP_RLDUB 74 /* rldub [r1], rd */ +#define DIF_OP_RLDUH 75 /* rlduh [r1], rd */ +#define DIF_OP_RLDUW 76 /* rlduw [r1], rd */ +#define DIF_OP_RLDX 77 /* rldx [r1], rd */ +#define DIF_OP_XLATE 78 /* xlate xlrindex, rd */ +#define DIF_OP_XLARG 79 /* xlarg xlrindex, rd */ + +#define DIF_INTOFF_MAX 0xffff /* highest integer table offset */ +#define DIF_STROFF_MAX 0xffff /* highest string table offset */ +#define DIF_REGISTER_MAX 0xff /* highest register number */ +#define DIF_VARIABLE_MAX 0xffff /* highest variable identifier */ +#define DIF_SUBROUTINE_MAX 0xffff /* highest subroutine code */ + +#define DIF_VAR_ARRAY_MIN 0x0000 /* lowest numbered array variable */ +#define DIF_VAR_ARRAY_UBASE 0x0080 /* lowest user-defined array */ +#define DIF_VAR_ARRAY_MAX 0x00ff /* highest numbered array variable */ + +#define DIF_VAR_OTHER_MIN 0x0100 /* lowest numbered scalar or assc */ +#define DIF_VAR_OTHER_UBASE 0x0500 /* lowest user-defined scalar or assc */ +#define DIF_VAR_OTHER_MAX 0xffff /* highest numbered scalar or assc */ + +#define DIF_VAR_ARGS 0x0000 /* arguments array */ +#define DIF_VAR_REGS 0x0001 /* registers array */ +#define DIF_VAR_UREGS 0x0002 /* user registers array */ +#define DIF_VAR_CURTHREAD 0x0100 /* thread pointer */ +#define DIF_VAR_TIMESTAMP 0x0101 /* timestamp */ +#define DIF_VAR_VTIMESTAMP 0x0102 /* virtual timestamp */ +#define DIF_VAR_IPL 0x0103 /* interrupt priority level */ +#define DIF_VAR_EPID 0x0104 /* enabled probe ID */ +#define DIF_VAR_ID 0x0105 /* probe ID */ +#define DIF_VAR_ARG0 0x0106 /* first argument */ +#define DIF_VAR_ARG1 0x0107 /* second argument */ +#define DIF_VAR_ARG2 0x0108 /* third argument */ +#define DIF_VAR_ARG3 0x0109 /* fourth argument */ +#define DIF_VAR_ARG4 0x010a /* fifth argument */ +#define DIF_VAR_ARG5 0x010b /* sixth argument */ +#define DIF_VAR_ARG6 0x010c /* seventh argument */ +#define DIF_VAR_ARG7 0x010d /* eighth argument */ +#define DIF_VAR_ARG8 0x010e /* ninth argument */ +#define DIF_VAR_ARG9 0x010f /* tenth argument */ +#define DIF_VAR_STACKDEPTH 0x0110 /* stack depth */ +#define DIF_VAR_CALLER 0x0111 /* caller */ +#define DIF_VAR_PROBEPROV 0x0112 /* probe provider */ +#define DIF_VAR_PROBEMOD 0x0113 /* probe module */ +#define DIF_VAR_PROBEFUNC 0x0114 /* probe function */ +#define DIF_VAR_PROBENAME 0x0115 /* probe name */ +#define DIF_VAR_PID 0x0116 /* process ID */ +#define DIF_VAR_TID 0x0117 /* (per-process) thread ID */ +#define DIF_VAR_EXECNAME 0x0118 /* name of executable */ +#define DIF_VAR_ZONENAME 0x0119 /* zone name associated with process */ +#define DIF_VAR_WALLTIMESTAMP 0x011a /* wall-clock timestamp */ +#define DIF_VAR_USTACKDEPTH 0x011b /* user-land stack depth */ +#define DIF_VAR_UCALLER 0x011c /* user-level caller */ +#define DIF_VAR_PPID 0x011d /* parent process ID */ +#define DIF_VAR_UID 0x011e /* process user ID */ +#define DIF_VAR_GID 0x011f /* process group ID */ +#define DIF_VAR_ERRNO 0x0120 /* thread errno */ +#define DIF_VAR_EXECARGS 0x0121 /* process arguments */ +#define DIF_VAR_JID 0x0122 /* process jail id */ +#define DIF_VAR_JAILNAME 0x0123 /* process jail name */ + +#ifndef illumos +#define DIF_VAR_CPU 0x0200 +#endif + +#define DIF_SUBR_RAND 0 +#define DIF_SUBR_MUTEX_OWNED 1 +#define DIF_SUBR_MUTEX_OWNER 2 +#define DIF_SUBR_MUTEX_TYPE_ADAPTIVE 3 +#define DIF_SUBR_MUTEX_TYPE_SPIN 4 +#define DIF_SUBR_RW_READ_HELD 5 +#define DIF_SUBR_RW_WRITE_HELD 6 +#define DIF_SUBR_RW_ISWRITER 7 +#define DIF_SUBR_COPYIN 8 +#define DIF_SUBR_COPYINSTR 9 +#define DIF_SUBR_SPECULATION 10 +#define DIF_SUBR_PROGENYOF 11 +#define DIF_SUBR_STRLEN 12 +#define DIF_SUBR_COPYOUT 13 +#define DIF_SUBR_COPYOUTSTR 14 +#define DIF_SUBR_ALLOCA 15 +#define DIF_SUBR_BCOPY 16 +#define DIF_SUBR_COPYINTO 17 +#define DIF_SUBR_MSGDSIZE 18 +#define DIF_SUBR_MSGSIZE 19 +#define DIF_SUBR_GETMAJOR 20 +#define DIF_SUBR_GETMINOR 21 +#define DIF_SUBR_DDI_PATHNAME 22 +#define DIF_SUBR_STRJOIN 23 +#define DIF_SUBR_LLTOSTR 24 +#define DIF_SUBR_BASENAME 25 +#define DIF_SUBR_DIRNAME 26 +#define DIF_SUBR_CLEANPATH 27 +#define DIF_SUBR_STRCHR 28 +#define DIF_SUBR_STRRCHR 29 +#define DIF_SUBR_STRSTR 30 +#define DIF_SUBR_STRTOK 31 +#define DIF_SUBR_SUBSTR 32 +#define DIF_SUBR_INDEX 33 +#define DIF_SUBR_RINDEX 34 +#define DIF_SUBR_HTONS 35 +#define DIF_SUBR_HTONL 36 +#define DIF_SUBR_HTONLL 37 +#define DIF_SUBR_NTOHS 38 +#define DIF_SUBR_NTOHL 39 +#define DIF_SUBR_NTOHLL 40 +#define DIF_SUBR_INET_NTOP 41 +#define DIF_SUBR_INET_NTOA 42 +#define DIF_SUBR_INET_NTOA6 43 +#define DIF_SUBR_TOUPPER 44 +#define DIF_SUBR_TOLOWER 45 +#define DIF_SUBR_MEMREF 46 +#define DIF_SUBR_SX_SHARED_HELD 47 +#define DIF_SUBR_SX_EXCLUSIVE_HELD 48 +#define DIF_SUBR_SX_ISEXCLUSIVE 49 +#define DIF_SUBR_MEMSTR 50 +#define DIF_SUBR_GETF 51 +#define DIF_SUBR_JSON 52 +#define DIF_SUBR_STRTOLL 53 +#define DIF_SUBR_MAX 53 /* max subroutine value */ + +typedef uint32_t dif_instr_t; + +#define DIF_INSTR_OP(i) (((i) >> 24) & 0xff) +#define DIF_INSTR_R1(i) (((i) >> 16) & 0xff) +#define DIF_INSTR_R2(i) (((i) >> 8) & 0xff) +#define DIF_INSTR_RD(i) ((i) & 0xff) +#define DIF_INSTR_RS(i) ((i) & 0xff) +#define DIF_INSTR_LABEL(i) ((i) & 0xffffff) +#define DIF_INSTR_VAR(i) (((i) >> 8) & 0xffff) +#define DIF_INSTR_INTEGER(i) (((i) >> 8) & 0xffff) +#define DIF_INSTR_STRING(i) (((i) >> 8) & 0xffff) +#define DIF_INSTR_SUBR(i) (((i) >> 8) & 0xffff) +#define DIF_INSTR_TYPE(i) (((i) >> 16) & 0xff) +#define DIF_INSTR_XLREF(i) (((i) >> 8) & 0xffff) + +#define DIF_INSTR_FMT(op, r1, r2, d) \ + (((op) << 24) | ((r1) << 16) | ((r2) << 8) | (d)) + +#define DIF_INSTR_NOT(r1, d) (DIF_INSTR_FMT(DIF_OP_NOT, r1, 0, d)) +#define DIF_INSTR_MOV(r1, d) (DIF_INSTR_FMT(DIF_OP_MOV, r1, 0, d)) +#define DIF_INSTR_CMP(op, r1, r2) (DIF_INSTR_FMT(op, r1, r2, 0)) +#define DIF_INSTR_TST(r1) (DIF_INSTR_FMT(DIF_OP_TST, r1, 0, 0)) +#define DIF_INSTR_BRANCH(op, label) (((op) << 24) | (label)) +#define DIF_INSTR_LOAD(op, r1, d) (DIF_INSTR_FMT(op, r1, 0, d)) +#define DIF_INSTR_STORE(op, r1, d) (DIF_INSTR_FMT(op, r1, 0, d)) +#define DIF_INSTR_SETX(i, d) ((DIF_OP_SETX << 24) | ((i) << 8) | (d)) +#define DIF_INSTR_SETS(s, d) ((DIF_OP_SETS << 24) | ((s) << 8) | (d)) +#define DIF_INSTR_RET(d) (DIF_INSTR_FMT(DIF_OP_RET, 0, 0, d)) +#define DIF_INSTR_NOP (DIF_OP_NOP << 24) +#define DIF_INSTR_LDA(op, v, r, d) (DIF_INSTR_FMT(op, v, r, d)) +#define DIF_INSTR_LDV(op, v, d) (((op) << 24) | ((v) << 8) | (d)) +#define DIF_INSTR_STV(op, v, rs) (((op) << 24) | ((v) << 8) | (rs)) +#define DIF_INSTR_CALL(s, d) ((DIF_OP_CALL << 24) | ((s) << 8) | (d)) +#define DIF_INSTR_PUSHTS(op, t, r2, rs) (DIF_INSTR_FMT(op, t, r2, rs)) +#define DIF_INSTR_POPTS (DIF_OP_POPTS << 24) +#define DIF_INSTR_FLUSHTS (DIF_OP_FLUSHTS << 24) +#define DIF_INSTR_ALLOCS(r1, d) (DIF_INSTR_FMT(DIF_OP_ALLOCS, r1, 0, d)) +#define DIF_INSTR_COPYS(r1, r2, d) (DIF_INSTR_FMT(DIF_OP_COPYS, r1, r2, d)) +#define DIF_INSTR_XLATE(op, r, d) (((op) << 24) | ((r) << 8) | (d)) + +#define DIF_REG_R0 0 /* %r0 is always set to zero */ + +/* + * A DTrace Intermediate Format Type (DIF Type) is used to represent the types + * of variables, function and associative array arguments, and the return type + * for each DIF object (shown below). It contains a description of the type, + * its size in bytes, and a module identifier. + */ +typedef struct dtrace_diftype { + uint8_t dtdt_kind; /* type kind (see below) */ + uint8_t dtdt_ckind; /* type kind in CTF */ + uint8_t dtdt_flags; /* type flags (see below) */ + uint8_t dtdt_pad; /* reserved for future use */ + uint32_t dtdt_size; /* type size in bytes (unless string) */ +} dtrace_diftype_t; + +#define DIF_TYPE_CTF 0 /* type is a CTF type */ +#define DIF_TYPE_STRING 1 /* type is a D string */ + +#define DIF_TF_BYREF 0x1 /* type is passed by reference */ +#define DIF_TF_BYUREF 0x2 /* user type is passed by reference */ + +/* + * A DTrace Intermediate Format variable record is used to describe each of the + * variables referenced by a given DIF object. It contains an integer variable + * identifier along with variable scope and properties, as shown below. The + * size of this structure must be sizeof (int) aligned. + */ +typedef struct dtrace_difv { + uint32_t dtdv_name; /* variable name index in dtdo_strtab */ + uint32_t dtdv_id; /* variable reference identifier */ + uint8_t dtdv_kind; /* variable kind (see below) */ + uint8_t dtdv_scope; /* variable scope (see below) */ + uint16_t dtdv_flags; /* variable flags (see below) */ + dtrace_diftype_t dtdv_type; /* variable type (see above) */ +} dtrace_difv_t; + +#define DIFV_KIND_ARRAY 0 /* variable is an array of quantities */ +#define DIFV_KIND_SCALAR 1 /* variable is a scalar quantity */ + +#define DIFV_SCOPE_GLOBAL 0 /* variable has global scope */ +#define DIFV_SCOPE_THREAD 1 /* variable has thread scope */ +#define DIFV_SCOPE_LOCAL 2 /* variable has local scope */ + +#define DIFV_F_REF 0x1 /* variable is referenced by DIFO */ +#define DIFV_F_MOD 0x2 /* variable is written by DIFO */ + +/* + * DTrace Actions + * + * The upper byte determines the class of the action; the low bytes determines + * the specific action within that class. The classes of actions are as + * follows: + * + * [ no class ] <= May record process- or kernel-related data + * DTRACEACT_PROC <= Only records process-related data + * DTRACEACT_PROC_DESTRUCTIVE <= Potentially destructive to processes + * DTRACEACT_KERNEL <= Only records kernel-related data + * DTRACEACT_KERNEL_DESTRUCTIVE <= Potentially destructive to the kernel + * DTRACEACT_SPECULATIVE <= Speculation-related action + * DTRACEACT_AGGREGATION <= Aggregating action + */ +#define DTRACEACT_NONE 0 /* no action */ +#define DTRACEACT_DIFEXPR 1 /* action is DIF expression */ +#define DTRACEACT_EXIT 2 /* exit() action */ +#define DTRACEACT_PRINTF 3 /* printf() action */ +#define DTRACEACT_PRINTA 4 /* printa() action */ +#define DTRACEACT_LIBACT 5 /* library-controlled action */ +#define DTRACEACT_TRACEMEM 6 /* tracemem() action */ +#define DTRACEACT_TRACEMEM_DYNSIZE 7 /* dynamic tracemem() size */ +#define DTRACEACT_PRINTM 8 /* printm() action (BSD) */ + +#define DTRACEACT_PROC 0x0100 +#define DTRACEACT_USTACK (DTRACEACT_PROC + 1) +#define DTRACEACT_JSTACK (DTRACEACT_PROC + 2) +#define DTRACEACT_USYM (DTRACEACT_PROC + 3) +#define DTRACEACT_UMOD (DTRACEACT_PROC + 4) +#define DTRACEACT_UADDR (DTRACEACT_PROC + 5) + +#define DTRACEACT_PROC_DESTRUCTIVE 0x0200 +#define DTRACEACT_STOP (DTRACEACT_PROC_DESTRUCTIVE + 1) +#define DTRACEACT_RAISE (DTRACEACT_PROC_DESTRUCTIVE + 2) +#define DTRACEACT_SYSTEM (DTRACEACT_PROC_DESTRUCTIVE + 3) +#define DTRACEACT_FREOPEN (DTRACEACT_PROC_DESTRUCTIVE + 4) + +#define DTRACEACT_PROC_CONTROL 0x0300 + +#define DTRACEACT_KERNEL 0x0400 +#define DTRACEACT_STACK (DTRACEACT_KERNEL + 1) +#define DTRACEACT_SYM (DTRACEACT_KERNEL + 2) +#define DTRACEACT_MOD (DTRACEACT_KERNEL + 3) + +#define DTRACEACT_KERNEL_DESTRUCTIVE 0x0500 +#define DTRACEACT_BREAKPOINT (DTRACEACT_KERNEL_DESTRUCTIVE + 1) +#define DTRACEACT_PANIC (DTRACEACT_KERNEL_DESTRUCTIVE + 2) +#define DTRACEACT_CHILL (DTRACEACT_KERNEL_DESTRUCTIVE + 3) + +#define DTRACEACT_SPECULATIVE 0x0600 +#define DTRACEACT_SPECULATE (DTRACEACT_SPECULATIVE + 1) +#define DTRACEACT_COMMIT (DTRACEACT_SPECULATIVE + 2) +#define DTRACEACT_DISCARD (DTRACEACT_SPECULATIVE + 3) + +#define DTRACEACT_CLASS(x) ((x) & 0xff00) + +#define DTRACEACT_ISDESTRUCTIVE(x) \ + (DTRACEACT_CLASS(x) == DTRACEACT_PROC_DESTRUCTIVE || \ + DTRACEACT_CLASS(x) == DTRACEACT_KERNEL_DESTRUCTIVE) + +#define DTRACEACT_ISSPECULATIVE(x) \ + (DTRACEACT_CLASS(x) == DTRACEACT_SPECULATIVE) + +#define DTRACEACT_ISPRINTFLIKE(x) \ + ((x) == DTRACEACT_PRINTF || (x) == DTRACEACT_PRINTA || \ + (x) == DTRACEACT_SYSTEM || (x) == DTRACEACT_FREOPEN) + +/* + * DTrace Aggregating Actions + * + * These are functions f(x) for which the following is true: + * + * f(f(x_0) U f(x_1) U ... U f(x_n)) = f(x_0 U x_1 U ... U x_n) + * + * where x_n is a set of arbitrary data. Aggregating actions are in their own + * DTrace action class, DTTRACEACT_AGGREGATION. The macros provided here allow + * for easier processing of the aggregation argument and data payload for a few + * aggregating actions (notably: quantize(), lquantize(), and ustack()). + */ +#define DTRACEACT_AGGREGATION 0x0700 +#define DTRACEAGG_COUNT (DTRACEACT_AGGREGATION + 1) +#define DTRACEAGG_MIN (DTRACEACT_AGGREGATION + 2) +#define DTRACEAGG_MAX (DTRACEACT_AGGREGATION + 3) +#define DTRACEAGG_AVG (DTRACEACT_AGGREGATION + 4) +#define DTRACEAGG_SUM (DTRACEACT_AGGREGATION + 5) +#define DTRACEAGG_STDDEV (DTRACEACT_AGGREGATION + 6) +#define DTRACEAGG_QUANTIZE (DTRACEACT_AGGREGATION + 7) +#define DTRACEAGG_LQUANTIZE (DTRACEACT_AGGREGATION + 8) +#define DTRACEAGG_LLQUANTIZE (DTRACEACT_AGGREGATION + 9) + +#define DTRACEACT_ISAGG(x) \ + (DTRACEACT_CLASS(x) == DTRACEACT_AGGREGATION) + +#define DTRACE_QUANTIZE_NBUCKETS \ + (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) + +#define DTRACE_QUANTIZE_ZEROBUCKET ((sizeof (uint64_t) * NBBY) - 1) + +#define DTRACE_QUANTIZE_BUCKETVAL(buck) \ + (int64_t)((buck) < DTRACE_QUANTIZE_ZEROBUCKET ? \ + -(1LL << (DTRACE_QUANTIZE_ZEROBUCKET - 1 - (buck))) : \ + (buck) == DTRACE_QUANTIZE_ZEROBUCKET ? 0 : \ + 1LL << ((buck) - DTRACE_QUANTIZE_ZEROBUCKET - 1)) + +#define DTRACE_LQUANTIZE_STEPSHIFT 48 +#define DTRACE_LQUANTIZE_STEPMASK ((uint64_t)UINT16_MAX << 48) +#define DTRACE_LQUANTIZE_LEVELSHIFT 32 +#define DTRACE_LQUANTIZE_LEVELMASK ((uint64_t)UINT16_MAX << 32) +#define DTRACE_LQUANTIZE_BASESHIFT 0 +#define DTRACE_LQUANTIZE_BASEMASK UINT32_MAX + +#define DTRACE_LQUANTIZE_STEP(x) \ + (uint16_t)(((x) & DTRACE_LQUANTIZE_STEPMASK) >> \ + DTRACE_LQUANTIZE_STEPSHIFT) + +#define DTRACE_LQUANTIZE_LEVELS(x) \ + (uint16_t)(((x) & DTRACE_LQUANTIZE_LEVELMASK) >> \ + DTRACE_LQUANTIZE_LEVELSHIFT) + +#define DTRACE_LQUANTIZE_BASE(x) \ + (int32_t)(((x) & DTRACE_LQUANTIZE_BASEMASK) >> \ + DTRACE_LQUANTIZE_BASESHIFT) + +#define DTRACE_LLQUANTIZE_FACTORSHIFT 48 +#define DTRACE_LLQUANTIZE_FACTORMASK ((uint64_t)UINT16_MAX << 48) +#define DTRACE_LLQUANTIZE_LOWSHIFT 32 +#define DTRACE_LLQUANTIZE_LOWMASK ((uint64_t)UINT16_MAX << 32) +#define DTRACE_LLQUANTIZE_HIGHSHIFT 16 +#define DTRACE_LLQUANTIZE_HIGHMASK ((uint64_t)UINT16_MAX << 16) +#define DTRACE_LLQUANTIZE_NSTEPSHIFT 0 +#define DTRACE_LLQUANTIZE_NSTEPMASK UINT16_MAX + +#define DTRACE_LLQUANTIZE_FACTOR(x) \ + (uint16_t)(((x) & DTRACE_LLQUANTIZE_FACTORMASK) >> \ + DTRACE_LLQUANTIZE_FACTORSHIFT) + +#define DTRACE_LLQUANTIZE_LOW(x) \ + (uint16_t)(((x) & DTRACE_LLQUANTIZE_LOWMASK) >> \ + DTRACE_LLQUANTIZE_LOWSHIFT) + +#define DTRACE_LLQUANTIZE_HIGH(x) \ + (uint16_t)(((x) & DTRACE_LLQUANTIZE_HIGHMASK) >> \ + DTRACE_LLQUANTIZE_HIGHSHIFT) + +#define DTRACE_LLQUANTIZE_NSTEP(x) \ + (uint16_t)(((x) & DTRACE_LLQUANTIZE_NSTEPMASK) >> \ + DTRACE_LLQUANTIZE_NSTEPSHIFT) + +#define DTRACE_USTACK_NFRAMES(x) (uint32_t)((x) & UINT32_MAX) +#define DTRACE_USTACK_STRSIZE(x) (uint32_t)((x) >> 32) +#define DTRACE_USTACK_ARG(x, y) \ + ((((uint64_t)(y)) << 32) | ((x) & UINT32_MAX)) + +#ifndef _LP64 +#if BYTE_ORDER == _BIG_ENDIAN +#define DTRACE_PTR(type, name) uint32_t name##pad; type *name +#else +#define DTRACE_PTR(type, name) type *name; uint32_t name##pad +#endif +#else +#define DTRACE_PTR(type, name) type *name +#endif + +/* + * DTrace Object Format (DOF) + * + * DTrace programs can be persistently encoded in the DOF format so that they + * may be embedded in other programs (for example, in an ELF file) or in the + * dtrace driver configuration file for use in anonymous tracing. The DOF + * format is versioned and extensible so that it can be revised and so that + * internal data structures can be modified or extended compatibly. All DOF + * structures use fixed-size types, so the 32-bit and 64-bit representations + * are identical and consumers can use either data model transparently. + * + * The file layout is structured as follows: + * + * +---------------+-------------------+----- ... ----+---- ... ------+ + * | dof_hdr_t | dof_sec_t[ ... ] | loadable | non-loadable | + * | (file header) | (section headers) | section data | section data | + * +---------------+-------------------+----- ... ----+---- ... ------+ + * |<------------ dof_hdr.dofh_loadsz --------------->| | + * |<------------ dof_hdr.dofh_filesz ------------------------------->| + * + * The file header stores meta-data including a magic number, data model for + * the instrumentation, data encoding, and properties of the DIF code within. + * The header describes its own size and the size of the section headers. By + * convention, an array of section headers follows the file header, and then + * the data for all loadable sections and unloadable sections. This permits + * consumer code to easily download the headers and all loadable data into the + * DTrace driver in one contiguous chunk, omitting other extraneous sections. + * + * The section headers describe the size, offset, alignment, and section type + * for each section. Sections are described using a set of #defines that tell + * the consumer what kind of data is expected. Sections can contain links to + * other sections by storing a dof_secidx_t, an index into the section header + * array, inside of the section data structures. The section header includes + * an entry size so that sections with data arrays can grow their structures. + * + * The DOF data itself can contain many snippets of DIF (i.e. >1 DIFOs), which + * are represented themselves as a collection of related DOF sections. This + * permits us to change the set of sections associated with a DIFO over time, + * and also permits us to encode DIFOs that contain different sets of sections. + * When a DOF section wants to refer to a DIFO, it stores the dof_secidx_t of a + * section of type DOF_SECT_DIFOHDR. This section's data is then an array of + * dof_secidx_t's which in turn denote the sections associated with this DIFO. + * + * This loose coupling of the file structure (header and sections) to the + * structure of the DTrace program itself (ECB descriptions, action + * descriptions, and DIFOs) permits activities such as relocation processing + * to occur in a single pass without having to understand D program structure. + * + * Finally, strings are always stored in ELF-style string tables along with a + * string table section index and string table offset. Therefore strings in + * DOF are always arbitrary-length and not bound to the current implementation. + */ + +#define DOF_ID_SIZE 16 /* total size of dofh_ident[] in bytes */ + +typedef struct dof_hdr { + uint8_t dofh_ident[DOF_ID_SIZE]; /* identification bytes (see below) */ + uint32_t dofh_flags; /* file attribute flags (if any) */ + uint32_t dofh_hdrsize; /* size of file header in bytes */ + uint32_t dofh_secsize; /* size of section header in bytes */ + uint32_t dofh_secnum; /* number of section headers */ + uint64_t dofh_secoff; /* file offset of section headers */ + uint64_t dofh_loadsz; /* file size of loadable portion */ + uint64_t dofh_filesz; /* file size of entire DOF file */ + uint64_t dofh_pad; /* reserved for future use */ +} dof_hdr_t; + +#define DOF_ID_MAG0 0 /* first byte of magic number */ +#define DOF_ID_MAG1 1 /* second byte of magic number */ +#define DOF_ID_MAG2 2 /* third byte of magic number */ +#define DOF_ID_MAG3 3 /* fourth byte of magic number */ +#define DOF_ID_MODEL 4 /* DOF data model (see below) */ +#define DOF_ID_ENCODING 5 /* DOF data encoding (see below) */ +#define DOF_ID_VERSION 6 /* DOF file format major version (see below) */ +#define DOF_ID_DIFVERS 7 /* DIF instruction set version */ +#define DOF_ID_DIFIREG 8 /* DIF integer registers used by compiler */ +#define DOF_ID_DIFTREG 9 /* DIF tuple registers used by compiler */ +#define DOF_ID_PAD 10 /* start of padding bytes (all zeroes) */ + +#define DOF_MAG_MAG0 0x7F /* DOF_ID_MAG[0-3] */ +#define DOF_MAG_MAG1 'D' +#define DOF_MAG_MAG2 'O' +#define DOF_MAG_MAG3 'F' + +#define DOF_MAG_STRING "\177DOF" +#define DOF_MAG_STRLEN 4 + +#define DOF_MODEL_NONE 0 /* DOF_ID_MODEL */ +#define DOF_MODEL_ILP32 1 +#define DOF_MODEL_LP64 2 + +#ifdef _LP64 +#define DOF_MODEL_NATIVE DOF_MODEL_LP64 +#else +#define DOF_MODEL_NATIVE DOF_MODEL_ILP32 +#endif + +#define DOF_ENCODE_NONE 0 /* DOF_ID_ENCODING */ +#define DOF_ENCODE_LSB 1 +#define DOF_ENCODE_MSB 2 + +#if BYTE_ORDER == _BIG_ENDIAN +#define DOF_ENCODE_NATIVE DOF_ENCODE_MSB +#else +#define DOF_ENCODE_NATIVE DOF_ENCODE_LSB +#endif + +#define DOF_VERSION_1 1 /* DOF version 1: Solaris 10 FCS */ +#define DOF_VERSION_2 2 /* DOF version 2: Solaris Express 6/06 */ +#define DOF_VERSION DOF_VERSION_2 /* Latest DOF version */ + +#define DOF_FL_VALID 0 /* mask of all valid dofh_flags bits */ + +typedef uint32_t dof_secidx_t; /* section header table index type */ +typedef uint32_t dof_stridx_t; /* string table index type */ + +#define DOF_SECIDX_NONE (-1U) /* null value for section indices */ +#define DOF_STRIDX_NONE (-1U) /* null value for string indices */ + +typedef struct dof_sec { + uint32_t dofs_type; /* section type (see below) */ + uint32_t dofs_align; /* section data memory alignment */ + uint32_t dofs_flags; /* section flags (if any) */ + uint32_t dofs_entsize; /* size of section entry (if table) */ + uint64_t dofs_offset; /* offset of section data within file */ + uint64_t dofs_size; /* size of section data in bytes */ +} dof_sec_t; + +#define DOF_SECT_NONE 0 /* null section */ +#define DOF_SECT_COMMENTS 1 /* compiler comments */ +#define DOF_SECT_SOURCE 2 /* D program source code */ +#define DOF_SECT_ECBDESC 3 /* dof_ecbdesc_t */ +#define DOF_SECT_PROBEDESC 4 /* dof_probedesc_t */ +#define DOF_SECT_ACTDESC 5 /* dof_actdesc_t array */ +#define DOF_SECT_DIFOHDR 6 /* dof_difohdr_t (variable length) */ +#define DOF_SECT_DIF 7 /* uint32_t array of byte code */ +#define DOF_SECT_STRTAB 8 /* string table */ +#define DOF_SECT_VARTAB 9 /* dtrace_difv_t array */ +#define DOF_SECT_RELTAB 10 /* dof_relodesc_t array */ +#define DOF_SECT_TYPTAB 11 /* dtrace_diftype_t array */ +#define DOF_SECT_URELHDR 12 /* dof_relohdr_t (user relocations) */ +#define DOF_SECT_KRELHDR 13 /* dof_relohdr_t (kernel relocations) */ +#define DOF_SECT_OPTDESC 14 /* dof_optdesc_t array */ +#define DOF_SECT_PROVIDER 15 /* dof_provider_t */ +#define DOF_SECT_PROBES 16 /* dof_probe_t array */ +#define DOF_SECT_PRARGS 17 /* uint8_t array (probe arg mappings) */ +#define DOF_SECT_PROFFS 18 /* uint32_t array (probe arg offsets) */ +#define DOF_SECT_INTTAB 19 /* uint64_t array */ +#define DOF_SECT_UTSNAME 20 /* struct utsname */ +#define DOF_SECT_XLTAB 21 /* dof_xlref_t array */ +#define DOF_SECT_XLMEMBERS 22 /* dof_xlmember_t array */ +#define DOF_SECT_XLIMPORT 23 /* dof_xlator_t */ +#define DOF_SECT_XLEXPORT 24 /* dof_xlator_t */ +#define DOF_SECT_PREXPORT 25 /* dof_secidx_t array (exported objs) */ +#define DOF_SECT_PRENOFFS 26 /* uint32_t array (enabled offsets) */ + +#define DOF_SECF_LOAD 1 /* section should be loaded */ + +#define DOF_SEC_ISLOADABLE(x) \ + (((x) == DOF_SECT_ECBDESC) || ((x) == DOF_SECT_PROBEDESC) || \ + ((x) == DOF_SECT_ACTDESC) || ((x) == DOF_SECT_DIFOHDR) || \ + ((x) == DOF_SECT_DIF) || ((x) == DOF_SECT_STRTAB) || \ + ((x) == DOF_SECT_VARTAB) || ((x) == DOF_SECT_RELTAB) || \ + ((x) == DOF_SECT_TYPTAB) || ((x) == DOF_SECT_URELHDR) || \ + ((x) == DOF_SECT_KRELHDR) || ((x) == DOF_SECT_OPTDESC) || \ + ((x) == DOF_SECT_PROVIDER) || ((x) == DOF_SECT_PROBES) || \ + ((x) == DOF_SECT_PRARGS) || ((x) == DOF_SECT_PROFFS) || \ + ((x) == DOF_SECT_INTTAB) || ((x) == DOF_SECT_XLTAB) || \ + ((x) == DOF_SECT_XLMEMBERS) || ((x) == DOF_SECT_XLIMPORT) || \ + ((x) == DOF_SECT_XLEXPORT) || ((x) == DOF_SECT_PREXPORT) || \ + ((x) == DOF_SECT_PRENOFFS)) + +typedef struct dof_ecbdesc { + dof_secidx_t dofe_probes; /* link to DOF_SECT_PROBEDESC */ + dof_secidx_t dofe_pred; /* link to DOF_SECT_DIFOHDR */ + dof_secidx_t dofe_actions; /* link to DOF_SECT_ACTDESC */ + uint32_t dofe_pad; /* reserved for future use */ + uint64_t dofe_uarg; /* user-supplied library argument */ +} dof_ecbdesc_t; + +typedef struct dof_probedesc { + dof_secidx_t dofp_strtab; /* link to DOF_SECT_STRTAB section */ + dof_stridx_t dofp_provider; /* provider string */ + dof_stridx_t dofp_mod; /* module string */ + dof_stridx_t dofp_func; /* function string */ + dof_stridx_t dofp_name; /* name string */ + uint32_t dofp_id; /* probe identifier (or zero) */ +} dof_probedesc_t; + +typedef struct dof_actdesc { + dof_secidx_t dofa_difo; /* link to DOF_SECT_DIFOHDR */ + dof_secidx_t dofa_strtab; /* link to DOF_SECT_STRTAB section */ + uint32_t dofa_kind; /* action kind (DTRACEACT_* constant) */ + uint32_t dofa_ntuple; /* number of subsequent tuple actions */ + uint64_t dofa_arg; /* kind-specific argument */ + uint64_t dofa_uarg; /* user-supplied argument */ +} dof_actdesc_t; + +typedef struct dof_difohdr { + dtrace_diftype_t dofd_rtype; /* return type for this fragment */ + dof_secidx_t dofd_links[1]; /* variable length array of indices */ +} dof_difohdr_t; + +typedef struct dof_relohdr { + dof_secidx_t dofr_strtab; /* link to DOF_SECT_STRTAB for names */ + dof_secidx_t dofr_relsec; /* link to DOF_SECT_RELTAB for relos */ + dof_secidx_t dofr_tgtsec; /* link to section we are relocating */ +} dof_relohdr_t; + +typedef struct dof_relodesc { + dof_stridx_t dofr_name; /* string name of relocation symbol */ + uint32_t dofr_type; /* relo type (DOF_RELO_* constant) */ + uint64_t dofr_offset; /* byte offset for relocation */ + uint64_t dofr_data; /* additional type-specific data */ +} dof_relodesc_t; + +#define DOF_RELO_NONE 0 /* empty relocation entry */ +#define DOF_RELO_SETX 1 /* relocate setx value */ +#define DOF_RELO_DOFREL 2 /* relocate DOF-relative value */ + +typedef struct dof_optdesc { + uint32_t dofo_option; /* option identifier */ + dof_secidx_t dofo_strtab; /* string table, if string option */ + uint64_t dofo_value; /* option value or string index */ +} dof_optdesc_t; + +typedef uint32_t dof_attr_t; /* encoded stability attributes */ + +#define DOF_ATTR(n, d, c) (((n) << 24) | ((d) << 16) | ((c) << 8)) +#define DOF_ATTR_NAME(a) (((a) >> 24) & 0xff) +#define DOF_ATTR_DATA(a) (((a) >> 16) & 0xff) +#define DOF_ATTR_CLASS(a) (((a) >> 8) & 0xff) + +typedef struct dof_provider { + dof_secidx_t dofpv_strtab; /* link to DOF_SECT_STRTAB section */ + dof_secidx_t dofpv_probes; /* link to DOF_SECT_PROBES section */ + dof_secidx_t dofpv_prargs; /* link to DOF_SECT_PRARGS section */ + dof_secidx_t dofpv_proffs; /* link to DOF_SECT_PROFFS section */ + dof_stridx_t dofpv_name; /* provider name string */ + dof_attr_t dofpv_provattr; /* provider attributes */ + dof_attr_t dofpv_modattr; /* module attributes */ + dof_attr_t dofpv_funcattr; /* function attributes */ + dof_attr_t dofpv_nameattr; /* name attributes */ + dof_attr_t dofpv_argsattr; /* args attributes */ + dof_secidx_t dofpv_prenoffs; /* link to DOF_SECT_PRENOFFS section */ +} dof_provider_t; + +typedef struct dof_probe { + uint64_t dofpr_addr; /* probe base address or offset */ + dof_stridx_t dofpr_func; /* probe function string */ + dof_stridx_t dofpr_name; /* probe name string */ + dof_stridx_t dofpr_nargv; /* native argument type strings */ + dof_stridx_t dofpr_xargv; /* translated argument type strings */ + uint32_t dofpr_argidx; /* index of first argument mapping */ + uint32_t dofpr_offidx; /* index of first offset entry */ + uint8_t dofpr_nargc; /* native argument count */ + uint8_t dofpr_xargc; /* translated argument count */ + uint16_t dofpr_noffs; /* number of offset entries for probe */ + uint32_t dofpr_enoffidx; /* index of first is-enabled offset */ + uint16_t dofpr_nenoffs; /* number of is-enabled offsets */ + uint16_t dofpr_pad1; /* reserved for future use */ + uint32_t dofpr_pad2; /* reserved for future use */ +} dof_probe_t; + +typedef struct dof_xlator { + dof_secidx_t dofxl_members; /* link to DOF_SECT_XLMEMBERS section */ + dof_secidx_t dofxl_strtab; /* link to DOF_SECT_STRTAB section */ + dof_stridx_t dofxl_argv; /* input parameter type strings */ + uint32_t dofxl_argc; /* input parameter list length */ + dof_stridx_t dofxl_type; /* output type string name */ + dof_attr_t dofxl_attr; /* output stability attributes */ +} dof_xlator_t; + +typedef struct dof_xlmember { + dof_secidx_t dofxm_difo; /* member link to DOF_SECT_DIFOHDR */ + dof_stridx_t dofxm_name; /* member name */ + dtrace_diftype_t dofxm_type; /* member type */ +} dof_xlmember_t; + +typedef struct dof_xlref { + dof_secidx_t dofxr_xlator; /* link to DOF_SECT_XLATORS section */ + uint32_t dofxr_member; /* index of referenced dof_xlmember */ + uint32_t dofxr_argn; /* index of argument for DIF_OP_XLARG */ +} dof_xlref_t; + +/* + * DTrace Intermediate Format Object (DIFO) + * + * A DIFO is used to store the compiled DIF for a D expression, its return + * type, and its string and variable tables. The string table is a single + * buffer of character data into which sets instructions and variable + * references can reference strings using a byte offset. The variable table + * is an array of dtrace_difv_t structures that describe the name and type of + * each variable and the id used in the DIF code. This structure is described + * above in the DIF section of this header file. The DIFO is used at both + * user-level (in the library) and in the kernel, but the structure is never + * passed between the two: the DOF structures form the only interface. As a + * result, the definition can change depending on the presence of _KERNEL. + */ +typedef struct dtrace_difo { + dif_instr_t *dtdo_buf; /* instruction buffer */ + uint64_t *dtdo_inttab; /* integer table (optional) */ + char *dtdo_strtab; /* string table (optional) */ + dtrace_difv_t *dtdo_vartab; /* variable table (optional) */ + uint_t dtdo_len; /* length of instruction buffer */ + uint_t dtdo_intlen; /* length of integer table */ + uint_t dtdo_strlen; /* length of string table */ + uint_t dtdo_varlen; /* length of variable table */ + dtrace_diftype_t dtdo_rtype; /* return type */ + uint_t dtdo_refcnt; /* owner reference count */ + uint_t dtdo_destructive; /* invokes destructive subroutines */ +#ifndef _KERNEL + dof_relodesc_t *dtdo_kreltab; /* kernel relocations */ + dof_relodesc_t *dtdo_ureltab; /* user relocations */ + struct dt_node **dtdo_xlmtab; /* translator references */ + uint_t dtdo_krelen; /* length of krelo table */ + uint_t dtdo_urelen; /* length of urelo table */ + uint_t dtdo_xlmlen; /* length of translator table */ +#endif +} dtrace_difo_t; + +/* + * DTrace Enabling Description Structures + * + * When DTrace is tracking the description of a DTrace enabling entity (probe, + * predicate, action, ECB, record, etc.), it does so in a description + * structure. These structures all end in "desc", and are used at both + * user-level and in the kernel -- but (with the exception of + * dtrace_probedesc_t) they are never passed between them. Typically, + * user-level will use the description structures when assembling an enabling. + * It will then distill those description structures into a DOF object (see + * above), and send it into the kernel. The kernel will again use the + * description structures to create a description of the enabling as it reads + * the DOF. When the description is complete, the enabling will be actually + * created -- turning it into the structures that represent the enabling + * instead of merely describing it. Not surprisingly, the description + * structures bear a strong resemblance to the DOF structures that act as their + * conduit. + */ +struct dtrace_predicate; + +typedef struct dtrace_probedesc { + dtrace_id_t dtpd_id; /* probe identifier */ + char dtpd_provider[DTRACE_PROVNAMELEN]; /* probe provider name */ + char dtpd_mod[DTRACE_MODNAMELEN]; /* probe module name */ + char dtpd_func[DTRACE_FUNCNAMELEN]; /* probe function name */ + char dtpd_name[DTRACE_NAMELEN]; /* probe name */ +} dtrace_probedesc_t; + +typedef struct dtrace_repldesc { + dtrace_probedesc_t dtrpd_match; /* probe descr. to match */ + dtrace_probedesc_t dtrpd_create; /* probe descr. to create */ +} dtrace_repldesc_t; + +typedef struct dtrace_preddesc { + dtrace_difo_t *dtpdd_difo; /* pointer to DIF object */ + struct dtrace_predicate *dtpdd_predicate; /* pointer to predicate */ +} dtrace_preddesc_t; + +typedef struct dtrace_actdesc { + dtrace_difo_t *dtad_difo; /* pointer to DIF object */ + struct dtrace_actdesc *dtad_next; /* next action */ + dtrace_actkind_t dtad_kind; /* kind of action */ + uint32_t dtad_ntuple; /* number in tuple */ + uint64_t dtad_arg; /* action argument */ + uint64_t dtad_uarg; /* user argument */ + int dtad_refcnt; /* reference count */ +} dtrace_actdesc_t; + +typedef struct dtrace_ecbdesc { + dtrace_actdesc_t *dted_action; /* action description(s) */ + dtrace_preddesc_t dted_pred; /* predicate description */ + dtrace_probedesc_t dted_probe; /* probe description */ + uint64_t dted_uarg; /* library argument */ + int dted_refcnt; /* reference count */ +} dtrace_ecbdesc_t; + +/* + * DTrace Metadata Description Structures + * + * DTrace separates the trace data stream from the metadata stream. The only + * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID + + * timestamp) or (in the case of aggregations) aggregation identifiers. To + * determine the structure of the data, DTrace consumers pass the token to the + * kernel, and receive in return a corresponding description of the enabled + * probe (via the dtrace_eprobedesc structure) or the aggregation (via the + * dtrace_aggdesc structure). Both of these structures are expressed in terms + * of record descriptions (via the dtrace_recdesc structure) that describe the + * exact structure of the data. Some record descriptions may also contain a + * format identifier; this additional bit of metadata can be retrieved from the + * kernel, for which a format description is returned via the dtrace_fmtdesc + * structure. Note that all four of these structures must be bitness-neutral + * to allow for a 32-bit DTrace consumer on a 64-bit kernel. + */ +typedef struct dtrace_recdesc { + dtrace_actkind_t dtrd_action; /* kind of action */ + uint32_t dtrd_size; /* size of record */ + uint32_t dtrd_offset; /* offset in ECB's data */ + uint16_t dtrd_alignment; /* required alignment */ + uint16_t dtrd_format; /* format, if any */ + uint64_t dtrd_arg; /* action argument */ + uint64_t dtrd_uarg; /* user argument */ +} dtrace_recdesc_t; + +typedef struct dtrace_eprobedesc { + dtrace_epid_t dtepd_epid; /* enabled probe ID */ + dtrace_id_t dtepd_probeid; /* probe ID */ + uint64_t dtepd_uarg; /* library argument */ + uint32_t dtepd_size; /* total size */ + int dtepd_nrecs; /* number of records */ + dtrace_recdesc_t dtepd_rec[1]; /* records themselves */ +} dtrace_eprobedesc_t; + +typedef struct dtrace_aggdesc { + DTRACE_PTR(char, dtagd_name); /* not filled in by kernel */ + dtrace_aggvarid_t dtagd_varid; /* not filled in by kernel */ + int dtagd_flags; /* not filled in by kernel */ + dtrace_aggid_t dtagd_id; /* aggregation ID */ + dtrace_epid_t dtagd_epid; /* enabled probe ID */ + uint32_t dtagd_size; /* size in bytes */ + int dtagd_nrecs; /* number of records */ + uint32_t dtagd_pad; /* explicit padding */ + dtrace_recdesc_t dtagd_rec[1]; /* record descriptions */ +} dtrace_aggdesc_t; + +typedef struct dtrace_fmtdesc { + DTRACE_PTR(char, dtfd_string); /* format string */ + int dtfd_length; /* length of format string */ + uint16_t dtfd_format; /* format identifier */ +} dtrace_fmtdesc_t; + +#define DTRACE_SIZEOF_EPROBEDESC(desc) \ + (sizeof (dtrace_eprobedesc_t) + ((desc)->dtepd_nrecs ? \ + (((desc)->dtepd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0)) + +#define DTRACE_SIZEOF_AGGDESC(desc) \ + (sizeof (dtrace_aggdesc_t) + ((desc)->dtagd_nrecs ? \ + (((desc)->dtagd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0)) + +/* + * DTrace Option Interface + * + * Run-time DTrace options are set and retrieved via DOF_SECT_OPTDESC sections + * in a DOF image. The dof_optdesc structure contains an option identifier and + * an option value. The valid option identifiers are found below; the mapping + * between option identifiers and option identifying strings is maintained at + * user-level. Note that the value of DTRACEOPT_UNSET is such that all of the + * following are potentially valid option values: all positive integers, zero + * and negative one. Some options (notably "bufpolicy" and "bufresize") take + * predefined tokens as their values; these are defined with + * DTRACEOPT_{option}_{token}. + */ +#define DTRACEOPT_BUFSIZE 0 /* buffer size */ +#define DTRACEOPT_BUFPOLICY 1 /* buffer policy */ +#define DTRACEOPT_DYNVARSIZE 2 /* dynamic variable size */ +#define DTRACEOPT_AGGSIZE 3 /* aggregation size */ +#define DTRACEOPT_SPECSIZE 4 /* speculation size */ +#define DTRACEOPT_NSPEC 5 /* number of speculations */ +#define DTRACEOPT_STRSIZE 6 /* string size */ +#define DTRACEOPT_CLEANRATE 7 /* dynvar cleaning rate */ +#define DTRACEOPT_CPU 8 /* CPU to trace */ +#define DTRACEOPT_BUFRESIZE 9 /* buffer resizing policy */ +#define DTRACEOPT_GRABANON 10 /* grab anonymous state, if any */ +#define DTRACEOPT_FLOWINDENT 11 /* indent function entry/return */ +#define DTRACEOPT_QUIET 12 /* only output explicitly traced data */ +#define DTRACEOPT_STACKFRAMES 13 /* number of stack frames */ +#define DTRACEOPT_USTACKFRAMES 14 /* number of user stack frames */ +#define DTRACEOPT_AGGRATE 15 /* aggregation snapshot rate */ +#define DTRACEOPT_SWITCHRATE 16 /* buffer switching rate */ +#define DTRACEOPT_STATUSRATE 17 /* status rate */ +#define DTRACEOPT_DESTRUCTIVE 18 /* destructive actions allowed */ +#define DTRACEOPT_STACKINDENT 19 /* output indent for stack traces */ +#define DTRACEOPT_RAWBYTES 20 /* always print bytes in raw form */ +#define DTRACEOPT_JSTACKFRAMES 21 /* number of jstack() frames */ +#define DTRACEOPT_JSTACKSTRSIZE 22 /* size of jstack() string table */ +#define DTRACEOPT_AGGSORTKEY 23 /* sort aggregations by key */ +#define DTRACEOPT_AGGSORTREV 24 /* reverse-sort aggregations */ +#define DTRACEOPT_AGGSORTPOS 25 /* agg. position to sort on */ +#define DTRACEOPT_AGGSORTKEYPOS 26 /* agg. key position to sort on */ +#define DTRACEOPT_TEMPORAL 27 /* temporally ordered output */ +#define DTRACEOPT_AGGHIST 28 /* histogram aggregation output */ +#define DTRACEOPT_AGGPACK 29 /* packed aggregation output */ +#define DTRACEOPT_AGGZOOM 30 /* zoomed aggregation scaling */ +#define DTRACEOPT_ZONE 31 /* zone in which to enable probes */ +#define DTRACEOPT_MAX 32 /* number of options */ + +#define DTRACEOPT_UNSET (dtrace_optval_t)-2 /* unset option */ + +#define DTRACEOPT_BUFPOLICY_RING 0 /* ring buffer */ +#define DTRACEOPT_BUFPOLICY_FILL 1 /* fill buffer, then stop */ +#define DTRACEOPT_BUFPOLICY_SWITCH 2 /* switch buffers */ + +#define DTRACEOPT_BUFRESIZE_AUTO 0 /* automatic resizing */ +#define DTRACEOPT_BUFRESIZE_MANUAL 1 /* manual resizing */ + +/* + * DTrace Buffer Interface + * + * In order to get a snapshot of the principal or aggregation buffer, + * user-level passes a buffer description to the kernel with the dtrace_bufdesc + * structure. This describes which CPU user-level is interested in, and + * where user-level wishes the kernel to snapshot the buffer to (the + * dtbd_data field). The kernel uses the same structure to pass back some + * information regarding the buffer: the size of data actually copied out, the + * number of drops, the number of errors, the offset of the oldest record, + * and the time of the snapshot. + * + * If the buffer policy is a "switch" policy, taking a snapshot of the + * principal buffer has the additional effect of switching the active and + * inactive buffers. Taking a snapshot of the aggregation buffer _always_ has + * the additional effect of switching the active and inactive buffers. + */ +typedef struct dtrace_bufdesc { + uint64_t dtbd_size; /* size of buffer */ + uint32_t dtbd_cpu; /* CPU or DTRACE_CPUALL */ + uint32_t dtbd_errors; /* number of errors */ + uint64_t dtbd_drops; /* number of drops */ + DTRACE_PTR(char, dtbd_data); /* data */ + uint64_t dtbd_oldest; /* offset of oldest record */ + uint64_t dtbd_timestamp; /* hrtime of snapshot */ +} dtrace_bufdesc_t; + +/* + * Each record in the buffer (dtbd_data) begins with a header that includes + * the epid and a timestamp. The timestamp is split into two 4-byte parts + * so that we do not require 8-byte alignment. + */ +typedef struct dtrace_rechdr { + dtrace_epid_t dtrh_epid; /* enabled probe id */ + uint32_t dtrh_timestamp_hi; /* high bits of hrtime_t */ + uint32_t dtrh_timestamp_lo; /* low bits of hrtime_t */ +} dtrace_rechdr_t; + +#define DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) \ + ((dtrh)->dtrh_timestamp_lo + \ + ((uint64_t)(dtrh)->dtrh_timestamp_hi << 32)) + +#define DTRACE_RECORD_STORE_TIMESTAMP(dtrh, hrtime) { \ + (dtrh)->dtrh_timestamp_lo = (uint32_t)hrtime; \ + (dtrh)->dtrh_timestamp_hi = hrtime >> 32; \ +} + +/* + * DTrace Status + * + * The status of DTrace is relayed via the dtrace_status structure. This + * structure contains members to count drops other than the capacity drops + * available via the buffer interface (see above). This consists of dynamic + * drops (including capacity dynamic drops, rinsing drops and dirty drops), and + * speculative drops (including capacity speculative drops, drops due to busy + * speculative buffers and drops due to unavailable speculative buffers). + * Additionally, the status structure contains a field to indicate the number + * of "fill"-policy buffers have been filled and a boolean field to indicate + * that exit() has been called. If the dtst_exiting field is non-zero, no + * further data will be generated until tracing is stopped (at which time any + * enablings of the END action will be processed); if user-level sees that + * this field is non-zero, tracing should be stopped as soon as possible. + */ +typedef struct dtrace_status { + uint64_t dtst_dyndrops; /* dynamic drops */ + uint64_t dtst_dyndrops_rinsing; /* dyn drops due to rinsing */ + uint64_t dtst_dyndrops_dirty; /* dyn drops due to dirty */ + uint64_t dtst_specdrops; /* speculative drops */ + uint64_t dtst_specdrops_busy; /* spec drops due to busy */ + uint64_t dtst_specdrops_unavail; /* spec drops due to unavail */ + uint64_t dtst_errors; /* total errors */ + uint64_t dtst_filled; /* number of filled bufs */ + uint64_t dtst_stkstroverflows; /* stack string tab overflows */ + uint64_t dtst_dblerrors; /* errors in ERROR probes */ + char dtst_killed; /* non-zero if killed */ + char dtst_exiting; /* non-zero if exit() called */ + char dtst_pad[6]; /* pad out to 64-bit align */ +} dtrace_status_t; + +/* + * DTrace Configuration + * + * User-level may need to understand some elements of the kernel DTrace + * configuration in order to generate correct DIF. This information is + * conveyed via the dtrace_conf structure. + */ +typedef struct dtrace_conf { + uint_t dtc_difversion; /* supported DIF version */ + uint_t dtc_difintregs; /* # of DIF integer registers */ + uint_t dtc_diftupregs; /* # of DIF tuple registers */ + uint_t dtc_ctfmodel; /* CTF data model */ + uint_t dtc_pad[8]; /* reserved for future use */ +} dtrace_conf_t; + +/* + * DTrace Faults + * + * The constants below DTRACEFLT_LIBRARY indicate probe processing faults; + * constants at or above DTRACEFLT_LIBRARY indicate faults in probe + * postprocessing at user-level. Probe processing faults induce an ERROR + * probe and are replicated in unistd.d to allow users' ERROR probes to decode + * the error condition using thse symbolic labels. + */ +#define DTRACEFLT_UNKNOWN 0 /* Unknown fault */ +#define DTRACEFLT_BADADDR 1 /* Bad address */ +#define DTRACEFLT_BADALIGN 2 /* Bad alignment */ +#define DTRACEFLT_ILLOP 3 /* Illegal operation */ +#define DTRACEFLT_DIVZERO 4 /* Divide-by-zero */ +#define DTRACEFLT_NOSCRATCH 5 /* Out of scratch space */ +#define DTRACEFLT_KPRIV 6 /* Illegal kernel access */ +#define DTRACEFLT_UPRIV 7 /* Illegal user access */ +#define DTRACEFLT_TUPOFLOW 8 /* Tuple stack overflow */ +#define DTRACEFLT_BADSTACK 9 /* Bad stack */ + +#define DTRACEFLT_LIBRARY 1000 /* Library-level fault */ + +/* + * DTrace Argument Types + * + * Because it would waste both space and time, argument types do not reside + * with the probe. In order to determine argument types for args[X] + * variables, the D compiler queries for argument types on a probe-by-probe + * basis. (This optimizes for the common case that arguments are either not + * used or used in an untyped fashion.) Typed arguments are specified with a + * string of the type name in the dtragd_native member of the argument + * description structure. Typed arguments may be further translated to types + * of greater stability; the provider indicates such a translated argument by + * filling in the dtargd_xlate member with the string of the translated type. + * Finally, the provider may indicate which argument value a given argument + * maps to by setting the dtargd_mapping member -- allowing a single argument + * to map to multiple args[X] variables. + */ +typedef struct dtrace_argdesc { + dtrace_id_t dtargd_id; /* probe identifier */ + int dtargd_ndx; /* arg number (-1 iff none) */ + int dtargd_mapping; /* value mapping */ + char dtargd_native[DTRACE_ARGTYPELEN]; /* native type name */ + char dtargd_xlate[DTRACE_ARGTYPELEN]; /* translated type name */ +} dtrace_argdesc_t; + +/* + * DTrace Stability Attributes + * + * Each DTrace provider advertises the name and data stability of each of its + * probe description components, as well as its architectural dependencies. + * The D compiler can query the provider attributes (dtrace_pattr_t below) in + * order to compute the properties of an input program and report them. + */ +typedef uint8_t dtrace_stability_t; /* stability code (see attributes(5)) */ +typedef uint8_t dtrace_class_t; /* architectural dependency class */ + +#define DTRACE_STABILITY_INTERNAL 0 /* private to DTrace itself */ +#define DTRACE_STABILITY_PRIVATE 1 /* private to Sun (see docs) */ +#define DTRACE_STABILITY_OBSOLETE 2 /* scheduled for removal */ +#define DTRACE_STABILITY_EXTERNAL 3 /* not controlled by Sun */ +#define DTRACE_STABILITY_UNSTABLE 4 /* new or rapidly changing */ +#define DTRACE_STABILITY_EVOLVING 5 /* less rapidly changing */ +#define DTRACE_STABILITY_STABLE 6 /* mature interface from Sun */ +#define DTRACE_STABILITY_STANDARD 7 /* industry standard */ +#define DTRACE_STABILITY_MAX 7 /* maximum valid stability */ + +#define DTRACE_CLASS_UNKNOWN 0 /* unknown architectural dependency */ +#define DTRACE_CLASS_CPU 1 /* CPU-module-specific */ +#define DTRACE_CLASS_PLATFORM 2 /* platform-specific (uname -i) */ +#define DTRACE_CLASS_GROUP 3 /* hardware-group-specific (uname -m) */ +#define DTRACE_CLASS_ISA 4 /* ISA-specific (uname -p) */ +#define DTRACE_CLASS_COMMON 5 /* common to all systems */ +#define DTRACE_CLASS_MAX 5 /* maximum valid class */ + +#define DTRACE_PRIV_NONE 0x0000 +#define DTRACE_PRIV_KERNEL 0x0001 +#define DTRACE_PRIV_USER 0x0002 +#define DTRACE_PRIV_PROC 0x0004 +#define DTRACE_PRIV_OWNER 0x0008 +#define DTRACE_PRIV_ZONEOWNER 0x0010 + +#define DTRACE_PRIV_ALL \ + (DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER | \ + DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER | DTRACE_PRIV_ZONEOWNER) + +typedef struct dtrace_ppriv { + uint32_t dtpp_flags; /* privilege flags */ + uid_t dtpp_uid; /* user ID */ + zoneid_t dtpp_zoneid; /* zone ID */ +} dtrace_ppriv_t; + +typedef struct dtrace_attribute { + dtrace_stability_t dtat_name; /* entity name stability */ + dtrace_stability_t dtat_data; /* entity data stability */ + dtrace_class_t dtat_class; /* entity data dependency */ +} dtrace_attribute_t; + +typedef struct dtrace_pattr { + dtrace_attribute_t dtpa_provider; /* provider attributes */ + dtrace_attribute_t dtpa_mod; /* module attributes */ + dtrace_attribute_t dtpa_func; /* function attributes */ + dtrace_attribute_t dtpa_name; /* name attributes */ + dtrace_attribute_t dtpa_args; /* args[] attributes */ +} dtrace_pattr_t; + +typedef struct dtrace_providerdesc { + char dtvd_name[DTRACE_PROVNAMELEN]; /* provider name */ + dtrace_pattr_t dtvd_attr; /* stability attributes */ + dtrace_ppriv_t dtvd_priv; /* privileges required */ +} dtrace_providerdesc_t; + +/* + * DTrace Pseudodevice Interface + * + * DTrace is controlled through ioctl(2)'s to the in-kernel dtrace:dtrace + * pseudodevice driver. These ioctls comprise the user-kernel interface to + * DTrace. + */ +#ifdef illumos +#define DTRACEIOC (('d' << 24) | ('t' << 16) | ('r' << 8)) +#define DTRACEIOC_PROVIDER (DTRACEIOC | 1) /* provider query */ +#define DTRACEIOC_PROBES (DTRACEIOC | 2) /* probe query */ +#define DTRACEIOC_BUFSNAP (DTRACEIOC | 4) /* snapshot buffer */ +#define DTRACEIOC_PROBEMATCH (DTRACEIOC | 5) /* match probes */ +#define DTRACEIOC_ENABLE (DTRACEIOC | 6) /* enable probes */ +#define DTRACEIOC_AGGSNAP (DTRACEIOC | 7) /* snapshot agg. */ +#define DTRACEIOC_EPROBE (DTRACEIOC | 8) /* get eprobe desc. */ +#define DTRACEIOC_PROBEARG (DTRACEIOC | 9) /* get probe arg */ +#define DTRACEIOC_CONF (DTRACEIOC | 10) /* get config. */ +#define DTRACEIOC_STATUS (DTRACEIOC | 11) /* get status */ +#define DTRACEIOC_GO (DTRACEIOC | 12) /* start tracing */ +#define DTRACEIOC_STOP (DTRACEIOC | 13) /* stop tracing */ +#define DTRACEIOC_AGGDESC (DTRACEIOC | 15) /* get agg. desc. */ +#define DTRACEIOC_FORMAT (DTRACEIOC | 16) /* get format str */ +#define DTRACEIOC_DOFGET (DTRACEIOC | 17) /* get DOF */ +#define DTRACEIOC_REPLICATE (DTRACEIOC | 18) /* replicate enab */ +#else +#define DTRACEIOC_PROVIDER _IOWR('x',1,dtrace_providerdesc_t) + /* provider query */ +#define DTRACEIOC_PROBES _IOWR('x',2,dtrace_probedesc_t) + /* probe query */ +#define DTRACEIOC_BUFSNAP _IOW('x',4,dtrace_bufdesc_t *) + /* snapshot buffer */ +#define DTRACEIOC_PROBEMATCH _IOWR('x',5,dtrace_probedesc_t) + /* match probes */ +typedef struct { + void *dof; /* DOF userland address written to driver. */ + int n_matched; /* # matches returned by driver. */ +} dtrace_enable_io_t; +#define DTRACEIOC_ENABLE _IOWR('x',6,dtrace_enable_io_t) + /* enable probes */ +#define DTRACEIOC_AGGSNAP _IOW('x',7,dtrace_bufdesc_t *) + /* snapshot agg. */ +#define DTRACEIOC_EPROBE _IOW('x',8,dtrace_eprobedesc_t) + /* get eprobe desc. */ +#define DTRACEIOC_PROBEARG _IOWR('x',9,dtrace_argdesc_t) + /* get probe arg */ +#define DTRACEIOC_CONF _IOR('x',10,dtrace_conf_t) + /* get config. */ +#define DTRACEIOC_STATUS _IOR('x',11,dtrace_status_t) + /* get status */ +#define DTRACEIOC_GO _IOR('x',12,processorid_t) + /* start tracing */ +#define DTRACEIOC_STOP _IOWR('x',13,processorid_t) + /* stop tracing */ +#define DTRACEIOC_AGGDESC _IOW('x',15,dtrace_aggdesc_t *) + /* get agg. desc. */ +#define DTRACEIOC_FORMAT _IOWR('x',16,dtrace_fmtdesc_t) + /* get format str */ +#define DTRACEIOC_DOFGET _IOW('x',17,dof_hdr_t *) + /* get DOF */ +#define DTRACEIOC_REPLICATE _IOW('x',18,dtrace_repldesc_t) + /* replicate enab */ +#endif + +/* + * DTrace Helpers + * + * In general, DTrace establishes probes in processes and takes actions on + * processes without knowing their specific user-level structures. Instead of + * existing in the framework, process-specific knowledge is contained by the + * enabling D program -- which can apply process-specific knowledge by making + * appropriate use of DTrace primitives like copyin() and copyinstr() to + * operate on user-level data. However, there may exist some specific probes + * of particular semantic relevance that the application developer may wish to + * explicitly export. For example, an application may wish to export a probe + * at the point that it begins and ends certain well-defined transactions. In + * addition to providing probes, programs may wish to offer assistance for + * certain actions. For example, in highly dynamic environments (e.g., Java), + * it may be difficult to obtain a stack trace in terms of meaningful symbol + * names (the translation from instruction addresses to corresponding symbol + * names may only be possible in situ); these environments may wish to define + * a series of actions to be applied in situ to obtain a meaningful stack + * trace. + * + * These two mechanisms -- user-level statically defined tracing and assisting + * DTrace actions -- are provided via DTrace _helpers_. Helpers are specified + * via DOF, but unlike enabling DOF, helper DOF may contain definitions of + * providers, probes and their arguments. If a helper wishes to provide + * action assistance, probe descriptions and corresponding DIF actions may be + * specified in the helper DOF. For such helper actions, however, the probe + * description describes the specific helper: all DTrace helpers have the + * provider name "dtrace" and the module name "helper", and the name of the + * helper is contained in the function name (for example, the ustack() helper + * is named "ustack"). Any helper-specific name may be contained in the name + * (for example, if a helper were to have a constructor, it might be named + * "dtrace:helper:<helper>:init"). Helper actions are only called when the + * action that they are helping is taken. Helper actions may only return DIF + * expressions, and may only call the following subroutines: + * + * alloca() <= Allocates memory out of the consumer's scratch space + * bcopy() <= Copies memory to scratch space + * copyin() <= Copies memory from user-level into consumer's scratch + * copyinto() <= Copies memory into a specific location in scratch + * copyinstr() <= Copies a string into a specific location in scratch + * + * Helper actions may only access the following built-in variables: + * + * curthread <= Current kthread_t pointer + * tid <= Current thread identifier + * pid <= Current process identifier + * ppid <= Parent process identifier + * uid <= Current user ID + * gid <= Current group ID + * execname <= Current executable name + * zonename <= Current zone name + * + * Helper actions may not manipulate or allocate dynamic variables, but they + * may have clause-local and statically-allocated global variables. The + * helper action variable state is specific to the helper action -- variables + * used by the helper action may not be accessed outside of the helper + * action, and the helper action may not access variables that like outside + * of it. Helper actions may not load from kernel memory at-large; they are + * restricting to loading current user state (via copyin() and variants) and + * scratch space. As with probe enablings, helper actions are executed in + * program order. The result of the helper action is the result of the last + * executing helper expression. + * + * Helpers -- composed of either providers/probes or probes/actions (or both) + * -- are added by opening the "helper" minor node, and issuing an ioctl(2) + * (DTRACEHIOC_ADDDOF) that specifies the dof_helper_t structure. This + * encapsulates the name and base address of the user-level library or + * executable publishing the helpers and probes as well as the DOF that + * contains the definitions of those helpers and probes. + * + * The DTRACEHIOC_ADD and DTRACEHIOC_REMOVE are left in place for legacy + * helpers and should no longer be used. No other ioctls are valid on the + * helper minor node. + */ +#ifdef illumos +#define DTRACEHIOC (('d' << 24) | ('t' << 16) | ('h' << 8)) +#define DTRACEHIOC_ADD (DTRACEHIOC | 1) /* add helper */ +#define DTRACEHIOC_REMOVE (DTRACEHIOC | 2) /* remove helper */ +#define DTRACEHIOC_ADDDOF (DTRACEHIOC | 3) /* add helper DOF */ +#else +#define DTRACEHIOC_REMOVE _IOW('z', 2, int) /* remove helper */ +#define DTRACEHIOC_ADDDOF _IOWR('z', 3, dof_helper_t)/* add helper DOF */ +#endif + +typedef struct dof_helper { + char dofhp_mod[DTRACE_MODNAMELEN]; /* executable or library name */ + uint64_t dofhp_addr; /* base address of object */ + uint64_t dofhp_dof; /* address of helper DOF */ +#ifdef __FreeBSD__ + pid_t dofhp_pid; /* target process ID */ + int dofhp_gen; +#endif +} dof_helper_t; + +#define DTRACEMNR_DTRACE "dtrace" /* node for DTrace ops */ +#define DTRACEMNR_HELPER "helper" /* node for helpers */ +#define DTRACEMNRN_DTRACE 0 /* minor for DTrace ops */ +#define DTRACEMNRN_HELPER 1 /* minor for helpers */ +#define DTRACEMNRN_CLONE 2 /* first clone minor */ + +#ifdef _KERNEL + +/* + * DTrace Provider API + * + * The following functions are implemented by the DTrace framework and are + * used to implement separate in-kernel DTrace providers. Common functions + * are provided in uts/common/os/dtrace.c. ISA-dependent subroutines are + * defined in uts/<isa>/dtrace/dtrace_asm.s or uts/<isa>/dtrace/dtrace_isa.c. + * + * The provider API has two halves: the API that the providers consume from + * DTrace, and the API that providers make available to DTrace. + * + * 1 Framework-to-Provider API + * + * 1.1 Overview + * + * The Framework-to-Provider API is represented by the dtrace_pops structure + * that the provider passes to the framework when registering itself. This + * structure consists of the following members: + * + * dtps_provide() <-- Provide all probes, all modules + * dtps_provide_module() <-- Provide all probes in specified module + * dtps_enable() <-- Enable specified probe + * dtps_disable() <-- Disable specified probe + * dtps_suspend() <-- Suspend specified probe + * dtps_resume() <-- Resume specified probe + * dtps_getargdesc() <-- Get the argument description for args[X] + * dtps_getargval() <-- Get the value for an argX or args[X] variable + * dtps_usermode() <-- Find out if the probe was fired in user mode + * dtps_destroy() <-- Destroy all state associated with this probe + * + * 1.2 void dtps_provide(void *arg, const dtrace_probedesc_t *spec) + * + * 1.2.1 Overview + * + * Called to indicate that the provider should provide all probes. If the + * specified description is non-NULL, dtps_provide() is being called because + * no probe matched a specified probe -- if the provider has the ability to + * create custom probes, it may wish to create a probe that matches the + * specified description. + * + * 1.2.2 Arguments and notes + * + * The first argument is the cookie as passed to dtrace_register(). The + * second argument is a pointer to a probe description that the provider may + * wish to consider when creating custom probes. The provider is expected to + * call back into the DTrace framework via dtrace_probe_create() to create + * any necessary probes. dtps_provide() may be called even if the provider + * has made available all probes; the provider should check the return value + * of dtrace_probe_create() to handle this case. Note that the provider need + * not implement both dtps_provide() and dtps_provide_module(); see + * "Arguments and Notes" for dtrace_register(), below. + * + * 1.2.3 Return value + * + * None. + * + * 1.2.4 Caller's context + * + * dtps_provide() is typically called from open() or ioctl() context, but may + * be called from other contexts as well. The DTrace framework is locked in + * such a way that providers may not register or unregister. This means that + * the provider may not call any DTrace API that affects its registration with + * the framework, including dtrace_register(), dtrace_unregister(), + * dtrace_invalidate(), and dtrace_condense(). However, the context is such + * that the provider may (and indeed, is expected to) call probe-related + * DTrace routines, including dtrace_probe_create(), dtrace_probe_lookup(), + * and dtrace_probe_arg(). + * + * 1.3 void dtps_provide_module(void *arg, modctl_t *mp) + * + * 1.3.1 Overview + * + * Called to indicate that the provider should provide all probes in the + * specified module. + * + * 1.3.2 Arguments and notes + * + * The first argument is the cookie as passed to dtrace_register(). The + * second argument is a pointer to a modctl structure that indicates the + * module for which probes should be created. + * + * 1.3.3 Return value + * + * None. + * + * 1.3.4 Caller's context + * + * dtps_provide_module() may be called from open() or ioctl() context, but + * may also be called from a module loading context. mod_lock is held, and + * the DTrace framework is locked in such a way that providers may not + * register or unregister. This means that the provider may not call any + * DTrace API that affects its registration with the framework, including + * dtrace_register(), dtrace_unregister(), dtrace_invalidate(), and + * dtrace_condense(). However, the context is such that the provider may (and + * indeed, is expected to) call probe-related DTrace routines, including + * dtrace_probe_create(), dtrace_probe_lookup(), and dtrace_probe_arg(). Note + * that the provider need not implement both dtps_provide() and + * dtps_provide_module(); see "Arguments and Notes" for dtrace_register(), + * below. + * + * 1.4 void dtps_enable(void *arg, dtrace_id_t id, void *parg) + * + * 1.4.1 Overview + * + * Called to enable the specified probe. + * + * 1.4.2 Arguments and notes + * + * The first argument is the cookie as passed to dtrace_register(). The + * second argument is the identifier of the probe to be enabled. The third + * argument is the probe argument as passed to dtrace_probe_create(). + * dtps_enable() will be called when a probe transitions from not being + * enabled at all to having one or more ECB. The number of ECBs associated + * with the probe may change without subsequent calls into the provider. + * When the number of ECBs drops to zero, the provider will be explicitly + * told to disable the probe via dtps_disable(). dtrace_probe() should never + * be called for a probe identifier that hasn't been explicitly enabled via + * dtps_enable(). + * + * 1.4.3 Return value + * + * None. + * + * 1.4.4 Caller's context + * + * The DTrace framework is locked in such a way that it may not be called + * back into at all. cpu_lock is held. mod_lock is not held and may not + * be acquired. + * + * 1.5 void dtps_disable(void *arg, dtrace_id_t id, void *parg) + * + * 1.5.1 Overview + * + * Called to disable the specified probe. + * + * 1.5.2 Arguments and notes + * + * The first argument is the cookie as passed to dtrace_register(). The + * second argument is the identifier of the probe to be disabled. The third + * argument is the probe argument as passed to dtrace_probe_create(). + * dtps_disable() will be called when a probe transitions from being enabled + * to having zero ECBs. dtrace_probe() should never be called for a probe + * identifier that has been explicitly enabled via dtps_disable(). + * + * 1.5.3 Return value + * + * None. + * + * 1.5.4 Caller's context + * + * The DTrace framework is locked in such a way that it may not be called + * back into at all. cpu_lock is held. mod_lock is not held and may not + * be acquired. + * + * 1.6 void dtps_suspend(void *arg, dtrace_id_t id, void *parg) + * + * 1.6.1 Overview + * + * Called to suspend the specified enabled probe. This entry point is for + * providers that may need to suspend some or all of their probes when CPUs + * are being powered on or when the boot monitor is being entered for a + * prolonged period of time. + * + * 1.6.2 Arguments and notes + * + * The first argument is the cookie as passed to dtrace_register(). The + * second argument is the identifier of the probe to be suspended. The + * third argument is the probe argument as passed to dtrace_probe_create(). + * dtps_suspend will only be called on an enabled probe. Providers that + * provide a dtps_suspend entry point will want to take roughly the action + * that it takes for dtps_disable. + * + * 1.6.3 Return value + * + * None. + * + * 1.6.4 Caller's context + * + * Interrupts are disabled. The DTrace framework is in a state such that the + * specified probe cannot be disabled or destroyed for the duration of + * dtps_suspend(). As interrupts are disabled, the provider is afforded + * little latitude; the provider is expected to do no more than a store to + * memory. + * + * 1.7 void dtps_resume(void *arg, dtrace_id_t id, void *parg) + * + * 1.7.1 Overview + * + * Called to resume the specified enabled probe. This entry point is for + * providers that may need to resume some or all of their probes after the + * completion of an event that induced a call to dtps_suspend(). + * + * 1.7.2 Arguments and notes + * + * The first argument is the cookie as passed to dtrace_register(). The + * second argument is the identifier of the probe to be resumed. The + * third argument is the probe argument as passed to dtrace_probe_create(). + * dtps_resume will only be called on an enabled probe. Providers that + * provide a dtps_resume entry point will want to take roughly the action + * that it takes for dtps_enable. + * + * 1.7.3 Return value + * + * None. + * + * 1.7.4 Caller's context + * + * Interrupts are disabled. The DTrace framework is in a state such that the + * specified probe cannot be disabled or destroyed for the duration of + * dtps_resume(). As interrupts are disabled, the provider is afforded + * little latitude; the provider is expected to do no more than a store to + * memory. + * + * 1.8 void dtps_getargdesc(void *arg, dtrace_id_t id, void *parg, + * dtrace_argdesc_t *desc) + * + * 1.8.1 Overview + * + * Called to retrieve the argument description for an args[X] variable. + * + * 1.8.2 Arguments and notes + * + * The first argument is the cookie as passed to dtrace_register(). The + * second argument is the identifier of the current probe. The third + * argument is the probe argument as passed to dtrace_probe_create(). The + * fourth argument is a pointer to the argument description. This + * description is both an input and output parameter: it contains the + * index of the desired argument in the dtargd_ndx field, and expects + * the other fields to be filled in upon return. If there is no argument + * corresponding to the specified index, the dtargd_ndx field should be set + * to DTRACE_ARGNONE. + * + * 1.8.3 Return value + * + * None. The dtargd_ndx, dtargd_native, dtargd_xlate and dtargd_mapping + * members of the dtrace_argdesc_t structure are all output values. + * + * 1.8.4 Caller's context + * + * dtps_getargdesc() is called from ioctl() context. mod_lock is held, and + * the DTrace framework is locked in such a way that providers may not + * register or unregister. This means that the provider may not call any + * DTrace API that affects its registration with the framework, including + * dtrace_register(), dtrace_unregister(), dtrace_invalidate(), and + * dtrace_condense(). + * + * 1.9 uint64_t dtps_getargval(void *arg, dtrace_id_t id, void *parg, + * int argno, int aframes) + * + * 1.9.1 Overview + * + * Called to retrieve a value for an argX or args[X] variable. + * + * 1.9.2 Arguments and notes + * + * The first argument is the cookie as passed to dtrace_register(). The + * second argument is the identifier of the current probe. The third + * argument is the probe argument as passed to dtrace_probe_create(). The + * fourth argument is the number of the argument (the X in the example in + * 1.9.1). The fifth argument is the number of stack frames that were used + * to get from the actual place in the code that fired the probe to + * dtrace_probe() itself, the so-called artificial frames. This argument may + * be used to descend an appropriate number of frames to find the correct + * values. If this entry point is left NULL, the dtrace_getarg() built-in + * function is used. + * + * 1.9.3 Return value + * + * The value of the argument. + * + * 1.9.4 Caller's context + * + * This is called from within dtrace_probe() meaning that interrupts + * are disabled. No locks should be taken within this entry point. + * + * 1.10 int dtps_usermode(void *arg, dtrace_id_t id, void *parg) + * + * 1.10.1 Overview + * + * Called to determine if the probe was fired in a user context. + * + * 1.10.2 Arguments and notes + * + * The first argument is the cookie as passed to dtrace_register(). The + * second argument is the identifier of the current probe. The third + * argument is the probe argument as passed to dtrace_probe_create(). This + * entry point must not be left NULL for providers whose probes allow for + * mixed mode tracing, that is to say those probes that can fire during + * kernel- _or_ user-mode execution + * + * 1.10.3 Return value + * + * A bitwise OR that encapsulates both the mode (either DTRACE_MODE_KERNEL + * or DTRACE_MODE_USER) and the policy when the privilege of the enabling + * is insufficient for that mode (a combination of DTRACE_MODE_NOPRIV_DROP, + * DTRACE_MODE_NOPRIV_RESTRICT, and DTRACE_MODE_LIMITEDPRIV_RESTRICT). If + * DTRACE_MODE_NOPRIV_DROP bit is set, insufficient privilege will result + * in the probe firing being silently ignored for the enabling; if the + * DTRACE_NODE_NOPRIV_RESTRICT bit is set, insufficient privilege will not + * prevent probe processing for the enabling, but restrictions will be in + * place that induce a UPRIV fault upon attempt to examine probe arguments + * or current process state. If the DTRACE_MODE_LIMITEDPRIV_RESTRICT bit + * is set, similar restrictions will be placed upon operation if the + * privilege is sufficient to process the enabling, but does not otherwise + * entitle the enabling to all zones. The DTRACE_MODE_NOPRIV_DROP and + * DTRACE_MODE_NOPRIV_RESTRICT are mutually exclusive (and one of these + * two policies must be specified), but either may be combined (or not) + * with DTRACE_MODE_LIMITEDPRIV_RESTRICT. + * + * 1.10.4 Caller's context + * + * This is called from within dtrace_probe() meaning that interrupts + * are disabled. No locks should be taken within this entry point. + * + * 1.11 void dtps_destroy(void *arg, dtrace_id_t id, void *parg) + * + * 1.11.1 Overview + * + * Called to destroy the specified probe. + * + * 1.11.2 Arguments and notes + * + * The first argument is the cookie as passed to dtrace_register(). The + * second argument is the identifier of the probe to be destroyed. The third + * argument is the probe argument as passed to dtrace_probe_create(). The + * provider should free all state associated with the probe. The framework + * guarantees that dtps_destroy() is only called for probes that have either + * been disabled via dtps_disable() or were never enabled via dtps_enable(). + * Once dtps_disable() has been called for a probe, no further call will be + * made specifying the probe. + * + * 1.11.3 Return value + * + * None. + * + * 1.11.4 Caller's context + * + * The DTrace framework is locked in such a way that it may not be called + * back into at all. mod_lock is held. cpu_lock is not held, and may not be + * acquired. + * + * + * 2 Provider-to-Framework API + * + * 2.1 Overview + * + * The Provider-to-Framework API provides the mechanism for the provider to + * register itself with the DTrace framework, to create probes, to lookup + * probes and (most importantly) to fire probes. The Provider-to-Framework + * consists of: + * + * dtrace_register() <-- Register a provider with the DTrace framework + * dtrace_unregister() <-- Remove a provider's DTrace registration + * dtrace_invalidate() <-- Invalidate the specified provider + * dtrace_condense() <-- Remove a provider's unenabled probes + * dtrace_attached() <-- Indicates whether or not DTrace has attached + * dtrace_probe_create() <-- Create a DTrace probe + * dtrace_probe_lookup() <-- Lookup a DTrace probe based on its name + * dtrace_probe_arg() <-- Return the probe argument for a specific probe + * dtrace_probe() <-- Fire the specified probe + * + * 2.2 int dtrace_register(const char *name, const dtrace_pattr_t *pap, + * uint32_t priv, cred_t *cr, const dtrace_pops_t *pops, void *arg, + * dtrace_provider_id_t *idp) + * + * 2.2.1 Overview + * + * dtrace_register() registers the calling provider with the DTrace + * framework. It should generally be called by DTrace providers in their + * attach(9E) entry point. + * + * 2.2.2 Arguments and Notes + * + * The first argument is the name of the provider. The second argument is a + * pointer to the stability attributes for the provider. The third argument + * is the privilege flags for the provider, and must be some combination of: + * + * DTRACE_PRIV_NONE <= All users may enable probes from this provider + * + * DTRACE_PRIV_PROC <= Any user with privilege of PRIV_DTRACE_PROC may + * enable probes from this provider + * + * DTRACE_PRIV_USER <= Any user with privilege of PRIV_DTRACE_USER may + * enable probes from this provider + * + * DTRACE_PRIV_KERNEL <= Any user with privilege of PRIV_DTRACE_KERNEL + * may enable probes from this provider + * + * DTRACE_PRIV_OWNER <= This flag places an additional constraint on + * the privilege requirements above. These probes + * require either (a) a user ID matching the user + * ID of the cred passed in the fourth argument + * or (b) the PRIV_PROC_OWNER privilege. + * + * DTRACE_PRIV_ZONEOWNER<= This flag places an additional constraint on + * the privilege requirements above. These probes + * require either (a) a zone ID matching the zone + * ID of the cred passed in the fourth argument + * or (b) the PRIV_PROC_ZONE privilege. + * + * Note that these flags designate the _visibility_ of the probes, not + * the conditions under which they may or may not fire. + * + * The fourth argument is the credential that is associated with the + * provider. This argument should be NULL if the privilege flags don't + * include DTRACE_PRIV_OWNER or DTRACE_PRIV_ZONEOWNER. If non-NULL, the + * framework stashes the uid and zoneid represented by this credential + * for use at probe-time, in implicit predicates. These limit visibility + * of the probes to users and/or zones which have sufficient privilege to + * access them. + * + * The fifth argument is a DTrace provider operations vector, which provides + * the implementation for the Framework-to-Provider API. (See Section 1, + * above.) This must be non-NULL, and each member must be non-NULL. The + * exceptions to this are (1) the dtps_provide() and dtps_provide_module() + * members (if the provider so desires, _one_ of these members may be left + * NULL -- denoting that the provider only implements the other) and (2) + * the dtps_suspend() and dtps_resume() members, which must either both be + * NULL or both be non-NULL. + * + * The sixth argument is a cookie to be specified as the first argument for + * each function in the Framework-to-Provider API. This argument may have + * any value. + * + * The final argument is a pointer to dtrace_provider_id_t. If + * dtrace_register() successfully completes, the provider identifier will be + * stored in the memory pointed to be this argument. This argument must be + * non-NULL. + * + * 2.2.3 Return value + * + * On success, dtrace_register() returns 0 and stores the new provider's + * identifier into the memory pointed to by the idp argument. On failure, + * dtrace_register() returns an errno: + * + * EINVAL The arguments passed to dtrace_register() were somehow invalid. + * This may because a parameter that must be non-NULL was NULL, + * because the name was invalid (either empty or an illegal + * provider name) or because the attributes were invalid. + * + * No other failure code is returned. + * + * 2.2.4 Caller's context + * + * dtrace_register() may induce calls to dtrace_provide(); the provider must + * hold no locks across dtrace_register() that may also be acquired by + * dtrace_provide(). cpu_lock and mod_lock must not be held. + * + * 2.3 int dtrace_unregister(dtrace_provider_t id) + * + * 2.3.1 Overview + * + * Unregisters the specified provider from the DTrace framework. It should + * generally be called by DTrace providers in their detach(9E) entry point. + * + * 2.3.2 Arguments and Notes + * + * The only argument is the provider identifier, as returned from a + * successful call to dtrace_register(). As a result of calling + * dtrace_unregister(), the DTrace framework will call back into the provider + * via the dtps_destroy() entry point. Once dtrace_unregister() successfully + * completes, however, the DTrace framework will no longer make calls through + * the Framework-to-Provider API. + * + * 2.3.3 Return value + * + * On success, dtrace_unregister returns 0. On failure, dtrace_unregister() + * returns an errno: + * + * EBUSY There are currently processes that have the DTrace pseudodevice + * open, or there exists an anonymous enabling that hasn't yet + * been claimed. + * + * No other failure code is returned. + * + * 2.3.4 Caller's context + * + * Because a call to dtrace_unregister() may induce calls through the + * Framework-to-Provider API, the caller may not hold any lock across + * dtrace_register() that is also acquired in any of the Framework-to- + * Provider API functions. Additionally, mod_lock may not be held. + * + * 2.4 void dtrace_invalidate(dtrace_provider_id_t id) + * + * 2.4.1 Overview + * + * Invalidates the specified provider. All subsequent probe lookups for the + * specified provider will fail, but its probes will not be removed. + * + * 2.4.2 Arguments and note + * + * The only argument is the provider identifier, as returned from a + * successful call to dtrace_register(). In general, a provider's probes + * always remain valid; dtrace_invalidate() is a mechanism for invalidating + * an entire provider, regardless of whether or not probes are enabled or + * not. Note that dtrace_invalidate() will _not_ prevent already enabled + * probes from firing -- it will merely prevent any new enablings of the + * provider's probes. + * + * 2.5 int dtrace_condense(dtrace_provider_id_t id) + * + * 2.5.1 Overview + * + * Removes all the unenabled probes for the given provider. This function is + * not unlike dtrace_unregister(), except that it doesn't remove the + * provider just as many of its associated probes as it can. + * + * 2.5.2 Arguments and Notes + * + * As with dtrace_unregister(), the sole argument is the provider identifier + * as returned from a successful call to dtrace_register(). As a result of + * calling dtrace_condense(), the DTrace framework will call back into the + * given provider's dtps_destroy() entry point for each of the provider's + * unenabled probes. + * + * 2.5.3 Return value + * + * Currently, dtrace_condense() always returns 0. However, consumers of this + * function should check the return value as appropriate; its behavior may + * change in the future. + * + * 2.5.4 Caller's context + * + * As with dtrace_unregister(), the caller may not hold any lock across + * dtrace_condense() that is also acquired in the provider's entry points. + * Also, mod_lock may not be held. + * + * 2.6 int dtrace_attached() + * + * 2.6.1 Overview + * + * Indicates whether or not DTrace has attached. + * + * 2.6.2 Arguments and Notes + * + * For most providers, DTrace makes initial contact beyond registration. + * That is, once a provider has registered with DTrace, it waits to hear + * from DTrace to create probes. However, some providers may wish to + * proactively create probes without first being told by DTrace to do so. + * If providers wish to do this, they must first call dtrace_attached() to + * determine if DTrace itself has attached. If dtrace_attached() returns 0, + * the provider must not make any other Provider-to-Framework API call. + * + * 2.6.3 Return value + * + * dtrace_attached() returns 1 if DTrace has attached, 0 otherwise. + * + * 2.7 int dtrace_probe_create(dtrace_provider_t id, const char *mod, + * const char *func, const char *name, int aframes, void *arg) + * + * 2.7.1 Overview + * + * Creates a probe with specified module name, function name, and name. + * + * 2.7.2 Arguments and Notes + * + * The first argument is the provider identifier, as returned from a + * successful call to dtrace_register(). The second, third, and fourth + * arguments are the module name, function name, and probe name, + * respectively. Of these, module name and function name may both be NULL + * (in which case the probe is considered to be unanchored), or they may both + * be non-NULL. The name must be non-NULL, and must point to a non-empty + * string. + * + * The fifth argument is the number of artificial stack frames that will be + * found on the stack when dtrace_probe() is called for the new probe. These + * artificial frames will be automatically be pruned should the stack() or + * stackdepth() functions be called as part of one of the probe's ECBs. If + * the parameter doesn't add an artificial frame, this parameter should be + * zero. + * + * The final argument is a probe argument that will be passed back to the + * provider when a probe-specific operation is called. (e.g., via + * dtps_enable(), dtps_disable(), etc.) + * + * Note that it is up to the provider to be sure that the probe that it + * creates does not already exist -- if the provider is unsure of the probe's + * existence, it should assure its absence with dtrace_probe_lookup() before + * calling dtrace_probe_create(). + * + * 2.7.3 Return value + * + * dtrace_probe_create() always succeeds, and always returns the identifier + * of the newly-created probe. + * + * 2.7.4 Caller's context + * + * While dtrace_probe_create() is generally expected to be called from + * dtps_provide() and/or dtps_provide_module(), it may be called from other + * non-DTrace contexts. Neither cpu_lock nor mod_lock may be held. + * + * 2.8 dtrace_id_t dtrace_probe_lookup(dtrace_provider_t id, const char *mod, + * const char *func, const char *name) + * + * 2.8.1 Overview + * + * Looks up a probe based on provdider and one or more of module name, + * function name and probe name. + * + * 2.8.2 Arguments and Notes + * + * The first argument is the provider identifier, as returned from a + * successful call to dtrace_register(). The second, third, and fourth + * arguments are the module name, function name, and probe name, + * respectively. Any of these may be NULL; dtrace_probe_lookup() will return + * the identifier of the first probe that is provided by the specified + * provider and matches all of the non-NULL matching criteria. + * dtrace_probe_lookup() is generally used by a provider to be check the + * existence of a probe before creating it with dtrace_probe_create(). + * + * 2.8.3 Return value + * + * If the probe exists, returns its identifier. If the probe does not exist, + * return DTRACE_IDNONE. + * + * 2.8.4 Caller's context + * + * While dtrace_probe_lookup() is generally expected to be called from + * dtps_provide() and/or dtps_provide_module(), it may also be called from + * other non-DTrace contexts. Neither cpu_lock nor mod_lock may be held. + * + * 2.9 void *dtrace_probe_arg(dtrace_provider_t id, dtrace_id_t probe) + * + * 2.9.1 Overview + * + * Returns the probe argument associated with the specified probe. + * + * 2.9.2 Arguments and Notes + * + * The first argument is the provider identifier, as returned from a + * successful call to dtrace_register(). The second argument is a probe + * identifier, as returned from dtrace_probe_lookup() or + * dtrace_probe_create(). This is useful if a probe has multiple + * provider-specific components to it: the provider can create the probe + * once with provider-specific state, and then add to the state by looking + * up the probe based on probe identifier. + * + * 2.9.3 Return value + * + * Returns the argument associated with the specified probe. If the + * specified probe does not exist, or if the specified probe is not provided + * by the specified provider, NULL is returned. + * + * 2.9.4 Caller's context + * + * While dtrace_probe_arg() is generally expected to be called from + * dtps_provide() and/or dtps_provide_module(), it may also be called from + * other non-DTrace contexts. Neither cpu_lock nor mod_lock may be held. + * + * 2.10 void dtrace_probe(dtrace_id_t probe, uintptr_t arg0, uintptr_t arg1, + * uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) + * + * 2.10.1 Overview + * + * The epicenter of DTrace: fires the specified probes with the specified + * arguments. + * + * 2.10.2 Arguments and Notes + * + * The first argument is a probe identifier as returned by + * dtrace_probe_create() or dtrace_probe_lookup(). The second through sixth + * arguments are the values to which the D variables "arg0" through "arg4" + * will be mapped. + * + * dtrace_probe() should be called whenever the specified probe has fired -- + * however the provider defines it. + * + * 2.10.3 Return value + * + * None. + * + * 2.10.4 Caller's context + * + * dtrace_probe() may be called in virtually any context: kernel, user, + * interrupt, high-level interrupt, with arbitrary adaptive locks held, with + * dispatcher locks held, with interrupts disabled, etc. The only latitude + * that must be afforded to DTrace is the ability to make calls within + * itself (and to its in-kernel subroutines) and the ability to access + * arbitrary (but mapped) memory. On some platforms, this constrains + * context. For example, on UltraSPARC, dtrace_probe() cannot be called + * from any context in which TL is greater than zero. dtrace_probe() may + * also not be called from any routine which may be called by dtrace_probe() + * -- which includes functions in the DTrace framework and some in-kernel + * DTrace subroutines. All such functions "dtrace_"; providers that + * instrument the kernel arbitrarily should be sure to not instrument these + * routines. + */ +typedef struct dtrace_pops { + void (*dtps_provide)(void *arg, dtrace_probedesc_t *spec); + void (*dtps_provide_module)(void *arg, modctl_t *mp); + void (*dtps_enable)(void *arg, dtrace_id_t id, void *parg); + void (*dtps_disable)(void *arg, dtrace_id_t id, void *parg); + void (*dtps_suspend)(void *arg, dtrace_id_t id, void *parg); + void (*dtps_resume)(void *arg, dtrace_id_t id, void *parg); + void (*dtps_getargdesc)(void *arg, dtrace_id_t id, void *parg, + dtrace_argdesc_t *desc); + uint64_t (*dtps_getargval)(void *arg, dtrace_id_t id, void *parg, + int argno, int aframes); + int (*dtps_usermode)(void *arg, dtrace_id_t id, void *parg); + void (*dtps_destroy)(void *arg, dtrace_id_t id, void *parg); +} dtrace_pops_t; + +#define DTRACE_MODE_KERNEL 0x01 +#define DTRACE_MODE_USER 0x02 +#define DTRACE_MODE_NOPRIV_DROP 0x10 +#define DTRACE_MODE_NOPRIV_RESTRICT 0x20 +#define DTRACE_MODE_LIMITEDPRIV_RESTRICT 0x40 + +typedef uintptr_t dtrace_provider_id_t; + +extern int dtrace_register(const char *, const dtrace_pattr_t *, uint32_t, + cred_t *, const dtrace_pops_t *, void *, dtrace_provider_id_t *); +extern int dtrace_unregister(dtrace_provider_id_t); +extern int dtrace_condense(dtrace_provider_id_t); +extern void dtrace_invalidate(dtrace_provider_id_t); +extern dtrace_id_t dtrace_probe_lookup(dtrace_provider_id_t, char *, + char *, char *); +extern dtrace_id_t dtrace_probe_create(dtrace_provider_id_t, const char *, + const char *, const char *, int, void *); +extern void *dtrace_probe_arg(dtrace_provider_id_t, dtrace_id_t); +extern void dtrace_probe(dtrace_id_t, uintptr_t arg0, uintptr_t arg1, + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4); + +/* + * DTrace Meta Provider API + * + * The following functions are implemented by the DTrace framework and are + * used to implement meta providers. Meta providers plug into the DTrace + * framework and are used to instantiate new providers on the fly. At + * present, there is only one type of meta provider and only one meta + * provider may be registered with the DTrace framework at a time. The + * sole meta provider type provides user-land static tracing facilities + * by taking meta probe descriptions and adding a corresponding provider + * into the DTrace framework. + * + * 1 Framework-to-Provider + * + * 1.1 Overview + * + * The Framework-to-Provider API is represented by the dtrace_mops structure + * that the meta provider passes to the framework when registering itself as + * a meta provider. This structure consists of the following members: + * + * dtms_create_probe() <-- Add a new probe to a created provider + * dtms_provide_pid() <-- Create a new provider for a given process + * dtms_remove_pid() <-- Remove a previously created provider + * + * 1.2 void dtms_create_probe(void *arg, void *parg, + * dtrace_helper_probedesc_t *probedesc); + * + * 1.2.1 Overview + * + * Called by the DTrace framework to create a new probe in a provider + * created by this meta provider. + * + * 1.2.2 Arguments and notes + * + * The first argument is the cookie as passed to dtrace_meta_register(). + * The second argument is the provider cookie for the associated provider; + * this is obtained from the return value of dtms_provide_pid(). The third + * argument is the helper probe description. + * + * 1.2.3 Return value + * + * None + * + * 1.2.4 Caller's context + * + * dtms_create_probe() is called from either ioctl() or module load context + * in the context of a newly-created provider (that is, a provider that + * is a result of a call to dtms_provide_pid()). The DTrace framework is + * locked in such a way that meta providers may not register or unregister, + * such that no other thread can call into a meta provider operation and that + * atomicity is assured with respect to meta provider operations across + * dtms_provide_pid() and subsequent calls to dtms_create_probe(). + * The context is thus effectively single-threaded with respect to the meta + * provider, and that the meta provider cannot call dtrace_meta_register() + * or dtrace_meta_unregister(). However, the context is such that the + * provider may (and is expected to) call provider-related DTrace provider + * APIs including dtrace_probe_create(). + * + * 1.3 void *dtms_provide_pid(void *arg, dtrace_meta_provider_t *mprov, + * pid_t pid) + * + * 1.3.1 Overview + * + * Called by the DTrace framework to instantiate a new provider given the + * description of the provider and probes in the mprov argument. The + * meta provider should call dtrace_register() to insert the new provider + * into the DTrace framework. + * + * 1.3.2 Arguments and notes + * + * The first argument is the cookie as passed to dtrace_meta_register(). + * The second argument is a pointer to a structure describing the new + * helper provider. The third argument is the process identifier for + * process associated with this new provider. Note that the name of the + * provider as passed to dtrace_register() should be the contatenation of + * the dtmpb_provname member of the mprov argument and the processs + * identifier as a string. + * + * 1.3.3 Return value + * + * The cookie for the provider that the meta provider creates. This is + * the same value that it passed to dtrace_register(). + * + * 1.3.4 Caller's context + * + * dtms_provide_pid() is called from either ioctl() or module load context. + * The DTrace framework is locked in such a way that meta providers may not + * register or unregister. This means that the meta provider cannot call + * dtrace_meta_register() or dtrace_meta_unregister(). However, the context + * is such that the provider may -- and is expected to -- call + * provider-related DTrace provider APIs including dtrace_register(). + * + * 1.4 void dtms_remove_pid(void *arg, dtrace_meta_provider_t *mprov, + * pid_t pid) + * + * 1.4.1 Overview + * + * Called by the DTrace framework to remove a provider that had previously + * been instantiated via the dtms_provide_pid() entry point. The meta + * provider need not remove the provider immediately, but this entry + * point indicates that the provider should be removed as soon as possible + * using the dtrace_unregister() API. + * + * 1.4.2 Arguments and notes + * + * The first argument is the cookie as passed to dtrace_meta_register(). + * The second argument is a pointer to a structure describing the helper + * provider. The third argument is the process identifier for process + * associated with this new provider. + * + * 1.4.3 Return value + * + * None + * + * 1.4.4 Caller's context + * + * dtms_remove_pid() is called from either ioctl() or exit() context. + * The DTrace framework is locked in such a way that meta providers may not + * register or unregister. This means that the meta provider cannot call + * dtrace_meta_register() or dtrace_meta_unregister(). However, the context + * is such that the provider may -- and is expected to -- call + * provider-related DTrace provider APIs including dtrace_unregister(). + */ +typedef struct dtrace_helper_probedesc { + char *dthpb_mod; /* probe module */ + char *dthpb_func; /* probe function */ + char *dthpb_name; /* probe name */ + uint64_t dthpb_base; /* base address */ + uint32_t *dthpb_offs; /* offsets array */ + uint32_t *dthpb_enoffs; /* is-enabled offsets array */ + uint32_t dthpb_noffs; /* offsets count */ + uint32_t dthpb_nenoffs; /* is-enabled offsets count */ + uint8_t *dthpb_args; /* argument mapping array */ + uint8_t dthpb_xargc; /* translated argument count */ + uint8_t dthpb_nargc; /* native argument count */ + char *dthpb_xtypes; /* translated types strings */ + char *dthpb_ntypes; /* native types strings */ +} dtrace_helper_probedesc_t; + +typedef struct dtrace_helper_provdesc { + char *dthpv_provname; /* provider name */ + dtrace_pattr_t dthpv_pattr; /* stability attributes */ +} dtrace_helper_provdesc_t; + +typedef struct dtrace_mops { + void (*dtms_create_probe)(void *, void *, dtrace_helper_probedesc_t *); + void *(*dtms_provide_pid)(void *, dtrace_helper_provdesc_t *, pid_t); + void (*dtms_remove_pid)(void *, dtrace_helper_provdesc_t *, pid_t); +} dtrace_mops_t; + +typedef uintptr_t dtrace_meta_provider_id_t; + +extern int dtrace_meta_register(const char *, const dtrace_mops_t *, void *, + dtrace_meta_provider_id_t *); +extern int dtrace_meta_unregister(dtrace_meta_provider_id_t); + +/* + * DTrace Kernel Hooks + * + * The following functions are implemented by the base kernel and form a set of + * hooks used by the DTrace framework. DTrace hooks are implemented in either + * uts/common/os/dtrace_subr.c, an ISA-specific assembly file, or in a + * uts/<platform>/os/dtrace_subr.c corresponding to each hardware platform. + */ + +typedef enum dtrace_vtime_state { + DTRACE_VTIME_INACTIVE = 0, /* No DTrace, no TNF */ + DTRACE_VTIME_ACTIVE, /* DTrace virtual time, no TNF */ + DTRACE_VTIME_INACTIVE_TNF, /* No DTrace, TNF active */ + DTRACE_VTIME_ACTIVE_TNF /* DTrace virtual time _and_ TNF */ +} dtrace_vtime_state_t; + +#ifdef illumos +extern dtrace_vtime_state_t dtrace_vtime_active; +#endif +extern void dtrace_vtime_switch(kthread_t *next); +extern void dtrace_vtime_enable_tnf(void); +extern void dtrace_vtime_disable_tnf(void); +extern void dtrace_vtime_enable(void); +extern void dtrace_vtime_disable(void); + +struct regs; +struct reg; + +#ifdef illumos +extern int (*dtrace_pid_probe_ptr)(struct reg *); +extern int (*dtrace_return_probe_ptr)(struct reg *); +extern void (*dtrace_fasttrap_fork_ptr)(proc_t *, proc_t *); +extern void (*dtrace_fasttrap_exec_ptr)(proc_t *); +extern void (*dtrace_fasttrap_exit_ptr)(proc_t *); +extern void dtrace_fasttrap_fork(proc_t *, proc_t *); +#endif + +typedef uintptr_t dtrace_icookie_t; +typedef void (*dtrace_xcall_t)(void *); + +extern dtrace_icookie_t dtrace_interrupt_disable(void); +extern void dtrace_interrupt_enable(dtrace_icookie_t); + +extern void dtrace_membar_producer(void); +extern void dtrace_membar_consumer(void); + +extern void (*dtrace_cpu_init)(processorid_t); +#ifdef illumos +extern void (*dtrace_modload)(modctl_t *); +extern void (*dtrace_modunload)(modctl_t *); +#endif +extern void (*dtrace_helpers_cleanup)(void); +extern void (*dtrace_helpers_fork)(proc_t *parent, proc_t *child); +extern void (*dtrace_cpustart_init)(void); +extern void (*dtrace_cpustart_fini)(void); +extern void (*dtrace_closef)(void); + +extern void (*dtrace_debugger_init)(void); +extern void (*dtrace_debugger_fini)(void); +extern dtrace_cacheid_t dtrace_predcache_id; + +#ifdef illumos +extern hrtime_t dtrace_gethrtime(void); +#else +void dtrace_debug_printf(const char *, ...) __printflike(1, 2); +#endif +extern void dtrace_sync(void); +extern void dtrace_toxic_ranges(void (*)(uintptr_t, uintptr_t)); +extern void dtrace_xcall(processorid_t, dtrace_xcall_t, void *); +extern void dtrace_vpanic(const char *, __va_list); +extern void dtrace_panic(const char *, ...); + +extern int dtrace_safe_defer_signal(void); +extern void dtrace_safe_synchronous_signal(void); + +extern int dtrace_mach_aframes(void); + +#if defined(__i386) || defined(__amd64) +extern int dtrace_instr_size(uchar_t *instr); +extern int dtrace_instr_size_isa(uchar_t *, model_t, int *); +extern void dtrace_invop_callsite(void); +#endif +extern void dtrace_invop_add(int (*)(uintptr_t, struct trapframe *, uintptr_t)); +extern void dtrace_invop_remove(int (*)(uintptr_t, struct trapframe *, + uintptr_t)); + +#ifdef __sparc +extern int dtrace_blksuword32(uintptr_t, uint32_t *, int); +extern void dtrace_getfsr(uint64_t *); +#endif + +#ifndef illumos +extern void dtrace_helpers_duplicate(proc_t *, proc_t *); +extern void dtrace_helpers_destroy(proc_t *); +#endif + +#define DTRACE_CPUFLAG_ISSET(flag) \ + (cpu_core[curcpu].cpuc_dtrace_flags & (flag)) + +#define DTRACE_CPUFLAG_SET(flag) \ + (cpu_core[curcpu].cpuc_dtrace_flags |= (flag)) + +#define DTRACE_CPUFLAG_CLEAR(flag) \ + (cpu_core[curcpu].cpuc_dtrace_flags &= ~(flag)) + +#endif /* _KERNEL */ + +#endif /* _ASM */ + +#if defined(__i386) || defined(__amd64) + +#define DTRACE_INVOP_PUSHL_EBP 1 +#define DTRACE_INVOP_PUSHQ_RBP DTRACE_INVOP_PUSHL_EBP +#define DTRACE_INVOP_POPL_EBP 2 +#define DTRACE_INVOP_POPQ_RBP DTRACE_INVOP_POPL_EBP +#define DTRACE_INVOP_LEAVE 3 +#define DTRACE_INVOP_NOP 4 +#define DTRACE_INVOP_RET 5 + +#elif defined(__powerpc__) + +#define DTRACE_INVOP_BCTR 1 +#define DTRACE_INVOP_BLR 2 +#define DTRACE_INVOP_JUMP 3 +#define DTRACE_INVOP_MFLR_R0 4 +#define DTRACE_INVOP_NOP 5 + +#elif defined(__arm__) + +#define DTRACE_INVOP_SHIFT 4 +#define DTRACE_INVOP_MASK ((1 << DTRACE_INVOP_SHIFT) - 1) +#define DTRACE_INVOP_DATA(x) ((x) >> DTRACE_INVOP_SHIFT) + +#define DTRACE_INVOP_PUSHM 1 +#define DTRACE_INVOP_POPM 2 +#define DTRACE_INVOP_B 3 + +#elif defined(__aarch64__) + +#define INSN_SIZE 4 + +#define B_MASK 0xff000000 +#define B_DATA_MASK 0x00ffffff +#define B_INSTR 0x14000000 + +#define RET_INSTR 0xd65f03c0 + +#define LDP_STP_MASK 0xffc00000 +#define STP_32 0x29800000 +#define STP_64 0xa9800000 +#define LDP_32 0x28c00000 +#define LDP_64 0xa8c00000 +#define LDP_STP_PREIND (1 << 24) +#define LDP_STP_DIR (1 << 22) /* Load instruction */ +#define ARG1_SHIFT 0 +#define ARG1_MASK 0x1f +#define ARG2_SHIFT 10 +#define ARG2_MASK 0x1f +#define OFFSET_SHIFT 15 +#define OFFSET_SIZE 7 +#define OFFSET_MASK ((1 << OFFSET_SIZE) - 1) + +#define DTRACE_INVOP_PUSHM 1 +#define DTRACE_INVOP_RET 2 +#define DTRACE_INVOP_B 3 + +#elif defined(__mips__) + +#define INSN_SIZE 4 + +/* Load/Store double RA to/from SP */ +#define LDSD_RA_SP_MASK 0xffff0000 +#define LDSD_DATA_MASK 0x0000ffff +#define SD_RA_SP 0xffbf0000 +#define LD_RA_SP 0xdfbf0000 + +#define DTRACE_INVOP_SD 1 +#define DTRACE_INVOP_LD 2 + +#elif defined(__riscv) + +#define DTRACE_INVOP_SD 1 +#define DTRACE_INVOP_C_SDSP 2 +#define DTRACE_INVOP_RET 3 +#define DTRACE_INVOP_C_RET 4 +#define DTRACE_INVOP_NOP 5 + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DTRACE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h new file mode 100644 index 000000000000..0b8df9834fa6 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h @@ -0,0 +1,1351 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2016 Joyent, Inc. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DTRACE_IMPL_H +#define _SYS_DTRACE_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * DTrace Dynamic Tracing Software: Kernel Implementation Interfaces + * + * Note: The contents of this file are private to the implementation of the + * Solaris system and DTrace subsystem and are subject to change at any time + * without notice. Applications and drivers using these interfaces will fail + * to run on future releases. These interfaces should not be used for any + * purpose except those expressly outlined in dtrace(7D) and libdtrace(3LIB). + * Please refer to the "Solaris Dynamic Tracing Guide" for more information. + */ + +#include <sys/dtrace.h> + +#ifndef illumos +#ifdef __sparcv9 +typedef uint32_t pc_t; +#else +typedef uintptr_t pc_t; +#endif +typedef u_long greg_t; +#endif + +/* + * DTrace Implementation Constants and Typedefs + */ +#define DTRACE_MAXPROPLEN 128 +#define DTRACE_DYNVAR_CHUNKSIZE 256 + +#ifdef __FreeBSD__ +#define NCPU MAXCPU +#endif /* __FreeBSD__ */ + +struct dtrace_probe; +struct dtrace_ecb; +struct dtrace_predicate; +struct dtrace_action; +struct dtrace_provider; +struct dtrace_state; + +typedef struct dtrace_probe dtrace_probe_t; +typedef struct dtrace_ecb dtrace_ecb_t; +typedef struct dtrace_predicate dtrace_predicate_t; +typedef struct dtrace_action dtrace_action_t; +typedef struct dtrace_provider dtrace_provider_t; +typedef struct dtrace_meta dtrace_meta_t; +typedef struct dtrace_state dtrace_state_t; +typedef uint32_t dtrace_optid_t; +typedef uint32_t dtrace_specid_t; +typedef uint64_t dtrace_genid_t; + +/* + * DTrace Probes + * + * The probe is the fundamental unit of the DTrace architecture. Probes are + * created by DTrace providers, and managed by the DTrace framework. A probe + * is identified by a unique <provider, module, function, name> tuple, and has + * a unique probe identifier assigned to it. (Some probes are not associated + * with a specific point in text; these are called _unanchored probes_ and have + * no module or function associated with them.) Probes are represented as a + * dtrace_probe structure. To allow quick lookups based on each element of the + * probe tuple, probes are hashed by each of provider, module, function and + * name. (If a lookup is performed based on a regular expression, a + * dtrace_probekey is prepared, and a linear search is performed.) Each probe + * is additionally pointed to by a linear array indexed by its identifier. The + * identifier is the provider's mechanism for indicating to the DTrace + * framework that a probe has fired: the identifier is passed as the first + * argument to dtrace_probe(), where it is then mapped into the corresponding + * dtrace_probe structure. From the dtrace_probe structure, dtrace_probe() can + * iterate over the probe's list of enabling control blocks; see "DTrace + * Enabling Control Blocks", below.) + */ +struct dtrace_probe { + dtrace_id_t dtpr_id; /* probe identifier */ + dtrace_ecb_t *dtpr_ecb; /* ECB list; see below */ + dtrace_ecb_t *dtpr_ecb_last; /* last ECB in list */ + void *dtpr_arg; /* provider argument */ + dtrace_cacheid_t dtpr_predcache; /* predicate cache ID */ + int dtpr_aframes; /* artificial frames */ + dtrace_provider_t *dtpr_provider; /* pointer to provider */ + char *dtpr_mod; /* probe's module name */ + char *dtpr_func; /* probe's function name */ + char *dtpr_name; /* probe's name */ + dtrace_probe_t *dtpr_nextmod; /* next in module hash */ + dtrace_probe_t *dtpr_prevmod; /* previous in module hash */ + dtrace_probe_t *dtpr_nextfunc; /* next in function hash */ + dtrace_probe_t *dtpr_prevfunc; /* previous in function hash */ + dtrace_probe_t *dtpr_nextname; /* next in name hash */ + dtrace_probe_t *dtpr_prevname; /* previous in name hash */ + dtrace_genid_t dtpr_gen; /* probe generation ID */ +}; + +typedef int dtrace_probekey_f(const char *, const char *, int); + +typedef struct dtrace_probekey { + char *dtpk_prov; /* provider name to match */ + dtrace_probekey_f *dtpk_pmatch; /* provider matching function */ + char *dtpk_mod; /* module name to match */ + dtrace_probekey_f *dtpk_mmatch; /* module matching function */ + char *dtpk_func; /* func name to match */ + dtrace_probekey_f *dtpk_fmatch; /* func matching function */ + char *dtpk_name; /* name to match */ + dtrace_probekey_f *dtpk_nmatch; /* name matching function */ + dtrace_id_t dtpk_id; /* identifier to match */ +} dtrace_probekey_t; + +typedef struct dtrace_hashbucket { + struct dtrace_hashbucket *dthb_next; /* next on hash chain */ + dtrace_probe_t *dthb_chain; /* chain of probes */ + int dthb_len; /* number of probes here */ +} dtrace_hashbucket_t; + +typedef struct dtrace_hash { + dtrace_hashbucket_t **dth_tab; /* hash table */ + int dth_size; /* size of hash table */ + int dth_mask; /* mask to index into table */ + int dth_nbuckets; /* total number of buckets */ + uintptr_t dth_nextoffs; /* offset of next in probe */ + uintptr_t dth_prevoffs; /* offset of prev in probe */ + uintptr_t dth_stroffs; /* offset of str in probe */ +} dtrace_hash_t; + +/* + * DTrace Enabling Control Blocks + * + * When a provider wishes to fire a probe, it calls into dtrace_probe(), + * passing the probe identifier as the first argument. As described above, + * dtrace_probe() maps the identifier into a pointer to a dtrace_probe_t + * structure. This structure contains information about the probe, and a + * pointer to the list of Enabling Control Blocks (ECBs). Each ECB points to + * DTrace consumer state, and contains an optional predicate, and a list of + * actions. (Shown schematically below.) The ECB abstraction allows a single + * probe to be multiplexed across disjoint consumers, or across disjoint + * enablings of a single probe within one consumer. + * + * Enabling Control Block + * dtrace_ecb_t + * +------------------------+ + * | dtrace_epid_t ---------+--------------> Enabled Probe ID (EPID) + * | dtrace_state_t * ------+--------------> State associated with this ECB + * | dtrace_predicate_t * --+---------+ + * | dtrace_action_t * -----+----+ | + * | dtrace_ecb_t * ---+ | | | Predicate (if any) + * +-------------------+----+ | | dtrace_predicate_t + * | | +---> +--------------------+ + * | | | dtrace_difo_t * ---+----> DIFO + * | | +--------------------+ + * | | + * Next ECB | | Action + * (if any) | | dtrace_action_t + * : +--> +-------------------+ + * : | dtrace_actkind_t -+------> kind + * v | dtrace_difo_t * --+------> DIFO (if any) + * | dtrace_recdesc_t -+------> record descr. + * | dtrace_action_t * +------+ + * +-------------------+ | + * | Next action + * +-------------------------------+ (if any) + * | + * | Action + * | dtrace_action_t + * +--> +-------------------+ + * | dtrace_actkind_t -+------> kind + * | dtrace_difo_t * --+------> DIFO (if any) + * | dtrace_action_t * +------+ + * +-------------------+ | + * | Next action + * +-------------------------------+ (if any) + * | + * : + * v + * + * + * dtrace_probe() iterates over the ECB list. If the ECB needs less space + * than is available in the principal buffer, the ECB is processed: if the + * predicate is non-NULL, the DIF object is executed. If the result is + * non-zero, the action list is processed, with each action being executed + * accordingly. When the action list has been completely executed, processing + * advances to the next ECB. The ECB abstraction allows disjoint consumers + * to multiplex on single probes. + * + * Execution of the ECB results in consuming dte_size bytes in the buffer + * to record data. During execution, dte_needed bytes must be available in + * the buffer. This space is used for both recorded data and tuple data. + */ +struct dtrace_ecb { + dtrace_epid_t dte_epid; /* enabled probe ID */ + uint32_t dte_alignment; /* required alignment */ + size_t dte_needed; /* space needed for execution */ + size_t dte_size; /* size of recorded payload */ + dtrace_predicate_t *dte_predicate; /* predicate, if any */ + dtrace_action_t *dte_action; /* actions, if any */ + dtrace_ecb_t *dte_next; /* next ECB on probe */ + dtrace_state_t *dte_state; /* pointer to state */ + uint32_t dte_cond; /* security condition */ + dtrace_probe_t *dte_probe; /* pointer to probe */ + dtrace_action_t *dte_action_last; /* last action on ECB */ + uint64_t dte_uarg; /* library argument */ +}; + +struct dtrace_predicate { + dtrace_difo_t *dtp_difo; /* DIF object */ + dtrace_cacheid_t dtp_cacheid; /* cache identifier */ + int dtp_refcnt; /* reference count */ +}; + +struct dtrace_action { + dtrace_actkind_t dta_kind; /* kind of action */ + uint16_t dta_intuple; /* boolean: in aggregation */ + uint32_t dta_refcnt; /* reference count */ + dtrace_difo_t *dta_difo; /* pointer to DIFO */ + dtrace_recdesc_t dta_rec; /* record description */ + dtrace_action_t *dta_prev; /* previous action */ + dtrace_action_t *dta_next; /* next action */ +}; + +typedef struct dtrace_aggregation { + dtrace_action_t dtag_action; /* action; must be first */ + dtrace_aggid_t dtag_id; /* identifier */ + dtrace_ecb_t *dtag_ecb; /* corresponding ECB */ + dtrace_action_t *dtag_first; /* first action in tuple */ + uint32_t dtag_base; /* base of aggregation */ + uint8_t dtag_hasarg; /* boolean: has argument */ + uint64_t dtag_initial; /* initial value */ + void (*dtag_aggregate)(uint64_t *, uint64_t, uint64_t); +} dtrace_aggregation_t; + +/* + * DTrace Buffers + * + * Principal buffers, aggregation buffers, and speculative buffers are all + * managed with the dtrace_buffer structure. By default, this structure + * includes twin data buffers -- dtb_tomax and dtb_xamot -- that serve as the + * active and passive buffers, respectively. For speculative buffers, + * dtb_xamot will be NULL; for "ring" and "fill" buffers, dtb_xamot will point + * to a scratch buffer. For all buffer types, the dtrace_buffer structure is + * always allocated on a per-CPU basis; a single dtrace_buffer structure is + * never shared among CPUs. (That is, there is never true sharing of the + * dtrace_buffer structure; to prevent false sharing of the structure, it must + * always be aligned to the coherence granularity -- generally 64 bytes.) + * + * One of the critical design decisions of DTrace is that a given ECB always + * stores the same quantity and type of data. This is done to assure that the + * only metadata required for an ECB's traced data is the EPID. That is, from + * the EPID, the consumer can determine the data layout. (The data buffer + * layout is shown schematically below.) By assuring that one can determine + * data layout from the EPID, the metadata stream can be separated from the + * data stream -- simplifying the data stream enormously. The ECB always + * proceeds the recorded data as part of the dtrace_rechdr_t structure that + * includes the EPID and a high-resolution timestamp used for output ordering + * consistency. + * + * base of data buffer ---> +--------+--------------------+--------+ + * | rechdr | data | rechdr | + * +--------+------+--------+----+--------+ + * | data | rechdr | data | + * +---------------+--------+-------------+ + * | data, cont. | + * +--------+--------------------+--------+ + * | rechdr | data | | + * +--------+--------------------+ | + * | || | + * | || | + * | \/ | + * : : + * . . + * . . + * . . + * : : + * | | + * limit of data buffer ---> +--------------------------------------+ + * + * When evaluating an ECB, dtrace_probe() determines if the ECB's needs of the + * principal buffer (both scratch and payload) exceed the available space. If + * the ECB's needs exceed available space (and if the principal buffer policy + * is the default "switch" policy), the ECB is dropped, the buffer's drop count + * is incremented, and processing advances to the next ECB. If the ECB's needs + * can be met with the available space, the ECB is processed, but the offset in + * the principal buffer is only advanced if the ECB completes processing + * without error. + * + * When a buffer is to be switched (either because the buffer is the principal + * buffer with a "switch" policy or because it is an aggregation buffer), a + * cross call is issued to the CPU associated with the buffer. In the cross + * call context, interrupts are disabled, and the active and the inactive + * buffers are atomically switched. This involves switching the data pointers, + * copying the various state fields (offset, drops, errors, etc.) into their + * inactive equivalents, and clearing the state fields. Because interrupts are + * disabled during this procedure, the switch is guaranteed to appear atomic to + * dtrace_probe(). + * + * DTrace Ring Buffering + * + * To process a ring buffer correctly, one must know the oldest valid record. + * Processing starts at the oldest record in the buffer and continues until + * the end of the buffer is reached. Processing then resumes starting with + * the record stored at offset 0 in the buffer, and continues until the + * youngest record is processed. If trace records are of a fixed-length, + * determining the oldest record is trivial: + * + * - If the ring buffer has not wrapped, the oldest record is the record + * stored at offset 0. + * + * - If the ring buffer has wrapped, the oldest record is the record stored + * at the current offset. + * + * With variable length records, however, just knowing the current offset + * doesn't suffice for determining the oldest valid record: assuming that one + * allows for arbitrary data, one has no way of searching forward from the + * current offset to find the oldest valid record. (That is, one has no way + * of separating data from metadata.) It would be possible to simply refuse to + * process any data in the ring buffer between the current offset and the + * limit, but this leaves (potentially) an enormous amount of otherwise valid + * data unprocessed. + * + * To effect ring buffering, we track two offsets in the buffer: the current + * offset and the _wrapped_ offset. If a request is made to reserve some + * amount of data, and the buffer has wrapped, the wrapped offset is + * incremented until the wrapped offset minus the current offset is greater + * than or equal to the reserve request. This is done by repeatedly looking + * up the ECB corresponding to the EPID at the current wrapped offset, and + * incrementing the wrapped offset by the size of the data payload + * corresponding to that ECB. If this offset is greater than or equal to the + * limit of the data buffer, the wrapped offset is set to 0. Thus, the + * current offset effectively "chases" the wrapped offset around the buffer. + * Schematically: + * + * base of data buffer ---> +------+--------------------+------+ + * | EPID | data | EPID | + * +------+--------+------+----+------+ + * | data | EPID | data | + * +---------------+------+-----------+ + * | data, cont. | + * +------+---------------------------+ + * | EPID | data | + * current offset ---> +------+---------------------------+ + * | invalid data | + * wrapped offset ---> +------+--------------------+------+ + * | EPID | data | EPID | + * +------+--------+------+----+------+ + * | data | EPID | data | + * +---------------+------+-----------+ + * : : + * . . + * . ... valid data ... . + * . . + * : : + * +------+-------------+------+------+ + * | EPID | data | EPID | data | + * +------+------------++------+------+ + * | data, cont. | leftover | + * limit of data buffer ---> +-------------------+--------------+ + * + * If the amount of requested buffer space exceeds the amount of space + * available between the current offset and the end of the buffer: + * + * (1) all words in the data buffer between the current offset and the limit + * of the data buffer (marked "leftover", above) are set to + * DTRACE_EPIDNONE + * + * (2) the wrapped offset is set to zero + * + * (3) the iteration process described above occurs until the wrapped offset + * is greater than the amount of desired space. + * + * The wrapped offset is implemented by (re-)using the inactive offset. + * In a "switch" buffer policy, the inactive offset stores the offset in + * the inactive buffer; in a "ring" buffer policy, it stores the wrapped + * offset. + * + * DTrace Scratch Buffering + * + * Some ECBs may wish to allocate dynamically-sized temporary scratch memory. + * To accommodate such requests easily, scratch memory may be allocated in + * the buffer beyond the current offset plus the needed memory of the current + * ECB. If there isn't sufficient room in the buffer for the requested amount + * of scratch space, the allocation fails and an error is generated. Scratch + * memory is tracked in the dtrace_mstate_t and is automatically freed when + * the ECB ceases processing. Note that ring buffers cannot allocate their + * scratch from the principal buffer -- lest they needlessly overwrite older, + * valid data. Ring buffers therefore have their own dedicated scratch buffer + * from which scratch is allocated. + */ +#define DTRACEBUF_RING 0x0001 /* bufpolicy set to "ring" */ +#define DTRACEBUF_FILL 0x0002 /* bufpolicy set to "fill" */ +#define DTRACEBUF_NOSWITCH 0x0004 /* do not switch buffer */ +#define DTRACEBUF_WRAPPED 0x0008 /* ring buffer has wrapped */ +#define DTRACEBUF_DROPPED 0x0010 /* drops occurred */ +#define DTRACEBUF_ERROR 0x0020 /* errors occurred */ +#define DTRACEBUF_FULL 0x0040 /* "fill" buffer is full */ +#define DTRACEBUF_CONSUMED 0x0080 /* buffer has been consumed */ +#define DTRACEBUF_INACTIVE 0x0100 /* buffer is not yet active */ + +typedef struct dtrace_buffer { + uint64_t dtb_offset; /* current offset in buffer */ + uint64_t dtb_size; /* size of buffer */ + uint32_t dtb_flags; /* flags */ + uint32_t dtb_drops; /* number of drops */ + caddr_t dtb_tomax; /* active buffer */ + caddr_t dtb_xamot; /* inactive buffer */ + uint32_t dtb_xamot_flags; /* inactive flags */ + uint32_t dtb_xamot_drops; /* drops in inactive buffer */ + uint64_t dtb_xamot_offset; /* offset in inactive buffer */ + uint32_t dtb_errors; /* number of errors */ + uint32_t dtb_xamot_errors; /* errors in inactive buffer */ +#ifndef _LP64 + uint64_t dtb_pad1; /* pad out to 64 bytes */ +#endif + uint64_t dtb_switched; /* time of last switch */ + uint64_t dtb_interval; /* observed switch interval */ + uint64_t dtb_pad2[6]; /* pad to avoid false sharing */ +} dtrace_buffer_t; + +/* + * DTrace Aggregation Buffers + * + * Aggregation buffers use much of the same mechanism as described above + * ("DTrace Buffers"). However, because an aggregation is fundamentally a + * hash, there exists dynamic metadata associated with an aggregation buffer + * that is not associated with other kinds of buffers. This aggregation + * metadata is _only_ relevant for the in-kernel implementation of + * aggregations; it is not actually relevant to user-level consumers. To do + * this, we allocate dynamic aggregation data (hash keys and hash buckets) + * starting below the _limit_ of the buffer, and we allocate data from the + * _base_ of the buffer. When the aggregation buffer is copied out, _only_ the + * data is copied out; the metadata is simply discarded. Schematically, + * aggregation buffers look like: + * + * base of data buffer ---> +-------+------+-----------+-------+ + * | aggid | key | value | aggid | + * +-------+------+-----------+-------+ + * | key | + * +-------+-------+-----+------------+ + * | value | aggid | key | value | + * +-------+------++-----+------+-----+ + * | aggid | key | value | | + * +-------+------+-------------+ | + * | || | + * | || | + * | \/ | + * : : + * . . + * . . + * . . + * : : + * | /\ | + * | || +------------+ + * | || | | + * +---------------------+ | + * | hash keys | + * | (dtrace_aggkey structures) | + * | | + * +----------------------------------+ + * | hash buckets | + * | (dtrace_aggbuffer structure) | + * | | + * limit of data buffer ---> +----------------------------------+ + * + * + * As implied above, just as we assure that ECBs always store a constant + * amount of data, we assure that a given aggregation -- identified by its + * aggregation ID -- always stores data of a constant quantity and type. + * As with EPIDs, this allows the aggregation ID to serve as the metadata for a + * given record. + * + * Note that the size of the dtrace_aggkey structure must be sizeof (uintptr_t) + * aligned. (If this the structure changes such that this becomes false, an + * assertion will fail in dtrace_aggregate().) + */ +typedef struct dtrace_aggkey { + uint32_t dtak_hashval; /* hash value */ + uint32_t dtak_action:4; /* action -- 4 bits */ + uint32_t dtak_size:28; /* size -- 28 bits */ + caddr_t dtak_data; /* data pointer */ + struct dtrace_aggkey *dtak_next; /* next in hash chain */ +} dtrace_aggkey_t; + +typedef struct dtrace_aggbuffer { + uintptr_t dtagb_hashsize; /* number of buckets */ + uintptr_t dtagb_free; /* free list of keys */ + dtrace_aggkey_t **dtagb_hash; /* hash table */ +} dtrace_aggbuffer_t; + +/* + * DTrace Speculations + * + * Speculations have a per-CPU buffer and a global state. Once a speculation + * buffer has been comitted or discarded, it cannot be reused until all CPUs + * have taken the same action (commit or discard) on their respective + * speculative buffer. However, because DTrace probes may execute in arbitrary + * context, other CPUs cannot simply be cross-called at probe firing time to + * perform the necessary commit or discard. The speculation states thus + * optimize for the case that a speculative buffer is only active on one CPU at + * the time of a commit() or discard() -- for if this is the case, other CPUs + * need not take action, and the speculation is immediately available for + * reuse. If the speculation is active on multiple CPUs, it must be + * asynchronously cleaned -- potentially leading to a higher rate of dirty + * speculative drops. The speculation states are as follows: + * + * DTRACESPEC_INACTIVE <= Initial state; inactive speculation + * DTRACESPEC_ACTIVE <= Allocated, but not yet speculatively traced to + * DTRACESPEC_ACTIVEONE <= Speculatively traced to on one CPU + * DTRACESPEC_ACTIVEMANY <= Speculatively traced to on more than one CPU + * DTRACESPEC_COMMITTING <= Currently being commited on one CPU + * DTRACESPEC_COMMITTINGMANY <= Currently being commited on many CPUs + * DTRACESPEC_DISCARDING <= Currently being discarded on many CPUs + * + * The state transition diagram is as follows: + * + * +----------------------------------------------------------+ + * | | + * | +------------+ | + * | +-------------------| COMMITTING |<-----------------+ | + * | | +------------+ | | + * | | copied spec. ^ commit() on | | discard() on + * | | into principal | active CPU | | active CPU + * | | | commit() | | + * V V | | | + * +----------+ +--------+ +-----------+ + * | INACTIVE |---------------->| ACTIVE |--------------->| ACTIVEONE | + * +----------+ speculation() +--------+ speculate() +-----------+ + * ^ ^ | | | + * | | | discard() | | + * | | asynchronously | discard() on | | speculate() + * | | cleaned V inactive CPU | | on inactive + * | | +------------+ | | CPU + * | +-------------------| DISCARDING |<-----------------+ | + * | +------------+ | + * | asynchronously ^ | + * | copied spec. | discard() | + * | into principal +------------------------+ | + * | | V + * +----------------+ commit() +------------+ + * | COMMITTINGMANY |<----------------------------------| ACTIVEMANY | + * +----------------+ +------------+ + */ +typedef enum dtrace_speculation_state { + DTRACESPEC_INACTIVE = 0, + DTRACESPEC_ACTIVE, + DTRACESPEC_ACTIVEONE, + DTRACESPEC_ACTIVEMANY, + DTRACESPEC_COMMITTING, + DTRACESPEC_COMMITTINGMANY, + DTRACESPEC_DISCARDING +} dtrace_speculation_state_t; + +typedef struct dtrace_speculation { + dtrace_speculation_state_t dtsp_state; /* current speculation state */ + int dtsp_cleaning; /* non-zero if being cleaned */ + dtrace_buffer_t *dtsp_buffer; /* speculative buffer */ +} dtrace_speculation_t; + +/* + * DTrace Dynamic Variables + * + * The dynamic variable problem is obviously decomposed into two subproblems: + * allocating new dynamic storage, and freeing old dynamic storage. The + * presence of the second problem makes the first much more complicated -- or + * rather, the absence of the second renders the first trivial. This is the + * case with aggregations, for which there is effectively no deallocation of + * dynamic storage. (Or more accurately, all dynamic storage is deallocated + * when a snapshot is taken of the aggregation.) As DTrace dynamic variables + * allow for both dynamic allocation and dynamic deallocation, the + * implementation of dynamic variables is quite a bit more complicated than + * that of their aggregation kin. + * + * We observe that allocating new dynamic storage is tricky only because the + * size can vary -- the allocation problem is much easier if allocation sizes + * are uniform. We further observe that in D, the size of dynamic variables is + * actually _not_ dynamic -- dynamic variable sizes may be determined by static + * analysis of DIF text. (This is true even of putatively dynamically-sized + * objects like strings and stacks, the sizes of which are dictated by the + * "stringsize" and "stackframes" variables, respectively.) We exploit this by + * performing this analysis on all DIF before enabling any probes. For each + * dynamic load or store, we calculate the dynamically-allocated size plus the + * size of the dtrace_dynvar structure plus the storage required to key the + * data. For all DIF, we take the largest value and dub it the _chunksize_. + * We then divide dynamic memory into two parts: a hash table that is wide + * enough to have every chunk in its own bucket, and a larger region of equal + * chunksize units. Whenever we wish to dynamically allocate a variable, we + * always allocate a single chunk of memory. Depending on the uniformity of + * allocation, this will waste some amount of memory -- but it eliminates the + * non-determinism inherent in traditional heap fragmentation. + * + * Dynamic objects are allocated by storing a non-zero value to them; they are + * deallocated by storing a zero value to them. Dynamic variables are + * complicated enormously by being shared between CPUs. In particular, + * consider the following scenario: + * + * CPU A CPU B + * +---------------------------------+ +---------------------------------+ + * | | | | + * | allocates dynamic object a[123] | | | + * | by storing the value 345 to it | | | + * | ---------> | + * | | | wishing to load from object | + * | | | a[123], performs lookup in | + * | | | dynamic variable space | + * | <--------- | + * | deallocates object a[123] by | | | + * | storing 0 to it | | | + * | | | | + * | allocates dynamic object b[567] | | performs load from a[123] | + * | by storing the value 789 to it | | | + * : : : : + * . . . . + * + * This is obviously a race in the D program, but there are nonetheless only + * two valid values for CPU B's load from a[123]: 345 or 0. Most importantly, + * CPU B may _not_ see the value 789 for a[123]. + * + * There are essentially two ways to deal with this: + * + * (1) Explicitly spin-lock variables. That is, if CPU B wishes to load + * from a[123], it needs to lock a[123] and hold the lock for the + * duration that it wishes to manipulate it. + * + * (2) Avoid reusing freed chunks until it is known that no CPU is referring + * to them. + * + * The implementation of (1) is rife with complexity, because it requires the + * user of a dynamic variable to explicitly decree when they are done using it. + * Were all variables by value, this perhaps wouldn't be debilitating -- but + * dynamic variables of non-scalar types are tracked by reference. That is, if + * a dynamic variable is, say, a string, and that variable is to be traced to, + * say, the principal buffer, the DIF emulation code returns to the main + * dtrace_probe() loop a pointer to the underlying storage, not the contents of + * the storage. Further, code calling on DIF emulation would have to be aware + * that the DIF emulation has returned a reference to a dynamic variable that + * has been potentially locked. The variable would have to be unlocked after + * the main dtrace_probe() loop is finished with the variable, and the main + * dtrace_probe() loop would have to be careful to not call any further DIF + * emulation while the variable is locked to avoid deadlock. More generally, + * if one were to implement (1), DIF emulation code dealing with dynamic + * variables could only deal with one dynamic variable at a time (lest deadlock + * result). To sum, (1) exports too much subtlety to the users of dynamic + * variables -- increasing maintenance burden and imposing serious constraints + * on future DTrace development. + * + * The implementation of (2) is also complex, but the complexity is more + * manageable. We need to be sure that when a variable is deallocated, it is + * not placed on a traditional free list, but rather on a _dirty_ list. Once a + * variable is on a dirty list, it cannot be found by CPUs performing a + * subsequent lookup of the variable -- but it may still be in use by other + * CPUs. To assure that all CPUs that may be seeing the old variable have + * cleared out of probe context, a dtrace_sync() can be issued. Once the + * dtrace_sync() has completed, it can be known that all CPUs are done + * manipulating the dynamic variable -- the dirty list can be atomically + * appended to the free list. Unfortunately, there's a slight hiccup in this + * mechanism: dtrace_sync() may not be issued from probe context. The + * dtrace_sync() must be therefore issued asynchronously from non-probe + * context. For this we rely on the DTrace cleaner, a cyclic that runs at the + * "cleanrate" frequency. To ease this implementation, we define several chunk + * lists: + * + * - Dirty. Deallocated chunks, not yet cleaned. Not available. + * + * - Rinsing. Formerly dirty chunks that are currently being asynchronously + * cleaned. Not available, but will be shortly. Dynamic variable + * allocation may not spin or block for availability, however. + * + * - Clean. Clean chunks, ready for allocation -- but not on the free list. + * + * - Free. Available for allocation. + * + * Moreover, to avoid absurd contention, _each_ of these lists is implemented + * on a per-CPU basis. This is only for performance, not correctness; chunks + * may be allocated from another CPU's free list. The algorithm for allocation + * then is this: + * + * (1) Attempt to atomically allocate from current CPU's free list. If list + * is non-empty and allocation is successful, allocation is complete. + * + * (2) If the clean list is non-empty, atomically move it to the free list, + * and reattempt (1). + * + * (3) If the dynamic variable space is in the CLEAN state, look for free + * and clean lists on other CPUs by setting the current CPU to the next + * CPU, and reattempting (1). If the next CPU is the current CPU (that + * is, if all CPUs have been checked), atomically switch the state of + * the dynamic variable space based on the following: + * + * - If no free chunks were found and no dirty chunks were found, + * atomically set the state to EMPTY. + * + * - If dirty chunks were found, atomically set the state to DIRTY. + * + * - If rinsing chunks were found, atomically set the state to RINSING. + * + * (4) Based on state of dynamic variable space state, increment appropriate + * counter to indicate dynamic drops (if in EMPTY state) vs. dynamic + * dirty drops (if in DIRTY state) vs. dynamic rinsing drops (if in + * RINSING state). Fail the allocation. + * + * The cleaning cyclic operates with the following algorithm: for all CPUs + * with a non-empty dirty list, atomically move the dirty list to the rinsing + * list. Perform a dtrace_sync(). For all CPUs with a non-empty rinsing list, + * atomically move the rinsing list to the clean list. Perform another + * dtrace_sync(). By this point, all CPUs have seen the new clean list; the + * state of the dynamic variable space can be restored to CLEAN. + * + * There exist two final races that merit explanation. The first is a simple + * allocation race: + * + * CPU A CPU B + * +---------------------------------+ +---------------------------------+ + * | | | | + * | allocates dynamic object a[123] | | allocates dynamic object a[123] | + * | by storing the value 345 to it | | by storing the value 567 to it | + * | | | | + * : : : : + * . . . . + * + * Again, this is a race in the D program. It can be resolved by having a[123] + * hold the value 345 or a[123] hold the value 567 -- but it must be true that + * a[123] have only _one_ of these values. (That is, the racing CPUs may not + * put the same element twice on the same hash chain.) This is resolved + * simply: before the allocation is undertaken, the start of the new chunk's + * hash chain is noted. Later, after the allocation is complete, the hash + * chain is atomically switched to point to the new element. If this fails + * (because of either concurrent allocations or an allocation concurrent with a + * deletion), the newly allocated chunk is deallocated to the dirty list, and + * the whole process of looking up (and potentially allocating) the dynamic + * variable is reattempted. + * + * The final race is a simple deallocation race: + * + * CPU A CPU B + * +---------------------------------+ +---------------------------------+ + * | | | | + * | deallocates dynamic object | | deallocates dynamic object | + * | a[123] by storing the value 0 | | a[123] by storing the value 0 | + * | to it | | to it | + * | | | | + * : : : : + * . . . . + * + * Once again, this is a race in the D program, but it is one that we must + * handle without corrupting the underlying data structures. Because + * deallocations require the deletion of a chunk from the middle of a hash + * chain, we cannot use a single-word atomic operation to remove it. For this, + * we add a spin lock to the hash buckets that is _only_ used for deallocations + * (allocation races are handled as above). Further, this spin lock is _only_ + * held for the duration of the delete; before control is returned to the DIF + * emulation code, the hash bucket is unlocked. + */ +typedef struct dtrace_key { + uint64_t dttk_value; /* data value or data pointer */ + uint64_t dttk_size; /* 0 if by-val, >0 if by-ref */ +} dtrace_key_t; + +typedef struct dtrace_tuple { + uint32_t dtt_nkeys; /* number of keys in tuple */ + uint32_t dtt_pad; /* padding */ + dtrace_key_t dtt_key[1]; /* array of tuple keys */ +} dtrace_tuple_t; + +typedef struct dtrace_dynvar { + uint64_t dtdv_hashval; /* hash value -- 0 if free */ + struct dtrace_dynvar *dtdv_next; /* next on list or hash chain */ + void *dtdv_data; /* pointer to data */ + dtrace_tuple_t dtdv_tuple; /* tuple key */ +} dtrace_dynvar_t; + +typedef enum dtrace_dynvar_op { + DTRACE_DYNVAR_ALLOC, + DTRACE_DYNVAR_NOALLOC, + DTRACE_DYNVAR_DEALLOC +} dtrace_dynvar_op_t; + +typedef struct dtrace_dynhash { + dtrace_dynvar_t *dtdh_chain; /* hash chain for this bucket */ + uintptr_t dtdh_lock; /* deallocation lock */ +#ifdef _LP64 + uintptr_t dtdh_pad[6]; /* pad to avoid false sharing */ +#else + uintptr_t dtdh_pad[14]; /* pad to avoid false sharing */ +#endif +} dtrace_dynhash_t; + +typedef struct dtrace_dstate_percpu { + dtrace_dynvar_t *dtdsc_free; /* free list for this CPU */ + dtrace_dynvar_t *dtdsc_dirty; /* dirty list for this CPU */ + dtrace_dynvar_t *dtdsc_rinsing; /* rinsing list for this CPU */ + dtrace_dynvar_t *dtdsc_clean; /* clean list for this CPU */ + uint64_t dtdsc_drops; /* number of capacity drops */ + uint64_t dtdsc_dirty_drops; /* number of dirty drops */ + uint64_t dtdsc_rinsing_drops; /* number of rinsing drops */ +#ifdef _LP64 + uint64_t dtdsc_pad; /* pad to avoid false sharing */ +#else + uint64_t dtdsc_pad[2]; /* pad to avoid false sharing */ +#endif +} dtrace_dstate_percpu_t; + +typedef enum dtrace_dstate_state { + DTRACE_DSTATE_CLEAN = 0, + DTRACE_DSTATE_EMPTY, + DTRACE_DSTATE_DIRTY, + DTRACE_DSTATE_RINSING +} dtrace_dstate_state_t; + +typedef struct dtrace_dstate { + void *dtds_base; /* base of dynamic var. space */ + size_t dtds_size; /* size of dynamic var. space */ + size_t dtds_hashsize; /* number of buckets in hash */ + size_t dtds_chunksize; /* size of each chunk */ + dtrace_dynhash_t *dtds_hash; /* pointer to hash table */ + dtrace_dstate_state_t dtds_state; /* current dynamic var. state */ + dtrace_dstate_percpu_t *dtds_percpu; /* per-CPU dyn. var. state */ +} dtrace_dstate_t; + +/* + * DTrace Variable State + * + * The DTrace variable state tracks user-defined variables in its dtrace_vstate + * structure. Each DTrace consumer has exactly one dtrace_vstate structure, + * but some dtrace_vstate structures may exist without a corresponding DTrace + * consumer (see "DTrace Helpers", below). As described in <sys/dtrace.h>, + * user-defined variables can have one of three scopes: + * + * DIFV_SCOPE_GLOBAL => global scope + * DIFV_SCOPE_THREAD => thread-local scope (i.e. "self->" variables) + * DIFV_SCOPE_LOCAL => clause-local scope (i.e. "this->" variables) + * + * The variable state tracks variables by both their scope and their allocation + * type: + * + * - The dtvs_globals and dtvs_locals members each point to an array of + * dtrace_statvar structures. These structures contain both the variable + * metadata (dtrace_difv structures) and the underlying storage for all + * statically allocated variables, including statically allocated + * DIFV_SCOPE_GLOBAL variables and all DIFV_SCOPE_LOCAL variables. + * + * - The dtvs_tlocals member points to an array of dtrace_difv structures for + * DIFV_SCOPE_THREAD variables. As such, this array tracks _only_ the + * variable metadata for DIFV_SCOPE_THREAD variables; the underlying storage + * is allocated out of the dynamic variable space. + * + * - The dtvs_dynvars member is the dynamic variable state associated with the + * variable state. The dynamic variable state (described in "DTrace Dynamic + * Variables", above) tracks all DIFV_SCOPE_THREAD variables and all + * dynamically-allocated DIFV_SCOPE_GLOBAL variables. + */ +typedef struct dtrace_statvar { + uint64_t dtsv_data; /* data or pointer to it */ + size_t dtsv_size; /* size of pointed-to data */ + int dtsv_refcnt; /* reference count */ + dtrace_difv_t dtsv_var; /* variable metadata */ +} dtrace_statvar_t; + +typedef struct dtrace_vstate { + dtrace_state_t *dtvs_state; /* back pointer to state */ + dtrace_statvar_t **dtvs_globals; /* statically-allocated glbls */ + int dtvs_nglobals; /* number of globals */ + dtrace_difv_t *dtvs_tlocals; /* thread-local metadata */ + int dtvs_ntlocals; /* number of thread-locals */ + dtrace_statvar_t **dtvs_locals; /* clause-local data */ + int dtvs_nlocals; /* number of clause-locals */ + dtrace_dstate_t dtvs_dynvars; /* dynamic variable state */ +} dtrace_vstate_t; + +/* + * DTrace Machine State + * + * In the process of processing a fired probe, DTrace needs to track and/or + * cache some per-CPU state associated with that particular firing. This is + * state that is always discarded after the probe firing has completed, and + * much of it is not specific to any DTrace consumer, remaining valid across + * all ECBs. This state is tracked in the dtrace_mstate structure. + */ +#define DTRACE_MSTATE_ARGS 0x00000001 +#define DTRACE_MSTATE_PROBE 0x00000002 +#define DTRACE_MSTATE_EPID 0x00000004 +#define DTRACE_MSTATE_TIMESTAMP 0x00000008 +#define DTRACE_MSTATE_STACKDEPTH 0x00000010 +#define DTRACE_MSTATE_CALLER 0x00000020 +#define DTRACE_MSTATE_IPL 0x00000040 +#define DTRACE_MSTATE_FLTOFFS 0x00000080 +#define DTRACE_MSTATE_WALLTIMESTAMP 0x00000100 +#define DTRACE_MSTATE_USTACKDEPTH 0x00000200 +#define DTRACE_MSTATE_UCALLER 0x00000400 + +typedef struct dtrace_mstate { + uintptr_t dtms_scratch_base; /* base of scratch space */ + uintptr_t dtms_scratch_ptr; /* current scratch pointer */ + size_t dtms_scratch_size; /* scratch size */ + uint32_t dtms_present; /* variables that are present */ + uint64_t dtms_arg[5]; /* cached arguments */ + dtrace_epid_t dtms_epid; /* current EPID */ + uint64_t dtms_timestamp; /* cached timestamp */ + hrtime_t dtms_walltimestamp; /* cached wall timestamp */ + int dtms_stackdepth; /* cached stackdepth */ + int dtms_ustackdepth; /* cached ustackdepth */ + struct dtrace_probe *dtms_probe; /* current probe */ + uintptr_t dtms_caller; /* cached caller */ + uint64_t dtms_ucaller; /* cached user-level caller */ + int dtms_ipl; /* cached interrupt pri lev */ + int dtms_fltoffs; /* faulting DIFO offset */ + uintptr_t dtms_strtok; /* saved strtok() pointer */ + uintptr_t dtms_strtok_limit; /* upper bound of strtok ptr */ + uint32_t dtms_access; /* memory access rights */ + dtrace_difo_t *dtms_difo; /* current dif object */ + file_t *dtms_getf; /* cached rval of getf() */ +} dtrace_mstate_t; + +#define DTRACE_COND_OWNER 0x1 +#define DTRACE_COND_USERMODE 0x2 +#define DTRACE_COND_ZONEOWNER 0x4 + +#define DTRACE_PROBEKEY_MAXDEPTH 8 /* max glob recursion depth */ + +/* + * Access flag used by dtrace_mstate.dtms_access. + */ +#define DTRACE_ACCESS_KERNEL 0x1 /* the priv to read kmem */ + + +/* + * DTrace Activity + * + * Each DTrace consumer is in one of several states, which (for purposes of + * avoiding yet-another overloading of the noun "state") we call the current + * _activity_. The activity transitions on dtrace_go() (from DTRACIOCGO), on + * dtrace_stop() (from DTRACIOCSTOP) and on the exit() action. Activities may + * only transition in one direction; the activity transition diagram is a + * directed acyclic graph. The activity transition diagram is as follows: + * + * + * +----------+ +--------+ +--------+ + * | INACTIVE |------------------>| WARMUP |------------------>| ACTIVE | + * +----------+ dtrace_go(), +--------+ dtrace_go(), +--------+ + * before BEGIN | after BEGIN | | | + * | | | | + * exit() action | | | | + * from BEGIN ECB | | | | + * | | | | + * v | | | + * +----------+ exit() action | | | + * +-----------------------------| DRAINING |<-------------------+ | | + * | +----------+ | | + * | | | | + * | dtrace_stop(), | | | + * | before END | | | + * | | | | + * | v | | + * | +---------+ +----------+ | | + * | | STOPPED |<----------------| COOLDOWN |<----------------------+ | + * | +---------+ dtrace_stop(), +----------+ dtrace_stop(), | + * | after END before END | + * | | + * | +--------+ | + * +----------------------------->| KILLED |<--------------------------+ + * deadman timeout or +--------+ deadman timeout or + * killed consumer killed consumer + * + * Note that once a DTrace consumer has stopped tracing, there is no way to + * restart it; if a DTrace consumer wishes to restart tracing, it must reopen + * the DTrace pseudodevice. + */ +typedef enum dtrace_activity { + DTRACE_ACTIVITY_INACTIVE = 0, /* not yet running */ + DTRACE_ACTIVITY_WARMUP, /* while starting */ + DTRACE_ACTIVITY_ACTIVE, /* running */ + DTRACE_ACTIVITY_DRAINING, /* before stopping */ + DTRACE_ACTIVITY_COOLDOWN, /* while stopping */ + DTRACE_ACTIVITY_STOPPED, /* after stopping */ + DTRACE_ACTIVITY_KILLED /* killed */ +} dtrace_activity_t; + +/* + * DTrace Helper Implementation + * + * A description of the helper architecture may be found in <sys/dtrace.h>. + * Each process contains a pointer to its helpers in its p_dtrace_helpers + * member. This is a pointer to a dtrace_helpers structure, which contains an + * array of pointers to dtrace_helper structures, helper variable state (shared + * among a process's helpers) and a generation count. (The generation count is + * used to provide an identifier when a helper is added so that it may be + * subsequently removed.) The dtrace_helper structure is self-explanatory, + * containing pointers to the objects needed to execute the helper. Note that + * helpers are _duplicated_ across fork(2), and destroyed on exec(2). No more + * than dtrace_helpers_max are allowed per-process. + */ +#define DTRACE_HELPER_ACTION_USTACK 0 +#define DTRACE_NHELPER_ACTIONS 1 + +typedef struct dtrace_helper_action { + int dtha_generation; /* helper action generation */ + int dtha_nactions; /* number of actions */ + dtrace_difo_t *dtha_predicate; /* helper action predicate */ + dtrace_difo_t **dtha_actions; /* array of actions */ + struct dtrace_helper_action *dtha_next; /* next helper action */ +} dtrace_helper_action_t; + +typedef struct dtrace_helper_provider { + int dthp_generation; /* helper provider generation */ + uint32_t dthp_ref; /* reference count */ + dof_helper_t dthp_prov; /* DOF w/ provider and probes */ +} dtrace_helper_provider_t; + +typedef struct dtrace_helpers { + dtrace_helper_action_t **dthps_actions; /* array of helper actions */ + dtrace_vstate_t dthps_vstate; /* helper action var. state */ + dtrace_helper_provider_t **dthps_provs; /* array of providers */ + uint_t dthps_nprovs; /* count of providers */ + uint_t dthps_maxprovs; /* provider array size */ + int dthps_generation; /* current generation */ + pid_t dthps_pid; /* pid of associated proc */ + int dthps_deferred; /* helper in deferred list */ + struct dtrace_helpers *dthps_next; /* next pointer */ + struct dtrace_helpers *dthps_prev; /* prev pointer */ +} dtrace_helpers_t; + +/* + * DTrace Helper Action Tracing + * + * Debugging helper actions can be arduous. To ease the development and + * debugging of helpers, DTrace contains a tracing-framework-within-a-tracing- + * framework: helper tracing. If dtrace_helptrace_enabled is non-zero (which + * it is by default on DEBUG kernels), all helper activity will be traced to a + * global, in-kernel ring buffer. Each entry includes a pointer to the specific + * helper, the location within the helper, and a trace of all local variables. + * The ring buffer may be displayed in a human-readable format with the + * ::dtrace_helptrace mdb(1) dcmd. + */ +#define DTRACE_HELPTRACE_NEXT (-1) +#define DTRACE_HELPTRACE_DONE (-2) +#define DTRACE_HELPTRACE_ERR (-3) + +typedef struct dtrace_helptrace { + dtrace_helper_action_t *dtht_helper; /* helper action */ + int dtht_where; /* where in helper action */ + int dtht_nlocals; /* number of locals */ + int dtht_fault; /* type of fault (if any) */ + int dtht_fltoffs; /* DIF offset */ + uint64_t dtht_illval; /* faulting value */ + uint64_t dtht_locals[1]; /* local variables */ +} dtrace_helptrace_t; + +/* + * DTrace Credentials + * + * In probe context, we have limited flexibility to examine the credentials + * of the DTrace consumer that created a particular enabling. We use + * the Least Privilege interfaces to cache the consumer's cred pointer and + * some facts about that credential in a dtrace_cred_t structure. These + * can limit the consumer's breadth of visibility and what actions the + * consumer may take. + */ +#define DTRACE_CRV_ALLPROC 0x01 +#define DTRACE_CRV_KERNEL 0x02 +#define DTRACE_CRV_ALLZONE 0x04 + +#define DTRACE_CRV_ALL (DTRACE_CRV_ALLPROC | DTRACE_CRV_KERNEL | \ + DTRACE_CRV_ALLZONE) + +#define DTRACE_CRA_PROC 0x0001 +#define DTRACE_CRA_PROC_CONTROL 0x0002 +#define DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER 0x0004 +#define DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE 0x0008 +#define DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG 0x0010 +#define DTRACE_CRA_KERNEL 0x0020 +#define DTRACE_CRA_KERNEL_DESTRUCTIVE 0x0040 + +#define DTRACE_CRA_ALL (DTRACE_CRA_PROC | \ + DTRACE_CRA_PROC_CONTROL | \ + DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER | \ + DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE | \ + DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG | \ + DTRACE_CRA_KERNEL | \ + DTRACE_CRA_KERNEL_DESTRUCTIVE) + +typedef struct dtrace_cred { + cred_t *dcr_cred; + uint8_t dcr_destructive; + uint8_t dcr_visible; + uint16_t dcr_action; +} dtrace_cred_t; + +/* + * DTrace Consumer State + * + * Each DTrace consumer has an associated dtrace_state structure that contains + * its in-kernel DTrace state -- including options, credentials, statistics and + * pointers to ECBs, buffers, speculations and formats. A dtrace_state + * structure is also allocated for anonymous enablings. When anonymous state + * is grabbed, the grabbing consumers dts_anon pointer is set to the grabbed + * dtrace_state structure. + */ +struct dtrace_state { +#ifdef illumos + dev_t dts_dev; /* device */ +#else + struct cdev *dts_dev; /* device */ +#endif + int dts_necbs; /* total number of ECBs */ + dtrace_ecb_t **dts_ecbs; /* array of ECBs */ + dtrace_epid_t dts_epid; /* next EPID to allocate */ + size_t dts_needed; /* greatest needed space */ + struct dtrace_state *dts_anon; /* anon. state, if grabbed */ + dtrace_activity_t dts_activity; /* current activity */ + dtrace_vstate_t dts_vstate; /* variable state */ + dtrace_buffer_t *dts_buffer; /* principal buffer */ + dtrace_buffer_t *dts_aggbuffer; /* aggregation buffer */ + dtrace_speculation_t *dts_speculations; /* speculation array */ + int dts_nspeculations; /* number of speculations */ + int dts_naggregations; /* number of aggregations */ + dtrace_aggregation_t **dts_aggregations; /* aggregation array */ +#ifdef illumos + vmem_t *dts_aggid_arena; /* arena for aggregation IDs */ +#else + struct unrhdr *dts_aggid_arena; /* arena for aggregation IDs */ +#endif + uint64_t dts_errors; /* total number of errors */ + uint32_t dts_speculations_busy; /* number of spec. busy */ + uint32_t dts_speculations_unavail; /* number of spec unavail */ + uint32_t dts_stkstroverflows; /* stack string tab overflows */ + uint32_t dts_dblerrors; /* errors in ERROR probes */ + uint32_t dts_reserve; /* space reserved for END */ + hrtime_t dts_laststatus; /* time of last status */ +#ifdef illumos + cyclic_id_t dts_cleaner; /* cleaning cyclic */ + cyclic_id_t dts_deadman; /* deadman cyclic */ +#else + struct callout dts_cleaner; /* Cleaning callout. */ + struct callout dts_deadman; /* Deadman callout. */ +#endif + hrtime_t dts_alive; /* time last alive */ + char dts_speculates; /* boolean: has speculations */ + char dts_destructive; /* boolean: has dest. actions */ + int dts_nformats; /* number of formats */ + char **dts_formats; /* format string array */ + dtrace_optval_t dts_options[DTRACEOPT_MAX]; /* options */ + dtrace_cred_t dts_cred; /* credentials */ + size_t dts_nretained; /* number of retained enabs */ + int dts_getf; /* number of getf() calls */ + uint64_t dts_rstate[NCPU][2]; /* per-CPU random state */ +}; + +struct dtrace_provider { + dtrace_pattr_t dtpv_attr; /* provider attributes */ + dtrace_ppriv_t dtpv_priv; /* provider privileges */ + dtrace_pops_t dtpv_pops; /* provider operations */ + char *dtpv_name; /* provider name */ + void *dtpv_arg; /* provider argument */ + hrtime_t dtpv_defunct; /* when made defunct */ + struct dtrace_provider *dtpv_next; /* next provider */ +}; + +struct dtrace_meta { + dtrace_mops_t dtm_mops; /* meta provider operations */ + char *dtm_name; /* meta provider name */ + void *dtm_arg; /* meta provider user arg */ + uint64_t dtm_count; /* no. of associated provs. */ +}; + +/* + * DTrace Enablings + * + * A dtrace_enabling structure is used to track a collection of ECB + * descriptions -- before they have been turned into actual ECBs. This is + * created as a result of DOF processing, and is generally used to generate + * ECBs immediately thereafter. However, enablings are also generally + * retained should the probes they describe be created at a later time; as + * each new module or provider registers with the framework, the retained + * enablings are reevaluated, with any new match resulting in new ECBs. To + * prevent probes from being matched more than once, the enabling tracks the + * last probe generation matched, and only matches probes from subsequent + * generations. + */ +typedef struct dtrace_enabling { + dtrace_ecbdesc_t **dten_desc; /* all ECB descriptions */ + int dten_ndesc; /* number of ECB descriptions */ + int dten_maxdesc; /* size of ECB array */ + dtrace_vstate_t *dten_vstate; /* associated variable state */ + dtrace_genid_t dten_probegen; /* matched probe generation */ + dtrace_ecbdesc_t *dten_current; /* current ECB description */ + int dten_error; /* current error value */ + int dten_primed; /* boolean: set if primed */ + struct dtrace_enabling *dten_prev; /* previous enabling */ + struct dtrace_enabling *dten_next; /* next enabling */ +} dtrace_enabling_t; + +/* + * DTrace Anonymous Enablings + * + * Anonymous enablings are DTrace enablings that are not associated with a + * controlling process, but rather derive their enabling from DOF stored as + * properties in the dtrace.conf file. If there is an anonymous enabling, a + * DTrace consumer state and enabling are created on attach. The state may be + * subsequently grabbed by the first consumer specifying the "grabanon" + * option. As long as an anonymous DTrace enabling exists, dtrace(7D) will + * refuse to unload. + */ +typedef struct dtrace_anon { + dtrace_state_t *dta_state; /* DTrace consumer state */ + dtrace_enabling_t *dta_enabling; /* pointer to enabling */ + processorid_t dta_beganon; /* which CPU BEGIN ran on */ +} dtrace_anon_t; + +/* + * DTrace Error Debugging + */ +#ifdef DEBUG +#define DTRACE_ERRDEBUG +#endif + +#ifdef DTRACE_ERRDEBUG + +typedef struct dtrace_errhash { + const char *dter_msg; /* error message */ + int dter_count; /* number of times seen */ +} dtrace_errhash_t; + +#define DTRACE_ERRHASHSZ 256 /* must be > number of err msgs */ + +#endif /* DTRACE_ERRDEBUG */ + +/* + * DTrace Toxic Ranges + * + * DTrace supports safe loads from probe context; if the address turns out to + * be invalid, a bit will be set by the kernel indicating that DTrace + * encountered a memory error, and DTrace will propagate the error to the user + * accordingly. However, there may exist some regions of memory in which an + * arbitrary load can change system state, and from which it is impossible to + * recover from such a load after it has been attempted. Examples of this may + * include memory in which programmable I/O registers are mapped (for which a + * read may have some implications for the device) or (in the specific case of + * UltraSPARC-I and -II) the virtual address hole. The platform is required + * to make DTrace aware of these toxic ranges; DTrace will then check that + * target addresses are not in a toxic range before attempting to issue a + * safe load. + */ +typedef struct dtrace_toxrange { + uintptr_t dtt_base; /* base of toxic range */ + uintptr_t dtt_limit; /* limit of toxic range */ +} dtrace_toxrange_t; + +#ifdef illumos +extern uint64_t dtrace_getarg(int, int); +#else +extern uint64_t __noinline dtrace_getarg(int, int); +#endif +extern greg_t dtrace_getfp(void); +extern int dtrace_getipl(void); +extern uintptr_t dtrace_caller(int); +extern uint32_t dtrace_cas32(uint32_t *, uint32_t, uint32_t); +extern void *dtrace_casptr(volatile void *, volatile void *, volatile void *); +extern void dtrace_copyin(uintptr_t, uintptr_t, size_t, volatile uint16_t *); +extern void dtrace_copyinstr(uintptr_t, uintptr_t, size_t, volatile uint16_t *); +extern void dtrace_copyout(uintptr_t, uintptr_t, size_t, volatile uint16_t *); +extern void dtrace_copyoutstr(uintptr_t, uintptr_t, size_t, + volatile uint16_t *); +extern void dtrace_getpcstack(pc_t *, int, int, uint32_t *); +extern ulong_t dtrace_getreg(struct trapframe *, uint_t); +extern int dtrace_getstackdepth(int); +extern void dtrace_getupcstack(uint64_t *, int); +extern void dtrace_getufpstack(uint64_t *, uint64_t *, int); +extern int dtrace_getustackdepth(void); +extern uintptr_t dtrace_fulword(void *); +extern uint8_t dtrace_fuword8(void *); +extern uint16_t dtrace_fuword16(void *); +extern uint32_t dtrace_fuword32(void *); +extern uint64_t dtrace_fuword64(void *); +extern void dtrace_probe_error(dtrace_state_t *, dtrace_epid_t, int, int, + int, uintptr_t); +extern int dtrace_assfail(const char *, const char *, int); +extern int dtrace_attached(void); +#ifdef illumos +extern hrtime_t dtrace_gethrestime(void); +#endif + +#ifdef __sparc +extern void dtrace_flush_windows(void); +extern void dtrace_flush_user_windows(void); +extern uint_t dtrace_getotherwin(void); +extern uint_t dtrace_getfprs(void); +#else +extern void dtrace_copy(uintptr_t, uintptr_t, size_t); +extern void dtrace_copystr(uintptr_t, uintptr_t, size_t, volatile uint16_t *); +#endif + +/* + * DTrace Assertions + * + * DTrace calls ASSERT and VERIFY from probe context. To assure that a failed + * ASSERT or VERIFY does not induce a markedly more catastrophic failure (e.g., + * one from which a dump cannot be gleaned), DTrace must define its own ASSERT + * and VERIFY macros to be ones that may safely be called from probe context. + * This header file must thus be included by any DTrace component that calls + * ASSERT and/or VERIFY from probe context, and _only_ by those components. + * (The only exception to this is kernel debugging infrastructure at user-level + * that doesn't depend on calling ASSERT.) + */ +#undef ASSERT +#undef VERIFY +#define VERIFY(EX) ((void)((EX) || \ + dtrace_assfail(#EX, __FILE__, __LINE__))) +#ifdef DEBUG +#define ASSERT(EX) ((void)((EX) || \ + dtrace_assfail(#EX, __FILE__, __LINE__))) +#else +#define ASSERT(X) ((void)0) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DTRACE_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h b/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h new file mode 100644 index 000000000000..971b19e6ccd8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h @@ -0,0 +1,83 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ERRORQ_H +#define _ERRORQ_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/nvpair.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct errorq errorq_t; +typedef struct errorq_elem errorq_elem_t; +typedef void (*errorq_func_t)(void *, const void *, const errorq_elem_t *); + +/* + * Public flags for errorq_create(): bit range 0-15 + */ +#define ERRORQ_VITAL 0x0001 /* drain queue automatically on system reset */ + +/* + * Public flags for errorq_dispatch(): + */ +#define ERRORQ_ASYNC 0 /* schedule async queue drain for caller */ +#define ERRORQ_SYNC 1 /* do not schedule drain; caller will drain */ + +#ifdef _KERNEL + +extern errorq_t *errorq_create(const char *, errorq_func_t, void *, + ulong_t, size_t, uint_t, uint_t); + +extern errorq_t *errorq_nvcreate(const char *, errorq_func_t, void *, + ulong_t, size_t, uint_t, uint_t); + +extern void errorq_destroy(errorq_t *); +extern void errorq_dispatch(errorq_t *, const void *, size_t, uint_t); +extern void errorq_drain(errorq_t *); +extern void errorq_init(void); +extern void errorq_panic(void); +extern errorq_elem_t *errorq_reserve(errorq_t *); +extern void errorq_commit(errorq_t *, errorq_elem_t *, uint_t); +extern void errorq_cancel(errorq_t *, errorq_elem_t *); +extern nvlist_t *errorq_elem_nvl(errorq_t *, const errorq_elem_t *); +extern nv_alloc_t *errorq_elem_nva(errorq_t *, const errorq_elem_t *); +extern void *errorq_elem_dup(errorq_t *, const errorq_elem_t *, + errorq_elem_t **); +extern void errorq_dump(); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ERRORQ_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h b/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h new file mode 100644 index 000000000000..3f9a665f0076 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_EXTDIRENT_H +#define _SYS_EXTDIRENT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> + +#if defined(_KERNEL) + +/* + * Extended file-system independent directory entry. This style of + * dirent provides additional informational flag bits for each + * directory entry. This dirent will be returned instead of the + * standard dirent if a VOP_READDIR() requests dirent flags via + * V_RDDIR_ENTFLAGS, and if the file system supports the flags. + */ +typedef struct edirent { + ino64_t ed_ino; /* "inode number" of entry */ + off64_t ed_off; /* offset of disk directory entry */ + uint32_t ed_eflags; /* per-entry flags */ + unsigned short ed_reclen; /* length of this record */ + char ed_name[1]; /* name of file */ +} edirent_t; + +#define EDIRENT_RECLEN(namelen) \ + ((offsetof(edirent_t, ed_name[0]) + 1 + (namelen) + 7) & ~ 7) +#define EDIRENT_NAMELEN(reclen) \ + ((reclen) - (offsetof(edirent_t, ed_name[0]))) + +/* + * Extended entry flags + * Extended entries include a bitfield of extra information + * regarding that entry. + */ +#define ED_CASE_CONFLICT 0x10 /* Disconsidering case, entry is not unique */ + +/* + * Extended flags accessor function + */ +#define ED_CASE_CONFLICTS(x) ((x)->ed_eflags & ED_CASE_CONFLICT) + +#endif /* defined(_KERNEL) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_EXTDIRENT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h new file mode 100644 index 000000000000..ec3582f5daa8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h @@ -0,0 +1,98 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FASTTRAP_H +#define _SYS_FASTTRAP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/fasttrap_isa.h> +#include <sys/dtrace.h> +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef illumos +#define FASTTRAPIOC (('m' << 24) | ('r' << 16) | ('f' << 8)) +#define FASTTRAPIOC_MAKEPROBE (FASTTRAPIOC | 1) +#define FASTTRAPIOC_GETINSTR (FASTTRAPIOC | 2) +#else +#define FASTTRAPIOC_GETINSTR _IOWR('f', 2, uint8_t) +#define FASTTRAPIOC_MAKEPROBE _IO('f', 3) +#endif + +typedef enum fasttrap_probe_type { + DTFTP_NONE = 0, + DTFTP_ENTRY, + DTFTP_RETURN, + DTFTP_OFFSETS, + DTFTP_POST_OFFSETS, + DTFTP_IS_ENABLED +} fasttrap_probe_type_t; + +typedef struct fasttrap_probe_spec { + pid_t ftps_pid; + fasttrap_probe_type_t ftps_type; + + char ftps_func[DTRACE_FUNCNAMELEN]; + char ftps_mod[DTRACE_MODNAMELEN]; + + uint64_t ftps_pc; + uint64_t ftps_size; + uint64_t ftps_noffs; + uint64_t ftps_offs[1]; +} fasttrap_probe_spec_t; + +typedef struct fasttrap_instr_query { + uint64_t ftiq_pc; + pid_t ftiq_pid; + fasttrap_instr_t ftiq_instr; +} fasttrap_instr_query_t; + +/* + * To support the fasttrap provider from very early in a process's life, + * the run-time linker, ld.so.1, has a program header of type PT_SUNWDTRACE + * which points to a data object which must be PT_SUNWDTRACE_SIZE bytes. + * This structure mimics the fasttrap provider section of the ulwp_t structure. + * When the fasttrap provider is changed to require new or different + * instructions, the data object in ld.so.1 and the thread initializers in libc + * (libc_init() and _thrp_create()) need to be updated to include the new + * instructions, and PT_SUNWDTRACE needs to be changed to a new unique number + * (while the old value gets assigned something like PT_SUNWDTRACE_1). Since the + * linker must be backward compatible with old Solaris releases, it must have + * program headers for each of the PT_SUNWDTRACE versions. The kernel's + * elfexec() function only has to look for the latest version of the + * PT_SUNWDTRACE program header. + */ +#define PT_SUNWDTRACE_SIZE FASTTRAP_SUNWDTRACE_SIZE + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FASTTRAP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h new file mode 100644 index 000000000000..d2fbf5f4981a --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h @@ -0,0 +1,235 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _FASTTRAP_IMPL_H +#define _FASTTRAP_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/dtrace.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/fasttrap.h> +#include <sys/fasttrap_isa.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Fasttrap Providers, Probes and Tracepoints + * + * Each Solaris process can have multiple providers -- the pid provider as + * well as any number of user-level statically defined tracing (USDT) + * providers. Those providers are each represented by a fasttrap_provider_t. + * All providers for a given process have a pointer to a shared + * fasttrap_proc_t. The fasttrap_proc_t has two states: active or defunct. + * When the count of active providers goes to zero it becomes defunct; a + * provider drops its active count when it is removed individually or as part + * of a mass removal when a process exits or performs an exec. + * + * Each probe is represented by a fasttrap_probe_t which has a pointer to + * its associated provider as well as a list of fasttrap_id_tp_t structures + * which are tuples combining a fasttrap_id_t and a fasttrap_tracepoint_t. + * A fasttrap_tracepoint_t represents the actual point of instrumentation + * and it contains two lists of fasttrap_id_t structures (to be fired pre- + * and post-instruction emulation) that identify the probes attached to the + * tracepoint. Tracepoints also have a pointer to the fasttrap_proc_t for the + * process they trace which is used when looking up a tracepoint both when a + * probe fires and when enabling and disabling probes. + * + * It's important to note that probes are preallocated with the necessary + * number of tracepoints, but that tracepoints can be shared by probes and + * swapped between probes. If a probe's preallocated tracepoint is enabled + * (and, therefore, the associated probe is enabled), and that probe is + * then disabled, ownership of that tracepoint may be exchanged for an + * unused tracepoint belonging to another probe that was attached to the + * enabled tracepoint. + * + * On FreeBSD, fasttrap providers also maintain per-thread scratch space for use + * by the ISA-specific fasttrap code. The fasttrap_scrblock_t type stores the + * virtual address of a page-sized memory block that is mapped into a process' + * address space. Each block is carved up into chunks (fasttrap_scrspace_t) for + * use by individual threads, which keep the address of their scratch space + * chunk in their struct kdtrace_thread. A thread's scratch space isn't released + * until it exits. + */ + +#ifndef illumos +typedef struct fasttrap_scrblock { + vm_offset_t ftsb_addr; /* address of a scratch block */ + LIST_ENTRY(fasttrap_scrblock) ftsb_next;/* next block in list */ +} fasttrap_scrblock_t; +#define FASTTRAP_SCRBLOCK_SIZE PAGE_SIZE + +typedef struct fasttrap_scrspace { + uintptr_t ftss_addr; /* scratch space address */ + LIST_ENTRY(fasttrap_scrspace) ftss_next;/* next in list */ +} fasttrap_scrspace_t; +#define FASTTRAP_SCRSPACE_SIZE 64 +#endif + +typedef struct fasttrap_proc { + pid_t ftpc_pid; /* process ID for this proc */ + uint64_t ftpc_acount; /* count of active providers */ + uint64_t ftpc_rcount; /* count of extant providers */ + kmutex_t ftpc_mtx; /* lock on all but acount */ + struct fasttrap_proc *ftpc_next; /* next proc in hash chain */ +#ifndef illumos + LIST_HEAD(, fasttrap_scrblock) ftpc_scrblks; /* mapped scratch blocks */ + LIST_HEAD(, fasttrap_scrspace) ftpc_fscr; /* free scratch space */ + LIST_HEAD(, fasttrap_scrspace) ftpc_ascr; /* used scratch space */ +#endif +} fasttrap_proc_t; + +typedef struct fasttrap_provider { + pid_t ftp_pid; /* process ID for this prov */ + char ftp_name[DTRACE_PROVNAMELEN]; /* prov name (w/o the pid) */ + dtrace_provider_id_t ftp_provid; /* DTrace provider handle */ + uint_t ftp_marked; /* mark for possible removal */ + uint_t ftp_retired; /* mark when retired */ + kmutex_t ftp_mtx; /* provider lock */ + kmutex_t ftp_cmtx; /* lock on creating probes */ + uint64_t ftp_rcount; /* enabled probes ref count */ + uint64_t ftp_ccount; /* consumers creating probes */ + uint64_t ftp_mcount; /* meta provider count */ + fasttrap_proc_t *ftp_proc; /* shared proc for all provs */ + struct fasttrap_provider *ftp_next; /* next prov in hash chain */ +} fasttrap_provider_t; + +typedef struct fasttrap_id fasttrap_id_t; +typedef struct fasttrap_probe fasttrap_probe_t; +typedef struct fasttrap_tracepoint fasttrap_tracepoint_t; + +struct fasttrap_id { + fasttrap_probe_t *fti_probe; /* referrring probe */ + fasttrap_id_t *fti_next; /* enabled probe list on tp */ + fasttrap_probe_type_t fti_ptype; /* probe type */ +}; + +typedef struct fasttrap_id_tp { + fasttrap_id_t fit_id; + fasttrap_tracepoint_t *fit_tp; +} fasttrap_id_tp_t; + +struct fasttrap_probe { + dtrace_id_t ftp_id; /* DTrace probe identifier */ + pid_t ftp_pid; /* pid for this probe */ + fasttrap_provider_t *ftp_prov; /* this probe's provider */ + uintptr_t ftp_faddr; /* associated function's addr */ + size_t ftp_fsize; /* associated function's size */ + uint64_t ftp_gen; /* modification generation */ + uint64_t ftp_ntps; /* number of tracepoints */ + uint8_t *ftp_argmap; /* native to translated args */ + uint8_t ftp_nargs; /* translated argument count */ + uint8_t ftp_enabled; /* is this probe enabled */ + char *ftp_xtypes; /* translated types index */ + char *ftp_ntypes; /* native types index */ + fasttrap_id_tp_t ftp_tps[1]; /* flexible array */ +}; + +#define FASTTRAP_ID_INDEX(id) \ +((fasttrap_id_tp_t *)(((char *)(id) - offsetof(fasttrap_id_tp_t, fit_id))) - \ +&(id)->fti_probe->ftp_tps[0]) + +struct fasttrap_tracepoint { + fasttrap_proc_t *ftt_proc; /* associated process struct */ + uintptr_t ftt_pc; /* address of tracepoint */ + pid_t ftt_pid; /* pid of tracepoint */ + fasttrap_machtp_t ftt_mtp; /* ISA-specific portion */ + fasttrap_id_t *ftt_ids; /* NULL-terminated list */ + fasttrap_id_t *ftt_retids; /* NULL-terminated list */ + fasttrap_tracepoint_t *ftt_next; /* link in global hash */ +}; + +typedef struct fasttrap_bucket { + kmutex_t ftb_mtx; /* bucket lock */ + void *ftb_data; /* data payload */ + + uint8_t ftb_pad[64 - sizeof (kmutex_t) - sizeof (void *)]; +} fasttrap_bucket_t; + +typedef struct fasttrap_hash { + ulong_t fth_nent; /* power-of-2 num. of entries */ + ulong_t fth_mask; /* fth_nent - 1 */ + fasttrap_bucket_t *fth_table; /* array of buckets */ +} fasttrap_hash_t; + +/* + * If at some future point these assembly functions become observable by + * DTrace, then these defines should become separate functions so that the + * fasttrap provider doesn't trigger probes during internal operations. + */ +#define fasttrap_copyout copyout +#define fasttrap_fuword32 fuword32 +#define fasttrap_suword32 suword32 +#define fasttrap_suword64 suword64 + +#ifdef __amd64__ +#define fasttrap_fulword fuword64 +#define fasttrap_sulword suword64 +#else +#define fasttrap_fulword fuword32 +#define fasttrap_sulword suword32 +#endif + +extern void fasttrap_sigtrap(proc_t *, kthread_t *, uintptr_t); +#ifndef illumos +extern fasttrap_scrspace_t *fasttrap_scraddr(struct thread *, + fasttrap_proc_t *); +#endif + +extern dtrace_id_t fasttrap_probe_id; +extern fasttrap_hash_t fasttrap_tpoints; + +#ifndef illumos +extern struct rmlock fasttrap_tp_lock; +#endif + +#define FASTTRAP_TPOINTS_INDEX(pid, pc) \ + (((pc) / sizeof (fasttrap_instr_t) + (pid)) & fasttrap_tpoints.fth_mask) + +/* + * Must be implemented by fasttrap_isa.c + */ +extern int fasttrap_tracepoint_init(proc_t *, fasttrap_tracepoint_t *, + uintptr_t, fasttrap_probe_type_t); +extern int fasttrap_tracepoint_install(proc_t *, fasttrap_tracepoint_t *); +extern int fasttrap_tracepoint_remove(proc_t *, fasttrap_tracepoint_t *); + +struct trapframe; +extern int fasttrap_pid_probe(struct trapframe *); +extern int fasttrap_return_probe(struct trapframe *); + +extern uint64_t fasttrap_pid_getarg(void *, dtrace_id_t, void *, int, int); +extern uint64_t fasttrap_usdt_getarg(void *, dtrace_id_t, void *, int, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _FASTTRAP_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h b/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h new file mode 100644 index 000000000000..717da1e3a00e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h @@ -0,0 +1,431 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2013 Garrett D'Amore <garrett@damore.org> + * + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FEATURE_TESTS_H +#define _SYS_FEATURE_TESTS_H + +#include <sys/ccompile.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Values of _POSIX_C_SOURCE + * + * undefined not a POSIX compilation + * 1 POSIX.1-1990 compilation + * 2 POSIX.2-1992 compilation + * 199309L POSIX.1b-1993 compilation (Real Time) + * 199506L POSIX.1c-1995 compilation (POSIX Threads) + * 200112L POSIX.1-2001 compilation (Austin Group Revision) + * 200809L POSIX.1-2008 compilation + */ +#if defined(_POSIX_SOURCE) && !defined(_POSIX_C_SOURCE) +#define _POSIX_C_SOURCE 1 +#endif + +/* + * The feature test macros __XOPEN_OR_POSIX, _STRICT_STDC, _STRICT_SYMBOLS, + * and _STDC_C99 are Sun implementation specific macros created in order to + * compress common standards specified feature test macros for easier reading. + * These macros should not be used by the application developer as + * unexpected results may occur. Instead, the user should reference + * standards(5) for correct usage of the standards feature test macros. + * + * __XOPEN_OR_POSIX Used in cases where a symbol is defined by both + * X/Open or POSIX or in the negative, when neither + * X/Open or POSIX defines a symbol. + * + * _STRICT_STDC __STDC__ is specified by the C Standards and defined + * by the compiler. For Sun compilers the value of + * __STDC__ is either 1, 0, or not defined based on the + * compilation mode (see cc(1)). When the value of + * __STDC__ is 1 and in the absence of any other feature + * test macros, the namespace available to the application + * is limited to only those symbols defined by the C + * Standard. _STRICT_STDC provides a more readable means + * of identifying symbols defined by the standard, or in + * the negative, symbols that are extensions to the C + * Standard. See additional comments for GNU C differences. + * + * _STDC_C99 __STDC_VERSION__ is specified by the C standards and + * defined by the compiler and indicates the version of + * the C standard. A value of 199901L indicates a + * compiler that complies with ISO/IEC 9899:1999, other- + * wise known as the C99 standard. + * + * _STRICT_SYMBOLS Used in cases where symbol visibility is restricted + * by the standards, and the user has not explicitly + * relaxed the strictness via __EXTENSIONS__. + */ + +#if defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE) +#define __XOPEN_OR_POSIX +#endif + +/* + * ISO/IEC 9899:1990 and it's revision, ISO/IEC 9899:1999 specify the + * following predefined macro name: + * + * __STDC__ The integer constant 1, intended to indicate a conforming + * implementation. + * + * Furthermore, a strictly conforming program shall use only those features + * of the language and library specified in these standards. A conforming + * implementation shall accept any strictly conforming program. + * + * Based on these requirements, Sun's C compiler defines __STDC__ to 1 for + * strictly conforming environments and __STDC__ to 0 for environments that + * use ANSI C semantics but allow extensions to the C standard. For non-ANSI + * C semantics, Sun's C compiler does not define __STDC__. + * + * The GNU C project interpretation is that __STDC__ should always be defined + * to 1 for compilation modes that accept ANSI C syntax regardless of whether + * or not extensions to the C standard are used. Violations of conforming + * behavior are conditionally flagged as warnings via the use of the + * -pedantic option. In addition to defining __STDC__ to 1, the GNU C + * compiler also defines __STRICT_ANSI__ as a means of specifying strictly + * conforming environments using the -ansi or -std=<standard> options. + * + * In the absence of any other compiler options, Sun and GNU set the value + * of __STDC__ as follows when using the following options: + * + * Value of __STDC__ __STRICT_ANSI__ + * + * cc -Xa (default) 0 undefined + * cc -Xt (transitional) 0 undefined + * cc -Xc (strictly conforming) 1 undefined + * cc -Xs (K&R C) undefined undefined + * + * gcc (default) 1 undefined + * gcc -ansi, -std={c89, c99,...) 1 defined + * gcc -traditional (K&R) undefined undefined + * + * The default compilation modes for Sun C compilers versus GNU C compilers + * results in a differing value for __STDC__ which results in a more + * restricted namespace when using Sun compilers. To allow both GNU and Sun + * interpretations to peacefully co-exist, we use the following Sun + * implementation _STRICT_STDC_ macro: + */ + +#if (__STDC__ - 0 == 1 && !defined(__GNUC__)) || \ + (defined(__GNUC__) && defined(__STRICT_ANSI__)) +#define _STRICT_STDC +#else +#undef _STRICT_STDC +#endif + +/* + * Compiler complies with ISO/IEC 9899:1999 + */ + +#if __STDC_VERSION__ - 0 >= 199901L +#ifndef _STDC_C99 +#define _STDC_C99 +#endif +#endif + +/* + * Use strict symbol visibility. + */ +#if (defined(_STRICT_STDC) || defined(__XOPEN_OR_POSIX)) && \ + !defined(__EXTENSIONS__) +#define _STRICT_SYMBOLS +#endif + +/* + * Large file interfaces: + * + * _LARGEFILE_SOURCE + * 1 large file-related additions to POSIX + * interfaces requested (fseeko, etc.) + * _LARGEFILE64_SOURCE + * 1 transitional large-file-related interfaces + * requested (seek64, stat64, etc.) + * + * The corresponding announcement macros are respectively: + * _LFS_LARGEFILE + * _LFS64_LARGEFILE + * (These are set in <unistd.h>.) + * + * Requesting _LARGEFILE64_SOURCE implies requesting _LARGEFILE_SOURCE as + * well. + * + * The large file interfaces are made visible regardless of the initial values + * of the feature test macros under certain circumstances: + * - If no explicit standards-conforming environment is requested (neither + * of _POSIX_SOURCE nor _XOPEN_SOURCE is defined and the value of + * __STDC__ does not imply standards conformance). + * - Extended system interfaces are explicitly requested (__EXTENSIONS__ + * is defined). + * - Access to in-kernel interfaces is requested (_KERNEL or _KMEMUSER is + * defined). (Note that this dependency is an artifact of the current + * kernel implementation and may change in future releases.) + */ +#if (!defined(_STRICT_STDC) && !defined(__XOPEN_OR_POSIX)) || \ + defined(_KERNEL) || defined(_KMEMUSER) || \ + defined(__EXTENSIONS__) +#undef _LARGEFILE64_SOURCE +#define _LARGEFILE64_SOURCE 1 +#endif +#if _LARGEFILE64_SOURCE - 0 == 1 +#undef _LARGEFILE_SOURCE +#define _LARGEFILE_SOURCE 1 +#endif + +/* + * Large file compilation environment control: + * + * The setting of _FILE_OFFSET_BITS controls the size of various file-related + * types and governs the mapping between file-related source function symbol + * names and the corresponding binary entry points. + * + * In the 32-bit environment, the default value is 32; if not set, set it to + * the default here, to simplify tests in other headers. + * + * In the 64-bit compilation environment, the only value allowed is 64. + */ +#if defined(_LP64) +#ifndef _FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 64 +#endif +#if _FILE_OFFSET_BITS - 0 != 64 +#error "invalid _FILE_OFFSET_BITS value specified" +#endif +#else /* _LP64 */ +#ifndef _FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 32 +#endif +#if _FILE_OFFSET_BITS - 0 != 32 && _FILE_OFFSET_BITS - 0 != 64 +#error "invalid _FILE_OFFSET_BITS value specified" +#endif +#endif /* _LP64 */ + +/* + * Use of _XOPEN_SOURCE + * + * The following X/Open specifications are supported: + * + * X/Open Portability Guide, Issue 3 (XPG3) + * X/Open CAE Specification, Issue 4 (XPG4) + * X/Open CAE Specification, Issue 4, Version 2 (XPG4v2) + * X/Open CAE Specification, Issue 5 (XPG5) + * Open Group Technical Standard, Issue 6 (XPG6), also referred to as + * IEEE Std. 1003.1-2001 and ISO/IEC 9945:2002. + * Open Group Technical Standard, Issue 7 (XPG7), also referred to as + * IEEE Std. 1003.1-2008 and ISO/IEC 9945:2009. + * + * XPG4v2 is also referred to as UNIX 95 (SUS or SUSv1). + * XPG5 is also referred to as UNIX 98 or the Single Unix Specification, + * Version 2 (SUSv2) + * XPG6 is the result of a merge of the X/Open and POSIX specifications + * and as such is also referred to as IEEE Std. 1003.1-2001 in + * addition to UNIX 03 and SUSv3. + * XPG7 is also referred to as UNIX 08 and SUSv4. + * + * When writing a conforming X/Open application, as per the specification + * requirements, the appropriate feature test macros must be defined at + * compile time. These are as follows. For more info, see standards(5). + * + * Feature Test Macro Specification + * ------------------------------------------------ ------------- + * _XOPEN_SOURCE XPG3 + * _XOPEN_SOURCE && _XOPEN_VERSION = 4 XPG4 + * _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED = 1 XPG4v2 + * _XOPEN_SOURCE = 500 XPG5 + * _XOPEN_SOURCE = 600 (or POSIX_C_SOURCE=200112L) XPG6 + * _XOPEN_SOURCE = 700 (or POSIX_C_SOURCE=200809L) XPG7 + * + * In order to simplify the guards within the headers, the following + * implementation private test macros have been created. Applications + * must NOT use these private test macros as unexpected results will + * occur. + * + * Note that in general, the use of these private macros is cumulative. + * For example, the use of _XPG3 with no other restrictions on the X/Open + * namespace will make the symbols visible for XPG3 through XPG6 + * compilation environments. The use of _XPG4_2 with no other X/Open + * namespace restrictions indicates that the symbols were introduced in + * XPG4v2 and are therefore visible for XPG4v2 through XPG6 compilation + * environments, but not for XPG3 or XPG4 compilation environments. + * + * _XPG3 X/Open Portability Guide, Issue 3 (XPG3) + * _XPG4 X/Open CAE Specification, Issue 4 (XPG4) + * _XPG4_2 X/Open CAE Specification, Issue 4, Version 2 (XPG4v2/UNIX 95/SUS) + * _XPG5 X/Open CAE Specification, Issue 5 (XPG5/UNIX 98/SUSv2) + * _XPG6 Open Group Technical Standard, Issue 6 (XPG6/UNIX 03/SUSv3) + * _XPG7 Open Group Technical Standard, Issue 7 (XPG7/UNIX 08/SUSv4) + */ + +/* X/Open Portability Guide, Issue 3 */ +#if defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE - 0 < 500) && \ + (_XOPEN_VERSION - 0 < 4) && !defined(_XOPEN_SOURCE_EXTENDED) +#define _XPG3 +/* X/Open CAE Specification, Issue 4 */ +#elif (defined(_XOPEN_SOURCE) && _XOPEN_VERSION - 0 == 4) +#define _XPG4 +#define _XPG3 +/* X/Open CAE Specification, Issue 4, Version 2 */ +#elif (defined(_XOPEN_SOURCE) && _XOPEN_SOURCE_EXTENDED - 0 == 1) +#define _XPG4_2 +#define _XPG4 +#define _XPG3 +/* X/Open CAE Specification, Issue 5 */ +#elif (_XOPEN_SOURCE - 0 == 500) +#define _XPG5 +#define _XPG4_2 +#define _XPG4 +#define _XPG3 +#undef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 199506L +/* Open Group Technical Standard , Issue 6 */ +#elif (_XOPEN_SOURCE - 0 == 600) || (_POSIX_C_SOURCE - 0 == 200112L) +#define _XPG6 +#define _XPG5 +#define _XPG4_2 +#define _XPG4 +#define _XPG3 +#undef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200112L +#undef _XOPEN_SOURCE +#define _XOPEN_SOURCE 600 + +/* Open Group Technical Standard, Issue 7 */ +#elif (_XOPEN_SOURCE - 0 == 700) || (_POSIX_C_SOURCE - 0 == 200809L) +#define _XPG7 +#define _XPG6 +#define _XPG5 +#define _XPG4_2 +#define _XPG4 +#define _XPG3 +#undef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#undef _XOPEN_SOURCE +#define _XOPEN_SOURCE 700 +#endif + +/* + * _XOPEN_VERSION is defined by the X/Open specifications and is not + * normally defined by the application, except in the case of an XPG4 + * application. On the implementation side, _XOPEN_VERSION defined with + * the value of 3 indicates an XPG3 application. _XOPEN_VERSION defined + * with the value of 4 indicates an XPG4 or XPG4v2 (UNIX 95) application. + * _XOPEN_VERSION defined with a value of 500 indicates an XPG5 (UNIX 98) + * application and with a value of 600 indicates an XPG6 (UNIX 03) + * application and with a value of 700 indicates an XPG7 (UNIX 08). + * The appropriate version is determined by the use of the + * feature test macros described earlier. The value of _XOPEN_VERSION + * defaults to 3 otherwise indicating support for XPG3 applications. + */ +#ifndef _XOPEN_VERSION +#if defined(_XPG7) +#define _XOPEN_VERSION 700 +#elif defined(_XPG6) +#define _XOPEN_VERSION 600 +#elif defined(_XPG5) +#define _XOPEN_VERSION 500 +#elif defined(_XPG4_2) +#define _XOPEN_VERSION 4 +#else +#define _XOPEN_VERSION 3 +#endif +#endif + +/* + * ANSI C and ISO 9899:1990 say the type long long doesn't exist in strictly + * conforming environments. ISO 9899:1999 says it does. + * + * The presence of _LONGLONG_TYPE says "long long exists" which is therefore + * defined in all but strictly conforming environments that disallow it. + */ +#if !defined(_STDC_C99) && defined(_STRICT_STDC) && !defined(__GNUC__) +/* + * Resist attempts to force the definition of long long in this case. + */ +#if defined(_LONGLONG_TYPE) +#error "No long long in strictly conforming ANSI C & 1990 ISO C environments" +#endif +#else +#if !defined(_LONGLONG_TYPE) +#define _LONGLONG_TYPE +#endif +#endif + +/* + * It is invalid to compile an XPG3, XPG4, XPG4v2, or XPG5 application + * using c99. The same is true for POSIX.1-1990, POSIX.2-1992, POSIX.1b, + * and POSIX.1c applications. Likewise, it is invalid to compile an XPG6 + * or a POSIX.1-2001 application with anything other than a c99 or later + * compiler. Therefore, we force an error in both cases. + */ +#if defined(_STDC_C99) && (defined(__XOPEN_OR_POSIX) && !defined(_XPG6)) +#error "Compiler or options invalid for pre-UNIX 03 X/Open applications \ + and pre-2001 POSIX applications" +#elif !defined(_STDC_C99) && \ + (defined(__XOPEN_OR_POSIX) && defined(_XPG6)) +#error "Compiler or options invalid; UNIX 03 and POSIX.1-2001 applications \ + require the use of c99" +#endif + +/* + * The following macro defines a value for the ISO C99 restrict + * keyword so that _RESTRICT_KYWD resolves to "restrict" if + * an ISO C99 compiler is used and "" (null string) if any other + * compiler is used. This allows for the use of single prototype + * declarations regardless of compiler version. + */ +#if (defined(__STDC__) && defined(_STDC_C99)) && !defined(__cplusplus) +#define _RESTRICT_KYWD restrict +#else +#define _RESTRICT_KYWD +#endif + +/* + * The following macro indicates header support for the ANSI C++ + * standard. The ISO/IEC designation for this is ISO/IEC FDIS 14882. + */ +#define _ISO_CPP_14882_1998 + +/* + * The following macro indicates header support for the C99 standard, + * ISO/IEC 9899:1999, Programming Languages - C. + */ +#define _ISO_C_9899_1999 + +/* + * The following macro indicates header support for DTrace. The value is an + * integer that corresponds to the major version number for DTrace. + */ +#define _DTRACE_VERSION 1 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FEATURE_TESTS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h new file mode 100644 index 000000000000..029af540b3c7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FM_FS_ZFS_H +#define _SYS_FM_FS_ZFS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_ERROR_CLASS "fs.zfs" + +#define FM_EREPORT_ZFS_CHECKSUM "checksum" +#define FM_EREPORT_ZFS_IO "io" +#define FM_EREPORT_ZFS_DATA "data" +#define FM_EREPORT_ZFS_POOL "zpool" +#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" +#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" +#define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data" +#define FM_EREPORT_ZFS_DEVICE_NO_REPLICAS "vdev.no_replicas" +#define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum" +#define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small" +#define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label" +#define FM_EREPORT_ZFS_IO_FAILURE "io_failure" +#define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure" +#define FM_EREPORT_ZFS_LOG_REPLAY "log_replay" +#define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write" + +#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool" +#define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode" +#define FM_EREPORT_PAYLOAD_ZFS_POOL_GUID "pool_guid" +#define FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT "pool_context" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID "parent_devid" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET "zio_objset" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT "zio_object" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL "zio_level" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID "zio_blkid" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR "zio_err" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" +#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS "bad_range_sets" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram" + +#define FM_EREPORT_FAILMODE_WAIT "wait" +#define FM_EREPORT_FAILMODE_CONTINUE "continue" +#define FM_EREPORT_FAILMODE_PANIC "panic" + +#define FM_RESOURCE_REMOVED "removed" +#define FM_RESOURCE_AUTOREPLACE "autoreplace" +#define FM_RESOURCE_STATECHANGE "statechange" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FM_FS_ZFS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h new file mode 100644 index 000000000000..f5f93421bd74 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h @@ -0,0 +1,369 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FM_PROTOCOL_H +#define _SYS_FM_PROTOCOL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +#include <sys/varargs.h> +#include <sys/nvpair.h> +#else +#include <libnvpair.h> +#include <stdarg.h> +#endif + +/* FM common member names */ +#define FM_CLASS "class" +#define FM_VERSION "version" + +/* FM protocol category 1 class names */ +#define FM_EREPORT_CLASS "ereport" +#define FM_FAULT_CLASS "fault" +#define FM_DEFECT_CLASS "defect" +#define FM_RSRC_CLASS "resource" +#define FM_LIST_EVENT "list" +#define FM_IREPORT_CLASS "ireport" + +/* FM list.* event class values */ +#define FM_LIST_SUSPECT_CLASS FM_LIST_EVENT ".suspect" +#define FM_LIST_ISOLATED_CLASS FM_LIST_EVENT ".isolated" +#define FM_LIST_REPAIRED_CLASS FM_LIST_EVENT ".repaired" +#define FM_LIST_UPDATED_CLASS FM_LIST_EVENT ".updated" +#define FM_LIST_RESOLVED_CLASS FM_LIST_EVENT ".resolved" + +/* ereport class subcategory values */ +#define FM_ERROR_CPU "cpu" +#define FM_ERROR_IO "io" + +/* ereport version and payload member names */ +#define FM_EREPORT_VERS0 0 +#define FM_EREPORT_VERSION FM_EREPORT_VERS0 + +/* ereport payload member names */ +#define FM_EREPORT_DETECTOR "detector" +#define FM_EREPORT_ENA "ena" + +/* list.* event payload member names */ +#define FM_LIST_EVENT_SIZE "list-sz" + +/* ireport.* event payload member names */ +#define FM_IREPORT_DETECTOR "detector" +#define FM_IREPORT_UUID "uuid" +#define FM_IREPORT_PRIORITY "pri" +#define FM_IREPORT_ATTRIBUTES "attr" + +/* + * list.suspect, isolated, updated, repaired and resolved + * versions/payload member names. + */ +#define FM_SUSPECT_UUID "uuid" +#define FM_SUSPECT_DIAG_CODE "code" +#define FM_SUSPECT_DIAG_TIME "diag-time" +#define FM_SUSPECT_DE "de" +#define FM_SUSPECT_FAULT_LIST "fault-list" +#define FM_SUSPECT_FAULT_SZ "fault-list-sz" +#define FM_SUSPECT_FAULT_STATUS "fault-status" +#define FM_SUSPECT_INJECTED "__injected" +#define FM_SUSPECT_MESSAGE "message" +#define FM_SUSPECT_RETIRE "retire" +#define FM_SUSPECT_RESPONSE "response" +#define FM_SUSPECT_SEVERITY "severity" + +#define FM_SUSPECT_VERS0 0 +#define FM_SUSPECT_VERSION FM_SUSPECT_VERS0 + +#define FM_SUSPECT_FAULTY 0x1 +#define FM_SUSPECT_UNUSABLE 0x2 +#define FM_SUSPECT_NOT_PRESENT 0x4 +#define FM_SUSPECT_DEGRADED 0x8 +#define FM_SUSPECT_REPAIRED 0x10 +#define FM_SUSPECT_REPLACED 0x20 +#define FM_SUSPECT_ACQUITTED 0x40 + +/* fault event versions and payload member names */ +#define FM_FAULT_VERS0 0 +#define FM_FAULT_VERSION FM_FAULT_VERS0 + +#define FM_FAULT_ASRU "asru" +#define FM_FAULT_FRU "fru" +#define FM_FAULT_FRU_LABEL "fru-label" +#define FM_FAULT_CERTAINTY "certainty" +#define FM_FAULT_RESOURCE "resource" +#define FM_FAULT_LOCATION "location" + +/* resource event versions and payload member names */ +#define FM_RSRC_VERS0 0 +#define FM_RSRC_VERSION FM_RSRC_VERS0 +#define FM_RSRC_RESOURCE "resource" + +/* resource.fm.asru.* payload member names */ +#define FM_RSRC_ASRU_UUID "uuid" +#define FM_RSRC_ASRU_CODE "code" +#define FM_RSRC_ASRU_FAULTY "faulty" +#define FM_RSRC_ASRU_REPAIRED "repaired" +#define FM_RSRC_ASRU_REPLACED "replaced" +#define FM_RSRC_ASRU_ACQUITTED "acquitted" +#define FM_RSRC_ASRU_RESOLVED "resolved" +#define FM_RSRC_ASRU_UNUSABLE "unusable" +#define FM_RSRC_ASRU_EVENT "event" + +/* resource.fm.xprt.* versions and payload member names */ +#define FM_RSRC_XPRT_VERS0 0 +#define FM_RSRC_XPRT_VERSION FM_RSRC_XPRT_VERS0 +#define FM_RSRC_XPRT_UUID "uuid" +#define FM_RSRC_XPRT_SUBCLASS "subclass" +#define FM_RSRC_XPRT_FAULT_STATUS "fault-status" +#define FM_RSRC_XPRT_FAULT_HAS_ASRU "fault-has-asru" + +/* + * FM ENA Format Macros + */ +#define ENA_FORMAT_MASK 0x3 +#define ENA_FORMAT(ena) ((ena) & ENA_FORMAT_MASK) + +/* ENA format types */ +#define FM_ENA_FMT0 0 +#define FM_ENA_FMT1 1 +#define FM_ENA_FMT2 2 + +/* Format 1 */ +#define ENA_FMT1_GEN_MASK 0x00000000000003FCull +#define ENA_FMT1_ID_MASK 0xFFFFFFFFFFFFFC00ull +#define ENA_FMT1_CPUID_MASK 0x00000000000FFC00ull +#define ENA_FMT1_TIME_MASK 0xFFFFFFFFFFF00000ull +#define ENA_FMT1_GEN_SHFT 2 +#define ENA_FMT1_ID_SHFT 10 +#define ENA_FMT1_CPUID_SHFT ENA_FMT1_ID_SHFT +#define ENA_FMT1_TIME_SHFT 20 + +/* Format 2 */ +#define ENA_FMT2_GEN_MASK 0x00000000000003FCull +#define ENA_FMT2_ID_MASK 0xFFFFFFFFFFFFFC00ull +#define ENA_FMT2_TIME_MASK ENA_FMT2_ID_MASK +#define ENA_FMT2_GEN_SHFT 2 +#define ENA_FMT2_ID_SHFT 10 +#define ENA_FMT2_TIME_SHFT ENA_FMT2_ID_SHFT + +/* Common FMRI type names */ +#define FM_FMRI_AUTHORITY "authority" +#define FM_FMRI_SCHEME "scheme" +#define FM_FMRI_SVC_AUTHORITY "svc-authority" +#define FM_FMRI_FACILITY "facility" + +/* FMRI authority-type member names */ +#define FM_FMRI_AUTH_CHASSIS "chassis-id" +#define FM_FMRI_AUTH_PRODUCT_SN "product-sn" +#define FM_FMRI_AUTH_PRODUCT "product-id" +#define FM_FMRI_AUTH_DOMAIN "domain-id" +#define FM_FMRI_AUTH_SERVER "server-id" +#define FM_FMRI_AUTH_HOST "host-id" + +#define FM_AUTH_VERS0 0 +#define FM_FMRI_AUTH_VERSION FM_AUTH_VERS0 + +/* scheme name values */ +#define FM_FMRI_SCHEME_FMD "fmd" +#define FM_FMRI_SCHEME_DEV "dev" +#define FM_FMRI_SCHEME_HC "hc" +#define FM_FMRI_SCHEME_SVC "svc" +#define FM_FMRI_SCHEME_CPU "cpu" +#define FM_FMRI_SCHEME_MEM "mem" +#define FM_FMRI_SCHEME_MOD "mod" +#define FM_FMRI_SCHEME_PKG "pkg" +#define FM_FMRI_SCHEME_LEGACY "legacy-hc" +#define FM_FMRI_SCHEME_ZFS "zfs" +#define FM_FMRI_SCHEME_SW "sw" + +/* Scheme versions */ +#define FMD_SCHEME_VERSION0 0 +#define FM_FMD_SCHEME_VERSION FMD_SCHEME_VERSION0 +#define DEV_SCHEME_VERSION0 0 +#define FM_DEV_SCHEME_VERSION DEV_SCHEME_VERSION0 +#define FM_HC_VERS0 0 +#define FM_HC_SCHEME_VERSION FM_HC_VERS0 +#define CPU_SCHEME_VERSION0 0 +#define CPU_SCHEME_VERSION1 1 +#define FM_CPU_SCHEME_VERSION CPU_SCHEME_VERSION1 +#define MEM_SCHEME_VERSION0 0 +#define FM_MEM_SCHEME_VERSION MEM_SCHEME_VERSION0 +#define MOD_SCHEME_VERSION0 0 +#define FM_MOD_SCHEME_VERSION MOD_SCHEME_VERSION0 +#define PKG_SCHEME_VERSION0 0 +#define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0 +#define LEGACY_SCHEME_VERSION0 0 +#define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0 +#define SVC_SCHEME_VERSION0 0 +#define FM_SVC_SCHEME_VERSION SVC_SCHEME_VERSION0 +#define ZFS_SCHEME_VERSION0 0 +#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0 +#define SW_SCHEME_VERSION0 0 +#define FM_SW_SCHEME_VERSION SW_SCHEME_VERSION0 + +/* hc scheme member names */ +#define FM_FMRI_HC_SERIAL_ID "serial" +#define FM_FMRI_HC_PART "part" +#define FM_FMRI_HC_REVISION "revision" +#define FM_FMRI_HC_ROOT "hc-root" +#define FM_FMRI_HC_LIST_SZ "hc-list-sz" +#define FM_FMRI_HC_LIST "hc-list" +#define FM_FMRI_HC_SPECIFIC "hc-specific" + +/* facility member names */ +#define FM_FMRI_FACILITY_NAME "facility-name" +#define FM_FMRI_FACILITY_TYPE "facility-type" + +/* hc-list version and member names */ +#define FM_FMRI_HC_NAME "hc-name" +#define FM_FMRI_HC_ID "hc-id" + +#define HC_LIST_VERSION0 0 +#define FM_HC_LIST_VERSION HC_LIST_VERSION0 + +/* hc-specific member names */ +#define FM_FMRI_HC_SPECIFIC_OFFSET "offset" +#define FM_FMRI_HC_SPECIFIC_PHYSADDR "physaddr" + +/* fmd module scheme member names */ +#define FM_FMRI_FMD_NAME "mod-name" +#define FM_FMRI_FMD_VERSION "mod-version" + +/* dev scheme member names */ +#define FM_FMRI_DEV_ID "devid" +#define FM_FMRI_DEV_TGTPTLUN0 "target-port-l0id" +#define FM_FMRI_DEV_PATH "device-path" + +/* pkg scheme member names */ +#define FM_FMRI_PKG_BASEDIR "pkg-basedir" +#define FM_FMRI_PKG_INST "pkg-inst" +#define FM_FMRI_PKG_VERSION "pkg-version" + +/* svc scheme member names */ +#define FM_FMRI_SVC_NAME "svc-name" +#define FM_FMRI_SVC_INSTANCE "svc-instance" +#define FM_FMRI_SVC_CONTRACT_ID "svc-contract-id" + +/* svc-authority member names */ +#define FM_FMRI_SVC_AUTH_SCOPE "scope" +#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-fqn" + +/* cpu scheme member names */ +#define FM_FMRI_CPU_ID "cpuid" +#define FM_FMRI_CPU_SERIAL_ID "serial" +#define FM_FMRI_CPU_MASK "cpumask" +#define FM_FMRI_CPU_VID "cpuvid" +#define FM_FMRI_CPU_CPUFRU "cpufru" +#define FM_FMRI_CPU_CACHE_INDEX "cacheindex" +#define FM_FMRI_CPU_CACHE_WAY "cacheway" +#define FM_FMRI_CPU_CACHE_BIT "cachebit" +#define FM_FMRI_CPU_CACHE_TYPE "cachetype" + +#define FM_FMRI_CPU_CACHE_TYPE_L2 0 +#define FM_FMRI_CPU_CACHE_TYPE_L3 1 + +/* legacy-hc scheme member names */ +#define FM_FMRI_LEGACY_HC "component" +#define FM_FMRI_LEGACY_HC_PREFIX FM_FMRI_SCHEME_HC":///" \ + FM_FMRI_LEGACY_HC"=" + +/* mem scheme member names */ +#define FM_FMRI_MEM_UNUM "unum" +#define FM_FMRI_MEM_SERIAL_ID "serial" +#define FM_FMRI_MEM_PHYSADDR "physaddr" +#define FM_FMRI_MEM_MEMCONFIG "memconfig" +#define FM_FMRI_MEM_OFFSET "offset" + +/* mod scheme member names */ +#define FM_FMRI_MOD_PKG "mod-pkg" +#define FM_FMRI_MOD_NAME "mod-name" +#define FM_FMRI_MOD_ID "mod-id" +#define FM_FMRI_MOD_DESC "mod-desc" + +/* zfs scheme member names */ +#define FM_FMRI_ZFS_POOL "pool" +#define FM_FMRI_ZFS_VDEV "vdev" + +/* sw scheme member names - extra indentation for members of an nvlist */ +#define FM_FMRI_SW_OBJ "object" +#define FM_FMRI_SW_OBJ_PATH "path" +#define FM_FMRI_SW_OBJ_ROOT "root" +#define FM_FMRI_SW_OBJ_PKG "pkg" +#define FM_FMRI_SW_SITE "site" +#define FM_FMRI_SW_SITE_TOKEN "token" +#define FM_FMRI_SW_SITE_MODULE "module" +#define FM_FMRI_SW_SITE_FILE "file" +#define FM_FMRI_SW_SITE_LINE "line" +#define FM_FMRI_SW_SITE_FUNC "func" +#define FM_FMRI_SW_CTXT "context" +#define FM_FMRI_SW_CTXT_ORIGIN "origin" +#define FM_FMRI_SW_CTXT_EXECNAME "execname" +#define FM_FMRI_SW_CTXT_PID "pid" +#define FM_FMRI_SW_CTXT_ZONE "zone" +#define FM_FMRI_SW_CTXT_CTID "ctid" +#define FM_FMRI_SW_CTXT_STACK "stack" + +extern nv_alloc_t *fm_nva_xcreate(char *, size_t); +extern void fm_nva_xdestroy(nv_alloc_t *); + +extern nvlist_t *fm_nvlist_create(nv_alloc_t *); +extern void fm_nvlist_destroy(nvlist_t *, int); + +#define FM_NVA_FREE 0 /* free allocator on nvlist_destroy */ +#define FM_NVA_RETAIN 1 /* keep allocator on nvlist_destroy */ + +extern void fm_ereport_set(nvlist_t *, int, const char *, uint64_t, + const nvlist_t *, ...); +extern void fm_payload_set(nvlist_t *, ...); +extern int i_fm_payload_set(nvlist_t *, const char *, va_list); +extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *, + int, ...); +extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *, + const char *, const char *); +extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *); +extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t, + uint8_t *, const char *); +extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *, + const char *, uint64_t); +extern void fm_authority_set(nvlist_t *, int, const char *, const char *, + const char *, const char *); +extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t); +extern void fm_fmri_hc_create(nvlist_t *, int, const nvlist_t *, nvlist_t *, + nvlist_t *, int, ...); + +extern uint64_t fm_ena_increment(uint64_t); +extern uint64_t fm_ena_generate(uint64_t, uchar_t); +extern uint64_t fm_ena_generation_get(uint64_t); +extern uchar_t fm_ena_format_get(uint64_t); +extern uint64_t fm_ena_id_get(uint64_t); +extern uint64_t fm_ena_time_get(uint64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FM_PROTOCOL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h new file mode 100644 index 000000000000..e99a370af7ae --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 RackTop Systems. + */ + +#ifndef _SYS_FM_UTIL_H +#define _SYS_FM_UTIL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/nvpair.h> +#include <sys/errorq.h> + +/* + * Shared user/kernel definitions for class length, error channel name, + * and kernel event publisher string. + */ +#define FM_MAX_CLASS 100 +#define FM_ERROR_CHAN "com.sun:fm:error" +#define FM_PUB "fm" + +/* + * ereport dump device transport support + * + * Ereports are written out to the dump device at a proscribed offset from the + * end, similar to in-transit log messages. The ereports are represented as a + * erpt_dump_t header followed by ed_size bytes of packed native nvlist data. + * + * NOTE: All of these constants and the header must be defined so they have the + * same representation for *both* 32-bit and 64-bit producers and consumers. + */ +#define ERPT_MAGIC 0xf00d4eddU +#define ERPT_MAX_ERRS 16 +#define ERPT_DATA_SZ (6 * 1024) +#define ERPT_EVCH_MAX 256 +#define ERPT_HIWAT 64 + +typedef struct erpt_dump { + uint32_t ed_magic; /* ERPT_MAGIC or zero to indicate end */ + uint32_t ed_chksum; /* checksum32() of packed nvlist data */ + uint32_t ed_size; /* ereport (nvl) fixed buf size */ + uint32_t ed_pad; /* reserved for future use */ + hrtime_t ed_hrt_nsec; /* hrtime of this ereport */ + hrtime_t ed_hrt_base; /* hrtime sample corresponding to ed_tod_base */ + struct { + uint64_t sec; /* seconds since gettimeofday() Epoch */ + uint64_t nsec; /* nanoseconds past ed_tod_base.sec */ + } ed_tod_base; +} erpt_dump_t; + +#if defined(_KERNEL) || defined(_FAKE_KERNEL) +#include <sys/systm.h> + +#define FM_STK_DEPTH 20 /* maximum stack depth */ +#define FM_SYM_SZ 64 /* maximum symbol size */ +#define FM_ERR_PIL 2 /* PIL for ereport_errorq drain processing */ + +#define FM_EREPORT_PAYLOAD_NAME_STACK "stack" + +extern errorq_t *ereport_errorq; +extern void *ereport_dumpbuf; +extern size_t ereport_dumplen; + +extern void fm_init(void); +extern void fm_nvprint(nvlist_t *); +#define fm_panic panic +extern void fm_banner(void); + +extern void fm_ereport_dump(void); +extern void fm_ereport_post(nvlist_t *, int); + +extern int is_fm_panic(); +#endif /* _KERNEL || _FAKE_KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FM_UTIL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h new file mode 100644 index 000000000000..461ecc34c80c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h @@ -0,0 +1,1190 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017 Datto Inc. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#ifndef _SYS_FS_ZFS_H +#define _SYS_FS_ZFS_H + +#include <sys/types.h> +#include <sys/ioccom.h> +#include <sys/time.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Types and constants shared between userland and the kernel. + */ + +/* + * Each dataset can be one of the following types. These constants can be + * combined into masks that can be passed to various functions. + */ +typedef enum { + ZFS_TYPE_FILESYSTEM = (1 << 0), + ZFS_TYPE_SNAPSHOT = (1 << 1), + ZFS_TYPE_VOLUME = (1 << 2), + ZFS_TYPE_POOL = (1 << 3), + ZFS_TYPE_BOOKMARK = (1 << 4) +} zfs_type_t; + +/* + * NB: lzc_dataset_type should be updated whenever a new objset type is added, + * if it represents a real type of a dataset that can be created from userland. + */ +typedef enum dmu_objset_type { + DMU_OST_NONE, + DMU_OST_META, + DMU_OST_ZFS, + DMU_OST_ZVOL, + DMU_OST_OTHER, /* For testing only! */ + DMU_OST_ANY, /* Be careful! */ + DMU_OST_NUMTYPES +} dmu_objset_type_t; + +#define ZFS_TYPE_DATASET \ + (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) + +/* + * All of these include the terminating NUL byte. + */ +#define ZAP_MAXNAMELEN 256 +#define ZAP_MAXVALUELEN (1024 * 8) +#define ZAP_OLDMAXVALUELEN 1024 +#define ZFS_MAX_DATASET_NAME_LEN 256 + +/* + * Dataset properties are identified by these constants and must be added to + * the end of this list to ensure that external consumers are not affected + * by the change. If you make any changes to this list, be sure to update + * the property table in usr/src/common/zfs/zfs_prop.c. + */ +typedef enum { + ZPROP_CONT = -2, + ZPROP_INVAL = -1, + ZFS_PROP_TYPE = 0, + ZFS_PROP_CREATION, + ZFS_PROP_USED, + ZFS_PROP_AVAILABLE, + ZFS_PROP_REFERENCED, + ZFS_PROP_COMPRESSRATIO, + ZFS_PROP_MOUNTED, + ZFS_PROP_ORIGIN, + ZFS_PROP_QUOTA, + ZFS_PROP_RESERVATION, + ZFS_PROP_VOLSIZE, + ZFS_PROP_VOLBLOCKSIZE, + ZFS_PROP_RECORDSIZE, + ZFS_PROP_MOUNTPOINT, + ZFS_PROP_SHARENFS, + ZFS_PROP_CHECKSUM, + ZFS_PROP_COMPRESSION, + ZFS_PROP_ATIME, + ZFS_PROP_DEVICES, + ZFS_PROP_EXEC, + ZFS_PROP_SETUID, + ZFS_PROP_READONLY, + ZFS_PROP_ZONED, + ZFS_PROP_SNAPDIR, + ZFS_PROP_ACLMODE, + ZFS_PROP_ACLINHERIT, + ZFS_PROP_CREATETXG, + ZFS_PROP_NAME, /* not exposed to the user */ + ZFS_PROP_CANMOUNT, + ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ + ZFS_PROP_XATTR, + ZFS_PROP_NUMCLONES, /* not exposed to the user */ + ZFS_PROP_COPIES, + ZFS_PROP_VERSION, + ZFS_PROP_UTF8ONLY, + ZFS_PROP_NORMALIZE, + ZFS_PROP_CASE, + ZFS_PROP_VSCAN, + ZFS_PROP_NBMAND, + ZFS_PROP_SHARESMB, + ZFS_PROP_REFQUOTA, + ZFS_PROP_REFRESERVATION, + ZFS_PROP_GUID, + ZFS_PROP_PRIMARYCACHE, + ZFS_PROP_SECONDARYCACHE, + ZFS_PROP_USEDSNAP, + ZFS_PROP_USEDDS, + ZFS_PROP_USEDCHILD, + ZFS_PROP_USEDREFRESERV, + ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ + ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ + ZFS_PROP_DEFER_DESTROY, + ZFS_PROP_USERREFS, + ZFS_PROP_LOGBIAS, + ZFS_PROP_UNIQUE, /* not exposed to the user */ + ZFS_PROP_OBJSETID, /* not exposed to the user */ + ZFS_PROP_DEDUP, + ZFS_PROP_MLSLABEL, + ZFS_PROP_SYNC, + ZFS_PROP_DNODESIZE, + ZFS_PROP_REFRATIO, + ZFS_PROP_WRITTEN, + ZFS_PROP_CLONES, + ZFS_PROP_LOGICALUSED, + ZFS_PROP_LOGICALREFERENCED, + ZFS_PROP_INCONSISTENT, /* not exposed to the user */ + ZFS_PROP_VOLMODE, + ZFS_PROP_FILESYSTEM_LIMIT, + ZFS_PROP_SNAPSHOT_LIMIT, + ZFS_PROP_FILESYSTEM_COUNT, + ZFS_PROP_SNAPSHOT_COUNT, + ZFS_PROP_REDUNDANT_METADATA, + ZFS_PROP_PREV_SNAP, + ZFS_PROP_RECEIVE_RESUME_TOKEN, + ZFS_PROP_REMAPTXG, /* not exposed to the user */ + ZFS_NUM_PROPS +} zfs_prop_t; + +typedef enum { + ZFS_PROP_USERUSED, + ZFS_PROP_USERQUOTA, + ZFS_PROP_GROUPUSED, + ZFS_PROP_GROUPQUOTA, + ZFS_NUM_USERQUOTA_PROPS +} zfs_userquota_prop_t; + +extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; + +/* + * Pool properties are identified by these constants and must be added to the + * end of this list to ensure that external consumers are not affected + * by the change. If you make any changes to this list, be sure to update + * the property table in usr/src/common/zfs/zpool_prop.c. + */ +typedef enum { + ZPOOL_PROP_INVAL = -1, + ZPOOL_PROP_NAME, + ZPOOL_PROP_SIZE, + ZPOOL_PROP_CAPACITY, + ZPOOL_PROP_ALTROOT, + ZPOOL_PROP_HEALTH, + ZPOOL_PROP_GUID, + ZPOOL_PROP_VERSION, + ZPOOL_PROP_BOOTFS, + ZPOOL_PROP_DELEGATION, + ZPOOL_PROP_AUTOREPLACE, + ZPOOL_PROP_CACHEFILE, + ZPOOL_PROP_FAILUREMODE, + ZPOOL_PROP_LISTSNAPS, + ZPOOL_PROP_AUTOEXPAND, + ZPOOL_PROP_DEDUPDITTO, + ZPOOL_PROP_DEDUPRATIO, + ZPOOL_PROP_FREE, + ZPOOL_PROP_ALLOCATED, + ZPOOL_PROP_READONLY, + ZPOOL_PROP_COMMENT, + ZPOOL_PROP_EXPANDSZ, + ZPOOL_PROP_FREEING, + ZPOOL_PROP_FRAGMENTATION, + ZPOOL_PROP_LEAKED, + ZPOOL_PROP_MAXBLOCKSIZE, + ZPOOL_PROP_BOOTSIZE, + ZPOOL_PROP_CHECKPOINT, + ZPOOL_PROP_TNAME, + ZPOOL_PROP_MAXDNODESIZE, + ZPOOL_NUM_PROPS +} zpool_prop_t; + +/* Small enough to not hog a whole line of printout in zpool(1M). */ +#define ZPROP_MAX_COMMENT 32 + +#define ZPROP_VALUE "value" +#define ZPROP_SOURCE "source" + +typedef enum { + ZPROP_SRC_NONE = 0x1, + ZPROP_SRC_DEFAULT = 0x2, + ZPROP_SRC_TEMPORARY = 0x4, + ZPROP_SRC_LOCAL = 0x8, + ZPROP_SRC_INHERITED = 0x10, + ZPROP_SRC_RECEIVED = 0x20 +} zprop_source_t; + +#define ZPROP_SRC_ALL 0x3f + +#define ZPROP_SOURCE_VAL_RECVD "$recvd" +#define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" +/* + * Dataset flag implemented as a special entry in the props zap object + * indicating that the dataset has received properties on or after + * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties + * just as it did in earlier versions, and thereafter, local properties are + * preserved. + */ +#define ZPROP_HAS_RECVD "$hasrecvd" + +typedef enum { + ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ + ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ +} zprop_errflags_t; + +typedef int (*zprop_func)(int, void *); + +/* + * Properties to be set on the root file system of a new pool + * are stuffed into their own nvlist, which is then included in + * the properties nvlist with the pool properties. + */ +#define ZPOOL_ROOTFS_PROPS "root-props-nvl" + +/* + * Length of 'written@' and 'written#' + */ +#define ZFS_WRITTEN_PROP_PREFIX_LEN 8 + +/* + * Dataset property functions shared between libzfs and kernel. + */ +const char *zfs_prop_default_string(zfs_prop_t); +uint64_t zfs_prop_default_numeric(zfs_prop_t); +boolean_t zfs_prop_readonly(zfs_prop_t); +boolean_t zfs_prop_visible(zfs_prop_t prop); +boolean_t zfs_prop_inheritable(zfs_prop_t); +boolean_t zfs_prop_setonce(zfs_prop_t); +const char *zfs_prop_to_name(zfs_prop_t); +zfs_prop_t zfs_name_to_prop(const char *); +boolean_t zfs_prop_user(const char *); +boolean_t zfs_prop_userquota(const char *); +int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); +int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); +uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); +boolean_t zfs_prop_valid_for_type(int, zfs_type_t); + +/* + * Pool property functions shared between libzfs and kernel. + */ +zpool_prop_t zpool_name_to_prop(const char *); +const char *zpool_prop_to_name(zpool_prop_t); +const char *zpool_prop_default_string(zpool_prop_t); +uint64_t zpool_prop_default_numeric(zpool_prop_t); +boolean_t zpool_prop_readonly(zpool_prop_t); +boolean_t zpool_prop_feature(const char *); +boolean_t zpool_prop_unsupported(const char *name); +int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); +int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); +uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); + +/* + * Definitions for the Delegation. + */ +typedef enum { + ZFS_DELEG_WHO_UNKNOWN = 0, + ZFS_DELEG_USER = 'u', + ZFS_DELEG_USER_SETS = 'U', + ZFS_DELEG_GROUP = 'g', + ZFS_DELEG_GROUP_SETS = 'G', + ZFS_DELEG_EVERYONE = 'e', + ZFS_DELEG_EVERYONE_SETS = 'E', + ZFS_DELEG_CREATE = 'c', + ZFS_DELEG_CREATE_SETS = 'C', + ZFS_DELEG_NAMED_SET = 's', + ZFS_DELEG_NAMED_SET_SETS = 'S' +} zfs_deleg_who_type_t; + +typedef enum { + ZFS_DELEG_NONE = 0, + ZFS_DELEG_PERM_LOCAL = 1, + ZFS_DELEG_PERM_DESCENDENT = 2, + ZFS_DELEG_PERM_LOCALDESCENDENT = 3, + ZFS_DELEG_PERM_CREATE = 4 +} zfs_deleg_inherit_t; + +#define ZFS_DELEG_PERM_UID "uid" +#define ZFS_DELEG_PERM_GID "gid" +#define ZFS_DELEG_PERM_GROUPS "groups" + +#define ZFS_MLSLABEL_DEFAULT "none" + +#define ZFS_SMB_ACL_SRC "src" +#define ZFS_SMB_ACL_TARGET "target" + +typedef enum { + ZFS_CANMOUNT_OFF = 0, + ZFS_CANMOUNT_ON = 1, + ZFS_CANMOUNT_NOAUTO = 2 +} zfs_canmount_type_t; + +typedef enum { + ZFS_LOGBIAS_LATENCY = 0, + ZFS_LOGBIAS_THROUGHPUT = 1 +} zfs_logbias_op_t; + +typedef enum zfs_share_op { + ZFS_SHARE_NFS = 0, + ZFS_UNSHARE_NFS = 1, + ZFS_SHARE_SMB = 2, + ZFS_UNSHARE_SMB = 3 +} zfs_share_op_t; + +typedef enum zfs_smb_acl_op { + ZFS_SMB_ACL_ADD, + ZFS_SMB_ACL_REMOVE, + ZFS_SMB_ACL_RENAME, + ZFS_SMB_ACL_PURGE +} zfs_smb_acl_op_t; + +typedef enum zfs_cache_type { + ZFS_CACHE_NONE = 0, + ZFS_CACHE_METADATA = 1, + ZFS_CACHE_ALL = 2 +} zfs_cache_type_t; + +typedef enum { + ZFS_SYNC_STANDARD = 0, + ZFS_SYNC_ALWAYS = 1, + ZFS_SYNC_DISABLED = 2 +} zfs_sync_type_t; + +typedef enum { + ZFS_VOLMODE_DEFAULT = 0, + ZFS_VOLMODE_GEOM = 1, + ZFS_VOLMODE_DEV = 2, + ZFS_VOLMODE_NONE = 3 +} zfs_volmode_t; + +typedef enum { + ZFS_DNSIZE_LEGACY = 0, + ZFS_DNSIZE_AUTO = 1, + ZFS_DNSIZE_1K = 1024, + ZFS_DNSIZE_2K = 2048, + ZFS_DNSIZE_4K = 4096, + ZFS_DNSIZE_8K = 8192, + ZFS_DNSIZE_16K = 16384 +} zfs_dnsize_type_t; + +typedef enum { + ZFS_REDUNDANT_METADATA_ALL, + ZFS_REDUNDANT_METADATA_MOST +} zfs_redundant_metadata_type_t; + +/* + * On-disk version number. + */ +#define SPA_VERSION_1 1ULL +#define SPA_VERSION_2 2ULL +#define SPA_VERSION_3 3ULL +#define SPA_VERSION_4 4ULL +#define SPA_VERSION_5 5ULL +#define SPA_VERSION_6 6ULL +#define SPA_VERSION_7 7ULL +#define SPA_VERSION_8 8ULL +#define SPA_VERSION_9 9ULL +#define SPA_VERSION_10 10ULL +#define SPA_VERSION_11 11ULL +#define SPA_VERSION_12 12ULL +#define SPA_VERSION_13 13ULL +#define SPA_VERSION_14 14ULL +#define SPA_VERSION_15 15ULL +#define SPA_VERSION_16 16ULL +#define SPA_VERSION_17 17ULL +#define SPA_VERSION_18 18ULL +#define SPA_VERSION_19 19ULL +#define SPA_VERSION_20 20ULL +#define SPA_VERSION_21 21ULL +#define SPA_VERSION_22 22ULL +#define SPA_VERSION_23 23ULL +#define SPA_VERSION_24 24ULL +#define SPA_VERSION_25 25ULL +#define SPA_VERSION_26 26ULL +#define SPA_VERSION_27 27ULL +#define SPA_VERSION_28 28ULL +#define SPA_VERSION_5000 5000ULL + +/* + * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk + * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, + * and do the appropriate changes. Also bump the version number in + * usr/src/grub/capability. + */ +#define SPA_VERSION SPA_VERSION_5000 +#define SPA_VERSION_STRING "5000" + +/* + * Symbolic names for the changes that caused a SPA_VERSION switch. + * Used in the code when checking for presence or absence of a feature. + * Feel free to define multiple symbolic names for each version if there + * were multiple changes to on-disk structures during that version. + * + * NOTE: When checking the current SPA_VERSION in your code, be sure + * to use spa_version() since it reports the version of the + * last synced uberblock. Checking the in-flight version can + * be dangerous in some cases. + */ +#define SPA_VERSION_INITIAL SPA_VERSION_1 +#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 +#define SPA_VERSION_SPARES SPA_VERSION_3 +#define SPA_VERSION_RAIDZ2 SPA_VERSION_3 +#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 +#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 +#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 +#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 +#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 +#define SPA_VERSION_BOOTFS SPA_VERSION_6 +#define SPA_VERSION_SLOGS SPA_VERSION_7 +#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 +#define SPA_VERSION_FUID SPA_VERSION_9 +#define SPA_VERSION_REFRESERVATION SPA_VERSION_9 +#define SPA_VERSION_REFQUOTA SPA_VERSION_9 +#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 +#define SPA_VERSION_L2CACHE SPA_VERSION_10 +#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 +#define SPA_VERSION_ORIGIN SPA_VERSION_11 +#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 +#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 +#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 +#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 +#define SPA_VERSION_USERSPACE SPA_VERSION_15 +#define SPA_VERSION_STMF_PROP SPA_VERSION_16 +#define SPA_VERSION_RAIDZ3 SPA_VERSION_17 +#define SPA_VERSION_USERREFS SPA_VERSION_18 +#define SPA_VERSION_HOLES SPA_VERSION_19 +#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 +#define SPA_VERSION_DEDUP SPA_VERSION_21 +#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 +#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 +#define SPA_VERSION_SA SPA_VERSION_24 +#define SPA_VERSION_SCAN SPA_VERSION_25 +#define SPA_VERSION_DIR_CLONES SPA_VERSION_26 +#define SPA_VERSION_DEADLISTS SPA_VERSION_26 +#define SPA_VERSION_FAST_SNAP SPA_VERSION_27 +#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 +#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 +#define SPA_VERSION_FEATURES SPA_VERSION_5000 + +#define SPA_VERSION_IS_SUPPORTED(v) \ + (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ + ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) + +/* + * ZPL version - rev'd whenever an incompatible on-disk format change + * occurs. This is independent of SPA/DMU/ZAP versioning. You must + * also update the version_table[] and help message in zfs_prop.c. + * + * When changing, be sure to teach GRUB how to read the new format! + * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*} + */ +#define ZPL_VERSION_1 1ULL +#define ZPL_VERSION_2 2ULL +#define ZPL_VERSION_3 3ULL +#define ZPL_VERSION_4 4ULL +#define ZPL_VERSION_5 5ULL +#define ZPL_VERSION ZPL_VERSION_5 +#define ZPL_VERSION_STRING "5" + +#define ZPL_VERSION_INITIAL ZPL_VERSION_1 +#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 +#define ZPL_VERSION_FUID ZPL_VERSION_3 +#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 +#define ZPL_VERSION_SYSATTR ZPL_VERSION_3 +#define ZPL_VERSION_USERSPACE ZPL_VERSION_4 +#define ZPL_VERSION_SA ZPL_VERSION_5 + +/* Rewind policy information */ +#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ +#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ +#define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ +#define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ +#define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ +#define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ +#define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ + +typedef struct zpool_load_policy { + uint32_t zlp_rewind; /* rewind policy requested */ + uint64_t zlp_maxmeta; /* max acceptable meta-data errors */ + uint64_t zlp_maxdata; /* max acceptable data errors */ + uint64_t zlp_txg; /* specific txg to load */ +} zpool_load_policy_t; + +/* + * The following are configuration names used in the nvlist describing a pool's + * configuration. New on-disk names should be prefixed with "<reverse-DNS>:" + * (e.g. "org.open-zfs:") to avoid conflicting names being developed + * independently. + */ +#define ZPOOL_CONFIG_VERSION "version" +#define ZPOOL_CONFIG_POOL_NAME "name" +#define ZPOOL_CONFIG_POOL_STATE "state" +#define ZPOOL_CONFIG_POOL_TXG "txg" +#define ZPOOL_CONFIG_POOL_GUID "pool_guid" +#define ZPOOL_CONFIG_CREATE_TXG "create_txg" +#define ZPOOL_CONFIG_TOP_GUID "top_guid" +#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" +#define ZPOOL_CONFIG_TYPE "type" +#define ZPOOL_CONFIG_CHILDREN "children" +#define ZPOOL_CONFIG_ID "id" +#define ZPOOL_CONFIG_GUID "guid" +#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object" +#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births" +#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" +#define ZPOOL_CONFIG_PATH "path" +#define ZPOOL_CONFIG_DEVID "devid" +#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" +#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" +#define ZPOOL_CONFIG_ASHIFT "ashift" +#define ZPOOL_CONFIG_ASIZE "asize" +#define ZPOOL_CONFIG_DTL "DTL" +#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ +#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ +#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" +#define ZPOOL_CONFIG_ERRCOUNT "error_count" +#define ZPOOL_CONFIG_NOT_PRESENT "not_present" +#define ZPOOL_CONFIG_SPARES "spares" +#define ZPOOL_CONFIG_IS_SPARE "is_spare" +#define ZPOOL_CONFIG_NPARITY "nparity" +#define ZPOOL_CONFIG_HOSTID "hostid" +#define ZPOOL_CONFIG_HOSTNAME "hostname" +#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" +#define ZPOOL_CONFIG_UNSPARE "unspare" +#define ZPOOL_CONFIG_PHYS_PATH "phys_path" +#define ZPOOL_CONFIG_IS_LOG "is_log" +#define ZPOOL_CONFIG_L2CACHE "l2cache" +#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" +#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" +#define ZPOOL_CONFIG_IS_HOLE "is_hole" +#define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" +#define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" +#define ZPOOL_CONFIG_DDT_STATS "ddt_stats" +#define ZPOOL_CONFIG_SPLIT "splitcfg" +#define ZPOOL_CONFIG_ORIG_GUID "orig_guid" +#define ZPOOL_CONFIG_SPLIT_GUID "split_guid" +#define ZPOOL_CONFIG_SPLIT_LIST "guid_list" +#define ZPOOL_CONFIG_REMOVING "removing" +#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" +#define ZPOOL_CONFIG_COMMENT "comment" +#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ +#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ +#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ +#define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ +#define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ +#define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ +#define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ +#define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ +#define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ +#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" +#define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" +#define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" +#define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" +#define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */ +/* + * The persistent vdev state is stored as separate values rather than a single + * 'vdev_state' entry. This is because a device can be in multiple states, such + * as offline and degraded. + */ +#define ZPOOL_CONFIG_OFFLINE "offline" +#define ZPOOL_CONFIG_FAULTED "faulted" +#define ZPOOL_CONFIG_DEGRADED "degraded" +#define ZPOOL_CONFIG_REMOVED "removed" +#define ZPOOL_CONFIG_FRU "fru" +#define ZPOOL_CONFIG_AUX_STATE "aux_state" + +/* Pool load policy parameters */ +#define ZPOOL_LOAD_POLICY "load-policy" +#define ZPOOL_LOAD_REWIND_POLICY "load-rewind-policy" +#define ZPOOL_LOAD_REQUEST_TXG "load-request-txg" +#define ZPOOL_LOAD_META_THRESH "load-meta-thresh" +#define ZPOOL_LOAD_DATA_THRESH "load-data-thresh" + +/* Rewind data discovered */ +#define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" +#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" +#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" + +#define VDEV_TYPE_ROOT "root" +#define VDEV_TYPE_MIRROR "mirror" +#define VDEV_TYPE_REPLACING "replacing" +#define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_DISK "disk" +#define VDEV_TYPE_FILE "file" +#define VDEV_TYPE_MISSING "missing" +#define VDEV_TYPE_HOLE "hole" +#define VDEV_TYPE_SPARE "spare" +#define VDEV_TYPE_LOG "log" +#define VDEV_TYPE_L2CACHE "l2cache" +#define VDEV_TYPE_INDIRECT "indirect" + +/* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ +#define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ + "com.delphix:indirect_obsolete_sm" +#define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \ + "com.delphix:obsolete_counts_are_precise" +#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ + "com.delphix:pool_checkpoint_sm" + +#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \ + "com.delphix:next_offset_to_initialize" +#define VDEV_LEAF_ZAP_INITIALIZE_STATE \ + "com.delphix:vdev_initialize_state" +#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \ + "com.delphix:vdev_initialize_action_time" + +/* + * This is needed in userland to report the minimum necessary device size. + * + * Note that the zfs test suite uses 64MB vdevs. + */ +#define SPA_MINDEVSIZE (64ULL << 20) + +/* + * Set if the fragmentation has not yet been calculated. This can happen + * because the space maps have not been upgraded or the histogram feature + * is not enabled. + */ +#define ZFS_FRAG_INVALID UINT64_MAX + +/* + * The location of the pool configuration repository, shared between kernel and + * userland. + */ +#define ZPOOL_CACHE "/boot/zfs/zpool.cache" + +/* + * vdev states are ordered from least to most healthy. + * A vdev that's CANT_OPEN or below is considered unusable. + */ +typedef enum vdev_state { + VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ + VDEV_STATE_CLOSED, /* Not currently open */ + VDEV_STATE_OFFLINE, /* Not allowed to open */ + VDEV_STATE_REMOVED, /* Explicitly removed from system */ + VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ + VDEV_STATE_FAULTED, /* External request to fault device */ + VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ + VDEV_STATE_HEALTHY /* Presumed good */ +} vdev_state_t; + +#define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY + +/* + * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field + * of the vdev stats structure uses these constants to distinguish why. + */ +typedef enum vdev_aux { + VDEV_AUX_NONE, /* no error */ + VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ + VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ + VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ + VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ + VDEV_AUX_TOO_SMALL, /* vdev size is too small */ + VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ + VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ + VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ + VDEV_AUX_UNSUP_FEAT, /* unsupported features */ + VDEV_AUX_SPARED, /* hot spare used in another pool */ + VDEV_AUX_ERR_EXCEEDED, /* too many errors */ + VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ + VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ + VDEV_AUX_EXTERNAL, /* external diagnosis */ + VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ + VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */ + VDEV_AUX_CHILDREN_OFFLINE /* all children are offline */ +} vdev_aux_t; + +/* + * pool state. The following states are written to disk as part of the normal + * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining + * states are software abstractions used at various levels to communicate + * pool state. + */ +typedef enum pool_state { + POOL_STATE_ACTIVE = 0, /* In active use */ + POOL_STATE_EXPORTED, /* Explicitly exported */ + POOL_STATE_DESTROYED, /* Explicitly destroyed */ + POOL_STATE_SPARE, /* Reserved for hot spare use */ + POOL_STATE_L2CACHE, /* Level 2 ARC device */ + POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ + POOL_STATE_UNAVAIL, /* Internal libzfs state */ + POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ +} pool_state_t; + +/* + * Scan Functions. + */ +typedef enum pool_scan_func { + POOL_SCAN_NONE, + POOL_SCAN_SCRUB, + POOL_SCAN_RESILVER, + POOL_SCAN_FUNCS +} pool_scan_func_t; + +/* + * Used to control scrub pause and resume. + */ +typedef enum pool_scrub_cmd { + POOL_SCRUB_NORMAL = 0, + POOL_SCRUB_PAUSE, + POOL_SCRUB_FLAGS_END +} pool_scrub_cmd_t; + +/* + * Initialize functions. + */ +typedef enum pool_initialize_func { + POOL_INITIALIZE_DO, + POOL_INITIALIZE_CANCEL, + POOL_INITIALIZE_SUSPEND, + POOL_INITIALIZE_FUNCS +} pool_initialize_func_t; + +/* + * ZIO types. Needed to interpret vdev statistics below. + */ +typedef enum zio_type { + ZIO_TYPE_NULL = 0, + ZIO_TYPE_READ, + ZIO_TYPE_WRITE, + ZIO_TYPE_FREE, + ZIO_TYPE_CLAIM, + ZIO_TYPE_IOCTL, + ZIO_TYPES +} zio_type_t; + +/* + * Pool statistics. Note: all fields should be 64-bit because this + * is passed between kernel and userland as an nvlist uint64 array. + */ +typedef struct pool_scan_stat { + /* values stored on disk */ + uint64_t pss_func; /* pool_scan_func_t */ + uint64_t pss_state; /* dsl_scan_state_t */ + uint64_t pss_start_time; /* scan start time */ + uint64_t pss_end_time; /* scan end time */ + uint64_t pss_to_examine; /* total bytes to scan */ + uint64_t pss_examined; /* total bytes located by scanner */ + uint64_t pss_to_process; /* total bytes to process */ + uint64_t pss_processed; /* total processed bytes */ + uint64_t pss_errors; /* scan errors */ + + /* values not stored on disk */ + uint64_t pss_pass_exam; /* examined bytes per scan pass */ + uint64_t pss_pass_start; /* start time of a scan pass */ + uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */ + /* cumulative time scrub spent paused, needed for rate calculation */ + uint64_t pss_pass_scrub_spent_paused; + + /* Sorted scrubbing new fields */ + /* Stored on disk */ + uint64_t pss_issued; /* total bytes checked by scanner */ + /* Not stored on disk */ + uint64_t pss_pass_issued; /* issued bytes per scan pass */ +} pool_scan_stat_t; + +typedef struct pool_removal_stat { + uint64_t prs_state; /* dsl_scan_state_t */ + uint64_t prs_removing_vdev; + uint64_t prs_start_time; + uint64_t prs_end_time; + uint64_t prs_to_copy; /* bytes that need to be copied */ + uint64_t prs_copied; /* bytes copied so far */ + /* + * bytes of memory used for indirect mappings. + * This includes all removed vdevs. + */ + uint64_t prs_mapping_memory; +} pool_removal_stat_t; + +typedef enum dsl_scan_state { + DSS_NONE, + DSS_SCANNING, + DSS_FINISHED, + DSS_CANCELED, + DSS_NUM_STATES +} dsl_scan_state_t; + +typedef enum { + CS_NONE, + CS_CHECKPOINT_EXISTS, + CS_CHECKPOINT_DISCARDING, + CS_NUM_STATES +} checkpoint_state_t; + +typedef struct pool_checkpoint_stat { + uint64_t pcs_state; /* checkpoint_state_t */ + uint64_t pcs_start_time; /* time checkpoint/discard started */ + uint64_t pcs_space; /* checkpointed space */ +} pool_checkpoint_stat_t; + +typedef enum { + VDEV_INITIALIZE_NONE, + VDEV_INITIALIZE_ACTIVE, + VDEV_INITIALIZE_CANCELED, + VDEV_INITIALIZE_SUSPENDED, + VDEV_INITIALIZE_COMPLETE +} vdev_initializing_state_t; + +/* + * Vdev statistics. Note: all fields should be 64-bit because this + * is passed between kernel and userland as an nvlist uint64 array. + */ +typedef struct vdev_stat { + hrtime_t vs_timestamp; /* time since vdev load */ + uint64_t vs_state; /* vdev state */ + uint64_t vs_aux; /* see vdev_aux_t */ + uint64_t vs_alloc; /* space allocated */ + uint64_t vs_space; /* total capacity */ + uint64_t vs_dspace; /* deflated capacity */ + uint64_t vs_rsize; /* replaceable dev size */ + uint64_t vs_esize; /* expandable dev size */ + uint64_t vs_ops[ZIO_TYPES]; /* operation count */ + uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */ + uint64_t vs_read_errors; /* read errors */ + uint64_t vs_write_errors; /* write errors */ + uint64_t vs_checksum_errors; /* checksum errors */ + uint64_t vs_self_healed; /* self-healed bytes */ + uint64_t vs_scan_removing; /* removing? */ + uint64_t vs_scan_processed; /* scan processed bytes */ + uint64_t vs_configured_ashift; /* TLV vdev_ashift */ + uint64_t vs_logical_ashift; /* vdev_logical_ashift */ + uint64_t vs_physical_ashift; /* vdev_physical_ashift */ + uint64_t vs_fragmentation; /* device fragmentation */ + uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ + uint64_t vs_initialize_errors; /* initializing errors */ + uint64_t vs_initialize_bytes_done; /* bytes initialized */ + uint64_t vs_initialize_bytes_est; /* total bytes to initialize */ + uint64_t vs_initialize_state; /* vdev_initialzing_state_t */ + uint64_t vs_initialize_action_time; /* time_t */ +} vdev_stat_t; +#define VDEV_STAT_VALID(field, uint64_t_field_count) \ + ((uint64_t_field_count * sizeof(uint64_t)) >= \ + (offsetof(vdev_stat_t, field) + sizeof(((vdev_stat_t *)NULL)->field))) + +/* + * DDT statistics. Note: all fields should be 64-bit because this + * is passed between kernel and userland as an nvlist uint64 array. + */ +typedef struct ddt_object { + uint64_t ddo_count; /* number of elments in ddt */ + uint64_t ddo_dspace; /* size of ddt on disk */ + uint64_t ddo_mspace; /* size of ddt in-core */ +} ddt_object_t; + +typedef struct ddt_stat { + uint64_t dds_blocks; /* blocks */ + uint64_t dds_lsize; /* logical size */ + uint64_t dds_psize; /* physical size */ + uint64_t dds_dsize; /* deflated allocated size */ + uint64_t dds_ref_blocks; /* referenced blocks */ + uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ + uint64_t dds_ref_psize; /* referenced psize * refcnt */ + uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ +} ddt_stat_t; + +typedef struct ddt_histogram { + ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ +} ddt_histogram_t; + +#define ZVOL_DRIVER "zvol" +#define ZFS_DRIVER "zfs" +#define ZFS_DEV_NAME "zfs" +#define ZFS_DEV "/dev/" ZFS_DEV_NAME +#define ZFS_DISK_ROOT "/dev/dsk" +#define ZFS_DISK_ROOTD ZFS_DISK_ROOT "/" +#define ZFS_RDISK_ROOT "/dev/rdsk" +#define ZFS_RDISK_ROOTD ZFS_RDISK_ROOT "/" + +/* general zvol path */ +#define ZVOL_DIR "/dev/zvol" +/* expansion */ +#define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:" +/* for dump and swap */ +#define ZVOL_FULL_DEV_DIR ZVOL_DIR "/dsk/" +#define ZVOL_FULL_RDEV_DIR ZVOL_DIR "/rdsk/" + +#define ZVOL_PROP_NAME "name" +#define ZVOL_DEFAULT_BLOCKSIZE 8192 + +/* + * /dev/zfs ioctl numbers. + */ +typedef enum zfs_ioc { + ZFS_IOC_FIRST = 0, + ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, + ZFS_IOC_POOL_DESTROY, + ZFS_IOC_POOL_IMPORT, + ZFS_IOC_POOL_EXPORT, + ZFS_IOC_POOL_CONFIGS, + ZFS_IOC_POOL_STATS, + ZFS_IOC_POOL_TRYIMPORT, + ZFS_IOC_POOL_SCAN, + ZFS_IOC_POOL_FREEZE, + ZFS_IOC_POOL_UPGRADE, + ZFS_IOC_POOL_GET_HISTORY, + ZFS_IOC_VDEV_ADD, + ZFS_IOC_VDEV_REMOVE, + ZFS_IOC_VDEV_SET_STATE, + ZFS_IOC_VDEV_ATTACH, + ZFS_IOC_VDEV_DETACH, + ZFS_IOC_VDEV_SETPATH, + ZFS_IOC_VDEV_SETFRU, + ZFS_IOC_OBJSET_STATS, + ZFS_IOC_OBJSET_ZPLPROPS, + ZFS_IOC_DATASET_LIST_NEXT, + ZFS_IOC_SNAPSHOT_LIST_NEXT, + ZFS_IOC_SET_PROP, + ZFS_IOC_CREATE, + ZFS_IOC_DESTROY, + ZFS_IOC_ROLLBACK, + ZFS_IOC_RENAME, + ZFS_IOC_RECV, + ZFS_IOC_SEND, + ZFS_IOC_INJECT_FAULT, + ZFS_IOC_CLEAR_FAULT, + ZFS_IOC_INJECT_LIST_NEXT, + ZFS_IOC_ERROR_LOG, + ZFS_IOC_CLEAR, + ZFS_IOC_PROMOTE, + ZFS_IOC_DESTROY_SNAPS, + ZFS_IOC_SNAPSHOT, + ZFS_IOC_DSOBJ_TO_DSNAME, + ZFS_IOC_OBJ_TO_PATH, + ZFS_IOC_POOL_SET_PROPS, + ZFS_IOC_POOL_GET_PROPS, + ZFS_IOC_SET_FSACL, + ZFS_IOC_GET_FSACL, + ZFS_IOC_SHARE, + ZFS_IOC_INHERIT_PROP, + ZFS_IOC_SMB_ACL, + ZFS_IOC_USERSPACE_ONE, + ZFS_IOC_USERSPACE_MANY, + ZFS_IOC_USERSPACE_UPGRADE, + ZFS_IOC_HOLD, + ZFS_IOC_RELEASE, + ZFS_IOC_GET_HOLDS, + ZFS_IOC_OBJSET_RECVD_PROPS, + ZFS_IOC_VDEV_SPLIT, + ZFS_IOC_NEXT_OBJ, + ZFS_IOC_DIFF, + ZFS_IOC_TMP_SNAPSHOT, + ZFS_IOC_OBJ_TO_STATS, + ZFS_IOC_JAIL, + ZFS_IOC_UNJAIL, + ZFS_IOC_POOL_REGUID, + ZFS_IOC_SPACE_WRITTEN, + ZFS_IOC_SPACE_SNAPS, + ZFS_IOC_SEND_PROGRESS, + ZFS_IOC_POOL_REOPEN, + ZFS_IOC_LOG_HISTORY, + ZFS_IOC_SEND_NEW, + ZFS_IOC_SEND_SPACE, + ZFS_IOC_CLONE, + ZFS_IOC_BOOKMARK, + ZFS_IOC_GET_BOOKMARKS, + ZFS_IOC_DESTROY_BOOKMARKS, +#ifdef __FreeBSD__ + ZFS_IOC_NEXTBOOT, +#endif + ZFS_IOC_CHANNEL_PROGRAM, + ZFS_IOC_REMAP, + ZFS_IOC_POOL_CHECKPOINT, + ZFS_IOC_POOL_DISCARD_CHECKPOINT, + ZFS_IOC_POOL_INITIALIZE, + ZFS_IOC_LAST +} zfs_ioc_t; + +/* + * ZFS-specific error codes used for returning descriptive errors + * to the userland through zfs ioctls. + * + * The enum implicitly includes all the error codes from errno.h. + * New code should use and extend this enum for errors that are + * not described precisely by generic errno codes. + */ +typedef enum { + ZFS_ERR_CHECKPOINT_EXISTS = 1024, + ZFS_ERR_DISCARDING_CHECKPOINT, + ZFS_ERR_NO_CHECKPOINT, + ZFS_ERR_DEVRM_IN_PROGRESS, + ZFS_ERR_VDEV_TOO_BIG +} zfs_errno_t; + +/* + * Internal SPA load state. Used by FMA diagnosis engine. + */ +typedef enum { + SPA_LOAD_NONE, /* no load in progress */ + SPA_LOAD_OPEN, /* normal open */ + SPA_LOAD_IMPORT, /* import in progress */ + SPA_LOAD_TRYIMPORT, /* tryimport in progress */ + SPA_LOAD_RECOVER, /* recovery requested */ + SPA_LOAD_ERROR, /* load failed */ + SPA_LOAD_CREATE /* creation in progress */ +} spa_load_state_t; + +/* + * Bookmark name values. + */ +#define ZPOOL_ERR_LIST "error list" +#define ZPOOL_ERR_DATASET "dataset" +#define ZPOOL_ERR_OBJECT "object" + +#define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) + +/* + * The following are names used in the nvlist describing + * the pool's history log. + */ +#define ZPOOL_HIST_RECORD "history record" +#define ZPOOL_HIST_TIME "history time" +#define ZPOOL_HIST_CMD "history command" +#define ZPOOL_HIST_WHO "history who" +#define ZPOOL_HIST_ZONE "history zone" +#define ZPOOL_HIST_HOST "history hostname" +#define ZPOOL_HIST_TXG "history txg" +#define ZPOOL_HIST_INT_EVENT "history internal event" +#define ZPOOL_HIST_INT_STR "history internal str" +#define ZPOOL_HIST_INT_NAME "internal_name" +#define ZPOOL_HIST_IOCTL "ioctl" +#define ZPOOL_HIST_INPUT_NVL "in_nvl" +#define ZPOOL_HIST_OUTPUT_NVL "out_nvl" +#define ZPOOL_HIST_DSNAME "dsname" +#define ZPOOL_HIST_DSID "dsid" +#define ZPOOL_HIST_ERRNO "errno" + +/* + * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. + */ +#define ZPOOL_INITIALIZE_COMMAND "initialize_command" +#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" + +/* + * Flags for ZFS_IOC_VDEV_SET_STATE + */ +#define ZFS_ONLINE_CHECKREMOVE 0x1 +#define ZFS_ONLINE_UNSPARE 0x2 +#define ZFS_ONLINE_FORCEFAULT 0x4 +#define ZFS_ONLINE_EXPAND 0x8 +#define ZFS_OFFLINE_TEMPORARY 0x1 + +/* + * Flags for ZFS_IOC_POOL_IMPORT + */ +#define ZFS_IMPORT_NORMAL 0x0 +#define ZFS_IMPORT_VERBATIM 0x1 +#define ZFS_IMPORT_ANY_HOST 0x2 +#define ZFS_IMPORT_MISSING_LOG 0x4 +#define ZFS_IMPORT_ONLY 0x8 +#define ZFS_IMPORT_CHECKPOINT 0x10 +#define ZFS_IMPORT_TEMP_NAME 0x20 + +/* + * Channel program argument/return nvlist keys and defaults. + */ +#define ZCP_ARG_PROGRAM "program" +#define ZCP_ARG_ARGLIST "arg" +#define ZCP_ARG_SYNC "sync" +#define ZCP_ARG_INSTRLIMIT "instrlimit" +#define ZCP_ARG_MEMLIMIT "memlimit" + +#define ZCP_ARG_CLIARGV "argv" + +#define ZCP_RET_ERROR "error" +#define ZCP_RET_RETURN "return" + +#define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000) +#define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT) +#define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024) +#define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT) + +/* + * Sysevent payload members. ZFS will generate the following sysevents with the + * given payloads: + * + * ESC_ZFS_RESILVER_START + * ESC_ZFS_RESILVER_END + * ESC_ZFS_POOL_DESTROY + * ESC_ZFS_POOL_REGUID + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * + * ESC_ZFS_VDEV_REMOVE + * ESC_ZFS_VDEV_CLEAR + * ESC_ZFS_VDEV_CHECK + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) + * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 + * + * ESC_ZFS_HISTORY_EVENT + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional) + * ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional) + * ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional) + * ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional) + * ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional) + * + * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the + * history log nvlist. The keynames will be free of any spaces or other + * characters that could be potentially unexpected to consumers of the + * sysevents. + */ +#define ZFS_EV_POOL_NAME "pool_name" +#define ZFS_EV_POOL_GUID "pool_guid" +#define ZFS_EV_VDEV_PATH "vdev_path" +#define ZFS_EV_VDEV_GUID "vdev_guid" +#define ZFS_EV_HIST_TIME "history_time" +#define ZFS_EV_HIST_CMD "history_command" +#define ZFS_EV_HIST_WHO "history_who" +#define ZFS_EV_HIST_ZONE "history_zone" +#define ZFS_EV_HIST_HOST "history_hostname" +#define ZFS_EV_HIST_TXG "history_txg" +#define ZFS_EV_HIST_INT_EVENT "history_internal_event" +#define ZFS_EV_HIST_INT_STR "history_internal_str" +#define ZFS_EV_HIST_INT_NAME "history_internal_name" +#define ZFS_EV_HIST_IOCTL "history_ioctl" +#define ZFS_EV_HIST_DSNAME "history_dsname" +#define ZFS_EV_HIST_DSID "history_dsid" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h new file mode 100644 index 000000000000..36c9eaa7f18e --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h @@ -0,0 +1,93 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZUT_H +#define _ZUT_H + +/* + * IOCTLs for the zfs unit test driver + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/stat.h> + +#define ZUT_DRIVER "zut" +#define ZUT_DEV "/dev/zut" + +#define ZUT_VERSION_STRING "1" + +/* + * /dev/zut ioctl numbers. + */ +#define ZUT_IOC ('U' << 8) + +/* Request flags */ +#define ZUT_IGNORECASE 0x01 +#define ZUT_ACCFILTER 0x02 +#define ZUT_XATTR 0x04 +#define ZUT_EXTRDDIR 0x08 +#define ZUT_GETSTAT 0x10 + +typedef struct zut_lookup { + int zl_reqflags; + int zl_deflags; /* output */ + int zl_retcode; /* output */ + char zl_dir[MAXPATHLEN]; + char zl_file[MAXNAMELEN]; + char zl_xfile[MAXNAMELEN]; + char zl_real[MAXPATHLEN]; /* output */ + uint64_t zl_xvattrs; /* output */ + struct stat64 zl_statbuf; /* output */ +} zut_lookup_t; + +typedef struct zut_readdir { + uint64_t zr_buf; /* pointer to output buffer */ + uint64_t zr_loffset; /* output */ + char zr_dir[MAXPATHLEN]; + char zr_file[MAXNAMELEN]; + int zr_reqflags; + int zr_retcode; /* output */ + int zr_eof; /* output */ + uint_t zr_bytes; /* output */ + uint_t zr_buflen; +} zut_readdir_t; + +typedef enum zut_ioc { + ZUT_IOC_MIN_CMD = ZUT_IOC - 1, + ZUT_IOC_LOOKUP = ZUT_IOC, + ZUT_IOC_READDIR, + ZUT_IOC_MAX_CMD +} zut_ioc_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZUT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h b/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h new file mode 100644 index 000000000000..39eeb905c72b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_IDMAP_H +#define _SYS_IDMAP_H + + +/* Idmap status codes */ +#define IDMAP_SUCCESS 0 +#define IDMAP_NEXT 1 +#define IDMAP_ERR_OTHER -10000 +#define IDMAP_ERR_INTERNAL -9999 +#define IDMAP_ERR_MEMORY -9998 +#define IDMAP_ERR_NORESULT -9997 +#define IDMAP_ERR_NOTUSER -9996 +#define IDMAP_ERR_NOTGROUP -9995 +#define IDMAP_ERR_NOTSUPPORTED -9994 +#define IDMAP_ERR_W2U_NAMERULE -9993 +#define IDMAP_ERR_U2W_NAMERULE -9992 +#define IDMAP_ERR_CACHE -9991 +#define IDMAP_ERR_DB -9990 +#define IDMAP_ERR_ARG -9989 +#define IDMAP_ERR_SID -9988 +#define IDMAP_ERR_IDTYPE -9987 +#define IDMAP_ERR_RPC_HANDLE -9986 +#define IDMAP_ERR_RPC -9985 +#define IDMAP_ERR_CLIENT_HANDLE -9984 +#define IDMAP_ERR_BUSY -9983 +#define IDMAP_ERR_PERMISSION_DENIED -9982 +#define IDMAP_ERR_NOMAPPING -9981 +#define IDMAP_ERR_NEW_ID_ALLOC_REQD -9980 +#define IDMAP_ERR_DOMAIN -9979 +#define IDMAP_ERR_SECURITY -9978 +#define IDMAP_ERR_NOTFOUND -9977 +#define IDMAP_ERR_DOMAIN_NOTFOUND -9976 +#define IDMAP_ERR_UPDATE_NOTALLOWED -9975 +#define IDMAP_ERR_CFG -9974 +#define IDMAP_ERR_CFG_CHANGE -9973 +#define IDMAP_ERR_NOTMAPPED_WELLKNOWN -9972 +#define IDMAP_ERR_RETRIABLE_NET_ERR -9971 +#define IDMAP_ERR_W2U_NAMERULE_CONFLICT -9970 +#define IDMAP_ERR_U2W_NAMERULE_CONFLICT -9969 +#define IDMAP_ERR_BAD_UTF8 -9968 +#define IDMAP_ERR_NONE_GENERATED -9967 +#define IDMAP_ERR_PROP_UNKNOWN -9966 +#define IDMAP_ERR_NS_LDAP_OP_FAILED -9965 +#define IDMAP_ERR_NS_LDAP_PARTIAL -9964 +#define IDMAP_ERR_NS_LDAP_CFG -9963 +#define IDMAP_ERR_NS_LDAP_BAD_WINNAME -9962 +#define IDMAP_ERR_NO_ACTIVEDIRECTORY -9961 + +/* Reserved GIDs for some well-known SIDs */ +#define IDMAP_WK_LOCAL_SYSTEM_GID 2147483648U /* 0x80000000 */ +#define IDMAP_WK_CREATOR_GROUP_GID 2147483649U +#define IDMAP_WK__MAX_GID 2147483649U + +/* Reserved UIDs for some well-known SIDs */ +#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U +#define IDMAP_WK__MAX_UID 2147483648U + +/* Reserved SIDs */ +#define IDMAP_WK_CREATOR_SID_AUTHORITY "S-1-3" + +/* + * Max door RPC size for ID mapping (can't be too large relative to the + * default user-land thread stack size, since clnt_door_call() + * alloca()s). See libidmap:idmap_init(). + */ +#define IDMAP_MAX_DOOR_RPC (256 * 1024) + +#define IDMAP_SENTINEL_PID UINT32_MAX +#define IDMAP_ID_IS_EPHEMERAL(pid) \ + (((pid) > INT32_MAX) && ((pid) != IDMAP_SENTINEL_PID)) + +#endif /* _SYS_IDMAP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h new file mode 100644 index 000000000000..93f1855b3908 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h @@ -0,0 +1,697 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ISA_DEFS_H +#define _SYS_ISA_DEFS_H + +/* + * This header file serves to group a set of well known defines and to + * set these for each instruction set architecture. These defines may + * be divided into two groups; characteristics of the processor and + * implementation choices for Solaris on a processor. + * + * Processor Characteristics: + * + * _LITTLE_ENDIAN / _BIG_ENDIAN: + * The natural byte order of the processor. A pointer to an int points + * to the least/most significant byte of that int. + * + * _STACK_GROWS_UPWARD / _STACK_GROWS_DOWNWARD: + * The processor specific direction of stack growth. A push onto the + * stack increases/decreases the stack pointer, so it stores data at + * successively higher/lower addresses. (Stackless machines ignored + * without regrets). + * + * _LONG_LONG_HTOL / _LONG_LONG_LTOH: + * A pointer to a long long points to the most/least significant long + * within that long long. + * + * _BIT_FIELDS_HTOL / _BIT_FIELDS_LTOH: + * The C compiler assigns bit fields from the high/low to the low/high end + * of an int (most to least significant vs. least to most significant). + * + * _IEEE_754: + * The processor (or supported implementations of the processor) + * supports the ieee-754 floating point standard. No other floating + * point standards are supported (or significant). Any other supported + * floating point formats are expected to be cased on the ISA processor + * symbol. + * + * _CHAR_IS_UNSIGNED / _CHAR_IS_SIGNED: + * The C Compiler implements objects of type `char' as `unsigned' or + * `signed' respectively. This is really an implementation choice of + * the compiler writer, but it is specified in the ABI and tends to + * be uniform across compilers for an instruction set architecture. + * Hence, it has the properties of a processor characteristic. + * + * _CHAR_ALIGNMENT / _SHORT_ALIGNMENT / _INT_ALIGNMENT / _LONG_ALIGNMENT / + * _LONG_LONG_ALIGNMENT / _DOUBLE_ALIGNMENT / _LONG_DOUBLE_ALIGNMENT / + * _POINTER_ALIGNMENT / _FLOAT_ALIGNMENT: + * The ABI defines alignment requirements of each of the primitive + * object types. Some, if not all, may be hardware requirements as + * well. The values are expressed in "byte-alignment" units. + * + * _MAX_ALIGNMENT: + * The most stringent alignment requirement as specified by the ABI. + * Equal to the maximum of all the above _XXX_ALIGNMENT values. + * + * _ALIGNMENT_REQUIRED: + * True or false (1 or 0) whether or not the hardware requires the ABI + * alignment. + * + * _LONG_LONG_ALIGNMENT_32 + * The 32-bit ABI supported by a 64-bit kernel may have different + * alignment requirements for primitive object types. The value of this + * identifier is expressed in "byte-alignment" units. + * + * _HAVE_CPUID_INSN + * This indicates that the architecture supports the 'cpuid' + * instruction as defined by Intel. (Intel allows other vendors + * to extend the instruction for their own purposes.) + * + * + * Implementation Choices: + * + * _ILP32 / _LP64: + * This specifies the compiler data type implementation as specified in + * the relevant ABI. The choice between these is strongly influenced + * by the underlying hardware, but is not absolutely tied to it. + * Currently only two data type models are supported: + * + * _ILP32: + * Int/Long/Pointer are 32 bits. This is the historical UNIX + * and Solaris implementation. Due to its historical standing, + * this is the default case. + * + * _LP64: + * Long/Pointer are 64 bits, Int is 32 bits. This is the chosen + * implementation for 64-bit ABIs such as SPARC V9. + * + * _I32LPx: + * A compilation environment where 'int' is 32-bit, and + * longs and pointers are simply the same size. + * + * In all cases, Char is 8 bits and Short is 16 bits. + * + * _SUNOS_VTOC_8 / _SUNOS_VTOC_16 / _SVR4_VTOC_16: + * This specifies the form of the disk VTOC (or label): + * + * _SUNOS_VTOC_8: + * This is a VTOC form which is upwardly compatible with the + * SunOS 4.x disk label and allows 8 partitions per disk. + * + * _SUNOS_VTOC_16: + * In this format the incore vtoc image matches the ondisk + * version. It allows 16 slices per disk, and is not + * compatible with the SunOS 4.x disk label. + * + * Note that these are not the only two VTOC forms possible and + * additional forms may be added. One possible form would be the + * SVr4 VTOC form. The symbol for that is reserved now, although + * it is not implemented. + * + * _SVR4_VTOC_16: + * This VTOC form is compatible with the System V Release 4 + * VTOC (as implemented on the SVr4 Intel and 3b ports) with + * 16 partitions per disk. + * + * + * _DMA_USES_PHYSADDR / _DMA_USES_VIRTADDR + * This describes the type of addresses used by system DMA: + * + * _DMA_USES_PHYSADDR: + * This type of DMA, used in the x86 implementation, + * requires physical addresses for DMA buffers. The 24-bit + * addresses used by some legacy boards is the source of the + * "low-memory" (<16MB) requirement for some devices using DMA. + * + * _DMA_USES_VIRTADDR: + * This method of DMA allows the use of virtual addresses for + * DMA transfers. + * + * _FIRMWARE_NEEDS_FDISK / _NO_FDISK_PRESENT + * This indicates the presence/absence of an fdisk table. + * + * _FIRMWARE_NEEDS_FDISK + * The fdisk table is required by system firmware. If present, + * it allows a disk to be subdivided into multiple fdisk + * partitions, each of which is equivalent to a separate, + * virtual disk. This enables the co-existence of multiple + * operating systems on a shared hard disk. + * + * _NO_FDISK_PRESENT + * If the fdisk table is absent, it is assumed that the entire + * media is allocated for a single operating system. + * + * _HAVE_TEM_FIRMWARE + * Defined if this architecture has the (fallback) option of + * using prom_* calls for doing I/O if a suitable kernel driver + * is not available to do it. + * + * _DONT_USE_1275_GENERIC_NAMES + * Controls whether or not device tree node names should + * comply with the IEEE 1275 "Generic Names" Recommended + * Practice. With _DONT_USE_GENERIC_NAMES, device-specific + * names identifying the particular device will be used. + * + * __i386_COMPAT + * This indicates whether the i386 ABI is supported as a *non-native* + * mode for the platform. When this symbol is defined: + * - 32-bit xstat-style system calls are enabled + * - 32-bit xmknod-style system calls are enabled + * - 32-bit system calls use i386 sizes -and- alignments + * + * Note that this is NOT defined for the i386 native environment! + * + * __x86 + * This is ONLY a synonym for defined(__i386) || defined(__amd64) + * which is useful only insofar as these two architectures share + * common attributes. Analogous to __sparc. + * + * _PSM_MODULES + * This indicates whether or not the implementation uses PSM + * modules for processor support, reading /etc/mach from inside + * the kernel to extract a list. + * + * _RTC_CONFIG + * This indicates whether or not the implementation uses /etc/rtc_config + * to configure the real-time clock in the kernel. + * + * _UNIX_KRTLD + * This indicates that the implementation uses a dynamically + * linked unix + krtld to form the core kernel image at boot + * time, or (in the absence of this symbol) a prelinked kernel image. + * + * _OBP + * This indicates the firmware interface is OBP. + * + * _SOFT_HOSTID + * This indicates that the implementation obtains the hostid + * from the file /etc/hostid, rather than from hardware. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The following set of definitions characterize Solaris on AMD's + * 64-bit systems. + */ +#if defined(__x86_64) || defined(__amd64) + +#if !defined(__amd64) +#define __amd64 /* preferred guard */ +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#ifdef illumos +#define _LITTLE_ENDIAN +#endif +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Different alignment constraints for the i386 ABI in compatibility mode + */ +#define _LONG_LONG_ALIGNMENT_32 4 + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_LP64) +#define _LP64 +#endif +#if !defined(_I32LPx) && defined(_KERNEL) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define __i386_COMPAT +#define _PSM_MODULES +#define _RTC_CONFIG +#define _SOFT_HOSTID +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +/* + * The feature test macro __i386 is generic for all processors implementing + * the Intel 386 instruction set or a superset of it. Specifically, this + * includes all members of the 386, 486, and Pentium family of processors. + */ +#elif defined(__i386) || defined(__i386__) + +#if !defined(__i386) +#define __i386 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#ifdef illumos +#define _LITTLE_ENDIAN +#endif +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) && defined(_KERNEL) +#define _I32LPx +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _SOFT_HOSTID +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__aarch64__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_UNSIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#define _LP64 +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__riscv) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_UNSIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#define _LP64 +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__arm__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) && defined(_KERNEL) +#define _I32LPx +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__mips__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#if defined(__mips_n64) +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 8 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 8 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _INT_ALIGNMENT +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_LP64) +#define _LP64 +#endif +#else +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) && defined(_KERNEL) +#define _I32LPx +#endif +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__powerpc__) + +#if defined(__BIG_ENDIAN__) +#define _BIT_FIELDS_HTOL +#else +#define _BIT_FIELDS_LTOH +#endif + +/* + * The following set of definitions characterize the Solaris on SPARC systems. + * + * The symbol __sparc indicates any of the SPARC family of processor + * architectures. This includes SPARC V7, SPARC V8 and SPARC V9. + * + * The symbol __sparcv8 indicates the 32-bit SPARC V8 architecture as defined + * by Version 8 of the SPARC Architecture Manual. (SPARC V7 is close enough + * to SPARC V8 for the former to be subsumed into the latter definition.) + * + * The symbol __sparcv9 indicates the 64-bit SPARC V9 architecture as defined + * by Version 9 of the SPARC Architecture Manual. + * + * The symbols __sparcv8 and __sparcv9 are mutually exclusive, and are only + * relevant when the symbol __sparc is defined. + */ +/* + * XXX Due to the existence of 5110166, "defined(__sparcv9)" needs to be added + * to support backwards builds. This workaround should be removed in s10_71. + */ +#elif defined(__sparc) || defined(__sparcv9) || defined(__sparc__) +#if !defined(__sparc) +#define __sparc +#endif + +/* + * You can be 32-bit or 64-bit, but not both at the same time. + */ +#if defined(__sparcv8) && defined(__sparcv9) +#error "SPARC Versions 8 and 9 are mutually exclusive choices" +#endif + +/* + * Existing compilers do not set __sparcv8. Years will transpire before + * the compilers can be depended on to set the feature test macro. In + * the interim, we'll set it here on the basis of historical behaviour; + * if you haven't asked for SPARC V9, then you must've meant SPARC V8. + */ +#if !defined(__sparcv9) && !defined(__sparcv8) +#define __sparcv8 +#endif + +/* + * Define the appropriate "processor characteristics" shared between + * all Solaris on SPARC systems. + */ +#ifdef illumos +#define _BIG_ENDIAN +#endif +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_HTOL +#define _BIT_FIELDS_HTOL +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Define the appropriate "implementation choices" shared between versions. + */ +#define _SUNOS_VTOC_8 +#define _DMA_USES_VIRTADDR +#define _NO_FDISK_PRESENT +#define _HAVE_TEM_FIRMWARE +#define _OBP + +/* + * The following set of definitions characterize the implementation of + * 32-bit Solaris on SPARC V8 systems. + */ +#if defined(__sparcv8) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 8 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 8 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#define _ILP32 +#if !defined(_I32LPx) && defined(_KERNEL) +#define _I32LPx +#endif + +/* + * The following set of definitions characterize the implementation of + * 64-bit Solaris on SPARC V9 systems. + */ +#elif defined(__sparcv9) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#define _LP64 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL + +#else +#error "unknown SPARC version" +#endif + +/* + * #error is strictly ansi-C, but works as well as anything for K&R systems. + */ +#else +#error "ISA not supported" +#endif + +#if defined(_ILP32) && defined(_LP64) +#error "Both _ILP32 and _LP64 are defined" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ISA_DEFS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/list.h b/sys/cddl/contrib/opensolaris/uts/common/sys/list.h new file mode 100644 index 000000000000..8339b6226d11 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/list.h @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LIST_H +#define _SYS_LIST_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/list_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct list_node list_node_t; +typedef struct list list_t; + +void list_create(list_t *, size_t, size_t); +void list_destroy(list_t *); + +void list_insert_after(list_t *, void *, void *); +void list_insert_before(list_t *, void *, void *); +void list_insert_head(list_t *, void *); +void list_insert_tail(list_t *, void *); +void list_remove(list_t *, void *); +void *list_remove_head(list_t *); +void *list_remove_tail(list_t *); +void list_move_tail(list_t *, list_t *); + +void *list_head(list_t *); +void *list_tail(list_t *); +void *list_next(list_t *, void *); +void *list_prev(list_t *, void *); +int list_is_empty(list_t *); + +void list_link_init(list_node_t *); +void list_link_replace(list_node_t *, list_node_t *); + +int list_link_active(list_node_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h new file mode 100644 index 000000000000..9c42f8832023 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LIST_IMPL_H +#define _SYS_LIST_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct list_node { + struct list_node *list_next; + struct list_node *list_prev; +}; + +struct list { + size_t list_size; + size_t list_offset; + struct list_node list_head; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/note.h b/sys/cddl/contrib/opensolaris/uts/common/sys/note.h new file mode 100644 index 000000000000..2cb7fd89b7dd --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/note.h @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994 by Sun Microsystems, Inc. + */ + +/* + * sys/note.h: interface for annotating source with info for tools + * + * This is the underlying interface; NOTE (/usr/include/note.h) is the + * preferred interface, but all exported header files should include this + * file directly and use _NOTE so as not to take "NOTE" from the user's + * namespace. For consistency, *all* kernel source should use _NOTE. + * + * By default, annotations expand to nothing. This file implements + * that. Tools using annotations will interpose a different version + * of this file that will expand annotations as needed. + */ + +#ifndef _SYS_NOTE_H +#define _SYS_NOTE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _NOTE +#define _NOTE(s) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_NOTE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h new file mode 100644 index 000000000000..52d6aea0a364 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h @@ -0,0 +1,351 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_NVPAIR_H +#define _SYS_NVPAIR_H + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/errno.h> + +#if defined(_KERNEL) && !defined(_BOOT) +#include <sys/kmem.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + DATA_TYPE_DONTCARE = -1, + DATA_TYPE_UNKNOWN = 0, + DATA_TYPE_BOOLEAN, + DATA_TYPE_BYTE, + DATA_TYPE_INT16, + DATA_TYPE_UINT16, + DATA_TYPE_INT32, + DATA_TYPE_UINT32, + DATA_TYPE_INT64, + DATA_TYPE_UINT64, + DATA_TYPE_STRING, + DATA_TYPE_BYTE_ARRAY, + DATA_TYPE_INT16_ARRAY, + DATA_TYPE_UINT16_ARRAY, + DATA_TYPE_INT32_ARRAY, + DATA_TYPE_UINT32_ARRAY, + DATA_TYPE_INT64_ARRAY, + DATA_TYPE_UINT64_ARRAY, + DATA_TYPE_STRING_ARRAY, + DATA_TYPE_HRTIME, + DATA_TYPE_NVLIST, + DATA_TYPE_NVLIST_ARRAY, + DATA_TYPE_BOOLEAN_VALUE, + DATA_TYPE_INT8, + DATA_TYPE_UINT8, + DATA_TYPE_BOOLEAN_ARRAY, + DATA_TYPE_INT8_ARRAY, +#if !defined(_KERNEL) + DATA_TYPE_UINT8_ARRAY, + DATA_TYPE_DOUBLE +#else + DATA_TYPE_UINT8_ARRAY +#endif +} data_type_t; + +typedef struct nvpair { + int32_t nvp_size; /* size of this nvpair */ + int16_t nvp_name_sz; /* length of name string */ + int16_t nvp_reserve; /* not used */ + int32_t nvp_value_elem; /* number of elements for array types */ + data_type_t nvp_type; /* type of value */ + /* name string */ + /* aligned ptr array for string arrays */ + /* aligned array of data for value */ +} nvpair_t; + +/* nvlist header */ +typedef struct nvlist { + int32_t nvl_version; + uint32_t nvl_nvflag; /* persistent flags */ + uint64_t nvl_priv; /* ptr to private data if not packed */ + uint32_t nvl_flag; + int32_t nvl_pad; /* currently not used, for alignment */ +} nvlist_t; + +/* nvp implementation version */ +#define NV_VERSION 0 + +/* nvlist pack encoding */ +#define NV_ENCODE_NATIVE 0 +#define NV_ENCODE_XDR 1 + +/* nvlist persistent unique name flags, stored in nvl_nvflags */ +#define NV_UNIQUE_NAME 0x1 +#define NV_UNIQUE_NAME_TYPE 0x2 + +/* nvlist lookup pairs related flags */ +#define NV_FLAG_NOENTOK 0x1 + +/* convenience macros */ +#define NV_ALIGN(x) (((ulong_t)(x) + 7ul) & ~7ul) +#define NV_ALIGN4(x) (((x) + 3) & ~3) + +#define NVP_SIZE(nvp) ((nvp)->nvp_size) +#define NVP_NAME(nvp) ((char *)(nvp) + sizeof (nvpair_t)) +#define NVP_TYPE(nvp) ((nvp)->nvp_type) +#define NVP_NELEM(nvp) ((nvp)->nvp_value_elem) +#define NVP_VALUE(nvp) ((char *)(nvp) + NV_ALIGN(sizeof (nvpair_t) \ + + (nvp)->nvp_name_sz)) + +#define NVL_VERSION(nvl) ((nvl)->nvl_version) +#define NVL_SIZE(nvl) ((nvl)->nvl_size) +#define NVL_FLAG(nvl) ((nvl)->nvl_flag) + +/* NV allocator framework */ +typedef struct nv_alloc_ops nv_alloc_ops_t; + +typedef struct nv_alloc { + const nv_alloc_ops_t *nva_ops; + void *nva_arg; +} nv_alloc_t; + +struct nv_alloc_ops { + int (*nv_ao_init)(nv_alloc_t *, __va_list); + void (*nv_ao_fini)(nv_alloc_t *); + void *(*nv_ao_alloc)(nv_alloc_t *, size_t); + void (*nv_ao_free)(nv_alloc_t *, void *, size_t); + void (*nv_ao_reset)(nv_alloc_t *); +}; + +extern const nv_alloc_ops_t *nv_fixed_ops; +extern nv_alloc_t *nv_alloc_nosleep; + +#if defined(_KERNEL) && !defined(_BOOT) +extern nv_alloc_t *nv_alloc_sleep; +#endif + +int nv_alloc_init(nv_alloc_t *, const nv_alloc_ops_t *, /* args */ ...); +void nv_alloc_reset(nv_alloc_t *); +void nv_alloc_fini(nv_alloc_t *); + +/* list management */ +int nvlist_alloc(nvlist_t **, uint_t, int); +void nvlist_free(nvlist_t *); +int nvlist_size(nvlist_t *, size_t *, int); +int nvlist_pack(nvlist_t *, char **, size_t *, int, int); +int nvlist_unpack(char *, size_t, nvlist_t **, int); +int nvlist_dup(nvlist_t *, nvlist_t **, int); +int nvlist_merge(nvlist_t *, nvlist_t *, int); + +uint_t nvlist_nvflag(nvlist_t *); + +int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *); +int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *); +int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *); +int nvlist_xdup(nvlist_t *, nvlist_t **, nv_alloc_t *); +nv_alloc_t *nvlist_lookup_nv_alloc(nvlist_t *); + +int nvlist_add_nvpair(nvlist_t *, nvpair_t *); +int nvlist_add_boolean(nvlist_t *, const char *); +int nvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); +int nvlist_add_byte(nvlist_t *, const char *, uchar_t); +int nvlist_add_int8(nvlist_t *, const char *, int8_t); +int nvlist_add_uint8(nvlist_t *, const char *, uint8_t); +int nvlist_add_int16(nvlist_t *, const char *, int16_t); +int nvlist_add_uint16(nvlist_t *, const char *, uint16_t); +int nvlist_add_int32(nvlist_t *, const char *, int32_t); +int nvlist_add_uint32(nvlist_t *, const char *, uint32_t); +int nvlist_add_int64(nvlist_t *, const char *, int64_t); +int nvlist_add_uint64(nvlist_t *, const char *, uint64_t); +int nvlist_add_string(nvlist_t *, const char *, const char *); +int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); +int nvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t); +int nvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); +int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); +int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); +int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); +int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); +int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); +int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); +int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); +int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); +int nvlist_add_string_array(nvlist_t *, const char *, char *const *, uint_t); +int nvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); +int nvlist_add_hrtime(nvlist_t *, const char *, hrtime_t); +#if !defined(_KERNEL) +int nvlist_add_double(nvlist_t *, const char *, double); +#endif + +int nvlist_remove(nvlist_t *, const char *, data_type_t); +int nvlist_remove_all(nvlist_t *, const char *); +int nvlist_remove_nvpair(nvlist_t *, nvpair_t *); + +int nvlist_lookup_boolean(nvlist_t *, const char *); +int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *); +int nvlist_lookup_byte(nvlist_t *, const char *, uchar_t *); +int nvlist_lookup_int8(nvlist_t *, const char *, int8_t *); +int nvlist_lookup_uint8(nvlist_t *, const char *, uint8_t *); +int nvlist_lookup_int16(nvlist_t *, const char *, int16_t *); +int nvlist_lookup_uint16(nvlist_t *, const char *, uint16_t *); +int nvlist_lookup_int32(nvlist_t *, const char *, int32_t *); +int nvlist_lookup_uint32(nvlist_t *, const char *, uint32_t *); +int nvlist_lookup_int64(nvlist_t *, const char *, int64_t *); +int nvlist_lookup_uint64(nvlist_t *, const char *, uint64_t *); +int nvlist_lookup_string(nvlist_t *, const char *, char **); +int nvlist_lookup_nvlist(nvlist_t *, const char *, nvlist_t **); +int nvlist_lookup_boolean_array(nvlist_t *, const char *, + boolean_t **, uint_t *); +int nvlist_lookup_byte_array(nvlist_t *, const char *, uchar_t **, uint_t *); +int nvlist_lookup_int8_array(nvlist_t *, const char *, int8_t **, uint_t *); +int nvlist_lookup_uint8_array(nvlist_t *, const char *, uint8_t **, uint_t *); +int nvlist_lookup_int16_array(nvlist_t *, const char *, int16_t **, uint_t *); +int nvlist_lookup_uint16_array(nvlist_t *, const char *, uint16_t **, uint_t *); +int nvlist_lookup_int32_array(nvlist_t *, const char *, int32_t **, uint_t *); +int nvlist_lookup_uint32_array(nvlist_t *, const char *, uint32_t **, uint_t *); +int nvlist_lookup_int64_array(nvlist_t *, const char *, int64_t **, uint_t *); +int nvlist_lookup_uint64_array(nvlist_t *, const char *, uint64_t **, uint_t *); +int nvlist_lookup_string_array(nvlist_t *, const char *, char ***, uint_t *); +int nvlist_lookup_nvlist_array(nvlist_t *, const char *, + nvlist_t ***, uint_t *); +int nvlist_lookup_hrtime(nvlist_t *, const char *, hrtime_t *); +int nvlist_lookup_pairs(nvlist_t *, int, ...); +#if !defined(_KERNEL) +int nvlist_lookup_double(nvlist_t *, const char *, double *); +#endif + +int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **); +int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **, + int *, char **); +boolean_t nvlist_exists(nvlist_t *, const char *); +boolean_t nvlist_empty(nvlist_t *); + +/* processing nvpair */ +nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *); +nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *); +char *nvpair_name(nvpair_t *); +data_type_t nvpair_type(nvpair_t *); +int nvpair_type_is_array(nvpair_t *); +int nvpair_value_boolean_value(nvpair_t *, boolean_t *); +int nvpair_value_byte(nvpair_t *, uchar_t *); +int nvpair_value_int8(nvpair_t *, int8_t *); +int nvpair_value_uint8(nvpair_t *, uint8_t *); +int nvpair_value_int16(nvpair_t *, int16_t *); +int nvpair_value_uint16(nvpair_t *, uint16_t *); +int nvpair_value_int32(nvpair_t *, int32_t *); +int nvpair_value_uint32(nvpair_t *, uint32_t *); +int nvpair_value_int64(nvpair_t *, int64_t *); +int nvpair_value_uint64(nvpair_t *, uint64_t *); +int nvpair_value_string(nvpair_t *, char **); +int nvpair_value_nvlist(nvpair_t *, nvlist_t **); +int nvpair_value_boolean_array(nvpair_t *, boolean_t **, uint_t *); +int nvpair_value_byte_array(nvpair_t *, uchar_t **, uint_t *); +int nvpair_value_int8_array(nvpair_t *, int8_t **, uint_t *); +int nvpair_value_uint8_array(nvpair_t *, uint8_t **, uint_t *); +int nvpair_value_int16_array(nvpair_t *, int16_t **, uint_t *); +int nvpair_value_uint16_array(nvpair_t *, uint16_t **, uint_t *); +int nvpair_value_int32_array(nvpair_t *, int32_t **, uint_t *); +int nvpair_value_uint32_array(nvpair_t *, uint32_t **, uint_t *); +int nvpair_value_int64_array(nvpair_t *, int64_t **, uint_t *); +int nvpair_value_uint64_array(nvpair_t *, uint64_t **, uint_t *); +int nvpair_value_string_array(nvpair_t *, char ***, uint_t *); +int nvpair_value_nvlist_array(nvpair_t *, nvlist_t ***, uint_t *); +int nvpair_value_hrtime(nvpair_t *, hrtime_t *); +#if !defined(_KERNEL) +int nvpair_value_double(nvpair_t *, double *); +#endif + +nvlist_t *fnvlist_alloc(void); +void fnvlist_free(nvlist_t *); +size_t fnvlist_size(nvlist_t *); +char *fnvlist_pack(nvlist_t *, size_t *); +void fnvlist_pack_free(char *, size_t); +nvlist_t *fnvlist_unpack(char *, size_t); +nvlist_t *fnvlist_dup(nvlist_t *); +void fnvlist_merge(nvlist_t *, nvlist_t *); +size_t fnvlist_num_pairs(nvlist_t *); + +void fnvlist_add_boolean(nvlist_t *, const char *); +void fnvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); +void fnvlist_add_byte(nvlist_t *, const char *, uchar_t); +void fnvlist_add_int8(nvlist_t *, const char *, int8_t); +void fnvlist_add_uint8(nvlist_t *, const char *, uint8_t); +void fnvlist_add_int16(nvlist_t *, const char *, int16_t); +void fnvlist_add_uint16(nvlist_t *, const char *, uint16_t); +void fnvlist_add_int32(nvlist_t *, const char *, int32_t); +void fnvlist_add_uint32(nvlist_t *, const char *, uint32_t); +void fnvlist_add_int64(nvlist_t *, const char *, int64_t); +void fnvlist_add_uint64(nvlist_t *, const char *, uint64_t); +void fnvlist_add_string(nvlist_t *, const char *, const char *); +void fnvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); +void fnvlist_add_nvpair(nvlist_t *, nvpair_t *); +void fnvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t); +void fnvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); +void fnvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); +void fnvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); +void fnvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); +void fnvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); +void fnvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); +void fnvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); +void fnvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); +void fnvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); +void fnvlist_add_string_array(nvlist_t *, const char *, char * const *, uint_t); +void fnvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); + +void fnvlist_remove(nvlist_t *, const char *); +void fnvlist_remove_nvpair(nvlist_t *, nvpair_t *); + +nvpair_t *fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name); +boolean_t fnvlist_lookup_boolean(nvlist_t *nvl, const char *name); +boolean_t fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name); +uchar_t fnvlist_lookup_byte(nvlist_t *nvl, const char *name); +int8_t fnvlist_lookup_int8(nvlist_t *nvl, const char *name); +int16_t fnvlist_lookup_int16(nvlist_t *nvl, const char *name); +int32_t fnvlist_lookup_int32(nvlist_t *nvl, const char *name); +int64_t fnvlist_lookup_int64(nvlist_t *nvl, const char *name); +uint8_t fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name); +uint16_t fnvlist_lookup_uint16(nvlist_t *nvl, const char *name); +uint32_t fnvlist_lookup_uint32(nvlist_t *nvl, const char *name); +uint64_t fnvlist_lookup_uint64(nvlist_t *nvl, const char *name); +char *fnvlist_lookup_string(nvlist_t *nvl, const char *name); +nvlist_t *fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name); + +boolean_t fnvpair_value_boolean_value(nvpair_t *nvp); +uchar_t fnvpair_value_byte(nvpair_t *nvp); +int8_t fnvpair_value_int8(nvpair_t *nvp); +int16_t fnvpair_value_int16(nvpair_t *nvp); +int32_t fnvpair_value_int32(nvpair_t *nvp); +int64_t fnvpair_value_int64(nvpair_t *nvp); +uint8_t fnvpair_value_uint8_t(nvpair_t *nvp); +uint16_t fnvpair_value_uint16(nvpair_t *nvp); +uint32_t fnvpair_value_uint32(nvpair_t *nvp); +uint64_t fnvpair_value_uint64(nvpair_t *nvp); +char *fnvpair_value_string(nvpair_t *nvp); +nvlist_t *fnvpair_value_nvlist(nvpair_t *nvp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_NVPAIR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h new file mode 100644 index 000000000000..c9874b3e4db7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#ifndef _NVPAIR_IMPL_H +#define _NVPAIR_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/nvpair.h> + +/* + * The structures here provided for information and debugging purposes only + * may be changed in the future. + */ + +/* + * implementation linked list for pre-packed data + */ +typedef struct i_nvp i_nvp_t; + +struct i_nvp { + union { + /* ensure alignment */ + uint64_t _nvi_align; + + struct { + /* pointer to next nvpair */ + i_nvp_t *_nvi_next; + + /* pointer to prev nvpair */ + i_nvp_t *_nvi_prev; + + /* next pair in table bucket */ + i_nvp_t *_nvi_hashtable_next; + } _nvi; + } _nvi_un; + + /* nvpair */ + nvpair_t nvi_nvp; +}; +#define nvi_next _nvi_un._nvi._nvi_next +#define nvi_prev _nvi_un._nvi._nvi_prev +#define nvi_hashtable_next _nvi_un._nvi._nvi_hashtable_next + +typedef struct { + i_nvp_t *nvp_list; /* linked list of nvpairs */ + i_nvp_t *nvp_last; /* last nvpair */ + i_nvp_t *nvp_curr; /* current walker nvpair */ + nv_alloc_t *nvp_nva; /* pluggable allocator */ + uint32_t nvp_stat; /* internal state */ + + i_nvp_t **nvp_hashtable; /* table of entries used for lookup */ + uint32_t nvp_nbuckets; /* # of buckets in hash table */ + uint32_t nvp_nentries; /* # of entries in hash table */ +} nvpriv_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _NVPAIR_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h b/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h new file mode 100644 index 000000000000..ec4b7471e50c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h @@ -0,0 +1,140 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + * + */ + +/* + * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PROCESSOR_H +#define _SYS_PROCESSOR_H + +#include <sys/types.h> +#include <sys/procset.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Definitions for p_online, processor_info & lgrp system calls. + */ + +/* + * Type for an lgrpid + */ +typedef uint16_t lgrpid_t; + +/* + * Type for processor name (CPU number). + */ +typedef int processorid_t; +typedef int chipid_t; + +/* + * Flags and return values for p_online(2), and pi_state for processor_info(2). + * These flags are *not* for in-kernel examination of CPU states. + * See <sys/cpuvar.h> for appropriate informational functions. + */ +#define P_OFFLINE 0x0001 /* processor is offline, as quiet as possible */ +#define P_ONLINE 0x0002 /* processor is online */ +#define P_STATUS 0x0003 /* value passed to p_online to request status */ +#define P_FAULTED 0x0004 /* processor is offline, in faulted state */ +#define P_POWEROFF 0x0005 /* processor is powered off */ +#define P_NOINTR 0x0006 /* processor is online, but no I/O interrupts */ +#define P_SPARE 0x0007 /* processor is offline, can be reactivated */ +#define P_BAD P_FAULTED /* unused but defined by USL */ +#define P_FORCED 0x10000000 /* force processor offline */ + +/* + * String names for processor states defined above. + */ +#define PS_OFFLINE "off-line" +#define PS_ONLINE "on-line" +#define PS_FAULTED "faulted" +#define PS_POWEROFF "powered-off" +#define PS_NOINTR "no-intr" +#define PS_SPARE "spare" + +/* + * Structure filled in by processor_info(2). This structure + * SHOULD NOT BE MODIFIED. Changes to the structure would + * negate ABI compatibility. + * + * The string fields are guaranteed to contain a NULL. + * + * The pi_fputypes field contains a (possibly empty) comma-separated + * list of floating point identifier strings. + */ +#define PI_TYPELEN 16 /* max size of CPU type string */ +#define PI_FPUTYPE 32 /* max size of FPU types string */ + +typedef struct { + int pi_state; /* processor state, see above */ + char pi_processor_type[PI_TYPELEN]; /* ASCII CPU type */ + char pi_fputypes[PI_FPUTYPE]; /* ASCII FPU types */ + int pi_clock; /* CPU clock freq in MHz */ +} processor_info_t; + +/* + * Binding values for processor_bind(2) + */ +#define PBIND_NONE -1 /* LWP/thread is not bound */ +#define PBIND_QUERY -2 /* don't set, just return the binding */ +#define PBIND_HARD -3 /* prevents offlining CPU (default) */ +#define PBIND_SOFT -4 /* allows offlining CPU */ +#define PBIND_QUERY_TYPE -5 /* Return binding type */ + +/* + * User-level system call interface prototypes + */ +#ifndef _KERNEL + +extern int p_online(processorid_t processorid, int flag); +extern int processor_info(processorid_t processorid, + processor_info_t *infop); +extern int processor_bind(idtype_t idtype, id_t id, + processorid_t processorid, processorid_t *obind); +extern processorid_t getcpuid(void); +extern lgrpid_t gethomelgroup(void); + +#else /* _KERNEL */ + +/* + * Internal interface prototypes + */ +extern int p_online_internal(processorid_t, int, int *); +extern int p_online_internal_locked(processorid_t, int, int *); + +#endif /* !_KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PROCESSOR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h b/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h new file mode 100644 index 000000000000..a7d58e52534c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h @@ -0,0 +1,166 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#ifndef _SYS_PROCSET_H +#define _SYS_PROCSET_H + +#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.6 */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/feature_tests.h> +#include <sys/types.h> +#include <sys/signal.h> + +/* + * This file defines the data needed to specify a set of + * processes. These types are used by the sigsend, sigsendset, + * priocntl, priocntlset, waitid, evexit, and evexitset system + * calls. + */ +#define P_INITPID 1 +#define P_INITUID 0 +#define P_INITPGID 0 + +#ifndef _IDTYPE_T_DECLARED + +/* + * The following defines the values for an identifier type. It + * specifies the interpretation of an id value. An idtype and + * id together define a simple set of processes. + */ +typedef enum +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) + idtype /* pollutes XPG4.2 namespace */ +#endif + { + P_PID, /* A process identifier. */ + P_PPID, /* A parent process identifier. */ + P_PGID, /* A process group (job control group) */ + /* identifier. */ + P_SID, /* A session identifier. */ + P_CID, /* A scheduling class identifier. */ + P_UID, /* A user identifier. */ + P_GID, /* A group identifier. */ + P_ALL, /* All processes. */ + P_LWPID, /* An LWP identifier. */ + P_TASKID, /* A task identifier. */ + P_PROJID, /* A project identifier. */ + P_POOLID, /* A pool identifier. */ + P_ZONEID, /* A zone identifier. */ + P_CTID, /* A (process) contract identifier. */ + P_CPUID, /* CPU identifier. */ + P_PSETID /* Processor set identifier */ +} idtype_t; + +#define _IDTYPE_T_DECLARED + +#endif + +/* + * The following defines the operations which can be performed to + * combine two simple sets of processes to form another set of + * processes. + */ +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) +typedef enum idop { + POP_DIFF, /* Set difference. The processes which */ + /* are in the left operand set and not */ + /* in the right operand set. */ + POP_AND, /* Set disjunction. The processes */ + /* which are in both the left and right */ + /* operand sets. */ + POP_OR, /* Set conjunction. The processes */ + /* which are in either the left or the */ + /* right operand sets (or both). */ + POP_XOR /* Set exclusive or. The processes */ + /* which are in either the left or */ + /* right operand sets but not in both. */ +} idop_t; + + +/* + * The following structure is used to define a set of processes. + * The set is defined in terms of two simple sets of processes + * and an operator which operates on these two operand sets. + */ +typedef struct procset { + idop_t p_op; /* The operator connection the */ + /* following two operands each */ + /* of which is a simple set of */ + /* processes. */ + + idtype_t p_lidtype; + /* The type of the left operand */ + /* simple set. */ + id_t p_lid; /* The id of the left operand. */ + + idtype_t p_ridtype; + /* The type of the right */ + /* operand simple set. */ + id_t p_rid; /* The id of the right operand. */ +} procset_t; + +/* + * The following macro can be used to initialize a procset_t + * structure. + */ +#define setprocset(psp, op, ltype, lid, rtype, rid) \ + (psp)->p_op = (op); \ + (psp)->p_lidtype = (ltype); \ + (psp)->p_lid = (lid); \ + (psp)->p_ridtype = (rtype); \ + (psp)->p_rid = (rid); + +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ + +#ifdef illumos +#ifdef _KERNEL + +struct proc; + +extern int dotoprocs(procset_t *, int (*)(), char *); +extern int dotolwp(procset_t *, int (*)(), char *); +extern int procinset(struct proc *, procset_t *); +extern int sigsendproc(struct proc *, sigsend_t *); +extern int sigsendset(procset_t *, sigsend_t *); +extern boolean_t cur_inset_only(procset_t *); +extern id_t getmyid(idtype_t); + +#endif /* _KERNEL */ +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PROCSET_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h b/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h new file mode 100644 index 000000000000..6431bf22bca0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h @@ -0,0 +1,162 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYNCH_H +#define _SYS_SYNCH_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifndef _ASM +#include <sys/types.h> +#include <sys/int_types.h> +#endif /* _ASM */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _ASM +/* + * Thread and LWP mutexes have the same type + * definitions. + * + * NOTE: + * + * POSIX requires that <pthread.h> define the structures pthread_mutex_t + * and pthread_cond_t. Although these structures are identical to mutex_t + * (lwp_mutex_t) and cond_t (lwp_cond_t), defined here, a typedef of these + * types would require including <synch.h> in <pthread.h>, pulling in + * non-posix symbols/constants, violating POSIX namespace restrictions. Hence, + * pthread_mutex_t/pthread_cond_t have been redefined (in <sys/types.h>). + * Any modifications done to mutex_t/lwp_mutex_t or cond_t/lwp_cond_t must + * also be done to pthread_mutex_t/pthread_cond_t. + */ +typedef struct _lwp_mutex { + struct { + uint16_t flag1; + uint8_t flag2; + uint8_t ceiling; + union { + uint16_t bcptype; + struct { + uint8_t count_type1; + uint8_t count_type2; + } mtype_rcount; + } mbcp_type_un; + uint16_t magic; + } flags; + union { + struct { + uint8_t pad[8]; + } lock64; + struct { + uint32_t ownerpid; + uint32_t lockword; + } lock32; + upad64_t owner64; + } lock; + upad64_t data; +} lwp_mutex_t; + +/* + * Thread and LWP condition variables have the same + * type definition. + * NOTE: + * The layout of the following structure should be kept in sync with the + * layout of pthread_cond_t in sys/types.h. See NOTE above for lwp_mutex_t. + */ +typedef struct _lwp_cond { + struct { + uint8_t flag[4]; + uint16_t type; + uint16_t magic; + } flags; + upad64_t data; +} lwp_cond_t; + +/* + * LWP semaphores + */ +typedef struct _lwp_sema { + uint32_t count; /* semaphore count */ + uint16_t type; + uint16_t magic; + uint8_t flags[8]; /* last byte reserved for waiters */ + upad64_t data; /* optional data */ +} lwp_sema_t; + +/* + * Thread and LWP rwlocks have the same type definition. + * NOTE: The layout of this structure should be kept in sync with the layout + * of the correponding structure of pthread_rwlock_t in sys/types.h. + * Also, because we have to deal with C++, there is an identical structure + * for rwlock_t in head/sync.h that we cannot change. + */ +typedef struct _lwp_rwlock { + int32_t readers; /* rwstate word */ + uint16_t type; + uint16_t magic; + lwp_mutex_t mutex; /* used with process-shared rwlocks */ + lwp_cond_t readercv; /* used only to indicate ownership */ + lwp_cond_t writercv; /* used only to indicate ownership */ +} lwp_rwlock_t; + +#endif /* _ASM */ +/* + * Definitions of synchronization types. + */ +#define USYNC_THREAD 0x00 /* private to a process */ +#define USYNC_PROCESS 0x01 /* shared by processes */ + +/* Keep the following values in sync with pthread.h */ +#define LOCK_NORMAL 0x00 /* same as USYNC_THREAD */ +#define LOCK_SHARED 0x01 /* same as USYNC_PROCESS */ +#define LOCK_ERRORCHECK 0x02 /* error check lock */ +#define LOCK_RECURSIVE 0x04 /* recursive lock */ +#define LOCK_PRIO_INHERIT 0x10 /* priority inheritance lock */ +#define LOCK_PRIO_PROTECT 0x20 /* priority ceiling lock */ +#define LOCK_ROBUST 0x40 /* robust lock */ + +/* + * USYNC_PROCESS_ROBUST is a deprecated historical type. It is mapped + * into (USYNC_PROCESS | LOCK_ROBUST) by mutex_init(). Application code + * should be revised to use (USYNC_PROCESS | LOCK_ROBUST) rather than this. + */ +#define USYNC_PROCESS_ROBUST 0x08 + +/* + * lwp_mutex_t flags + */ +#define LOCK_OWNERDEAD 0x1 +#define LOCK_NOTRECOVERABLE 0x2 +#define LOCK_INITED 0x4 +#define LOCK_UNMAPPED 0x8 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYNCH_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h new file mode 100644 index 000000000000..9a259828fc1f --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h @@ -0,0 +1,289 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_SYSEVENT_H +#define _SYS_SYSEVENT_H + +#include <sys/nvpair.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef NULL +#if defined(_LP64) && !defined(__cplusplus) +#define NULL 0L +#else +#define NULL 0 +#endif +#endif + +/* Internal registration class and subclass */ +#define EC_ALL "register_all_classes" +#define EC_SUB_ALL "register_all_subclasses" + +/* + * Event allocation/enqueuing sleep/nosleep flags + */ +#define SE_SLEEP 0 +#define SE_NOSLEEP 1 + +/* Framework error codes */ +#define SE_EINVAL 1 /* Invalid argument */ +#define SE_ENOMEM 2 /* Unable to allocate memory */ +#define SE_EQSIZE 3 /* Maximum event q size exceeded */ +#define SE_EFAULT 4 /* Copy fault */ +#define SE_NOTFOUND 5 /* Attribute not found */ +#define SE_NO_TRANSPORT 6 /* sysevent transport down */ + +/* Internal data types */ + +#define SE_DATA_TYPE_BYTE DATA_TYPE_BYTE +#define SE_DATA_TYPE_INT16 DATA_TYPE_INT16 +#define SE_DATA_TYPE_UINT16 DATA_TYPE_UINT16 +#define SE_DATA_TYPE_INT32 DATA_TYPE_INT32 +#define SE_DATA_TYPE_UINT32 DATA_TYPE_UINT32 +#define SE_DATA_TYPE_INT64 DATA_TYPE_INT64 +#define SE_DATA_TYPE_UINT64 DATA_TYPE_UINT64 +#define SE_DATA_TYPE_STRING DATA_TYPE_STRING +#define SE_DATA_TYPE_BYTES DATA_TYPE_BYTE_ARRAY +#define SE_DATA_TYPE_TIME DATA_TYPE_HRTIME + +#define SE_KERN_PID 0 + +#define SUNW_VENDOR "SUNW" +#define SE_USR_PUB "usr:" +#define SE_KERN_PUB "kern:" +#define SUNW_KERN_PUB SUNW_VENDOR ":" SE_KERN_PUB +#define SUNW_USR_PUB SUNW_VENDOR ":" SE_USR_PUB + +/* + * Event header and attribute value limits + */ +#define MAX_ATTR_NAME 1024 +#define MAX_STRING_SZ 1024 +#define MAX_BYTE_ARRAY 1024 + +#define MAX_CLASS_LEN 64 +#define MAX_SUBCLASS_LEN 64 +#define MAX_PUB_LEN 128 +#define MAX_CHNAME_LEN 128 +#define MAX_SUBID_LEN 16 + +/* + * Limit for the event payload size + */ +#define MAX_EV_SIZE_LEN (SHRT_MAX/4) + +/* Opaque sysevent_t data type */ +typedef void *sysevent_t; + +/* Opaque channel bind data type */ +typedef void evchan_t; + +/* sysevent attribute list */ +typedef nvlist_t sysevent_attr_list_t; + +/* sysevent attribute name-value pair */ +typedef nvpair_t sysevent_attr_t; + +/* Unique event identifier */ +typedef struct sysevent_id { + uint64_t eid_seq; + hrtime_t eid_ts; +} sysevent_id_t; + +/* Event attribute value structures */ +typedef struct sysevent_bytes { + int32_t size; + uchar_t *data; +} sysevent_bytes_t; + +typedef struct sysevent_value { + int32_t value_type; /* data type */ + union { + uchar_t sv_byte; + int16_t sv_int16; + uint16_t sv_uint16; + int32_t sv_int32; + uint32_t sv_uint32; + int64_t sv_int64; + uint64_t sv_uint64; + hrtime_t sv_time; + char *sv_string; + sysevent_bytes_t sv_bytes; + } value; +} sysevent_value_t; + +/* + * The following flags determine the memory allocation semantics to use for + * kernel event buffer allocation by userland and kernel versions of + * sysevent_evc_publish(). + * + * EVCH_SLEEP and EVCH_NOSLEEP respectively map to KM_SLEEP and KM_NOSLEEP. + * EVCH_TRYHARD is a kernel-only publish flag that allow event allocation + * routines to use use alternate kmem caches in situations where free memory + * may be low. Kernel callers of sysevent_evc_publish() must set flags to + * one of EVCH_SLEEP, EVCH_NOSLEEP or EVCH_TRYHARD. Userland callers of + * sysevent_evc_publish() must set flags to one of EVCH_SLEEP or EVCH_NOSLEEP. + * + * EVCH_QWAIT determines whether or not we should wait for slots in the event + * queue at publication time. EVCH_QWAIT may be used by kernel and userland + * publishers and must be used in conjunction with any of one of EVCH_SLEEP, + * EVCH_NOSLEEP or EVCH_TRYHARD (kernel-only). + */ + +#define EVCH_NOSLEEP 0x0001 /* No sleep on kmem_alloc() */ +#define EVCH_SLEEP 0x0002 /* Sleep on kmem_alloc() */ +#define EVCH_TRYHARD 0x0004 /* May use alternate kmem cache for alloc */ +#define EVCH_QWAIT 0x0008 /* Wait for slot in event queue */ + +/* + * Meaning of flags for subscribe. Bits 8 to 15 are dedicated to + * the consolidation private interface, so flags defined here are restricted + * to the LSB. + * + * EVCH_SUB_KEEP indicates that this subscription should persist even if + * this subscriber id should die unexpectedly; matching events will be + * queued (up to a limit) and will be delivered if/when we restart again + * with the same subscriber id. + */ +#define EVCH_SUB_KEEP 0x01 + +/* + * Subscriptions may be wildcarded, but we limit the number of + * wildcards permitted. + */ +#define EVCH_WILDCARD_MAX 10 + +/* + * Used in unsubscribe to indicate all subscriber ids for a channel. + */ +#define EVCH_ALLSUB "all_subs" + +/* + * Meaning of flags parameter of channel bind function + * + * EVCH_CREAT indicates to create a channel if not already present. + * + * EVCH_HOLD_PEND indicates that events should be published to this + * channel even if there are no matching subscribers present; when + * a subscriber belatedly binds to the channel and registers their + * subscriptions they will receive events that predate their bind. + * If the channel is closed, however, with no remaining bindings then + * the channel is destroyed. + * + * EVCH_HOLD_PEND_INDEF is a stronger version of EVCH_HOLD_PEND - + * even if the channel has no remaining bindings it will not be + * destroyed so long as events remain unconsumed. This is suitable for + * use with short-lived event producers that may bind to (create) the + * channel and exit before the intended consumer has started. + */ +#define EVCH_CREAT 0x0001 +#define EVCH_HOLD_PEND 0x0002 +#define EVCH_HOLD_PEND_INDEF 0x0004 +#define EVCH_B_FLAGS 0x0007 /* All valid bits */ + +/* + * Meaning of commands of evc_control function + */ +#define EVCH_GET_CHAN_LEN_MAX 1 /* Get event queue length limit */ +#define EVCH_GET_CHAN_LEN 2 /* Get event queue length */ +#define EVCH_SET_CHAN_LEN 3 /* Set event queue length */ +#define EVCH_CMD_LAST EVCH_SET_CHAN_LEN /* Last command */ + +#ifdef illumos +/* + * Shared user/kernel event channel interface definitions + */ +extern int sysevent_evc_bind(const char *, evchan_t **, uint32_t); +extern int sysevent_evc_unbind(evchan_t *); +extern int sysevent_evc_subscribe(evchan_t *, const char *, const char *, + int (*)(sysevent_t *, void *), void *, uint32_t); +extern int sysevent_evc_unsubscribe(evchan_t *, const char *); +extern int sysevent_evc_publish(evchan_t *, const char *, const char *, + const char *, const char *, nvlist_t *, uint32_t); +extern int sysevent_evc_control(evchan_t *, int, ...); +extern int sysevent_evc_setpropnvl(evchan_t *, nvlist_t *); +extern int sysevent_evc_getpropnvl(evchan_t *, nvlist_t **); +#endif /* illumos */ + +#ifndef _KERNEL + +#ifdef illumos +/* + * Userland-only event channel interfaces + */ + +#include <door.h> + +typedef struct sysevent_subattr sysevent_subattr_t; + +extern sysevent_subattr_t *sysevent_subattr_alloc(void); +extern void sysevent_subattr_free(sysevent_subattr_t *); + +extern void sysevent_subattr_thrattr(sysevent_subattr_t *, pthread_attr_t *); +extern void sysevent_subattr_sigmask(sysevent_subattr_t *, sigset_t *); + +extern void sysevent_subattr_thrcreate(sysevent_subattr_t *, + door_xcreate_server_func_t *, void *); +extern void sysevent_subattr_thrsetup(sysevent_subattr_t *, + door_xcreate_thrsetup_func_t *, void *); + +extern int sysevent_evc_xsubscribe(evchan_t *, const char *, const char *, + int (*)(sysevent_t *, void *), void *, uint32_t, sysevent_subattr_t *); +#endif /* illumos */ + +#else + +/* + * Kernel log_event interfaces. + */ +extern int log_sysevent(sysevent_t *, int, sysevent_id_t *); + +extern sysevent_t *sysevent_alloc(char *, char *, char *, int); +extern void sysevent_free(sysevent_t *); +extern int sysevent_add_attr(sysevent_attr_list_t **, char *, + sysevent_value_t *, int); +extern void sysevent_free_attr(sysevent_attr_list_t *); +extern int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *); +extern void sysevent_detach_attributes(sysevent_t *); +#ifdef illumos +extern char *sysevent_get_class_name(sysevent_t *); +extern char *sysevent_get_subclass_name(sysevent_t *); +extern uint64_t sysevent_get_seq(sysevent_t *); +extern void sysevent_get_time(sysevent_t *, hrtime_t *); +extern size_t sysevent_get_size(sysevent_t *); +extern char *sysevent_get_pub(sysevent_t *); +extern int sysevent_get_attr_list(sysevent_t *, nvlist_t **); +#endif /* illumos */ + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSEVENT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h new file mode 100644 index 000000000000..9d3107d09011 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h @@ -0,0 +1,256 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYSEVENT_DEV_H +#define _SYS_SYSEVENT_DEV_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/sysevent/eventdefs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Event schema for EC_DEV_ADD/ESC_DISK + * + * Event Class - EC_DEV_ADD + * Event Sub-Class - ESC_DISK + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev name to the raw device. + * The name does not include the slice number component. + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property> + * Attribute Type - data type of the devinfo_node_property + * Attribute Value - value of the devinfo_node_property + * + * + * Event schema for EC_DEV_ADD/ESC_NETWORK + * + * Event Class - EC_DEV_ADD + * Event Sub-Class - ESC_NETWORK + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev name associated with the device if exists. + * /dev name associated with the driver for DLPI + * Style-2 only drivers. + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property> + * Attribute Type - data type of the devinfo_node_property + * Attribute Value - value of the devinfo_node_property + * + * + * Event schema for EC_DEV_ADD/ESC_PRINTER + * + * Event Class - EC_DEV_ADD + * Event Sub-Class - ESC_PRINTER + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev/printers name associated with the device + * if exists. + * /dev name associated with the device if it exists + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * Attribute Name - DEV_PROP_PREFIX<devinfo_node_property> + * Attribute Type - data type of the devinfo_node_property + * Attribute Value - value of the devinfo_node_property + * + * + * Event schema for EC_DEV_REMOVE/ESC_DISK + * + * Event Class - EC_DEV_REMOVE + * Event Sub-Class - ESC_DISK + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev name to the raw device. + * The name does not include the slice number component. + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * + * Event schema for EC_DEV_REMOVE/ESC_NETWORK + * + * Event Class - EC_DEV_REMOVE + * Event Sub-Class - ESC_NETWORK + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev name associated with the device if exists. + * /dev name associated with the driver for DLPI + * Style-2 only drivers. + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * + * Event schema for EC_DEV_REMOVE/ESC_PRINTER + * + * Event Class - EC_DEV_REMOVE + * Event Sub-Class - ESC_PRINTER + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev/printers name associated with the device + * if exists. + * /dev name associated with the device if it exists + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * + * Event schema for EC_DEV_BRANCH/ESC_DEV_BRANCH_ADD or ESC_DEV_BRANCH_REMOVE + * + * Event Class - EC_DEV_BRANCH + * Event Sub-Class - ESC_DEV_BRANCH_ADD or ESC_DEV_BRANCH_REMOVE + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path to the root node of the device subtree + * without the "/devices" prefix. + */ + +#define EV_VERSION "version" +#define DEV_PHYS_PATH "phys_path" +#define DEV_NAME "dev_name" +#define DEV_DRIVER_NAME "driver_name" +#define DEV_INSTANCE "instance" +#define DEV_PROP_PREFIX "prop-" + +#define EV_V1 1 + +/* maximum number of devinfo node properties added to the event */ +#define MAX_PROP_COUNT 100 + +/* only properties with size less than PROP_LEN_LIMIT are added to the event */ +#define PROP_LEN_LIMIT 1024 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSEVENT_DEV_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h new file mode 100644 index 000000000000..f9f81e0213cf --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h @@ -0,0 +1,230 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _SYS_SYSEVENT_EVENTDEFS_H +#define _SYS_SYSEVENT_EVENTDEFS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * eventdefs.h contains public definitions for sysevent types (classes + * and subclasses). All additions/removal/changes are subject + * to PSARC approval. + */ + +/* Sysevent Class definitions */ +#define EC_NONE "EC_none" +#define EC_PRIV "EC_priv" +#define EC_PLATFORM "EC_platform" /* events private to platform */ +#define EC_DR "EC_dr" /* Dynamic reconfiguration event class */ +#define EC_ENV "EC_env" /* Environmental monitor event class */ +#define EC_DOMAIN "EC_domain" /* Domain event class */ +#define EC_AP_DRIVER "EC_ap_driver" /* Alternate Pathing event class */ +#define EC_IPMP "EC_ipmp" /* IP Multipathing event class */ +#define EC_DEV_ADD "EC_dev_add" /* device add event class */ +#define EC_DEV_REMOVE "EC_dev_remove" /* device remove event class */ +#define EC_DEV_BRANCH "EC_dev_branch" /* device tree branch event class */ +#define EC_DEV_STATUS "EC_dev_status" /* device status event class */ +#define EC_FM "EC_fm" /* FMA error report event */ +#define EC_ZFS "EC_zfs" /* ZFS event */ +#define EC_DATALINK "EC_datalink" /* datalink event */ +#define EC_VRRP "EC_vrrp" /* VRRP event */ + +/* + * The following event class is reserved for exclusive use + * by Sun Cluster software. + */ +#define EC_CLUSTER "EC_Cluster" + +/* + * EC_DR subclass definitions - supporting attributes (name/value pairs) + * are found in sys/sysevent/dr.h + */ + +/* Attachment point state change */ +#define ESC_DR_AP_STATE_CHANGE "ESC_dr_ap_state_change" +#define ESC_DR_REQ "ESC_dr_req" /* Request DR */ +#define ESC_DR_TARGET_STATE_CHANGE "ESC_dr_target_state_change" + +/* + * EC_ENV subclass definitions - supporting attributes (name/value pairs) + * are found in sys/sysevent/env.h + */ +#define ESC_ENV_TEMP "ESC_env_temp" /* Temperature change event subclass */ +#define ESC_ENV_FAN "ESC_env_fan" /* Fan status change event subclass */ +#define ESC_ENV_POWER "ESC_env_power" /* Power supply change event subclass */ +#define ESC_ENV_LED "ESC_env_led" /* LED change event subclass */ + +/* + * EC_DOMAIN subclass definitions - supporting attributes (name/value pairs) + * are found in sys/sysevent/domain.h + */ + +/* Domain state change */ +#define ESC_DOMAIN_STATE_CHANGE "ESC_domain_state_change" +/* Domain loghost name change */ +#define ESC_DOMAIN_LOGHOST_CHANGE "ESC_domain_loghost_change" + +/* + * EC_AP_DRIVER subclass definitions - supporting attributes (name/value pairs) + * are found in sys/sysevent/ap_driver.h + */ + +/* Alternate Pathing path switch */ +#define ESC_AP_DRIVER_PATHSWITCH "ESC_ap_driver_pathswitch" +/* Alternate Pathing database commit */ +#define ESC_AP_DRIVER_COMMIT "ESC_ap_driver_commit" +/* Alternate Pathing physical path status change */ +#define ESC_AP_DRIVER_PHYS_PATH_STATUS_CHANGE \ + "ESC_ap_driver_phys_path_status_change" + +/* + * EC_IPMP subclass definitions - supporting attributes (name/value pairs) + * are found in sys/sysevent/ipmp.h + */ + +/* IPMP group has changed state */ +#define ESC_IPMP_GROUP_STATE "ESC_ipmp_group_state" + +/* IPMP group has been created or removed */ +#define ESC_IPMP_GROUP_CHANGE "ESC_ipmp_group_change" + +/* IPMP group has had an interface added or removed */ +#define ESC_IPMP_GROUP_MEMBER_CHANGE "ESC_ipmp_group_member_change" + +/* Interface within an IPMP group has changed state or type */ +#define ESC_IPMP_IF_CHANGE "ESC_ipmp_if_change" + +/* IPMP probe has changed state */ +#define ESC_IPMP_PROBE_STATE "ESC_ipmp_probe_state" + +/* + * EC_DEV_ADD and EC_DEV_REMOVE subclass definitions - supporting attributes + * (name/value pairs) are found in sys/sysevent/dev.h + */ +#define ESC_DISK "disk" /* disk device */ +#define ESC_NETWORK "network" /* network interface */ +#define ESC_PRINTER "printer" /* printer device */ +#define ESC_LOFI "lofi" /* lofi device */ + +/* + * EC_DEV_BRANCH subclass definitions - supporting attributes (name/value pairs) + * are found in sys/sysevent/dev.h + */ + +/* device tree branch added */ +#define ESC_DEV_BRANCH_ADD "ESC_dev_branch_add" + +/* device tree branch removed */ +#define ESC_DEV_BRANCH_REMOVE "ESC_dev_branch_remove" + +/* + * EC_DEV_STATUS subclass definitions + * + * device capacity dynamically changed + */ +#define ESC_DEV_DLE "ESC_dev_dle" + +/* LUN has received an eject request from the user */ +#define ESC_DEV_EJECT_REQUEST "ESC_dev_eject_request" + +/* FMA Fault and Error event protocol subclass */ +#define ESC_FM_ERROR "ESC_FM_error" +#define ESC_FM_ERROR_REPLAY "ESC_FM_error_replay" + +/* Service processor subclass definitions */ +#define ESC_PLATFORM_SP_RESET "ESC_platform_sp_reset" + +/* + * EC_PWRCTL subclass definitions + */ +#define EC_PWRCTL "EC_pwrctl" +#define ESC_PWRCTL_ADD "ESC_pwrctl_add" +#define ESC_PWRCTL_REMOVE "ESC_pwrctl_remove" +#define ESC_PWRCTL_WARN "ESC_pwrctl_warn" +#define ESC_PWRCTL_LOW "ESC_pwrctl_low" +#define ESC_PWRCTL_STATE_CHANGE "ESC_pwrctl_state_change" +#define ESC_PWRCTL_POWER_BUTTON "ESC_pwrctl_power_button" +#define ESC_PWRCTL_BRIGHTNESS_UP "ESC_pwrctl_brightness_up" +#define ESC_PWRCTL_BRIGHTNESS_DOWN "ESC_pwrctl_brightness_down" + +/* EC_ACPIEV subclass definitions */ +#define EC_ACPIEV "EC_acpiev" +#define ESC_ACPIEV_DISPLAY_SWITCH "ESC_acpiev_display_switch" +#define ESC_ACPIEV_SCREEN_LOCK "ESC_acpiev_screen_lock" +#define ESC_ACPIEV_SLEEP "ESC_acpiev_sleep" +#define ESC_ACPIEV_AUDIO_MUTE "ESC_acpiev_audio_mute" +#define ESC_ACPIEV_WIFI "ESC_acpiev_wifi" +#define ESC_ACPIEV_TOUCHPAD "ESC_acpiev_touchpad" + +/* + * ZFS subclass definitions. supporting attributes (name/value paris) are found + * in sys/fs/zfs.h + */ +#define ESC_ZFS_RESILVER_START "ESC_ZFS_resilver_start" +#define ESC_ZFS_RESILVER_FINISH "ESC_ZFS_resilver_finish" +#define ESC_ZFS_VDEV_REMOVE "ESC_ZFS_vdev_remove" +#define ESC_ZFS_VDEV_REMOVE_AUX "ESC_ZFS_vdev_remove_aux" +#define ESC_ZFS_VDEV_REMOVE_DEV "ESC_ZFS_vdev_remove_dev" +#define ESC_ZFS_POOL_CREATE "ESC_ZFS_pool_create" +#define ESC_ZFS_POOL_DESTROY "ESC_ZFS_pool_destroy" +#define ESC_ZFS_POOL_IMPORT "ESC_ZFS_pool_import" +#define ESC_ZFS_VDEV_ADD "ESC_ZFS_vdev_add" +#define ESC_ZFS_VDEV_ATTACH "ESC_ZFS_vdev_attach" +#define ESC_ZFS_VDEV_CLEAR "ESC_ZFS_vdev_clear" +#define ESC_ZFS_VDEV_CHECK "ESC_ZFS_vdev_check" +#define ESC_ZFS_VDEV_ONLINE "ESC_ZFS_vdev_online" +#define ESC_ZFS_CONFIG_SYNC "ESC_ZFS_config_sync" +#define ESC_ZFS_SCRUB_START "ESC_ZFS_scrub_start" +#define ESC_ZFS_SCRUB_FINISH "ESC_ZFS_scrub_finish" +#define ESC_ZFS_SCRUB_ABORT "ESC_ZFS_scrub_abort" +#define ESC_ZFS_SCRUB_RESUME "ESC_ZFS_scrub_resume" +#define ESC_ZFS_SCRUB_PAUSED "ESC_ZFS_scrub_paused" +#define ESC_ZFS_VDEV_SPARE "ESC_ZFS_vdev_spare" +#define ESC_ZFS_BOOTFS_VDEV_ATTACH "ESC_ZFS_bootfs_vdev_attach" +#define ESC_ZFS_POOL_REGUID "ESC_ZFS_pool_reguid" +#define ESC_ZFS_HISTORY_EVENT "ESC_ZFS_history_event" +#define ESC_ZFS_VDEV_AUTOEXPAND "ESC_ZFS_vdev_autoexpand" + +/* + * datalink subclass definitions. + */ +#define ESC_DATALINK_PHYS_ADD "ESC_datalink_phys_add" /* new physical link */ + +/* + * VRRP subclass definitions. Supporting attributes (name/value paris) are + * found in sys/sysevent/vrrp.h + */ +#define ESC_VRRP_STATE_CHANGE "ESC_vrrp_state_change" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSEVENT_EVENTDEFS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h new file mode 100644 index 000000000000..ee6e55214ec7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h @@ -0,0 +1,459 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYSMACROS_H +#define _SYS_SYSMACROS_H + +#include <sys/param.h> +#include <sys/isa_defs.h> +#if defined(__FreeBSD__) && defined(_KERNEL) +#include <sys/libkern.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Some macros for units conversion + */ +/* + * Disk blocks (sectors) and bytes. + */ +#define dtob(DD) ((DD) << DEV_BSHIFT) +#define btod(BB) (((BB) + DEV_BSIZE - 1) >> DEV_BSHIFT) +#define btodt(BB) ((BB) >> DEV_BSHIFT) +#define lbtod(BB) (((offset_t)(BB) + DEV_BSIZE - 1) >> DEV_BSHIFT) + +/* common macros */ +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif +#ifndef SIGNOF +#define SIGNOF(a) ((a) < 0 ? -1 : (a) > 0) +#endif + +#ifdef _KERNEL + +/* + * Convert a single byte to/from binary-coded decimal (BCD). + */ +extern unsigned char byte_to_bcd[256]; +extern unsigned char bcd_to_byte[256]; + +#define BYTE_TO_BCD(x) byte_to_bcd[(x) & 0xff] +#define BCD_TO_BYTE(x) bcd_to_byte[(x) & 0xff] + +#endif /* _KERNEL */ + +/* + * WARNING: The device number macros defined here should not be used by device + * drivers or user software. Device drivers should use the device functions + * defined in the DDI/DKI interface (see also ddi.h). Application software + * should make use of the library routines available in makedev(3). A set of + * new device macros are provided to operate on the expanded device number + * format supported in SVR4. Macro versions of the DDI device functions are + * provided for use by kernel proper routines only. Macro routines bmajor(), + * major(), minor(), emajor(), eminor(), and makedev() will be removed or + * their definitions changed at the next major release following SVR4. + */ + +#define O_BITSMAJOR 7 /* # of SVR3 major device bits */ +#define O_BITSMINOR 8 /* # of SVR3 minor device bits */ +#define O_MAXMAJ 0x7f /* SVR3 max major value */ +#define O_MAXMIN 0xff /* SVR3 max minor value */ + + +#define L_BITSMAJOR32 14 /* # of SVR4 major device bits */ +#define L_BITSMINOR32 18 /* # of SVR4 minor device bits */ +#define L_MAXMAJ32 0x3fff /* SVR4 max major value */ +#define L_MAXMIN32 0x3ffff /* MAX minor for 3b2 software drivers. */ + /* For 3b2 hardware devices the minor is */ + /* restricted to 256 (0-255) */ + +#ifdef _LP64 +#define L_BITSMAJOR 32 /* # of major device bits in 64-bit Solaris */ +#define L_BITSMINOR 32 /* # of minor device bits in 64-bit Solaris */ +#define L_MAXMAJ 0xfffffffful /* max major value */ +#define L_MAXMIN 0xfffffffful /* max minor value */ +#else +#define L_BITSMAJOR L_BITSMAJOR32 +#define L_BITSMINOR L_BITSMINOR32 +#define L_MAXMAJ L_MAXMAJ32 +#define L_MAXMIN L_MAXMIN32 +#endif + +#ifdef illumos +#ifdef _KERNEL + +/* major part of a device internal to the kernel */ + +#define major(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ) +#define bmajor(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ) + +/* get internal major part of expanded device number */ + +#define getmajor(x) (major_t)((((dev_t)(x)) >> L_BITSMINOR) & L_MAXMAJ) + +/* minor part of a device internal to the kernel */ + +#define minor(x) (minor_t)((x) & O_MAXMIN) + +/* get internal minor part of expanded device number */ + +#define getminor(x) (minor_t)((x) & L_MAXMIN) + +#else + +/* major part of a device external from the kernel (same as emajor below) */ + +#define major(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ) + +/* minor part of a device external from the kernel (same as eminor below) */ + +#define minor(x) (minor_t)((x) & O_MAXMIN) + +#endif /* _KERNEL */ + +/* create old device number */ + +#define makedev(x, y) (unsigned short)(((x) << O_BITSMINOR) | ((y) & O_MAXMIN)) + +/* make an new device number */ + +#define makedevice(x, y) (dev_t)(((dev_t)(x) << L_BITSMINOR) | ((y) & L_MAXMIN)) + + +/* + * emajor() allows kernel/driver code to print external major numbers + * eminor() allows kernel/driver code to print external minor numbers + */ + +#define emajor(x) \ + (major_t)(((unsigned int)(x) >> O_BITSMINOR) > O_MAXMAJ) ? \ + NODEV : (((unsigned int)(x) >> O_BITSMINOR) & O_MAXMAJ) + +#define eminor(x) \ + (minor_t)((x) & O_MAXMIN) + +/* + * get external major and minor device + * components from expanded device number + */ +#define getemajor(x) (major_t)((((dev_t)(x) >> L_BITSMINOR) > L_MAXMAJ) ? \ + NODEV : (((dev_t)(x) >> L_BITSMINOR) & L_MAXMAJ)) +#define geteminor(x) (minor_t)((x) & L_MAXMIN) +#endif /* illumos */ + +/* + * These are versions of the kernel routines for compressing and + * expanding long device numbers that don't return errors. + */ +#if (L_BITSMAJOR32 == L_BITSMAJOR) && (L_BITSMINOR32 == L_BITSMINOR) + +#define DEVCMPL(x) (x) +#define DEVEXPL(x) (x) + +#else + +#define DEVCMPL(x) \ + (dev32_t)((((x) >> L_BITSMINOR) > L_MAXMAJ32 || \ + ((x) & L_MAXMIN) > L_MAXMIN32) ? NODEV32 : \ + ((((x) >> L_BITSMINOR) << L_BITSMINOR32) | ((x) & L_MAXMIN32))) + +#define DEVEXPL(x) \ + (((x) == NODEV32) ? NODEV : \ + makedevice(((x) >> L_BITSMINOR32) & L_MAXMAJ32, (x) & L_MAXMIN32)) + +#endif /* L_BITSMAJOR32 ... */ + +/* convert to old (SVR3.2) dev format */ + +#define cmpdev(x) \ + (o_dev_t)((((x) >> L_BITSMINOR) > O_MAXMAJ || \ + ((x) & L_MAXMIN) > O_MAXMIN) ? NODEV : \ + ((((x) >> L_BITSMINOR) << O_BITSMINOR) | ((x) & O_MAXMIN))) + +/* convert to new (SVR4) dev format */ + +#define expdev(x) \ + (dev_t)(((dev_t)(((x) >> O_BITSMINOR) & O_MAXMAJ) << L_BITSMINOR) | \ + ((x) & O_MAXMIN)) + +/* + * Macro for checking power of 2 address alignment. + */ +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) + +/* + * Macros for counting and rounding. + */ +#define howmany(x, y) (((x)+((y)-1))/(y)) +#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) + +/* + * Macro to determine if value is a power of 2 + */ +#define ISP2(x) (((x) & ((x) - 1)) == 0) + +/* + * Macros for various sorts of alignment and rounding. The "align" must + * be a power of 2. Often times it is a block, sector, or page. + */ + +/* + * return x rounded down to an align boundary + * eg, P2ALIGN(1200, 1024) == 1024 (1*align) + * eg, P2ALIGN(1024, 1024) == 1024 (1*align) + * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align) + * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align) + */ +#define P2ALIGN(x, align) ((x) & -(align)) + +/* + * return x % (mod) align + * eg, P2PHASE(0x1234, 0x100) == 0x34 (x-0x12*align) + * eg, P2PHASE(0x5600, 0x100) == 0x00 (x-0x56*align) + */ +#define P2PHASE(x, align) ((x) & ((align) - 1)) + +/* + * return how much space is left in this block (but if it's perfectly + * aligned, return 0). + * eg, P2NPHASE(0x1234, 0x100) == 0xcc (0x13*align-x) + * eg, P2NPHASE(0x5600, 0x100) == 0x00 (0x56*align-x) + */ +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) + +/* + * return x rounded up to an align boundary + * eg, P2ROUNDUP(0x1234, 0x100) == 0x1300 (0x13*align) + * eg, P2ROUNDUP(0x5600, 0x100) == 0x5600 (0x56*align) + */ +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) + +/* + * return the ending address of the block that x is in + * eg, P2END(0x1234, 0x100) == 0x12ff (0x13*align - 1) + * eg, P2END(0x5600, 0x100) == 0x56ff (0x57*align - 1) + */ +#define P2END(x, align) (-(~(x) & -(align))) + +/* + * return x rounded up to the next phase (offset) within align. + * phase should be < align. + * eg, P2PHASEUP(0x1234, 0x100, 0x10) == 0x1310 (0x13*align + phase) + * eg, P2PHASEUP(0x5600, 0x100, 0x10) == 0x5610 (0x56*align + phase) + */ +#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) + +/* + * return TRUE if adding len to off would cause it to cross an align + * boundary. + * eg, P2BOUNDARY(0x1234, 0xe0, 0x100) == TRUE (0x1234 + 0xe0 == 0x1314) + * eg, P2BOUNDARY(0x1234, 0x50, 0x100) == FALSE (0x1234 + 0x50 == 0x1284) + */ +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) + +/* + * Return TRUE if they have the same highest bit set. + * eg, P2SAMEHIGHBIT(0x1234, 0x1001) == TRUE (the high bit is 0x1000) + * eg, P2SAMEHIGHBIT(0x1234, 0x3010) == FALSE (high bit of 0x3010 is 0x2000) + */ +#define P2SAMEHIGHBIT(x, y) (((x) ^ (y)) < ((x) & (y))) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + (-(-(type)(x) & -(type)(align))) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + +/* + * Macros to atomically increment/decrement a variable. mutex and var + * must be pointers. + */ +#define INCR_COUNT(var, mutex) mutex_enter(mutex), (*(var))++, mutex_exit(mutex) +#define DECR_COUNT(var, mutex) mutex_enter(mutex), (*(var))--, mutex_exit(mutex) + +/* + * Macros to declare bitfields - the order in the parameter list is + * Low to High - that is, declare bit 0 first. We only support 8-bit bitfields + * because if a field crosses a byte boundary it's not likely to be meaningful + * without reassembly in its nonnative endianness. + */ +#if defined(_BIT_FIELDS_LTOH) +#define DECL_BITFIELD2(_a, _b) \ + uint8_t _a, _b +#define DECL_BITFIELD3(_a, _b, _c) \ + uint8_t _a, _b, _c +#define DECL_BITFIELD4(_a, _b, _c, _d) \ + uint8_t _a, _b, _c, _d +#define DECL_BITFIELD5(_a, _b, _c, _d, _e) \ + uint8_t _a, _b, _c, _d, _e +#define DECL_BITFIELD6(_a, _b, _c, _d, _e, _f) \ + uint8_t _a, _b, _c, _d, _e, _f +#define DECL_BITFIELD7(_a, _b, _c, _d, _e, _f, _g) \ + uint8_t _a, _b, _c, _d, _e, _f, _g +#define DECL_BITFIELD8(_a, _b, _c, _d, _e, _f, _g, _h) \ + uint8_t _a, _b, _c, _d, _e, _f, _g, _h +#elif defined(_BIT_FIELDS_HTOL) +#define DECL_BITFIELD2(_a, _b) \ + uint8_t _b, _a +#define DECL_BITFIELD3(_a, _b, _c) \ + uint8_t _c, _b, _a +#define DECL_BITFIELD4(_a, _b, _c, _d) \ + uint8_t _d, _c, _b, _a +#define DECL_BITFIELD5(_a, _b, _c, _d, _e) \ + uint8_t _e, _d, _c, _b, _a +#define DECL_BITFIELD6(_a, _b, _c, _d, _e, _f) \ + uint8_t _f, _e, _d, _c, _b, _a +#define DECL_BITFIELD7(_a, _b, _c, _d, _e, _f, _g) \ + uint8_t _g, _f, _e, _d, _c, _b, _a +#define DECL_BITFIELD8(_a, _b, _c, _d, _e, _f, _g, _h) \ + uint8_t _h, _g, _f, _e, _d, _c, _b, _a +#else +#error One of _BIT_FIELDS_LTOH or _BIT_FIELDS_HTOL must be defined +#endif /* _BIT_FIELDS_LTOH */ + +#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof) + +/* avoid any possibility of clashing with <stddef.h> version */ + +#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#endif + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + * High order bit is 31 (or 63 in _LP64 kernel). + */ +static __inline int +highbit(ulong_t i) +{ +#if defined(__FreeBSD__) && defined(_KERNEL) && defined(HAVE_INLINE_FLSL) + return (flsl(i)); +#else + int h = 1; + + if (i == 0) + return (0); +#ifdef _LP64 + if (i & 0xffffffff00000000ul) { + h += 32; i >>= 32; + } +#endif + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +#endif +} + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + */ +static __inline int +highbit64(uint64_t i) +{ +#if defined(__FreeBSD__) && defined(_KERNEL) && defined(HAVE_INLINE_FLSLL) + return (flsll(i)); +#else + int h = 1; + + if (i == 0) + return (0); + if (i & 0xffffffff00000000ULL) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSMACROS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h b/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h new file mode 100644 index 000000000000..beff02c5a9cf --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h @@ -0,0 +1,108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TASKQ_H +#define _SYS_TASKQ_H + +#include <sys/types.h> +#include <sys/proc.h> +#include <sys/taskqueue.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define TASKQ_NAMELEN 31 + +struct taskqueue; +struct taskq { + struct taskqueue *tq_queue; +}; + +typedef struct taskq taskq_t; +typedef uintptr_t taskqid_t; +typedef void (task_func_t)(void *); + +typedef struct taskq_ent { + struct task tqent_task; + task_func_t *tqent_func; + void *tqent_arg; +} taskq_ent_t; + +struct proc; + +/* + * Public flags for taskq_create(): bit range 0-15 + */ +#define TASKQ_PREPOPULATE 0x0001 /* Prepopulate with threads and data */ +#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ +#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ +#define TASKQ_THREADS_CPU_PCT 0x0008 /* number of threads as % of ncpu */ +#define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */ + +/* + * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as + * KM_SLEEP/KM_NOSLEEP. + */ +#define TQ_SLEEP 0x00 /* Can block for memory */ +#define TQ_NOSLEEP 0x01 /* cannot block for memory; may fail */ +#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ +#define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */ +#define TQ_FRONT 0x08 /* Put task at the front of the queue */ + +#define TASKQID_INVALID ((taskqid_t)0) + +#ifdef _KERNEL + +extern taskq_t *system_taskq; + +void taskq_init(void); +void taskq_mp_init(void); + +taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t); +taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, + struct proc *, uint_t); +taskq_t *taskq_create_sysdc(const char *, int, int, int, + struct proc *, uint_t, uint_t); +taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); +void nulltask(void *); +void taskq_destroy(taskq_t *); +void taskq_wait(taskq_t *); +void taskq_wait_id(taskq_t *, taskqid_t); +void taskq_suspend(taskq_t *); +int taskq_suspended(taskq_t *); +void taskq_resume(taskq_t *); +int taskq_member(taskq_t *, kthread_t *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TASKQ_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h b/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h new file mode 100644 index 000000000000..1496fa356835 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h @@ -0,0 +1,115 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_U8_TEXTPREP_H +#define _SYS_U8_TEXTPREP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/errno.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef illumos +/* + * Unicode encoding conversion functions and their macros. + */ +#define UCONV_IN_BIG_ENDIAN 0x0001 +#define UCONV_OUT_BIG_ENDIAN 0x0002 +#define UCONV_IN_SYSTEM_ENDIAN 0x0004 +#define UCONV_OUT_SYSTEM_ENDIAN 0x0008 +#define UCONV_IN_LITTLE_ENDIAN 0x0010 +#define UCONV_OUT_LITTLE_ENDIAN 0x0020 +#define UCONV_IGNORE_NULL 0x0040 +#define UCONV_IN_ACCEPT_BOM 0x0080 +#define UCONV_OUT_EMIT_BOM 0x0100 + +extern int uconv_u16tou32(const uint16_t *, size_t *, uint32_t *, size_t *, + int); +extern int uconv_u16tou8(const uint16_t *, size_t *, uchar_t *, size_t *, int); +extern int uconv_u32tou16(const uint32_t *, size_t *, uint16_t *, size_t *, + int); +extern int uconv_u32tou8(const uint32_t *, size_t *, uchar_t *, size_t *, int); +extern int uconv_u8tou16(const uchar_t *, size_t *, uint16_t *, size_t *, int); +extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int); +#endif /* illumos */ + +/* + * UTF-8 text preparation functions and their macros. + * + * Among the macros defined, U8_CANON_DECOMP, U8_COMPAT_DECOMP, and + * U8_CANON_COMP are not public interfaces and must not be used directly + * at the flag input argument. + */ +#define U8_STRCMP_CS (0x00000001) +#define U8_STRCMP_CI_UPPER (0x00000002) +#define U8_STRCMP_CI_LOWER (0x00000004) + +#define U8_CANON_DECOMP (0x00000010) +#define U8_COMPAT_DECOMP (0x00000020) +#define U8_CANON_COMP (0x00000040) + +#define U8_STRCMP_NFD (U8_CANON_DECOMP) +#define U8_STRCMP_NFC (U8_CANON_DECOMP | U8_CANON_COMP) +#define U8_STRCMP_NFKD (U8_COMPAT_DECOMP) +#define U8_STRCMP_NFKC (U8_COMPAT_DECOMP | U8_CANON_COMP) + +#define U8_TEXTPREP_TOUPPER (U8_STRCMP_CI_UPPER) +#define U8_TEXTPREP_TOLOWER (U8_STRCMP_CI_LOWER) + +#define U8_TEXTPREP_NFD (U8_STRCMP_NFD) +#define U8_TEXTPREP_NFC (U8_STRCMP_NFC) +#define U8_TEXTPREP_NFKD (U8_STRCMP_NFKD) +#define U8_TEXTPREP_NFKC (U8_STRCMP_NFKC) + +#define U8_TEXTPREP_IGNORE_NULL (0x00010000) +#define U8_TEXTPREP_IGNORE_INVALID (0x00020000) +#define U8_TEXTPREP_NOWAIT (0x00040000) + +#define U8_UNICODE_320 (0) +#define U8_UNICODE_500 (1) +#define U8_UNICODE_LATEST (U8_UNICODE_500) + +#define U8_VALIDATE_ENTIRE (0x00100000) +#define U8_VALIDATE_CHECK_ADDITIONAL (0x00200000) +#define U8_VALIDATE_UCS2_RANGE (0x00400000) + +#define U8_ILLEGAL_CHAR (-1) +#define U8_OUT_OF_RANGE_CHAR (-2) + +extern int u8_validate(char *, size_t, char **, int, int *); +extern int u8_strcmp(const char *, const char *, size_t, int, size_t, int *); +extern size_t u8_textprep_str(char *, size_t *, char *, size_t *, int, size_t, + int *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_U8_TEXTPREP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h b/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h new file mode 100644 index 000000000000..de6866096160 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h @@ -0,0 +1,35376 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright (c) 1991-2006 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of the Unicode data files and any associated documentation (the + * "Data Files") or Unicode software and any associated documentation (the + * "Software") to deal in the Data Files or Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, and/or sell copies of the Data Files or Software, and + * to permit persons to whom the Data Files or Software are furnished to do so, + * provided that (a) the above copyright notice(s) and this permission notice + * appear with all copies of the Data Files or Software, (b) both the above + * copyright notice(s) and this permission notice appear in associated + * documentation, and (c) there is clear notice in each modified Data File or + * in the Software as well as in the documentation associated with the Data + * File(s) or Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF + * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR + * CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall not + * be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written authorization + * of the copyright holder. + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be + * registered in some jurisdictions. All other trademarks and registered + * trademarks mentioned herein are the property of their respective owners. + */ +/* + * This file has been modified by Sun Microsystems, Inc. + */ + +#ifndef _SYS_U8_TEXTPREP_DATA_H +#define _SYS_U8_TEXTPREP_DATA_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * To get to the combining class data, composition mappings, decomposition + * mappings, and case conversion mappings of Unicode, the data structures + * formulated and their meanings are like the following: + * + * Each UTF-8 character is seen as a 4-byte entity so that U+0061 (or 0x61 in + * UTF-8) would be seen as 0x00 0x00 0x00 0x61. Similarly, U+1D15E would be + * 0xF0 0x9D 0x85 0x9E in UTF-8. + * + * The first byte (MSB) value is an index to the b1_tbl, such as + * u8_common_b1_tbl and u8_composition_b1_tbl tables. A b1_tbl has + * indices to b2_tbl tables that have indices to b3_tbl. Each b3_tbl has + * either indices to b4_tbl or indices to b4_tbl and base values for + * displacement calculations later by using the u8_displacement_t type at + * below. Each b4_tbl table then has indices to the final tables. + * + * As an example, if we have a character with code value of U+1D15E which is + * 0xF0 0x9D 0x85 0x9E in UTF-8, the target decomposition character bytes + * that will be mapped by the mapping procedure would be the ones between + * the start_index and the end_index computed as like the following: + * + * b2_tbl_id = u8_common_b1_tbl[0][0xF0]; + * b3_tbl_id = u8_decomp_b2_tbl[0][b2_tbl_id][0x9D]; + * b4_tbl_id = u8_decomp_b3_tbl[0][b3_tbl_id][0x85].tbl_id; + * b4_base = u8_decomp_b3_tbl[0][b3_tbl_id][0x85].base; + * if (b4_tbl_id >= 0x8000) { + * b4_tbl_id -= 0x8000; + * start_index = u8_decomp_b4_16bit_tbl[0][b4_tbl_id][0x9E]; + * end_index = u8_decomp_b4_16bit_tbl[0][b4_tbl_id][0x9E + 1]; + * } else { + * start_index = u8_decomp_b4_tbl[0][b4_tbl_id][0x9E]; + * end_index = u8_decomp_b4_tbl[0][b4_tbl_id][0x9E + 1]; + * } + * + * The start_index and the end_index can be used to retrieve the bytes + * possibly of multiple UTF-8 characters from the final tables. + * + * The "[0]" at the above indicates this is for Unicode Version 3.2.0 data + * as of today. Consequently, the "[1]" indicates another Unicode version + * data and it is Unicode 5.0.0 as of today. + * + * The mapping procedures and the data structures are more or less similar or + * alike among different mappings. You might want to read the u8_textprep.c + * for specific details. + * + * The tool programs created and used to generate the tables in this file are + * saved at PSARC/2007/149/materials/ as tools.tar.gz file. + */ + +/* The following is a component type for the b4_tbl vectors. */ +typedef struct { + uint16_t tbl_id; + uint16_t base; +} u8_displacement_t; + +/* + * The U8_TBL_ELEMENT_NOT_DEF macro indicates a byte that is not defined or + * used. The U8_TBL_ELEMENT_FILLER indicates the end of a UTF-8 character at + * the final tables. + */ +#define U8_TBL_ELEMENT_NOT_DEF (0xff) +#define N_ U8_TBL_ELEMENT_NOT_DEF + +#define U8_TBL_ELEMENT_FILLER (0xf7) +#define FIL_ U8_TBL_ELEMENT_FILLER + +/* + * The common b1_tbl for combining class, decompositions, tolower, and + * toupper case conversion mappings. + */ +static const uchar_t u8_common_b1_tbl[2][256] = { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, +}; + +static const uchar_t u8_combining_class_b2_tbl[2][2][256] = { + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, 2, 3, 4, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 5, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 6, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, 2, 3, 4, N_, N_, N_, N_, + N_, N_, 5, N_, N_, N_, N_, 6, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 7, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 8, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + +}; + +static const uchar_t u8_combining_class_b3_tbl[2][9][256] = { + { + { /* Third byte table 0. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 0, 1, N_, N_, + N_, N_, 2, N_, N_, N_, 3, 4, + N_, 5, N_, 6, 7, 8, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 1. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, N_, 19, + N_, 20, N_, 21, N_, 22, N_, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 2. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 32, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 33, N_, N_, 34, + N_, N_, 35, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 3. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, 36, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 4. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 37, N_, 38, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 5. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 39, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 40, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 6. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 41, 42, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 7. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 8. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + }, + { + { /* Third byte table 0. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 0, 1, N_, N_, + N_, N_, 2, N_, N_, N_, 3, 4, + 5, 6, N_, 7, 8, 9, N_, 10, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 1. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, N_, 21, + N_, 22, 23, 24, N_, 25, N_, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 2. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 35, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 36, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 37, N_, N_, 38, + N_, N_, 39, N_, 40, N_, N_, N_, + 41, N_, N_, N_, 42, 43, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 44, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 3. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, 45, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 4. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 46, N_, 47, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 5. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 48, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 6. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 49, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 50, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 7. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 51, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 8. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 52, 53, N_, + N_, 54, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + }, +}; + +/* + * Unlike other b4_tbl, the b4_tbl for combining class data has + * the combining class values not indices to the final tables. + */ +static const uchar_t u8_combining_class_b4_tbl[2][55][256] = { + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 230, 230, 230, 230, + 230, 230, 230, 230, 230, 230, 230, 230, + 230, 230, 230, 230, 230, 232, 220, 220, + 220, 220, 232, 216, 220, 220, 220, 220, + 220, 202, 202, 220, 220, 220, 220, 202, + 202, 220, 220, 220, 220, 220, 220, 220, + 220, 220, 220, 220, 1, 1, 1, 1, + 1, 220, 220, 220, 220, 230, 230, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 230, 240, 230, 220, + 220, 220, 230, 230, 230, 220, 220, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 234, 234, 233, 230, 230, 230, 230, 230, + 230, 230, 230, 230, 230, 230, 230, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 230, 230, 230, 230, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 220, 230, 230, 230, 230, 220, 230, + 230, 230, 222, 220, 230, 230, 230, 230, + 230, 230, 0, 220, 220, 220, 220, 220, + 230, 230, 220, 230, 230, 222, 228, 230, + 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 0, 20, 21, 22, 0, 23, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 24, 25, 0, 230, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 27, 28, 29, 30, 31, + 32, 33, 34, 230, 230, 220, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 35, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 230, 230, + 230, 230, 230, 230, 230, 0, 0, 230, + 230, 230, 230, 220, 230, 0, 0, 230, + 230, 0, 220, 230, 230, 220, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 36, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 220, 230, 230, 220, 230, 230, 220, + 220, 220, 230, 220, 220, 230, 220, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 220, 230, 220, 230, 220, 230, + 220, 230, 230, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 230, 220, 230, 230, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 84, 91, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 9, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 103, 103, 9, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 107, 107, 107, 107, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 118, 118, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 122, 122, 122, 122, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 220, 220, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 220, 0, 220, + 0, 216, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 129, 130, 0, 132, 0, 0, 0, + 0, 0, 130, 130, 130, 130, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 130, 0, 230, 230, 9, 0, 230, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 220, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 0, 9, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 9, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 228, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 1, 1, 230, 230, 230, 230, + 1, 1, 1, 230, 230, 0, 0, 0, + 0, 230, 0, 0, 0, 1, 1, 230, + 220, 230, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 218, 228, 232, 222, 224, 224, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 39. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 26, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 40. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 41. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 216, 216, 1, + 1, 1, 0, 0, 0, 226, 216, 216, + 216, 216, 216, 0, 0, 0, 0, 0, + 0, 0, 0, 220, 220, 220, 220, 220, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 42. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 220, 220, 220, 0, 0, 230, 230, 230, + 230, 230, 220, 220, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 230, 230, 230, 230, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 43. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 44. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 45. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 46. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 47. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 48. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 49. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 50. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 51. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 52. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 53. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 54. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + }, + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 230, 230, 230, 230, + 230, 230, 230, 230, 230, 230, 230, 230, + 230, 230, 230, 230, 230, 232, 220, 220, + 220, 220, 232, 216, 220, 220, 220, 220, + 220, 202, 202, 220, 220, 220, 220, 202, + 202, 220, 220, 220, 220, 220, 220, 220, + 220, 220, 220, 220, 1, 1, 1, 1, + 1, 220, 220, 220, 220, 230, 230, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 230, 240, 230, 220, + 220, 220, 230, 230, 230, 220, 220, 0, + 230, 230, 230, 220, 220, 220, 220, 230, + 232, 220, 220, 230, 233, 234, 234, 233, + 234, 234, 233, 230, 230, 230, 230, 230, + 230, 230, 230, 230, 230, 230, 230, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 230, 230, 230, 230, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 220, 230, 230, 230, 230, 220, 230, + 230, 230, 222, 220, 230, 230, 230, 230, + 230, 230, 220, 220, 220, 220, 220, 220, + 230, 230, 220, 230, 230, 222, 228, 230, + 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 19, 20, 21, 22, 0, 23, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 24, 25, 0, 230, 220, 0, 18, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 230, 230, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 27, 28, 29, 30, 31, + 32, 33, 34, 230, 230, 220, 220, 230, + 230, 230, 230, 230, 220, 230, 230, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 35, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 230, 230, + 230, 230, 230, 230, 230, 0, 0, 230, + 230, 230, 230, 220, 230, 0, 0, 230, + 230, 0, 220, 230, 230, 220, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 36, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 220, 230, 230, 220, 230, 230, 220, + 220, 220, 230, 220, 220, 230, 220, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 220, 230, 220, 230, 220, 230, + 220, 230, 230, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 230, 230, 230, 230, 230, + 230, 230, 220, 230, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 230, 220, 230, 230, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 84, 91, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 9, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 103, 103, 9, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 107, 107, 107, 107, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 118, 118, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 122, 122, 122, 122, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 220, 220, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 220, 0, 220, + 0, 216, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 129, 130, 0, 132, 0, 0, 0, + 0, 0, 130, 130, 130, 130, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 130, 0, 230, 230, 9, 0, 230, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 220, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 0, 9, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 9, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 230, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 39. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 228, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 40. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 222, 230, 220, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 41. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 230, + 220, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 42. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 43. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 230, 220, 230, 230, 230, + 230, 230, 230, 230, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 44. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 220, 230, 230, 230, 230, 230, + 230, 230, 220, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 230, 220, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 45. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 1, 1, 230, 230, 230, 230, + 1, 1, 1, 230, 230, 0, 0, 0, + 0, 230, 0, 0, 0, 1, 1, 230, + 220, 230, 1, 1, 220, 220, 220, 220, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 46. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 218, 228, 232, 222, 224, 224, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 47. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 48. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 9, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 49. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 26, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 50. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 51. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 220, 0, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 1, 220, 0, 0, 0, 0, 9, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 52. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 216, 216, 1, + 1, 1, 0, 0, 0, 226, 216, 216, + 216, 216, 216, 0, 0, 0, 0, 0, + 0, 0, 0, 220, 220, 220, 220, 220, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 53. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 220, 220, 220, 0, 0, 230, 230, 230, + 230, 230, 220, 220, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 230, 230, 230, 230, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 54. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 230, 230, 230, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + }, +}; + +static const uchar_t u8_composition_b1_tbl[2][256] = { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, +}; + +static const uchar_t u8_composition_b2_tbl[2][1][256] = { + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, 2, 3, 4, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, 2, 3, 4, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + +}; + +static const u8_displacement_t u8_composition_b3_tbl[2][5][256] = { + { + { /* Third byte table 0. */ + { 0x8000, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0, 2470 }, + { 0x8001, 2491 }, { 1, 2871 }, { 2, 2959 }, + { 3, 3061 }, { 4, 3212 }, { 5, 3226 }, + { N_, 0 }, { 6, 3270 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0x8002, 3277 }, + { 7, 3774 }, { 8, 3949 }, { 9, 4198 }, + { N_, 0 }, { 10, 4265 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 11, 4293 }, { 12, 4312 }, { N_, 0 }, + { 13, 4326 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 14, 4347 }, + { N_, 0 }, { N_, 0 }, { 15, 4374 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 16, 4391 }, + { 17, 4416 }, { 18, 4425 }, { N_, 0 }, + { 19, 4451 }, { 20, 4460 }, { 21, 4469 }, + { N_, 0 }, { 22, 4503 }, { N_, 0 }, + { 23, 4529 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 24, 4563 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 25, 4572 }, { 26, 4588 }, + { 27, 4620 }, { 28, 4666 }, { 0x8003, 4682 }, + { 0x8004, 5254 }, { 29, 5616 }, { 30, 5646 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 31, 5684 }, + { 32, 5708 }, { 33, 5732 }, { 34, 5780 }, + { 35, 5900 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 36, 6012 }, { 37, 6241 }, { 38, 6358 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + }, + { + { /* Third byte table 0. */ + { 0x8000, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0, 2470 }, + { 0x8001, 2491 }, { 1, 2871 }, { 2, 2959 }, + { 3, 3061 }, { 4, 3212 }, { 5, 3226 }, + { N_, 0 }, { 6, 3270 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0x8002, 3277 }, + { 7, 3774 }, { 8, 3949 }, { 9, 4198 }, + { N_, 0 }, { 10, 4265 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 11, 4293 }, { 12, 4312 }, { N_, 0 }, + { 13, 4326 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 14, 4347 }, + { N_, 0 }, { N_, 0 }, { 15, 4374 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 16, 4391 }, + { 17, 4416 }, { 18, 4425 }, { N_, 0 }, + { 19, 4451 }, { 20, 4460 }, { 21, 4469 }, + { N_, 0 }, { 22, 4503 }, { N_, 0 }, + { 23, 4529 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 24, 4563 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 25, 4572 }, { 26, 4662 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 27, 4671 }, { 28, 4687 }, + { 29, 4719 }, { 30, 4765 }, { 0x8003, 4781 }, + { 0x8004, 5353 }, { 31, 5715 }, { 32, 5745 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 33, 5783 }, + { 34, 5807 }, { 35, 5831 }, { 36, 5879 }, + { 37, 5999 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 38, 6111 }, { 39, 6340 }, { 40, 6457 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + }, +}; + +static const uchar_t u8_composition_b4_tbl[2][41][257] = { + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 29, 58, 58, 58, 58, + 58, 58, 58, 58, 58, 58, 58, 58, + 58, 58, 58, 73, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 15, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 38, 46, 46, 46, 46, + 46, 54, 62, 62, 62, 62, 62, 62, + 62, 70, 78, 86, 94, 94, 94, 94, + 94, 94, 94, 94, 94, 94, 94, 94, + 94, 94, 94, 94, 94, 94, 94, 94, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 36, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 108, 144, 144, 144, 144, 144, 144, 144, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 14, 22, 30, 30, 30, 30, 30, 37, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 15, 15, 15, 15, 70, 70, + 70, 70, 112, 133, 154, 154, 154, 162, + 162, 162, 162, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 20, 20, 20, 27, 27, 46, 59, + 66, 91, 91, 98, 98, 98, 98, 105, + 105, 105, 105, 105, 130, 130, 130, 130, + 137, 137, 137, 137, 144, 144, 151, 151, + 151, 164, 164, 164, 171, 171, 190, 203, + 210, 235, 235, 242, 242, 242, 242, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 25, 25, 25, 25, + 32, 32, 32, 32, 39, 39, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 53, + 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 60, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 21, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 14, 14, 14, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 9, 9, 9, 9, 9, 9, 9, + 9, 18, 18, 18, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 17, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 25, + 25, 25, 25, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 17, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 25, 25, 25, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 16, 16, 16, 16, + 16, 16, 16, 24, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 38, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 16, + 16, 16, 16, 16, 16, 16, 16, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 16, 16, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 16, 16, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 8, 8, 8, 8, + 8, 16, 16, 16, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 32, 32, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 8, 8, 16, 16, + 16, 24, 24, 24, 24, 24, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 40, 40, 40, 48, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 64, 72, 72, 72, 80, + 88, 88, 88, 96, 104, 112, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 16, 16, 16, 24, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 40, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 56, 56, 56, 56, 56, + 56, 64, 72, 72, 80, 80, 80, 80, + 80, 80, 80, 88, 96, 104, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 9, + 9, 9, 9, 9, 18, 18, 27, 27, + 36, 36, 45, 45, 54, 54, 63, 63, + 72, 72, 81, 81, 90, 90, 99, 99, + 108, 108, 117, 117, 117, 126, 126, 135, + 135, 144, 144, 144, 144, 144, 144, 144, + 161, 161, 161, 178, 178, 178, 195, 195, + 195, 212, 212, 212, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 18, + 18, 18, 18, 18, 27, 27, 36, 36, + 45, 45, 54, 54, 63, 63, 72, 72, + 81, 81, 90, 90, 99, 99, 108, 108, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 9, 9, 9, 18, 18, 27, + 27, 36, 36, 36, 36, 36, 36, 36, + 53, 53, 53, 70, 70, 70, 87, 87, + 87, 104, 104, 104, 121, 121, 121, 121, + 121, 121, 121, 121, 121, 121, 121, 121, + 121, 121, 121, 121, 121, 121, 121, 121, + 130, 139, 148, 157, 157, 157, 157, 157, + 157, 157, 157, 157, 157, 157, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, + }, + { /* Fourth byte table 39. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 40. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + }, + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 29, 58, 58, 58, 58, + 58, 58, 58, 58, 58, 58, 58, 58, + 58, 58, 58, 73, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 15, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 38, 46, 46, 46, 46, + 46, 54, 62, 62, 62, 62, 62, 62, + 62, 70, 78, 86, 94, 94, 94, 94, + 94, 94, 94, 94, 94, 94, 94, 94, + 94, 94, 94, 94, 94, 94, 94, 94, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 36, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 108, 144, 144, 144, 144, 144, 144, 144, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 14, 22, 30, 30, 30, 30, 30, 37, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 15, 15, 15, 15, 70, 70, + 70, 70, 112, 133, 154, 154, 154, 162, + 162, 162, 162, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 20, 20, 20, 27, 27, 46, 59, + 66, 91, 91, 98, 98, 98, 98, 105, + 105, 105, 105, 105, 130, 130, 130, 130, + 137, 137, 137, 137, 144, 144, 151, 151, + 151, 164, 164, 164, 171, 171, 190, 203, + 210, 235, 235, 242, 242, 242, 242, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 25, 25, 25, 25, + 32, 32, 32, 32, 39, 39, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 53, + 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 60, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 21, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 14, 14, 14, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 9, 9, 9, 9, 9, 9, 9, + 9, 18, 18, 18, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 17, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 25, + 25, 25, 25, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 17, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 25, 25, 25, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 9, 9, + 18, 18, 27, 27, 36, 36, 45, 45, + 45, 45, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 63, 63, 72, 72, 81, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 16, 16, 16, 16, + 16, 16, 16, 24, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 38, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 16, + 16, 16, 16, 16, 16, 16, 16, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 16, 16, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 16, 16, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 8, 8, 8, 8, + 8, 16, 16, 16, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 32, 32, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 8, 8, 16, 16, + 16, 24, 24, 24, 24, 24, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 40, 40, 40, 48, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 64, 72, 72, 72, 80, + 88, 88, 88, 96, 104, 112, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 16, 16, 16, 24, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 40, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 56, 56, 56, 56, 56, + 56, 64, 72, 72, 80, 80, 80, 80, + 80, 80, 80, 88, 96, 104, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 9, + 9, 9, 9, 9, 18, 18, 27, 27, + 36, 36, 45, 45, 54, 54, 63, 63, + 72, 72, 81, 81, 90, 90, 99, 99, + 108, 108, 117, 117, 117, 126, 126, 135, + 135, 144, 144, 144, 144, 144, 144, 144, + 161, 161, 161, 178, 178, 178, 195, 195, + 195, 212, 212, 212, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, + }, + { /* Fourth byte table 39. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 18, + 18, 18, 18, 18, 27, 27, 36, 36, + 45, 45, 54, 54, 63, 63, 72, 72, + 81, 81, 90, 90, 99, 99, 108, 108, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, + }, + { /* Fourth byte table 40. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 9, 9, 9, 18, 18, 27, + 27, 36, 36, 36, 36, 36, 36, 36, + 53, 53, 53, 70, 70, 70, 87, 87, + 87, 104, 104, 104, 121, 121, 121, 121, + 121, 121, 121, 121, 121, 121, 121, 121, + 121, 121, 121, 121, 121, 121, 121, 121, + 130, 139, 148, 157, 157, 157, 157, 157, + 157, 157, 157, 157, 157, 157, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, + }, + }, +}; + +static const uint16_t u8_composition_b4_16bit_tbl[2][5][257] = { + { + { /* Fourth byte 16-bit table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, 16, 24, + 24, 24, 124, 146, 177, 219, 327, 335, + 379, 427, 521, 528, 562, 602, 624, 683, + 782, 797, 797, 849, 894, 941, 1061, 1076, + 1118, 1133, 1193, 1233, 1233, 1233, 1233, 1233, + 1233, 1233, 1333, 1355, 1386, 1428, 1536, 1544, + 1588, 1643, 1731, 1744, 1778, 1818, 1840, 1899, + 1998, 2013, 2013, 2065, 2110, 2164, 2284, 2299, + 2348, 2363, 2430, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, + }, + { /* Fourth byte 16-bit table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 29, 29, 36, 43, 56, + 64, 64, 64, 93, 93, 93, 93, 93, + 101, 101, 101, 101, 101, 130, 151, 158, + 158, 165, 165, 165, 165, 190, 190, 190, + 190, 190, 190, 219, 219, 226, 233, 246, + 254, 254, 254, 283, 283, 283, 283, 283, + 291, 291, 291, 291, 291, 320, 341, 348, + 348, 355, 355, 355, 355, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, + }, + { /* Fourth byte 16-bit table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 49, 49, 49, 49, 77, 77, + 112, 112, 160, 160, 160, 160, 160, 160, + 188, 188, 196, 196, 196, 196, 237, 237, + 237, 237, 272, 272, 272, 280, 280, 288, + 288, 288, 344, 344, 344, 344, 372, 372, + 414, 414, 469, 469, 469, 469, 469, 469, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, + }, + { /* Fourth byte 16-bit table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 29, 58, 66, 74, 82, 90, 98, + 106, 135, 164, 172, 180, 188, 196, 204, + 212, 227, 242, 242, 242, 242, 242, 242, + 242, 257, 272, 272, 272, 272, 272, 272, + 272, 301, 330, 338, 346, 354, 362, 370, + 378, 407, 436, 444, 452, 460, 468, 476, + 484, 506, 528, 528, 528, 528, 528, 528, + 528, 550, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, + }, + { /* Fourth byte 16-bit table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 30, 30, 30, 30, 30, 30, + 30, 45, 60, 60, 60, 60, 60, 60, + 60, 82, 104, 104, 104, 104, 104, 104, + 104, 104, 126, 126, 126, 126, 126, 126, + 126, 155, 184, 192, 200, 208, 216, 224, + 232, 261, 290, 298, 306, 314, 322, 330, + 338, 346, 346, 346, 346, 354, 354, 354, + 354, 354, 354, 354, 354, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, + }, + }, + { + { /* Fourth byte 16-bit table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, 16, 24, + 24, 24, 124, 146, 177, 219, 327, 335, + 379, 427, 521, 528, 562, 602, 624, 683, + 782, 797, 797, 849, 894, 941, 1061, 1076, + 1118, 1133, 1193, 1233, 1233, 1233, 1233, 1233, + 1233, 1233, 1333, 1355, 1386, 1428, 1536, 1544, + 1588, 1643, 1731, 1744, 1778, 1818, 1840, 1899, + 1998, 2013, 2013, 2065, 2110, 2164, 2284, 2299, + 2348, 2363, 2430, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, + }, + { /* Fourth byte 16-bit table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 29, 29, 36, 43, 56, + 64, 64, 64, 93, 93, 93, 93, 93, + 101, 101, 101, 101, 101, 130, 151, 158, + 158, 165, 165, 165, 165, 190, 190, 190, + 190, 190, 190, 219, 219, 226, 233, 246, + 254, 254, 254, 283, 283, 283, 283, 283, + 291, 291, 291, 291, 291, 320, 341, 348, + 348, 355, 355, 355, 355, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, + }, + { /* Fourth byte 16-bit table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 49, 49, 49, 49, 77, 77, + 112, 112, 160, 160, 160, 160, 160, 160, + 188, 188, 196, 196, 196, 196, 237, 237, + 237, 237, 272, 272, 272, 280, 280, 288, + 288, 288, 344, 344, 344, 344, 372, 372, + 414, 414, 469, 469, 469, 469, 469, 469, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, + }, + { /* Fourth byte 16-bit table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 29, 58, 66, 74, 82, 90, 98, + 106, 135, 164, 172, 180, 188, 196, 204, + 212, 227, 242, 242, 242, 242, 242, 242, + 242, 257, 272, 272, 272, 272, 272, 272, + 272, 301, 330, 338, 346, 354, 362, 370, + 378, 407, 436, 444, 452, 460, 468, 476, + 484, 506, 528, 528, 528, 528, 528, 528, + 528, 550, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, + }, + { /* Fourth byte 16-bit table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 30, 30, 30, 30, 30, 30, + 30, 45, 60, 60, 60, 60, 60, 60, + 60, 82, 104, 104, 104, 104, 104, 104, + 104, 104, 126, 126, 126, 126, 126, 126, + 126, 155, 184, 192, 200, 208, 216, 224, + 232, 261, 290, 298, 306, 314, 322, 330, + 338, 346, 346, 346, 346, 354, 354, 354, + 354, 354, 354, 354, 354, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, + }, + }, +}; + +static const uchar_t u8_composition_final_tbl[2][6623] = { + { + 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAE, FIL_, + 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xA0, FIL_, + 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAF, FIL_, + 0x10, 0xCC, 0x86, FIL_, 0xC4, 0x82, FIL_, 0xCC, + 0x87, FIL_, 0xC8, 0xA6, FIL_, 0xCC, 0x8F, FIL_, + 0xC8, 0x80, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x82, + FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x81, FIL_, 0xCC, + 0x80, FIL_, 0xC3, 0x80, FIL_, 0xCC, 0x83, FIL_, + 0xC3, 0x83, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, + 0xA0, FIL_, 0xCC, 0xA5, FIL_, 0xE1, 0xB8, 0x80, + FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x82, FIL_, 0xCC, + 0x84, FIL_, 0xC4, 0x80, FIL_, 0xCC, 0x88, FIL_, + 0xC3, 0x84, FIL_, 0xCC, 0x8A, FIL_, 0xC3, 0x85, + FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x84, FIL_, 0xCC, + 0x89, FIL_, 0xE1, 0xBA, 0xA2, FIL_, 0xCC, 0x8C, + FIL_, 0xC7, 0x8D, FIL_, 0x03, 0xCC, 0x87, FIL_, + 0xE1, 0xB8, 0x82, FIL_, 0xCC, 0xB1, FIL_, 0xE1, + 0xB8, 0x86, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0x84, FIL_, 0x05, 0xCC, 0xA7, FIL_, 0xC3, 0x87, + FIL_, 0xCC, 0x81, FIL_, 0xC4, 0x86, FIL_, 0xCC, + 0x8C, FIL_, 0xC4, 0x8C, FIL_, 0xCC, 0x87, FIL_, + 0xC4, 0x8A, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0x88, + FIL_, 0x06, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0x8E, + FIL_, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0x90, FIL_, + 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x92, FIL_, 0xCC, + 0x87, FIL_, 0xE1, 0xB8, 0x8A, FIL_, 0xCC, 0x8C, + FIL_, 0xC4, 0x8E, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB8, 0x8C, FIL_, 0x11, 0xCC, 0x80, FIL_, 0xC3, + 0x88, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x89, FIL_, + 0xCC, 0x82, FIL_, 0xC3, 0x8A, FIL_, 0xCC, 0x88, + FIL_, 0xC3, 0x8B, FIL_, 0xCC, 0xA7, FIL_, 0xC8, + 0xA8, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x86, FIL_, + 0xCC, 0x8F, FIL_, 0xC8, 0x84, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBA, 0xBA, FIL_, 0xCC, 0xB0, FIL_, + 0xE1, 0xB8, 0x9A, FIL_, 0xCC, 0xAD, FIL_, 0xE1, + 0xB8, 0x98, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBA, + 0xBC, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0xB8, + FIL_, 0xCC, 0x84, FIL_, 0xC4, 0x92, FIL_, 0xCC, + 0x86, FIL_, 0xC4, 0x94, FIL_, 0xCC, 0x87, FIL_, + 0xC4, 0x96, FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x98, + FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x9A, FIL_, 0x01, + 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9E, FIL_, 0x07, + 0xCC, 0x8C, FIL_, 0xC7, 0xA6, FIL_, 0xCC, 0x87, + FIL_, 0xC4, 0xA0, FIL_, 0xCC, 0x84, FIL_, 0xE1, + 0xB8, 0xA0, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0x9C, + FIL_, 0xCC, 0x81, FIL_, 0xC7, 0xB4, FIL_, 0xCC, + 0xA7, FIL_, 0xC4, 0xA2, FIL_, 0xCC, 0x86, FIL_, + 0xC4, 0x9E, FIL_, 0x07, 0xCC, 0xAE, FIL_, 0xE1, + 0xB8, 0xAA, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB8, + 0xA2, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA6, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0xA4, FIL_, + 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0xA8, FIL_, 0xCC, + 0x8C, FIL_, 0xC8, 0x9E, FIL_, 0xCC, 0x82, FIL_, + 0xC4, 0xA4, FIL_, 0x0F, 0xCC, 0x84, FIL_, 0xC4, + 0xAA, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x8C, FIL_, + 0xCC, 0xA8, FIL_, 0xC4, 0xAE, FIL_, 0xCC, 0x83, + FIL_, 0xC4, 0xA8, FIL_, 0xCC, 0x88, FIL_, 0xC3, + 0x8F, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x8D, FIL_, + 0xCC, 0x8F, FIL_, 0xC8, 0x88, FIL_, 0xCC, 0x86, + FIL_, 0xC4, 0xAC, FIL_, 0xCC, 0x91, FIL_, 0xC8, + 0x8A, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x8F, FIL_, + 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x88, FIL_, 0xCC, + 0x87, FIL_, 0xC4, 0xB0, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xBB, 0x8A, FIL_, 0xCC, 0xB0, FIL_, 0xE1, + 0xB8, 0xAC, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x8E, + FIL_, 0x01, 0xCC, 0x82, FIL_, 0xC4, 0xB4, FIL_, + 0x05, 0xCC, 0x8C, FIL_, 0xC7, 0xA8, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xB8, 0xB4, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xB8, 0xB0, FIL_, 0xCC, 0xA7, FIL_, + 0xC4, 0xB6, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0xB2, FIL_, 0x06, 0xCC, 0xA7, FIL_, 0xC4, 0xBB, + FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0xBD, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xB8, 0xBA, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xB8, 0xB6, FIL_, 0xCC, 0xAD, FIL_, + 0xE1, 0xB8, 0xBC, FIL_, 0xCC, 0x81, FIL_, 0xC4, + 0xB9, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xB8, + 0xBE, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x82, + FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x80, FIL_, + 0x09, 0xCC, 0x80, FIL_, 0xC7, 0xB8, FIL_, 0xCC, + 0xAD, FIL_, 0xE1, 0xB9, 0x8A, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0x84, FIL_, 0xCC, 0xB1, FIL_, + 0xE1, 0xB9, 0x88, FIL_, 0xCC, 0x83, FIL_, 0xC3, + 0x91, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x86, + FIL_, 0xCC, 0x81, FIL_, 0xC5, 0x83, FIL_, 0xCC, + 0xA7, FIL_, 0xC5, 0x85, FIL_, 0xCC, 0x8C, FIL_, + 0xC5, 0x87, FIL_, 0x10, 0xCC, 0xA8, FIL_, 0xC7, + 0xAA, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x8E, FIL_, + 0xCC, 0x80, FIL_, 0xC3, 0x92, FIL_, 0xCC, 0x9B, + FIL_, 0xC6, 0xA0, FIL_, 0xCC, 0x8F, FIL_, 0xC8, + 0x8C, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x93, FIL_, + 0xCC, 0x87, FIL_, 0xC8, 0xAE, FIL_, 0xCC, 0x8C, + FIL_, 0xC7, 0x91, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xBB, 0x8C, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x94, + FIL_, 0xCC, 0x84, FIL_, 0xC5, 0x8C, FIL_, 0xCC, + 0x83, FIL_, 0xC3, 0x95, FIL_, 0xCC, 0x86, FIL_, + 0xC5, 0x8E, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x96, + FIL_, 0xCC, 0x8B, FIL_, 0xC5, 0x90, FIL_, 0xCC, + 0x89, FIL_, 0xE1, 0xBB, 0x8E, FIL_, 0x02, 0xCC, + 0x87, FIL_, 0xE1, 0xB9, 0x96, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0x94, FIL_, 0x08, 0xCC, 0x91, + FIL_, 0xC8, 0x92, FIL_, 0xCC, 0xA7, FIL_, 0xC5, + 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0x98, FIL_, + 0xCC, 0xB1, FIL_, 0xE1, 0xB9, 0x9E, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xB9, 0x9A, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0x98, FIL_, 0xCC, 0x81, FIL_, + 0xC5, 0x94, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x90, + FIL_, 0x07, 0xCC, 0x81, FIL_, 0xC5, 0x9A, FIL_, + 0xCC, 0x82, FIL_, 0xC5, 0x9C, FIL_, 0xCC, 0xA7, + FIL_, 0xC5, 0x9E, FIL_, 0xCC, 0x8C, FIL_, 0xC5, + 0xA0, FIL_, 0xCC, 0xA6, FIL_, 0xC8, 0x98, FIL_, + 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA0, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xB9, 0xA2, FIL_, 0x07, 0xCC, + 0x8C, FIL_, 0xC5, 0xA4, FIL_, 0xCC, 0xB1, FIL_, + 0xE1, 0xB9, 0xAE, FIL_, 0xCC, 0xA6, FIL_, 0xC8, + 0x9A, FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0xA2, FIL_, + 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xAA, FIL_, 0xCC, + 0xAD, FIL_, 0xE1, 0xB9, 0xB0, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xB9, 0xAC, FIL_, 0x13, 0xCC, 0xA8, + FIL_, 0xC5, 0xB2, FIL_, 0xCC, 0x83, FIL_, 0xC5, + 0xA8, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0xAA, FIL_, + 0xCC, 0x81, FIL_, 0xC3, 0x9A, FIL_, 0xCC, 0x86, + FIL_, 0xC5, 0xAC, FIL_, 0xCC, 0x8A, FIL_, 0xC5, + 0xAE, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x99, FIL_, + 0xCC, 0x91, FIL_, 0xC8, 0x96, FIL_, 0xCC, 0x8B, + FIL_, 0xC5, 0xB0, FIL_, 0xCC, 0xA4, FIL_, 0xE1, + 0xB9, 0xB2, FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB9, + 0xB4, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x94, FIL_, + 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB6, FIL_, 0xCC, + 0x9B, FIL_, 0xC6, 0xAF, FIL_, 0xCC, 0x82, FIL_, + 0xC3, 0x9B, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x9C, + FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x93, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xBB, 0xA4, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0xA6, FIL_, 0x02, 0xCC, 0x83, + FIL_, 0xE1, 0xB9, 0xBC, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xB9, 0xBE, FIL_, 0x06, 0xCC, 0x82, FIL_, + 0xC5, 0xB4, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xBA, + 0x84, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xBA, 0x86, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x88, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0x82, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBA, 0x80, FIL_, 0x02, 0xCC, + 0x87, FIL_, 0xE1, 0xBA, 0x8A, FIL_, 0xCC, 0x88, + FIL_, 0xE1, 0xBA, 0x8C, FIL_, 0x09, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0xB6, FIL_, 0xCC, 0x87, FIL_, + 0xE1, 0xBA, 0x8E, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xBB, 0xB4, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x9D, + FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xB2, FIL_, 0xCC, + 0x82, FIL_, 0xC5, 0xB6, FIL_, 0xCC, 0x88, FIL_, + 0xC5, 0xB8, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, + 0xB2, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xB8, + FIL_, 0x06, 0xCC, 0x87, FIL_, 0xC5, 0xBB, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x92, FIL_, 0xCC, + 0x8C, FIL_, 0xC5, 0xBD, FIL_, 0xCC, 0xB1, FIL_, + 0xE1, 0xBA, 0x94, FIL_, 0xCC, 0x82, FIL_, 0xE1, + 0xBA, 0x90, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0xB9, + FIL_, 0x10, 0xCC, 0x8C, FIL_, 0xC7, 0x8E, FIL_, + 0xCC, 0x8F, FIL_, 0xC8, 0x81, FIL_, 0xCC, 0xA8, + FIL_, 0xC4, 0x85, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xBA, 0xA1, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x83, + FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA3, FIL_, + 0xCC, 0x84, FIL_, 0xC4, 0x81, FIL_, 0xCC, 0x91, + FIL_, 0xC8, 0x83, FIL_, 0xCC, 0x8A, FIL_, 0xC3, + 0xA5, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0xA4, FIL_, + 0xCC, 0x83, FIL_, 0xC3, 0xA3, FIL_, 0xCC, 0x82, + FIL_, 0xC3, 0xA2, FIL_, 0xCC, 0x81, FIL_, 0xC3, + 0xA1, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xA0, FIL_, + 0xCC, 0x87, FIL_, 0xC8, 0xA7, FIL_, 0xCC, 0xA5, + FIL_, 0xE1, 0xB8, 0x81, FIL_, 0x03, 0xCC, 0xB1, + FIL_, 0xE1, 0xB8, 0x87, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xB8, 0x85, FIL_, 0xCC, 0x87, FIL_, 0xE1, + 0xB8, 0x83, FIL_, 0x05, 0xCC, 0x87, FIL_, 0xC4, + 0x8B, FIL_, 0xCC, 0xA7, FIL_, 0xC3, 0xA7, FIL_, + 0xCC, 0x82, FIL_, 0xC4, 0x89, FIL_, 0xCC, 0x8C, + FIL_, 0xC4, 0x8D, FIL_, 0xCC, 0x81, FIL_, 0xC4, + 0x87, FIL_, 0x06, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, + 0x93, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x8B, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0x8D, FIL_, + 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0x8F, FIL_, 0xCC, + 0xA7, FIL_, 0xE1, 0xB8, 0x91, FIL_, 0xCC, 0x8C, + FIL_, 0xC4, 0x8F, FIL_, 0x11, 0xCC, 0xA8, FIL_, + 0xC4, 0x99, FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x9B, + FIL_, 0xCC, 0x87, FIL_, 0xC4, 0x97, FIL_, 0xCC, + 0x88, FIL_, 0xC3, 0xAB, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xBA, 0xB9, FIL_, 0xCC, 0xB0, FIL_, 0xE1, + 0xB8, 0x9B, FIL_, 0xCC, 0x84, FIL_, 0xC4, 0x93, + FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x99, FIL_, + 0xCC, 0x83, FIL_, 0xE1, 0xBA, 0xBD, FIL_, 0xCC, + 0x86, FIL_, 0xC4, 0x95, FIL_, 0xCC, 0xA7, FIL_, + 0xC8, 0xA9, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, + 0xBB, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x85, FIL_, + 0xCC, 0x81, FIL_, 0xC3, 0xA9, FIL_, 0xCC, 0x91, + FIL_, 0xC8, 0x87, FIL_, 0xCC, 0x80, FIL_, 0xC3, + 0xA8, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xAA, FIL_, + 0x01, 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9F, FIL_, + 0x07, 0xCC, 0x86, FIL_, 0xC4, 0x9F, FIL_, 0xCC, + 0xA7, FIL_, 0xC4, 0xA3, FIL_, 0xCC, 0x81, FIL_, + 0xC7, 0xB5, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0x9D, + FIL_, 0xCC, 0x87, FIL_, 0xC4, 0xA1, FIL_, 0xCC, + 0x8C, FIL_, 0xC7, 0xA7, FIL_, 0xCC, 0x84, FIL_, + 0xE1, 0xB8, 0xA1, FIL_, 0x08, 0xCC, 0x8C, FIL_, + 0xC8, 0x9F, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0xA5, + FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA7, FIL_, + 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0xA3, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xBA, 0x96, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xB8, 0xA5, FIL_, 0xCC, 0xA7, FIL_, + 0xE1, 0xB8, 0xA9, FIL_, 0xCC, 0xAE, FIL_, 0xE1, + 0xB8, 0xAB, FIL_, 0x0E, 0xCC, 0x81, FIL_, 0xC3, + 0xAD, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xAC, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0x8B, FIL_, 0xCC, + 0x8C, FIL_, 0xC7, 0x90, FIL_, 0xCC, 0x89, FIL_, + 0xE1, 0xBB, 0x89, FIL_, 0xCC, 0x91, FIL_, 0xC8, + 0x8B, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x89, FIL_, + 0xCC, 0x82, FIL_, 0xC3, 0xAE, FIL_, 0xCC, 0xB0, + FIL_, 0xE1, 0xB8, 0xAD, FIL_, 0xCC, 0xA8, FIL_, + 0xC4, 0xAF, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0xAD, + FIL_, 0xCC, 0x84, FIL_, 0xC4, 0xAB, FIL_, 0xCC, + 0x83, FIL_, 0xC4, 0xA9, FIL_, 0xCC, 0x88, FIL_, + 0xC3, 0xAF, FIL_, 0x02, 0xCC, 0x82, FIL_, 0xC4, + 0xB5, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0xB0, FIL_, + 0x05, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0xB3, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xB1, FIL_, 0xCC, + 0xA7, FIL_, 0xC4, 0xB7, FIL_, 0xCC, 0x8C, FIL_, + 0xC7, 0xA9, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, + 0xB5, FIL_, 0x06, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0xB7, FIL_, 0xCC, 0x81, FIL_, 0xC4, 0xBA, FIL_, + 0xCC, 0xA7, FIL_, 0xC4, 0xBC, FIL_, 0xCC, 0x8C, + FIL_, 0xC4, 0xBE, FIL_, 0xCC, 0xB1, FIL_, 0xE1, + 0xB8, 0xBB, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, + 0xBD, FIL_, 0x03, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, + 0x83, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xBF, + FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x81, FIL_, + 0x09, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x87, FIL_, + 0xCC, 0x83, FIL_, 0xC3, 0xB1, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0x85, FIL_, 0xCC, 0xB1, FIL_, + 0xE1, 0xB9, 0x89, FIL_, 0xCC, 0x81, FIL_, 0xC5, + 0x84, FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x86, FIL_, + 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0x8B, FIL_, 0xCC, + 0x8C, FIL_, 0xC5, 0x88, FIL_, 0xCC, 0x80, FIL_, + 0xC7, 0xB9, FIL_, 0x10, 0xCC, 0x89, FIL_, 0xE1, + 0xBB, 0x8F, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0xB3, + FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xB2, FIL_, 0xCC, + 0x87, FIL_, 0xC8, 0xAF, FIL_, 0xCC, 0x8F, FIL_, + 0xC8, 0x8D, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, + 0x8D, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0x8D, FIL_, + 0xCC, 0x8C, FIL_, 0xC7, 0x92, FIL_, 0xCC, 0x86, + FIL_, 0xC5, 0x8F, FIL_, 0xCC, 0x8B, FIL_, 0xC5, + 0x91, FIL_, 0xCC, 0x9B, FIL_, 0xC6, 0xA1, FIL_, + 0xCC, 0x91, FIL_, 0xC8, 0x8F, FIL_, 0xCC, 0xA8, + FIL_, 0xC7, 0xAB, FIL_, 0xCC, 0x88, FIL_, 0xC3, + 0xB6, FIL_, 0xCC, 0x83, FIL_, 0xC3, 0xB5, FIL_, + 0xCC, 0x82, FIL_, 0xC3, 0xB4, FIL_, 0x02, 0xCC, + 0x87, FIL_, 0xE1, 0xB9, 0x97, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0x95, FIL_, 0x08, 0xCC, 0xB1, + FIL_, 0xE1, 0xB9, 0x9F, FIL_, 0xCC, 0x87, FIL_, + 0xE1, 0xB9, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC5, + 0x95, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x91, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x9B, FIL_, 0xCC, + 0x8C, FIL_, 0xC5, 0x99, FIL_, 0xCC, 0x91, FIL_, + 0xC8, 0x93, FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x97, + FIL_, 0x07, 0xCC, 0xA6, FIL_, 0xC8, 0x99, FIL_, + 0xCC, 0x8C, FIL_, 0xC5, 0xA1, FIL_, 0xCC, 0x81, + FIL_, 0xC5, 0x9B, FIL_, 0xCC, 0x87, FIL_, 0xE1, + 0xB9, 0xA1, FIL_, 0xCC, 0x82, FIL_, 0xC5, 0x9D, + FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x9F, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xB9, 0xA3, FIL_, 0x08, 0xCC, + 0x88, FIL_, 0xE1, 0xBA, 0x97, FIL_, 0xCC, 0xAD, + FIL_, 0xE1, 0xB9, 0xB1, FIL_, 0xCC, 0xB1, FIL_, + 0xE1, 0xB9, 0xAF, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB9, 0xAD, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0xA5, + FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0xA3, FIL_, 0xCC, + 0x87, FIL_, 0xE1, 0xB9, 0xAB, FIL_, 0xCC, 0xA6, + FIL_, 0xC8, 0x9B, FIL_, 0x13, 0xCC, 0x81, FIL_, + 0xC3, 0xBA, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x97, + FIL_, 0xCC, 0x83, FIL_, 0xC5, 0xA9, FIL_, 0xCC, + 0x8F, FIL_, 0xC8, 0x95, FIL_, 0xCC, 0xA8, FIL_, + 0xC5, 0xB3, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xBB, + FIL_, 0xCC, 0x88, FIL_, 0xC3, 0xBC, FIL_, 0xCC, + 0x80, FIL_, 0xC3, 0xB9, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xBB, 0xA5, FIL_, 0xCC, 0xA4, FIL_, 0xE1, + 0xB9, 0xB3, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, + 0xA7, FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB9, 0xB5, + FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB7, FIL_, + 0xCC, 0x9B, FIL_, 0xC6, 0xB0, FIL_, 0xCC, 0x84, + FIL_, 0xC5, 0xAB, FIL_, 0xCC, 0x8B, FIL_, 0xC5, + 0xB1, FIL_, 0xCC, 0x86, FIL_, 0xC5, 0xAD, FIL_, + 0xCC, 0x8C, FIL_, 0xC7, 0x94, FIL_, 0xCC, 0x8A, + FIL_, 0xC5, 0xAF, FIL_, 0x02, 0xCC, 0x83, FIL_, + 0xE1, 0xB9, 0xBD, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB9, 0xBF, FIL_, 0x07, 0xCC, 0x82, FIL_, 0xC5, + 0xB5, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0x81, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0x83, FIL_, + 0xCC, 0x88, FIL_, 0xE1, 0xBA, 0x85, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xBA, 0x89, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xBA, 0x87, FIL_, 0xCC, 0x8A, FIL_, + 0xE1, 0xBA, 0x98, FIL_, 0x02, 0xCC, 0x87, FIL_, + 0xE1, 0xBA, 0x8B, FIL_, 0xCC, 0x88, FIL_, 0xE1, + 0xBA, 0x8D, FIL_, 0x0A, 0xCC, 0x87, FIL_, 0xE1, + 0xBA, 0x8F, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, + 0xB9, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xB3, + FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0xB7, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xB5, FIL_, 0xCC, + 0x82, FIL_, 0xC5, 0xB7, FIL_, 0xCC, 0x84, FIL_, + 0xC8, 0xB3, FIL_, 0xCC, 0x8A, FIL_, 0xE1, 0xBA, + 0x99, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0xBF, FIL_, + 0xCC, 0x81, FIL_, 0xC3, 0xBD, FIL_, 0x06, 0xCC, + 0x8C, FIL_, 0xC5, 0xBE, FIL_, 0xCC, 0x87, FIL_, + 0xC5, 0xBC, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xBA, + 0x95, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x93, + FIL_, 0xCC, 0x81, FIL_, 0xC5, 0xBA, FIL_, 0xCC, + 0x82, FIL_, 0xE1, 0xBA, 0x91, FIL_, 0x03, 0xCC, + 0x80, FIL_, 0xE1, 0xBF, 0xAD, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBF, 0x81, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x85, FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, + 0xBA, 0xA8, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBA, + 0xAA, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xA4, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xA6, FIL_, + 0x01, 0xCC, 0x84, FIL_, 0xC7, 0x9E, FIL_, 0x01, + 0xCC, 0x81, FIL_, 0xC7, 0xBA, FIL_, 0x02, 0xCC, + 0x84, FIL_, 0xC7, 0xA2, FIL_, 0xCC, 0x81, FIL_, + 0xC7, 0xBC, FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, + 0xB8, 0x88, FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, + 0xBA, 0xBE, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, + 0x80, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0x84, + FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x82, FIL_, + 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xAE, FIL_, + 0x04, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0x96, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x90, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBB, 0x92, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0x94, FIL_, 0x03, 0xCC, 0x84, + FIL_, 0xC8, 0xAC, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xB9, 0x8C, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB9, + 0x8E, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAA, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xC7, 0xBE, FIL_, + 0x04, 0xCC, 0x80, FIL_, 0xC7, 0x9B, FIL_, 0xCC, + 0x84, FIL_, 0xC7, 0x95, FIL_, 0xCC, 0x8C, FIL_, + 0xC7, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC7, 0x97, + FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA9, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xA7, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xA5, FIL_, 0xCC, + 0x83, FIL_, 0xE1, 0xBA, 0xAB, FIL_, 0x01, 0xCC, + 0x84, FIL_, 0xC7, 0x9F, FIL_, 0x01, 0xCC, 0x81, + FIL_, 0xC7, 0xBB, FIL_, 0x02, 0xCC, 0x84, FIL_, + 0xC7, 0xA3, FIL_, 0xCC, 0x81, FIL_, 0xC7, 0xBD, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x89, + FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x83, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xBF, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0x81, FIL_, 0xCC, + 0x83, FIL_, 0xE1, 0xBB, 0x85, FIL_, 0x01, 0xCC, + 0x81, FIL_, 0xE1, 0xB8, 0xAF, FIL_, 0x04, 0xCC, + 0x83, FIL_, 0xE1, 0xBB, 0x97, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0x95, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBB, 0x93, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xBB, 0x91, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, + 0xB9, 0x8D, FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xAD, + FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB9, 0x8F, FIL_, + 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAB, FIL_, 0x01, + 0xCC, 0x81, FIL_, 0xC7, 0xBF, FIL_, 0x04, 0xCC, + 0x81, FIL_, 0xC7, 0x98, FIL_, 0xCC, 0x84, FIL_, + 0xC7, 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x9A, + FIL_, 0xCC, 0x80, FIL_, 0xC7, 0x9C, FIL_, 0x04, + 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xB0, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBA, 0xAE, FIL_, 0xCC, 0x83, + FIL_, 0xE1, 0xBA, 0xB4, FIL_, 0xCC, 0x89, FIL_, + 0xE1, 0xBA, 0xB2, FIL_, 0x04, 0xCC, 0x80, FIL_, + 0xE1, 0xBA, 0xB1, FIL_, 0xCC, 0x83, FIL_, 0xE1, + 0xBA, 0xB5, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, + 0xAF, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xB3, + FIL_, 0x02, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x96, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x94, FIL_, + 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x95, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x97, FIL_, 0x02, + 0xCC, 0x80, FIL_, 0xE1, 0xB9, 0x90, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xB9, 0x92, FIL_, 0x02, 0xCC, + 0x80, FIL_, 0xE1, 0xB9, 0x91, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0x93, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA4, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA5, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA6, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA7, FIL_, 0x01, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0xB8, FIL_, 0x01, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0xB9, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xE1, 0xB9, 0xBA, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xE1, 0xB9, 0xBB, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xBA, 0x9B, FIL_, 0x05, 0xCC, 0x80, + FIL_, 0xE1, 0xBB, 0x9C, FIL_, 0xCC, 0x81, FIL_, + 0xE1, 0xBB, 0x9A, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xBB, 0xA2, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, + 0xA0, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x9E, + FIL_, 0x05, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xA1, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x9B, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xA3, FIL_, 0xCC, + 0x89, FIL_, 0xE1, 0xBB, 0x9F, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBB, 0x9D, FIL_, 0x05, 0xCC, 0x83, + FIL_, 0xE1, 0xBB, 0xAE, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xBB, 0xB0, FIL_, 0xCC, 0x89, FIL_, 0xE1, + 0xBB, 0xAC, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, + 0xA8, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xAA, + FIL_, 0x05, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xB1, + FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xAF, FIL_, + 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0xAD, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBB, 0xA9, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBB, 0xAB, FIL_, 0x01, 0xCC, 0x8C, + FIL_, 0xC7, 0xAE, FIL_, 0x01, 0xCC, 0x84, FIL_, + 0xC7, 0xAC, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, + 0xAD, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA0, + FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA1, FIL_, + 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9C, FIL_, + 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9D, FIL_, + 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xB0, FIL_, 0x01, + 0xCC, 0x84, FIL_, 0xC8, 0xB1, FIL_, 0x01, 0xCC, + 0x8C, FIL_, 0xC7, 0xAF, FIL_, 0x07, 0xCC, 0x93, + FIL_, 0xE1, 0xBC, 0x88, FIL_, 0xCC, 0x94, FIL_, + 0xE1, 0xBC, 0x89, FIL_, 0xCC, 0x81, FIL_, 0xCE, + 0x86, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xBC, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBE, 0xBA, FIL_, + 0xCC, 0x84, FIL_, 0xE1, 0xBE, 0xB9, FIL_, 0xCC, + 0x86, FIL_, 0xE1, 0xBE, 0xB8, FIL_, 0x04, 0xCC, + 0x81, FIL_, 0xCE, 0x88, FIL_, 0xCC, 0x94, FIL_, + 0xE1, 0xBC, 0x99, FIL_, 0xCC, 0x93, FIL_, 0xE1, + 0xBC, 0x98, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, + 0x88, FIL_, 0x05, 0xCC, 0x94, FIL_, 0xE1, 0xBC, + 0xA9, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x8A, + FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x89, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBF, 0x8C, FIL_, 0xCC, 0x93, + FIL_, 0xE1, 0xBC, 0xA8, FIL_, 0x07, 0xCC, 0x81, + FIL_, 0xCE, 0x8A, FIL_, 0xCC, 0x88, FIL_, 0xCE, + 0xAA, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0x98, + FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xBF, 0x99, FIL_, + 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0xB8, FIL_, 0xCC, + 0x94, FIL_, 0xE1, 0xBC, 0xB9, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBF, 0x9A, FIL_, 0x04, 0xCC, 0x94, + FIL_, 0xE1, 0xBD, 0x89, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBF, 0xB8, FIL_, 0xCC, 0x81, FIL_, 0xCE, + 0x8C, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, 0x88, + FIL_, 0x01, 0xCC, 0x94, FIL_, 0xE1, 0xBF, 0xAC, + FIL_, 0x06, 0xCC, 0x81, FIL_, 0xCE, 0x8E, FIL_, + 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0xA8, FIL_, 0xCC, + 0x94, FIL_, 0xE1, 0xBD, 0x99, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBF, 0xAA, FIL_, 0xCC, 0x84, FIL_, + 0xE1, 0xBF, 0xA9, FIL_, 0xCC, 0x88, FIL_, 0xCE, + 0xAB, FIL_, 0x05, 0xCC, 0x80, FIL_, 0xE1, 0xBF, + 0xBA, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x8F, FIL_, + 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xBC, FIL_, 0xCC, + 0x94, FIL_, 0xE1, 0xBD, 0xA9, FIL_, 0xCC, 0x93, + FIL_, 0xE1, 0xBD, 0xA8, FIL_, 0x01, 0xCD, 0x85, + FIL_, 0xE1, 0xBE, 0xB4, FIL_, 0x01, 0xCD, 0x85, + FIL_, 0xE1, 0xBF, 0x84, FIL_, 0x08, 0xCC, 0x81, + FIL_, 0xCE, 0xAC, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBD, 0xB0, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, + 0x80, FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0x81, + FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBE, 0xB6, FIL_, + 0xCC, 0x86, FIL_, 0xE1, 0xBE, 0xB0, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xB3, FIL_, 0xCC, 0x84, + FIL_, 0xE1, 0xBE, 0xB1, FIL_, 0x04, 0xCC, 0x81, + FIL_, 0xCE, 0xAD, FIL_, 0xCC, 0x94, FIL_, 0xE1, + 0xBC, 0x91, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, + 0xB2, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0x90, + FIL_, 0x06, 0xCC, 0x81, FIL_, 0xCE, 0xAE, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xB4, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBF, 0x83, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBF, 0x86, FIL_, 0xCC, 0x94, FIL_, + 0xE1, 0xBC, 0xA1, FIL_, 0xCC, 0x93, FIL_, 0xE1, + 0xBC, 0xA0, FIL_, 0x08, 0xCD, 0x82, FIL_, 0xE1, + 0xBF, 0x96, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, + 0x90, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0xB0, + FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAF, FIL_, 0xCC, + 0x94, FIL_, 0xE1, 0xBC, 0xB1, FIL_, 0xCC, 0x84, + FIL_, 0xE1, 0xBF, 0x91, FIL_, 0xCC, 0x88, FIL_, + 0xCF, 0x8A, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, + 0xB6, FIL_, 0x04, 0xCC, 0x81, FIL_, 0xCF, 0x8C, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xB8, FIL_, + 0xCC, 0x93, FIL_, 0xE1, 0xBD, 0x80, FIL_, 0xCC, + 0x94, FIL_, 0xE1, 0xBD, 0x81, FIL_, 0x02, 0xCC, + 0x93, FIL_, 0xE1, 0xBF, 0xA4, FIL_, 0xCC, 0x94, + FIL_, 0xE1, 0xBF, 0xA5, FIL_, 0x08, 0xCC, 0x93, + FIL_, 0xE1, 0xBD, 0x90, FIL_, 0xCC, 0x94, FIL_, + 0xE1, 0xBD, 0x91, FIL_, 0xCC, 0x86, FIL_, 0xE1, + 0xBF, 0xA0, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, + 0xA6, FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xBF, 0xA1, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xBA, FIL_, + 0xCC, 0x81, FIL_, 0xCF, 0x8D, FIL_, 0xCC, 0x88, + FIL_, 0xCF, 0x8B, FIL_, 0x06, 0xCC, 0x94, FIL_, + 0xE1, 0xBD, 0xA1, FIL_, 0xCD, 0x85, FIL_, 0xE1, + 0xBF, 0xB3, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, + 0xBC, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0xB6, + FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, 0xA0, FIL_, + 0xCC, 0x81, FIL_, 0xCF, 0x8E, FIL_, 0x03, 0xCD, + 0x82, FIL_, 0xE1, 0xBF, 0x97, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBF, 0x92, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x90, FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, + 0xBF, 0xA2, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xB0, + FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0xA7, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB4, FIL_, + 0x02, 0xCC, 0x88, FIL_, 0xCF, 0x94, FIL_, 0xCC, + 0x81, FIL_, 0xCF, 0x93, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xD0, 0x87, FIL_, 0x02, 0xCC, 0x86, FIL_, + 0xD3, 0x90, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x92, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x83, FIL_, + 0x03, 0xCC, 0x86, FIL_, 0xD3, 0x96, FIL_, 0xCC, + 0x80, FIL_, 0xD0, 0x80, FIL_, 0xCC, 0x88, FIL_, + 0xD0, 0x81, FIL_, 0x02, 0xCC, 0x88, FIL_, 0xD3, + 0x9C, FIL_, 0xCC, 0x86, FIL_, 0xD3, 0x81, FIL_, + 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9E, FIL_, 0x04, + 0xCC, 0x80, FIL_, 0xD0, 0x8D, FIL_, 0xCC, 0x88, + FIL_, 0xD3, 0xA4, FIL_, 0xCC, 0x86, FIL_, 0xD0, + 0x99, FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xA2, FIL_, + 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x8C, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xA6, FIL_, 0x04, 0xCC, + 0x86, FIL_, 0xD0, 0x8E, FIL_, 0xCC, 0x8B, FIL_, + 0xD3, 0xB2, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0xB0, + FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xAE, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xB4, FIL_, 0x01, 0xCC, + 0x88, FIL_, 0xD3, 0xB8, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xD3, 0xAC, FIL_, 0x02, 0xCC, 0x86, FIL_, + 0xD3, 0x91, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x93, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x93, FIL_, + 0x03, 0xCC, 0x80, FIL_, 0xD1, 0x90, FIL_, 0xCC, + 0x88, FIL_, 0xD1, 0x91, FIL_, 0xCC, 0x86, FIL_, + 0xD3, 0x97, FIL_, 0x02, 0xCC, 0x88, FIL_, 0xD3, + 0x9D, FIL_, 0xCC, 0x86, FIL_, 0xD3, 0x82, FIL_, + 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9F, FIL_, 0x04, + 0xCC, 0x88, FIL_, 0xD3, 0xA5, FIL_, 0xCC, 0x86, + FIL_, 0xD0, 0xB9, FIL_, 0xCC, 0x80, FIL_, 0xD1, + 0x9D, FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xA3, FIL_, + 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x9C, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xA7, FIL_, 0x04, 0xCC, + 0x84, FIL_, 0xD3, 0xAF, FIL_, 0xCC, 0x86, FIL_, + 0xD1, 0x9E, FIL_, 0xCC, 0x8B, FIL_, 0xD3, 0xB3, + FIL_, 0xCC, 0x88, FIL_, 0xD3, 0xB1, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xB5, FIL_, 0x01, 0xCC, + 0x88, FIL_, 0xD3, 0xB9, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xD3, 0xAD, FIL_, 0x01, 0xCC, 0x88, FIL_, + 0xD1, 0x97, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, + 0xB6, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, 0xB7, + FIL_, 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9A, FIL_, + 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9B, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xAA, FIL_, 0x01, 0xCC, + 0x88, FIL_, 0xD3, 0xAB, FIL_, 0x03, 0xD9, 0x94, + FIL_, 0xD8, 0xA3, FIL_, 0xD9, 0x93, FIL_, 0xD8, + 0xA2, FIL_, 0xD9, 0x95, FIL_, 0xD8, 0xA5, FIL_, + 0x01, 0xD9, 0x94, FIL_, 0xD8, 0xA4, FIL_, 0x01, + 0xD9, 0x94, FIL_, 0xD8, 0xA6, FIL_, 0x01, 0xD9, + 0x94, FIL_, 0xDB, 0x82, FIL_, 0x01, 0xD9, 0x94, + FIL_, 0xDB, 0x93, FIL_, 0x01, 0xD9, 0x94, FIL_, + 0xDB, 0x80, FIL_, 0x01, 0xE0, 0xA4, 0xBC, FIL_, + 0xE0, 0xA4, 0xA9, FIL_, 0x01, 0xE0, 0xA4, 0xBC, + FIL_, 0xE0, 0xA4, 0xB1, FIL_, 0x01, 0xE0, 0xA4, + 0xBC, FIL_, 0xE0, 0xA4, 0xB4, FIL_, 0x02, 0xE0, + 0xA6, 0xBE, FIL_, 0xE0, 0xA7, 0x8B, FIL_, 0xE0, + 0xA7, 0x97, FIL_, 0xE0, 0xA7, 0x8C, FIL_, 0x03, + 0xE0, 0xAD, 0x97, FIL_, 0xE0, 0xAD, 0x8C, FIL_, + 0xE0, 0xAC, 0xBE, FIL_, 0xE0, 0xAD, 0x8B, FIL_, + 0xE0, 0xAD, 0x96, FIL_, 0xE0, 0xAD, 0x88, FIL_, + 0x01, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAE, 0x94, + FIL_, 0x02, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, 0xAF, + 0x8A, FIL_, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAF, + 0x8C, FIL_, 0x01, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, + 0xAF, 0x8B, FIL_, 0x01, 0xE0, 0xB1, 0x96, FIL_, + 0xE0, 0xB1, 0x88, FIL_, 0x01, 0xE0, 0xB3, 0x95, + FIL_, 0xE0, 0xB3, 0x80, FIL_, 0x03, 0xE0, 0xB3, + 0x95, FIL_, 0xE0, 0xB3, 0x87, FIL_, 0xE0, 0xB3, + 0x82, FIL_, 0xE0, 0xB3, 0x8A, FIL_, 0xE0, 0xB3, + 0x96, FIL_, 0xE0, 0xB3, 0x88, FIL_, 0x01, 0xE0, + 0xB3, 0x95, FIL_, 0xE0, 0xB3, 0x8B, FIL_, 0x02, + 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8A, FIL_, + 0xE0, 0xB5, 0x97, FIL_, 0xE0, 0xB5, 0x8C, FIL_, + 0x01, 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8B, + FIL_, 0x03, 0xE0, 0xB7, 0x8F, FIL_, 0xE0, 0xB7, + 0x9C, FIL_, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, 0xB7, + 0x9A, FIL_, 0xE0, 0xB7, 0x9F, FIL_, 0xE0, 0xB7, + 0x9E, FIL_, 0x01, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, + 0xB7, 0x9D, FIL_, 0x01, 0xE1, 0x80, 0xAE, FIL_, + 0xE1, 0x80, 0xA6, FIL_, 0x01, 0xCC, 0x84, FIL_, + 0xE1, 0xB8, 0xB8, FIL_, 0x01, 0xCC, 0x84, FIL_, + 0xE1, 0xB8, 0xB9, FIL_, 0x01, 0xCC, 0x84, FIL_, + 0xE1, 0xB9, 0x9C, FIL_, 0x01, 0xCC, 0x84, FIL_, + 0xE1, 0xB9, 0x9D, FIL_, 0x01, 0xCC, 0x87, FIL_, + 0xE1, 0xB9, 0xA8, FIL_, 0x01, 0xCC, 0x87, FIL_, + 0xE1, 0xB9, 0xA9, FIL_, 0x02, 0xCC, 0x86, FIL_, + 0xE1, 0xBA, 0xB6, FIL_, 0xCC, 0x82, FIL_, 0xE1, + 0xBA, 0xAC, FIL_, 0x02, 0xCC, 0x86, FIL_, 0xE1, + 0xBA, 0xB7, FIL_, 0xCC, 0x82, FIL_, 0xE1, 0xBA, + 0xAD, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB, + 0x86, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB, + 0x87, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB, + 0x98, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB, + 0x99, FIL_, 0x04, 0xCC, 0x80, FIL_, 0xE1, 0xBC, + 0x82, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x84, + FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x80, FIL_, + 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0x86, FIL_, 0x04, + 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0x87, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBC, 0x83, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBC, 0x85, FIL_, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x81, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x82, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x83, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x84, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x85, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x86, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x87, FIL_, 0x04, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x88, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBC, 0x8A, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, + 0x8E, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x8C, + FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x8D, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x8B, FIL_, + 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0x8F, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x89, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x8A, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x8B, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x8C, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x8D, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x8E, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x8F, FIL_, 0x02, 0xCC, + 0x80, FIL_, 0xE1, 0xBC, 0x92, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBC, 0x94, FIL_, 0x02, 0xCC, 0x80, + FIL_, 0xE1, 0xBC, 0x93, FIL_, 0xCC, 0x81, FIL_, + 0xE1, 0xBC, 0x95, FIL_, 0x02, 0xCC, 0x80, FIL_, + 0xE1, 0xBC, 0x9A, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xBC, 0x9C, FIL_, 0x02, 0xCC, 0x80, FIL_, 0xE1, + 0xBC, 0x9B, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, + 0x9D, FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBC, + 0xA6, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x90, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xA4, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xA2, FIL_, 0x04, + 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xA3, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBC, 0xA5, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBC, 0xA7, FIL_, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x91, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x92, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x93, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x94, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x95, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x96, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x97, FIL_, 0x04, 0xCD, 0x82, FIL_, + 0xE1, 0xBC, 0xAE, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xBC, 0xAC, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, + 0x98, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xAA, + FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xAF, + FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x99, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xAD, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBC, 0xAB, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x9A, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x9B, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x9C, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x9D, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x9E, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x9F, FIL_, 0x03, 0xCC, + 0x81, FIL_, 0xE1, 0xBC, 0xB4, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBC, 0xB6, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBC, 0xB2, FIL_, 0x03, 0xCC, 0x81, FIL_, + 0xE1, 0xBC, 0xB5, FIL_, 0xCD, 0x82, FIL_, 0xE1, + 0xBC, 0xB7, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, + 0xB3, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xBC, + 0xBC, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xBA, + FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xBE, FIL_, + 0x03, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xBB, FIL_, + 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xBF, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBC, 0xBD, FIL_, 0x02, 0xCC, + 0x80, FIL_, 0xE1, 0xBD, 0x82, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBD, 0x84, FIL_, 0x02, 0xCC, 0x80, + FIL_, 0xE1, 0xBD, 0x83, FIL_, 0xCC, 0x81, FIL_, + 0xE1, 0xBD, 0x85, FIL_, 0x02, 0xCC, 0x81, FIL_, + 0xE1, 0xBD, 0x8C, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBD, 0x8A, FIL_, 0x02, 0xCC, 0x81, FIL_, 0xE1, + 0xBD, 0x8D, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, + 0x8B, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xBD, + 0x94, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0x96, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x92, FIL_, + 0x03, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0x97, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x95, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBD, 0x93, FIL_, 0x03, 0xCC, + 0x81, FIL_, 0xE1, 0xBD, 0x9D, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBD, 0x9F, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBD, 0x9B, FIL_, 0x04, 0xCC, 0x81, FIL_, + 0xE1, 0xBD, 0xA4, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBD, 0xA2, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, + 0xA6, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA0, + FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xA7, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0xA5, FIL_, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA1, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBD, 0xA3, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA2, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA3, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA4, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA5, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA6, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA7, FIL_, 0x04, 0xCC, + 0x81, FIL_, 0xE1, 0xBD, 0xAC, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBD, 0xAA, FIL_, 0xCD, 0x82, FIL_, + 0xE1, 0xBD, 0xAE, FIL_, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xA8, FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, + 0xBD, 0xAD, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, + 0xA9, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xAF, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xAB, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAA, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAB, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAC, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAD, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAE, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAF, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xB2, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0x82, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB2, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xB7, FIL_, + 0x03, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0x8F, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x8D, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBF, 0x8E, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBF, 0x87, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBF, 0xB7, FIL_, 0x03, 0xCC, + 0x80, FIL_, 0xE1, 0xBF, 0x9D, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBF, 0x9F, FIL_, 0xCC, 0x81, FIL_, + 0xE1, 0xBF, 0x9E, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x86, 0x9A, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x86, 0x9B, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x86, 0xAE, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x87, 0x8D, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x87, 0x8F, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x87, 0x8E, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x88, 0x84, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x88, 0x89, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x88, 0x8C, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x88, 0xA4, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x88, 0xA6, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0x81, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0x84, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0x87, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0x89, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xAD, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xA2, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xB0, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xB1, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xB4, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xB5, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xB8, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xB9, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0x80, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0x81, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xA0, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xA1, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0x84, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0x85, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0x88, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0x89, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xA2, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xA3, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0xAC, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0xAD, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0xAE, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0xAF, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xAA, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xAB, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xAC, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xAD, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x82, 0x94, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x81, 0x8C, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x81, 0x8E, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x90, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x92, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, + 0x94, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x81, 0x96, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x81, 0x98, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x81, 0x9A, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x81, 0x9C, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x81, 0x9E, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xA0, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xA2, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, + 0xA5, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x81, 0xA7, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x81, 0xA9, FIL_, 0x02, 0xE3, 0x82, 0x9A, + FIL_, 0xE3, 0x81, 0xB1, FIL_, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x81, 0xB0, FIL_, 0x02, 0xE3, 0x82, + 0x9A, FIL_, 0xE3, 0x81, 0xB4, FIL_, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x81, 0xB3, FIL_, 0x02, 0xE3, + 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xB7, FIL_, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB6, FIL_, 0x02, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB9, FIL_, + 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xBA, FIL_, + 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xBC, + FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xBD, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, + 0x9E, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x83, 0xB4, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x82, 0xAC, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x82, 0xAE, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x82, 0xB0, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB2, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB4, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB6, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, + 0xB8, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x82, 0xBA, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x82, 0xBC, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x82, 0xBE, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x83, 0x80, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x83, 0x82, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x85, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x87, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, + 0x89, FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x83, 0x90, FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, + 0x83, 0x91, FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x83, 0x93, FIL_, 0xE3, 0x82, 0x9A, FIL_, + 0xE3, 0x83, 0x94, FIL_, 0x02, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x83, 0x96, FIL_, 0xE3, 0x82, 0x9A, + FIL_, 0xE3, 0x83, 0x97, FIL_, 0x02, 0xE3, 0x82, + 0x9A, FIL_, 0xE3, 0x83, 0x9A, FIL_, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x83, 0x99, FIL_, 0x02, 0xE3, + 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x9D, FIL_, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x83, 0x9C, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0xB7, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0xB8, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, + 0xB9, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x83, 0xBA, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x83, 0xBE, FIL_, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + }, + { + 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAE, FIL_, + 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xA0, FIL_, + 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAF, FIL_, + 0x10, 0xCC, 0xA5, FIL_, 0xE1, 0xB8, 0x80, FIL_, + 0xCC, 0x87, FIL_, 0xC8, 0xA6, FIL_, 0xCC, 0x83, + FIL_, 0xC3, 0x83, FIL_, 0xCC, 0x91, FIL_, 0xC8, + 0x82, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x80, FIL_, + 0xCC, 0x8A, FIL_, 0xC3, 0x85, FIL_, 0xCC, 0x88, + FIL_, 0xC3, 0x84, FIL_, 0xCC, 0x89, FIL_, 0xE1, + 0xBA, 0xA2, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, + 0xA0, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x8D, FIL_, + 0xCC, 0x80, FIL_, 0xC3, 0x80, FIL_, 0xCC, 0x81, + FIL_, 0xC3, 0x81, FIL_, 0xCC, 0x82, FIL_, 0xC3, + 0x82, FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x84, FIL_, + 0xCC, 0x86, FIL_, 0xC4, 0x82, FIL_, 0xCC, 0x84, + FIL_, 0xC4, 0x80, FIL_, 0x03, 0xCC, 0xB1, FIL_, + 0xE1, 0xB8, 0x86, FIL_, 0xCC, 0x87, FIL_, 0xE1, + 0xB8, 0x82, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0x84, FIL_, 0x05, 0xCC, 0xA7, FIL_, 0xC3, 0x87, + FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x8C, FIL_, 0xCC, + 0x81, FIL_, 0xC4, 0x86, FIL_, 0xCC, 0x82, FIL_, + 0xC4, 0x88, FIL_, 0xCC, 0x87, FIL_, 0xC4, 0x8A, + FIL_, 0x06, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0x90, + FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x8E, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xB8, 0x8E, FIL_, 0xCC, 0xAD, + FIL_, 0xE1, 0xB8, 0x92, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xB8, 0x8C, FIL_, 0xCC, 0x87, FIL_, 0xE1, + 0xB8, 0x8A, FIL_, 0x11, 0xCC, 0x84, FIL_, 0xC4, + 0x92, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x94, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0xB8, FIL_, 0xCC, + 0x91, FIL_, 0xC8, 0x86, FIL_, 0xCC, 0x82, FIL_, + 0xC3, 0x8A, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x84, + FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x98, FIL_, + 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xBA, FIL_, 0xCC, + 0xA7, FIL_, 0xC8, 0xA8, FIL_, 0xCC, 0x8C, FIL_, + 0xC4, 0x9A, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x88, + FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x98, FIL_, 0xCC, + 0x83, FIL_, 0xE1, 0xBA, 0xBC, FIL_, 0xCC, 0x87, + FIL_, 0xC4, 0x96, FIL_, 0xCC, 0x81, FIL_, 0xC3, + 0x89, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x8B, FIL_, + 0xCC, 0xB0, FIL_, 0xE1, 0xB8, 0x9A, FIL_, 0x01, + 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9E, FIL_, 0x07, + 0xCC, 0x8C, FIL_, 0xC7, 0xA6, FIL_, 0xCC, 0x86, + FIL_, 0xC4, 0x9E, FIL_, 0xCC, 0x82, FIL_, 0xC4, + 0x9C, FIL_, 0xCC, 0xA7, FIL_, 0xC4, 0xA2, FIL_, + 0xCC, 0x84, FIL_, 0xE1, 0xB8, 0xA0, FIL_, 0xCC, + 0x81, FIL_, 0xC7, 0xB4, FIL_, 0xCC, 0x87, FIL_, + 0xC4, 0xA0, FIL_, 0x07, 0xCC, 0x87, FIL_, 0xE1, + 0xB8, 0xA2, FIL_, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, + 0xA8, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0xA4, FIL_, + 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA6, FIL_, 0xCC, + 0x8C, FIL_, 0xC8, 0x9E, FIL_, 0xCC, 0xAE, FIL_, + 0xE1, 0xB8, 0xAA, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB8, 0xA4, FIL_, 0x0F, 0xCC, 0xB0, FIL_, 0xE1, + 0xB8, 0xAC, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x8F, + FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x8C, FIL_, 0xCC, + 0x89, FIL_, 0xE1, 0xBB, 0x88, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xBB, 0x8A, FIL_, 0xCC, 0x91, FIL_, + 0xC8, 0x8A, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x8F, + FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x8E, FIL_, 0xCC, + 0x81, FIL_, 0xC3, 0x8D, FIL_, 0xCC, 0x83, FIL_, + 0xC4, 0xA8, FIL_, 0xCC, 0x87, FIL_, 0xC4, 0xB0, + FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x88, FIL_, 0xCC, + 0xA8, FIL_, 0xC4, 0xAE, FIL_, 0xCC, 0x86, FIL_, + 0xC4, 0xAC, FIL_, 0xCC, 0x84, FIL_, 0xC4, 0xAA, + FIL_, 0x01, 0xCC, 0x82, FIL_, 0xC4, 0xB4, FIL_, + 0x05, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xB0, FIL_, + 0xCC, 0x8C, FIL_, 0xC7, 0xA8, FIL_, 0xCC, 0xB1, + FIL_, 0xE1, 0xB8, 0xB4, FIL_, 0xCC, 0xA7, FIL_, + 0xC4, 0xB6, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0xB2, FIL_, 0x06, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0xB6, FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0xBD, FIL_, + 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0xBC, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xB8, 0xBA, FIL_, 0xCC, 0xA7, + FIL_, 0xC4, 0xBB, FIL_, 0xCC, 0x81, FIL_, 0xC4, + 0xB9, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xB8, + 0xBE, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x80, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x82, FIL_, + 0x09, 0xCC, 0x83, FIL_, 0xC3, 0x91, FIL_, 0xCC, + 0x81, FIL_, 0xC5, 0x83, FIL_, 0xCC, 0xA7, FIL_, + 0xC5, 0x85, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0x87, + FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x84, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x86, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xB9, 0x88, FIL_, 0xCC, 0xAD, + FIL_, 0xE1, 0xB9, 0x8A, FIL_, 0xCC, 0x80, FIL_, + 0xC7, 0xB8, FIL_, 0x10, 0xCC, 0x89, FIL_, 0xE1, + 0xBB, 0x8E, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0x8C, + FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x94, FIL_, 0xCC, + 0x86, FIL_, 0xC5, 0x8E, FIL_, 0xCC, 0x83, FIL_, + 0xC3, 0x95, FIL_, 0xCC, 0x8B, FIL_, 0xC5, 0x90, + FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x96, FIL_, 0xCC, + 0x9B, FIL_, 0xC6, 0xA0, FIL_, 0xCC, 0x91, FIL_, + 0xC8, 0x8E, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x91, + FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x8C, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xBB, 0x8C, FIL_, 0xCC, 0x80, + FIL_, 0xC3, 0x92, FIL_, 0xCC, 0xA8, FIL_, 0xC7, + 0xAA, FIL_, 0xCC, 0x87, FIL_, 0xC8, 0xAE, FIL_, + 0xCC, 0x81, FIL_, 0xC3, 0x93, FIL_, 0x02, 0xCC, + 0x87, FIL_, 0xE1, 0xB9, 0x96, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0x94, FIL_, 0x08, 0xCC, 0xA7, + FIL_, 0xC5, 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC5, + 0x98, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x92, FIL_, + 0xCC, 0x8F, FIL_, 0xC8, 0x90, FIL_, 0xCC, 0x81, + FIL_, 0xC5, 0x94, FIL_, 0xCC, 0x87, FIL_, 0xE1, + 0xB9, 0x98, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB9, + 0x9E, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x9A, + FIL_, 0x07, 0xCC, 0xA6, FIL_, 0xC8, 0x98, FIL_, + 0xCC, 0x81, FIL_, 0xC5, 0x9A, FIL_, 0xCC, 0x82, + FIL_, 0xC5, 0x9C, FIL_, 0xCC, 0xA7, FIL_, 0xC5, + 0x9E, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0xA0, FIL_, + 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA0, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xB9, 0xA2, FIL_, 0x07, 0xCC, + 0xA6, FIL_, 0xC8, 0x9A, FIL_, 0xCC, 0x87, FIL_, + 0xE1, 0xB9, 0xAA, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB9, 0xAC, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB9, + 0xAE, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB0, + FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0xA2, FIL_, 0xCC, + 0x8C, FIL_, 0xC5, 0xA4, FIL_, 0x13, 0xCC, 0x8A, + FIL_, 0xC5, 0xAE, FIL_, 0xCC, 0x88, FIL_, 0xC3, + 0x9C, FIL_, 0xCC, 0x8B, FIL_, 0xC5, 0xB0, FIL_, + 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB6, FIL_, 0xCC, + 0xA8, FIL_, 0xC5, 0xB2, FIL_, 0xCC, 0x8C, FIL_, + 0xC7, 0x93, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x99, + FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x94, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xBB, 0xA4, FIL_, 0xCC, 0xA4, + FIL_, 0xE1, 0xB9, 0xB2, FIL_, 0xCC, 0x81, FIL_, + 0xC3, 0x9A, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x9B, + FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB9, 0xB4, FIL_, + 0xCC, 0x83, FIL_, 0xC5, 0xA8, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0xA6, FIL_, 0xCC, 0x84, FIL_, + 0xC5, 0xAA, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x96, + FIL_, 0xCC, 0x86, FIL_, 0xC5, 0xAC, FIL_, 0xCC, + 0x9B, FIL_, 0xC6, 0xAF, FIL_, 0x02, 0xCC, 0xA3, + FIL_, 0xE1, 0xB9, 0xBE, FIL_, 0xCC, 0x83, FIL_, + 0xE1, 0xB9, 0xBC, FIL_, 0x06, 0xCC, 0x88, FIL_, + 0xE1, 0xBA, 0x84, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xBA, 0x82, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, + 0x80, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x88, + FIL_, 0xCC, 0x82, FIL_, 0xC5, 0xB4, FIL_, 0xCC, + 0x87, FIL_, 0xE1, 0xBA, 0x86, FIL_, 0x02, 0xCC, + 0x88, FIL_, 0xE1, 0xBA, 0x8C, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xBA, 0x8A, FIL_, 0x09, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0xB6, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xBB, 0xB4, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBB, 0xB2, FIL_, 0xCC, 0x88, FIL_, 0xC5, 0xB8, + FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x9D, FIL_, 0xCC, + 0x83, FIL_, 0xE1, 0xBB, 0xB8, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xBA, 0x8E, FIL_, 0xCC, 0x84, FIL_, + 0xC8, 0xB2, FIL_, 0xCC, 0x82, FIL_, 0xC5, 0xB6, + FIL_, 0x06, 0xCC, 0x82, FIL_, 0xE1, 0xBA, 0x90, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x92, FIL_, + 0xCC, 0xB1, FIL_, 0xE1, 0xBA, 0x94, FIL_, 0xCC, + 0x8C, FIL_, 0xC5, 0xBD, FIL_, 0xCC, 0x87, FIL_, + 0xC5, 0xBB, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0xB9, + FIL_, 0x10, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0xA1, + FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x85, FIL_, 0xCC, + 0x81, FIL_, 0xC3, 0xA1, FIL_, 0xCC, 0x82, FIL_, + 0xC3, 0xA2, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, + 0xA3, FIL_, 0xCC, 0x83, FIL_, 0xC3, 0xA3, FIL_, + 0xCC, 0x8C, FIL_, 0xC7, 0x8E, FIL_, 0xCC, 0x8A, + FIL_, 0xC3, 0xA5, FIL_, 0xCC, 0x88, FIL_, 0xC3, + 0xA4, FIL_, 0xCC, 0x87, FIL_, 0xC8, 0xA7, FIL_, + 0xCC, 0x91, FIL_, 0xC8, 0x83, FIL_, 0xCC, 0xA5, + FIL_, 0xE1, 0xB8, 0x81, FIL_, 0xCC, 0x84, FIL_, + 0xC4, 0x81, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x81, + FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x83, FIL_, 0xCC, + 0x80, FIL_, 0xC3, 0xA0, FIL_, 0x03, 0xCC, 0xA3, + FIL_, 0xE1, 0xB8, 0x85, FIL_, 0xCC, 0x87, FIL_, + 0xE1, 0xB8, 0x83, FIL_, 0xCC, 0xB1, FIL_, 0xE1, + 0xB8, 0x87, FIL_, 0x05, 0xCC, 0x87, FIL_, 0xC4, + 0x8B, FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x8D, FIL_, + 0xCC, 0x82, FIL_, 0xC4, 0x89, FIL_, 0xCC, 0x81, + FIL_, 0xC4, 0x87, FIL_, 0xCC, 0xA7, FIL_, 0xC3, + 0xA7, FIL_, 0x06, 0xCC, 0x87, FIL_, 0xE1, 0xB8, + 0x8B, FIL_, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0x91, + FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0x8F, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0x8D, FIL_, 0xCC, + 0x8C, FIL_, 0xC4, 0x8F, FIL_, 0xCC, 0xAD, FIL_, + 0xE1, 0xB8, 0x93, FIL_, 0x11, 0xCC, 0x80, FIL_, + 0xC3, 0xA8, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0xA9, + FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xAA, FIL_, 0xCC, + 0x88, FIL_, 0xC3, 0xAB, FIL_, 0xCC, 0x84, FIL_, + 0xC4, 0x93, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x95, + FIL_, 0xCC, 0x87, FIL_, 0xC4, 0x97, FIL_, 0xCC, + 0xA8, FIL_, 0xC4, 0x99, FIL_, 0xCC, 0x8C, FIL_, + 0xC4, 0x9B, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x85, + FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x87, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xBA, 0xB9, FIL_, 0xCC, 0xA7, + FIL_, 0xC8, 0xA9, FIL_, 0xCC, 0x83, FIL_, 0xE1, + 0xBA, 0xBD, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, + 0xBB, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x99, + FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB8, 0x9B, FIL_, + 0x01, 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9F, FIL_, + 0x07, 0xCC, 0x86, FIL_, 0xC4, 0x9F, FIL_, 0xCC, + 0x87, FIL_, 0xC4, 0xA1, FIL_, 0xCC, 0x82, FIL_, + 0xC4, 0x9D, FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xB8, + 0xA1, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0xA7, FIL_, + 0xCC, 0xA7, FIL_, 0xC4, 0xA3, FIL_, 0xCC, 0x81, + FIL_, 0xC7, 0xB5, FIL_, 0x08, 0xCC, 0xA7, FIL_, + 0xE1, 0xB8, 0xA9, FIL_, 0xCC, 0xB1, FIL_, 0xE1, + 0xBA, 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC8, 0x9F, + FIL_, 0xCC, 0xAE, FIL_, 0xE1, 0xB8, 0xAB, FIL_, + 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA7, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xB8, 0xA5, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xB8, 0xA3, FIL_, 0xCC, 0x82, FIL_, + 0xC4, 0xA5, FIL_, 0x0E, 0xCC, 0x88, FIL_, 0xC3, + 0xAF, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x89, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0x8B, FIL_, + 0xCC, 0x82, FIL_, 0xC3, 0xAE, FIL_, 0xCC, 0x81, + FIL_, 0xC3, 0xAD, FIL_, 0xCC, 0x80, FIL_, 0xC3, + 0xAC, FIL_, 0xCC, 0x83, FIL_, 0xC4, 0xA9, FIL_, + 0xCC, 0x84, FIL_, 0xC4, 0xAB, FIL_, 0xCC, 0x86, + FIL_, 0xC4, 0xAD, FIL_, 0xCC, 0xA8, FIL_, 0xC4, + 0xAF, FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB8, 0xAD, + FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x90, FIL_, 0xCC, + 0x91, FIL_, 0xC8, 0x8B, FIL_, 0xCC, 0x8F, FIL_, + 0xC8, 0x89, FIL_, 0x02, 0xCC, 0x8C, FIL_, 0xC7, + 0xB0, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0xB5, FIL_, + 0x05, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0xB5, FIL_, + 0xCC, 0xA7, FIL_, 0xC4, 0xB7, FIL_, 0xCC, 0x8C, + FIL_, 0xC7, 0xA9, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xB8, 0xB1, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0xB3, FIL_, 0x06, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0xB7, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0xBD, + FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0xBB, FIL_, + 0xCC, 0xA7, FIL_, 0xC4, 0xBC, FIL_, 0xCC, 0x81, + FIL_, 0xC4, 0xBA, FIL_, 0xCC, 0x8C, FIL_, 0xC4, + 0xBE, FIL_, 0x03, 0xCC, 0x87, FIL_, 0xE1, 0xB9, + 0x81, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x83, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xBF, FIL_, + 0x09, 0xCC, 0x80, FIL_, 0xC7, 0xB9, FIL_, 0xCC, + 0xAD, FIL_, 0xE1, 0xB9, 0x8B, FIL_, 0xCC, 0x83, + FIL_, 0xC3, 0xB1, FIL_, 0xCC, 0x81, FIL_, 0xC5, + 0x84, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x87, + FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB9, 0x89, FIL_, + 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x85, FIL_, 0xCC, + 0xA7, FIL_, 0xC5, 0x86, FIL_, 0xCC, 0x8C, FIL_, + 0xC5, 0x88, FIL_, 0x10, 0xCC, 0xA3, FIL_, 0xE1, + 0xBB, 0x8D, FIL_, 0xCC, 0x87, FIL_, 0xC8, 0xAF, + FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xB2, FIL_, 0xCC, + 0x91, FIL_, 0xC8, 0x8F, FIL_, 0xCC, 0x89, FIL_, + 0xE1, 0xBB, 0x8F, FIL_, 0xCC, 0x88, FIL_, 0xC3, + 0xB6, FIL_, 0xCC, 0x83, FIL_, 0xC3, 0xB5, FIL_, + 0xCC, 0x81, FIL_, 0xC3, 0xB3, FIL_, 0xCC, 0x8C, + FIL_, 0xC7, 0x92, FIL_, 0xCC, 0xA8, FIL_, 0xC7, + 0xAB, FIL_, 0xCC, 0x9B, FIL_, 0xC6, 0xA1, FIL_, + 0xCC, 0x84, FIL_, 0xC5, 0x8D, FIL_, 0xCC, 0x86, + FIL_, 0xC5, 0x8F, FIL_, 0xCC, 0x8B, FIL_, 0xC5, + 0x91, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xB4, FIL_, + 0xCC, 0x8F, FIL_, 0xC8, 0x8D, FIL_, 0x02, 0xCC, + 0x87, FIL_, 0xE1, 0xB9, 0x97, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0x95, FIL_, 0x08, 0xCC, 0x8C, + FIL_, 0xC5, 0x99, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB9, 0x9B, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0x95, + FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x97, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xB9, 0x9F, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0x99, FIL_, 0xCC, 0x91, FIL_, + 0xC8, 0x93, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x91, + FIL_, 0x07, 0xCC, 0xA7, FIL_, 0xC5, 0x9F, FIL_, + 0xCC, 0x82, FIL_, 0xC5, 0x9D, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA1, FIL_, 0xCC, 0xA6, FIL_, + 0xC8, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0x9B, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0xA3, FIL_, + 0xCC, 0x8C, FIL_, 0xC5, 0xA1, FIL_, 0x08, 0xCC, + 0xA6, FIL_, 0xC8, 0x9B, FIL_, 0xCC, 0xAD, FIL_, + 0xE1, 0xB9, 0xB1, FIL_, 0xCC, 0xB1, FIL_, 0xE1, + 0xB9, 0xAF, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, + 0xAD, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xAB, + FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0xA5, FIL_, 0xCC, + 0xA7, FIL_, 0xC5, 0xA3, FIL_, 0xCC, 0x88, FIL_, + 0xE1, 0xBA, 0x97, FIL_, 0x13, 0xCC, 0x8A, FIL_, + 0xC5, 0xAF, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x95, + FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x94, FIL_, 0xCC, + 0x80, FIL_, 0xC3, 0xB9, FIL_, 0xCC, 0x9B, FIL_, + 0xC6, 0xB0, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xBB, + FIL_, 0xCC, 0x81, FIL_, 0xC3, 0xBA, FIL_, 0xCC, + 0x88, FIL_, 0xC3, 0xBC, FIL_, 0xCC, 0x83, FIL_, + 0xC5, 0xA9, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, + 0xA7, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0xAB, FIL_, + 0xCC, 0x86, FIL_, 0xC5, 0xAD, FIL_, 0xCC, 0xAD, + FIL_, 0xE1, 0xB9, 0xB7, FIL_, 0xCC, 0x8B, FIL_, + 0xC5, 0xB1, FIL_, 0xCC, 0xA8, FIL_, 0xC5, 0xB3, + FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x97, FIL_, 0xCC, + 0xA4, FIL_, 0xE1, 0xB9, 0xB3, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xBB, 0xA5, FIL_, 0xCC, 0xB0, FIL_, + 0xE1, 0xB9, 0xB5, FIL_, 0x02, 0xCC, 0x83, FIL_, + 0xE1, 0xB9, 0xBD, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB9, 0xBF, FIL_, 0x07, 0xCC, 0x8A, FIL_, 0xE1, + 0xBA, 0x98, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xBA, + 0x87, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0x83, + FIL_, 0xCC, 0x82, FIL_, 0xC5, 0xB5, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBA, 0x81, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xBA, 0x89, FIL_, 0xCC, 0x88, FIL_, + 0xE1, 0xBA, 0x85, FIL_, 0x02, 0xCC, 0x87, FIL_, + 0xE1, 0xBA, 0x8B, FIL_, 0xCC, 0x88, FIL_, 0xE1, + 0xBA, 0x8D, FIL_, 0x0A, 0xCC, 0x87, FIL_, 0xE1, + 0xBA, 0x8F, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, + 0xB5, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0xB7, + FIL_, 0xCC, 0x8A, FIL_, 0xE1, 0xBA, 0x99, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xB3, FIL_, 0xCC, + 0x83, FIL_, 0xE1, 0xBB, 0xB9, FIL_, 0xCC, 0x88, + FIL_, 0xC3, 0xBF, FIL_, 0xCC, 0x81, FIL_, 0xC3, + 0xBD, FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xB3, FIL_, + 0xCC, 0x82, FIL_, 0xC5, 0xB7, FIL_, 0x06, 0xCC, + 0xB1, FIL_, 0xE1, 0xBA, 0x95, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xBA, 0x93, FIL_, 0xCC, 0x82, FIL_, + 0xE1, 0xBA, 0x91, FIL_, 0xCC, 0x81, FIL_, 0xC5, + 0xBA, FIL_, 0xCC, 0x87, FIL_, 0xC5, 0xBC, FIL_, + 0xCC, 0x8C, FIL_, 0xC5, 0xBE, FIL_, 0x03, 0xCC, + 0x80, FIL_, 0xE1, 0xBF, 0xAD, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBF, 0x81, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x85, FIL_, 0x04, 0xCC, 0x83, FIL_, 0xE1, + 0xBA, 0xAA, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, + 0xA4, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA8, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xA6, FIL_, + 0x01, 0xCC, 0x84, FIL_, 0xC7, 0x9E, FIL_, 0x01, + 0xCC, 0x81, FIL_, 0xC7, 0xBA, FIL_, 0x02, 0xCC, + 0x84, FIL_, 0xC7, 0xA2, FIL_, 0xCC, 0x81, FIL_, + 0xC7, 0xBC, FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, + 0xB8, 0x88, FIL_, 0x04, 0xCC, 0x83, FIL_, 0xE1, + 0xBB, 0x84, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, + 0x80, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x82, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xBE, FIL_, + 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xAE, FIL_, + 0x04, 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x90, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0x92, FIL_, 0xCC, + 0x89, FIL_, 0xE1, 0xBB, 0x94, FIL_, 0xCC, 0x83, + FIL_, 0xE1, 0xBB, 0x96, FIL_, 0x03, 0xCC, 0x84, + FIL_, 0xC8, 0xAC, FIL_, 0xCC, 0x88, FIL_, 0xE1, + 0xB9, 0x8E, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xB9, + 0x8C, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAA, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xC7, 0xBE, FIL_, + 0x04, 0xCC, 0x80, FIL_, 0xC7, 0x9B, FIL_, 0xCC, + 0x84, FIL_, 0xC7, 0x95, FIL_, 0xCC, 0x8C, FIL_, + 0xC7, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC7, 0x97, + FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xA5, + FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBA, 0xAB, FIL_, + 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA9, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBA, 0xA7, FIL_, 0x01, 0xCC, + 0x84, FIL_, 0xC7, 0x9F, FIL_, 0x01, 0xCC, 0x81, + FIL_, 0xC7, 0xBB, FIL_, 0x02, 0xCC, 0x81, FIL_, + 0xC7, 0xBD, FIL_, 0xCC, 0x84, FIL_, 0xC7, 0xA3, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x89, + FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x83, + FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0x85, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0x81, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBA, 0xBF, FIL_, 0x01, 0xCC, + 0x81, FIL_, 0xE1, 0xB8, 0xAF, FIL_, 0x04, 0xCC, + 0x80, FIL_, 0xE1, 0xBB, 0x93, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBB, 0x91, FIL_, 0xCC, 0x83, FIL_, + 0xE1, 0xBB, 0x97, FIL_, 0xCC, 0x89, FIL_, 0xE1, + 0xBB, 0x95, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, + 0xB9, 0x8D, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB9, + 0x8F, FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xAD, FIL_, + 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAB, FIL_, 0x01, + 0xCC, 0x81, FIL_, 0xC7, 0xBF, FIL_, 0x04, 0xCC, + 0x8C, FIL_, 0xC7, 0x9A, FIL_, 0xCC, 0x84, FIL_, + 0xC7, 0x96, FIL_, 0xCC, 0x80, FIL_, 0xC7, 0x9C, + FIL_, 0xCC, 0x81, FIL_, 0xC7, 0x98, FIL_, 0x04, + 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xAE, FIL_, 0xCC, + 0x83, FIL_, 0xE1, 0xBA, 0xB4, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBA, 0xB2, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBA, 0xB0, FIL_, 0x04, 0xCC, 0x83, FIL_, + 0xE1, 0xBA, 0xB5, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBA, 0xB1, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, + 0xAF, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xB3, + FIL_, 0x02, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x96, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x94, FIL_, + 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x95, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x97, FIL_, 0x02, + 0xCC, 0x80, FIL_, 0xE1, 0xB9, 0x90, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xB9, 0x92, FIL_, 0x02, 0xCC, + 0x81, FIL_, 0xE1, 0xB9, 0x93, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xB9, 0x91, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA4, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA5, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA6, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA7, FIL_, 0x01, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0xB8, FIL_, 0x01, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0xB9, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xE1, 0xB9, 0xBA, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xE1, 0xB9, 0xBB, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xBA, 0x9B, FIL_, 0x05, 0xCC, 0x80, + FIL_, 0xE1, 0xBB, 0x9C, FIL_, 0xCC, 0x89, FIL_, + 0xE1, 0xBB, 0x9E, FIL_, 0xCC, 0x83, FIL_, 0xE1, + 0xBB, 0xA0, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, + 0x9A, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xA2, + FIL_, 0x05, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xA1, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xA3, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x9B, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBB, 0x9D, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0x9F, FIL_, 0x05, 0xCC, 0x81, + FIL_, 0xE1, 0xBB, 0xA8, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBB, 0xAA, FIL_, 0xCC, 0x89, FIL_, 0xE1, + 0xBB, 0xAC, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, + 0xAE, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xB0, + FIL_, 0x05, 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xAB, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0xA9, FIL_, + 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xAF, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xBB, 0xB1, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0xAD, FIL_, 0x01, 0xCC, 0x8C, + FIL_, 0xC7, 0xAE, FIL_, 0x01, 0xCC, 0x84, FIL_, + 0xC7, 0xAC, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, + 0xAD, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA0, + FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA1, FIL_, + 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9C, FIL_, + 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9D, FIL_, + 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xB0, FIL_, 0x01, + 0xCC, 0x84, FIL_, 0xC8, 0xB1, FIL_, 0x01, 0xCC, + 0x8C, FIL_, 0xC7, 0xAF, FIL_, 0x07, 0xCC, 0x93, + FIL_, 0xE1, 0xBC, 0x88, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x86, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBE, + 0xB8, FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xBE, 0xB9, + FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0x89, FIL_, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xBC, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBE, 0xBA, FIL_, 0x04, 0xCC, + 0x94, FIL_, 0xE1, 0xBC, 0x99, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBF, 0x88, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x88, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, + 0x98, FIL_, 0x05, 0xCD, 0x85, FIL_, 0xE1, 0xBF, + 0x8C, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x89, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x8A, FIL_, 0xCC, + 0x93, FIL_, 0xE1, 0xBC, 0xA8, FIL_, 0xCC, 0x94, + FIL_, 0xE1, 0xBC, 0xA9, FIL_, 0x07, 0xCC, 0x80, + FIL_, 0xE1, 0xBF, 0x9A, FIL_, 0xCC, 0x84, FIL_, + 0xE1, 0xBF, 0x99, FIL_, 0xCC, 0x93, FIL_, 0xE1, + 0xBC, 0xB8, FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBC, + 0xB9, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0x98, + FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x8A, FIL_, 0xCC, + 0x88, FIL_, 0xCE, 0xAA, FIL_, 0x04, 0xCC, 0x81, + FIL_, 0xCE, 0x8C, FIL_, 0xCC, 0x94, FIL_, 0xE1, + 0xBD, 0x89, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, + 0x88, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0xB8, + FIL_, 0x01, 0xCC, 0x94, FIL_, 0xE1, 0xBF, 0xAC, + FIL_, 0x06, 0xCC, 0x94, FIL_, 0xE1, 0xBD, 0x99, + FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0xA8, FIL_, + 0xCC, 0x88, FIL_, 0xCE, 0xAB, FIL_, 0xCC, 0x84, + FIL_, 0xE1, 0xBF, 0xA9, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x8E, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, + 0xAA, FIL_, 0x05, 0xCC, 0x93, FIL_, 0xE1, 0xBD, + 0xA8, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xBC, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0xBA, FIL_, + 0xCC, 0x94, FIL_, 0xE1, 0xBD, 0xA9, FIL_, 0xCC, + 0x81, FIL_, 0xCE, 0x8F, FIL_, 0x01, 0xCD, 0x85, + FIL_, 0xE1, 0xBE, 0xB4, FIL_, 0x01, 0xCD, 0x85, + FIL_, 0xE1, 0xBF, 0x84, FIL_, 0x08, 0xCD, 0x85, + FIL_, 0xE1, 0xBE, 0xB3, FIL_, 0xCC, 0x84, FIL_, + 0xE1, 0xBE, 0xB1, FIL_, 0xCC, 0x86, FIL_, 0xE1, + 0xBE, 0xB0, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, + 0xB0, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAC, FIL_, + 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0x81, FIL_, 0xCC, + 0x93, FIL_, 0xE1, 0xBC, 0x80, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBE, 0xB6, FIL_, 0x04, 0xCC, 0x93, + FIL_, 0xE1, 0xBC, 0x90, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBD, 0xB2, FIL_, 0xCC, 0x94, FIL_, 0xE1, + 0xBC, 0x91, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAD, + FIL_, 0x06, 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0xA1, + FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAE, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBF, 0x83, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBF, 0x86, FIL_, 0xCC, 0x93, FIL_, + 0xE1, 0xBC, 0xA0, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBD, 0xB4, FIL_, 0x08, 0xCC, 0x88, FIL_, 0xCF, + 0x8A, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAF, FIL_, + 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0xB0, FIL_, 0xCC, + 0x94, FIL_, 0xE1, 0xBC, 0xB1, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBD, 0xB6, FIL_, 0xCC, 0x86, FIL_, + 0xE1, 0xBF, 0x90, FIL_, 0xCC, 0x84, FIL_, 0xE1, + 0xBF, 0x91, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, + 0x96, FIL_, 0x04, 0xCC, 0x93, FIL_, 0xE1, 0xBD, + 0x80, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xB8, + FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBD, 0x81, FIL_, + 0xCC, 0x81, FIL_, 0xCF, 0x8C, FIL_, 0x02, 0xCC, + 0x93, FIL_, 0xE1, 0xBF, 0xA4, FIL_, 0xCC, 0x94, + FIL_, 0xE1, 0xBF, 0xA5, FIL_, 0x08, 0xCC, 0x81, + FIL_, 0xCF, 0x8D, FIL_, 0xCC, 0x94, FIL_, 0xE1, + 0xBD, 0x91, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, + 0xA6, FIL_, 0xCC, 0x88, FIL_, 0xCF, 0x8B, FIL_, + 0xCC, 0x84, FIL_, 0xE1, 0xBF, 0xA1, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBD, 0xBA, FIL_, 0xCC, 0x93, + FIL_, 0xE1, 0xBD, 0x90, FIL_, 0xCC, 0x86, FIL_, + 0xE1, 0xBF, 0xA0, FIL_, 0x06, 0xCC, 0x80, FIL_, + 0xE1, 0xBD, 0xBC, FIL_, 0xCC, 0x94, FIL_, 0xE1, + 0xBD, 0xA1, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, + 0xA0, FIL_, 0xCC, 0x81, FIL_, 0xCF, 0x8E, FIL_, + 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB3, FIL_, 0xCD, + 0x82, FIL_, 0xE1, 0xBF, 0xB6, FIL_, 0x03, 0xCC, + 0x80, FIL_, 0xE1, 0xBF, 0x92, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBF, 0x97, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x90, FIL_, 0x03, 0xCD, 0x82, FIL_, 0xE1, + 0xBF, 0xA7, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, + 0xA2, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xB0, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB4, FIL_, + 0x02, 0xCC, 0x88, FIL_, 0xCF, 0x94, FIL_, 0xCC, + 0x81, FIL_, 0xCF, 0x93, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xD0, 0x87, FIL_, 0x02, 0xCC, 0x88, FIL_, + 0xD3, 0x92, FIL_, 0xCC, 0x86, FIL_, 0xD3, 0x90, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x83, FIL_, + 0x03, 0xCC, 0x88, FIL_, 0xD0, 0x81, FIL_, 0xCC, + 0x80, FIL_, 0xD0, 0x80, FIL_, 0xCC, 0x86, FIL_, + 0xD3, 0x96, FIL_, 0x02, 0xCC, 0x86, FIL_, 0xD3, + 0x81, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x9C, FIL_, + 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9E, FIL_, 0x04, + 0xCC, 0x84, FIL_, 0xD3, 0xA2, FIL_, 0xCC, 0x88, + FIL_, 0xD3, 0xA4, FIL_, 0xCC, 0x86, FIL_, 0xD0, + 0x99, FIL_, 0xCC, 0x80, FIL_, 0xD0, 0x8D, FIL_, + 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x8C, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xA6, FIL_, 0x04, 0xCC, + 0x8B, FIL_, 0xD3, 0xB2, FIL_, 0xCC, 0x88, FIL_, + 0xD3, 0xB0, FIL_, 0xCC, 0x86, FIL_, 0xD0, 0x8E, + FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xAE, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xB4, FIL_, 0x01, 0xCC, + 0x88, FIL_, 0xD3, 0xB8, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xD3, 0xAC, FIL_, 0x02, 0xCC, 0x86, FIL_, + 0xD3, 0x91, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x93, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x93, FIL_, + 0x03, 0xCC, 0x80, FIL_, 0xD1, 0x90, FIL_, 0xCC, + 0x86, FIL_, 0xD3, 0x97, FIL_, 0xCC, 0x88, FIL_, + 0xD1, 0x91, FIL_, 0x02, 0xCC, 0x86, FIL_, 0xD3, + 0x82, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x9D, FIL_, + 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9F, FIL_, 0x04, + 0xCC, 0x86, FIL_, 0xD0, 0xB9, FIL_, 0xCC, 0x88, + FIL_, 0xD3, 0xA5, FIL_, 0xCC, 0x84, FIL_, 0xD3, + 0xA3, FIL_, 0xCC, 0x80, FIL_, 0xD1, 0x9D, FIL_, + 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x9C, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xA7, FIL_, 0x04, 0xCC, + 0x8B, FIL_, 0xD3, 0xB3, FIL_, 0xCC, 0x84, FIL_, + 0xD3, 0xAF, FIL_, 0xCC, 0x86, FIL_, 0xD1, 0x9E, + FIL_, 0xCC, 0x88, FIL_, 0xD3, 0xB1, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xB5, FIL_, 0x01, 0xCC, + 0x88, FIL_, 0xD3, 0xB9, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xD3, 0xAD, FIL_, 0x01, 0xCC, 0x88, FIL_, + 0xD1, 0x97, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, + 0xB6, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, 0xB7, + FIL_, 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9A, FIL_, + 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9B, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xAA, FIL_, 0x01, 0xCC, + 0x88, FIL_, 0xD3, 0xAB, FIL_, 0x03, 0xD9, 0x94, + FIL_, 0xD8, 0xA3, FIL_, 0xD9, 0x95, FIL_, 0xD8, + 0xA5, FIL_, 0xD9, 0x93, FIL_, 0xD8, 0xA2, FIL_, + 0x01, 0xD9, 0x94, FIL_, 0xD8, 0xA4, FIL_, 0x01, + 0xD9, 0x94, FIL_, 0xD8, 0xA6, FIL_, 0x01, 0xD9, + 0x94, FIL_, 0xDB, 0x82, FIL_, 0x01, 0xD9, 0x94, + FIL_, 0xDB, 0x93, FIL_, 0x01, 0xD9, 0x94, FIL_, + 0xDB, 0x80, FIL_, 0x01, 0xE0, 0xA4, 0xBC, FIL_, + 0xE0, 0xA4, 0xA9, FIL_, 0x01, 0xE0, 0xA4, 0xBC, + FIL_, 0xE0, 0xA4, 0xB1, FIL_, 0x01, 0xE0, 0xA4, + 0xBC, FIL_, 0xE0, 0xA4, 0xB4, FIL_, 0x02, 0xE0, + 0xA6, 0xBE, FIL_, 0xE0, 0xA7, 0x8B, FIL_, 0xE0, + 0xA7, 0x97, FIL_, 0xE0, 0xA7, 0x8C, FIL_, 0x03, + 0xE0, 0xAD, 0x96, FIL_, 0xE0, 0xAD, 0x88, FIL_, + 0xE0, 0xAC, 0xBE, FIL_, 0xE0, 0xAD, 0x8B, FIL_, + 0xE0, 0xAD, 0x97, FIL_, 0xE0, 0xAD, 0x8C, FIL_, + 0x01, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAE, 0x94, + FIL_, 0x02, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAF, + 0x8C, FIL_, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, 0xAF, + 0x8A, FIL_, 0x01, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, + 0xAF, 0x8B, FIL_, 0x01, 0xE0, 0xB1, 0x96, FIL_, + 0xE0, 0xB1, 0x88, FIL_, 0x01, 0xE0, 0xB3, 0x95, + FIL_, 0xE0, 0xB3, 0x80, FIL_, 0x03, 0xE0, 0xB3, + 0x82, FIL_, 0xE0, 0xB3, 0x8A, FIL_, 0xE0, 0xB3, + 0x96, FIL_, 0xE0, 0xB3, 0x88, FIL_, 0xE0, 0xB3, + 0x95, FIL_, 0xE0, 0xB3, 0x87, FIL_, 0x01, 0xE0, + 0xB3, 0x95, FIL_, 0xE0, 0xB3, 0x8B, FIL_, 0x02, + 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8A, FIL_, + 0xE0, 0xB5, 0x97, FIL_, 0xE0, 0xB5, 0x8C, FIL_, + 0x01, 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8B, + FIL_, 0x03, 0xE0, 0xB7, 0x9F, FIL_, 0xE0, 0xB7, + 0x9E, FIL_, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, 0xB7, + 0x9A, FIL_, 0xE0, 0xB7, 0x8F, FIL_, 0xE0, 0xB7, + 0x9C, FIL_, 0x01, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, + 0xB7, 0x9D, FIL_, 0x01, 0xE1, 0x80, 0xAE, FIL_, + 0xE1, 0x80, 0xA6, FIL_, 0x01, 0xE1, 0xAC, 0xB5, + FIL_, 0xE1, 0xAC, 0x86, FIL_, 0x01, 0xE1, 0xAC, + 0xB5, FIL_, 0xE1, 0xAC, 0x88, FIL_, 0x01, 0xE1, + 0xAC, 0xB5, FIL_, 0xE1, 0xAC, 0x8A, FIL_, 0x01, + 0xE1, 0xAC, 0xB5, FIL_, 0xE1, 0xAC, 0x8C, FIL_, + 0x01, 0xE1, 0xAC, 0xB5, FIL_, 0xE1, 0xAC, 0x8E, + FIL_, 0x01, 0xE1, 0xAC, 0xB5, FIL_, 0xE1, 0xAC, + 0x92, FIL_, 0x01, 0xE1, 0xAC, 0xB5, FIL_, 0xE1, + 0xAC, 0xBB, FIL_, 0x01, 0xE1, 0xAC, 0xB5, FIL_, + 0xE1, 0xAC, 0xBD, FIL_, 0x01, 0xE1, 0xAC, 0xB5, + FIL_, 0xE1, 0xAD, 0x80, FIL_, 0x01, 0xE1, 0xAC, + 0xB5, FIL_, 0xE1, 0xAD, 0x81, FIL_, 0x01, 0xE1, + 0xAC, 0xB5, FIL_, 0xE1, 0xAD, 0x83, FIL_, 0x01, + 0xCC, 0x84, FIL_, 0xE1, 0xB8, 0xB8, FIL_, 0x01, + 0xCC, 0x84, FIL_, 0xE1, 0xB8, 0xB9, FIL_, 0x01, + 0xCC, 0x84, FIL_, 0xE1, 0xB9, 0x9C, FIL_, 0x01, + 0xCC, 0x84, FIL_, 0xE1, 0xB9, 0x9D, FIL_, 0x01, + 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA8, FIL_, 0x01, + 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA9, FIL_, 0x02, + 0xCC, 0x86, FIL_, 0xE1, 0xBA, 0xB6, FIL_, 0xCC, + 0x82, FIL_, 0xE1, 0xBA, 0xAC, FIL_, 0x02, 0xCC, + 0x82, FIL_, 0xE1, 0xBA, 0xAD, FIL_, 0xCC, 0x86, + FIL_, 0xE1, 0xBA, 0xB7, FIL_, 0x01, 0xCC, 0x82, + FIL_, 0xE1, 0xBB, 0x86, FIL_, 0x01, 0xCC, 0x82, + FIL_, 0xE1, 0xBB, 0x87, FIL_, 0x01, 0xCC, 0x82, + FIL_, 0xE1, 0xBB, 0x98, FIL_, 0x01, 0xCC, 0x82, + FIL_, 0xE1, 0xBB, 0x99, FIL_, 0x04, 0xCD, 0x85, + FIL_, 0xE1, 0xBE, 0x80, FIL_, 0xCD, 0x82, FIL_, + 0xE1, 0xBC, 0x86, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBC, 0x82, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, + 0x84, FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBC, + 0x87, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x85, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x83, FIL_, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x81, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x82, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x83, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x84, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x85, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x86, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x87, FIL_, 0x04, + 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x8C, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBC, 0x8A, FIL_, 0xCD, 0x85, + FIL_, 0xE1, 0xBE, 0x88, FIL_, 0xCD, 0x82, FIL_, + 0xE1, 0xBC, 0x8E, FIL_, 0x04, 0xCC, 0x80, FIL_, + 0xE1, 0xBC, 0x8B, FIL_, 0xCD, 0x82, FIL_, 0xE1, + 0xBC, 0x8F, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, + 0x8D, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x89, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8A, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8B, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8C, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8D, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8E, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8F, + FIL_, 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x92, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x94, FIL_, + 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x93, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x95, FIL_, 0x02, + 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x9A, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBC, 0x9C, FIL_, 0x02, 0xCC, + 0x80, FIL_, 0xE1, 0xBC, 0x9B, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBC, 0x9D, FIL_, 0x04, 0xCC, 0x80, + FIL_, 0xE1, 0xBC, 0xA2, FIL_, 0xCC, 0x81, FIL_, + 0xE1, 0xBC, 0xA4, FIL_, 0xCD, 0x82, FIL_, 0xE1, + 0xBC, 0xA6, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, + 0x90, FIL_, 0x04, 0xCD, 0x85, FIL_, 0xE1, 0xBE, + 0x91, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xA5, + FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xA7, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xA3, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x92, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x93, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x94, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x95, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x96, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x97, FIL_, 0x04, + 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xAC, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBC, 0xAA, FIL_, 0xCD, 0x85, + FIL_, 0xE1, 0xBE, 0x98, FIL_, 0xCD, 0x82, FIL_, + 0xE1, 0xBC, 0xAE, FIL_, 0x04, 0xCD, 0x82, FIL_, + 0xE1, 0xBC, 0xAF, FIL_, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, + 0xAD, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xAB, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9A, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9B, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9C, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9D, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9E, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9F, + FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xB4, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xB2, FIL_, + 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xB6, FIL_, 0x03, + 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xB3, FIL_, 0xCD, + 0x82, FIL_, 0xE1, 0xBC, 0xB7, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBC, 0xB5, FIL_, 0x03, 0xCC, 0x81, + FIL_, 0xE1, 0xBC, 0xBC, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBC, 0xBA, FIL_, 0xCD, 0x82, FIL_, 0xE1, + 0xBC, 0xBE, FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, + 0xBC, 0xBB, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, + 0xBF, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xBD, + FIL_, 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x82, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x84, FIL_, + 0x02, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x85, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x83, FIL_, 0x02, + 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x8A, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBD, 0x8C, FIL_, 0x02, 0xCC, + 0x80, FIL_, 0xE1, 0xBD, 0x8B, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBD, 0x8D, FIL_, 0x03, 0xCD, 0x82, + FIL_, 0xE1, 0xBD, 0x96, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBD, 0x92, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xBD, 0x94, FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, + 0xBD, 0x93, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, + 0x97, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x95, + FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x9B, + FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0x9F, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x9D, FIL_, 0x04, + 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xA6, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA0, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBD, 0xA2, FIL_, 0xCC, 0x81, FIL_, + 0xE1, 0xBD, 0xA4, FIL_, 0x04, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0xA1, FIL_, 0xCD, 0x82, FIL_, 0xE1, + 0xBD, 0xA7, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, + 0xA5, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xA3, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA2, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA3, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA4, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA5, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA6, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA7, + FIL_, 0x04, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xAA, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0xAC, FIL_, + 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xAE, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA8, FIL_, 0x04, 0xCD, + 0x82, FIL_, 0xE1, 0xBD, 0xAF, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBD, 0xAB, FIL_, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0xA9, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xBD, 0xAD, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xAA, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xAB, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xAC, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xAD, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xAE, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xAF, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xB2, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBF, 0x82, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBF, 0xB2, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xB7, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, + 0xBF, 0x8E, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, + 0x8D, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0x8F, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0x87, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB7, + FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x9D, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBF, 0x9E, FIL_, + 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0x9F, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x86, 0x9A, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x86, 0x9B, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x86, 0xAE, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x87, 0x8D, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x87, 0x8F, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x87, 0x8E, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0x84, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0x89, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0x8C, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0xA4, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0xA6, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x81, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x84, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x87, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x89, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAD, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xA2, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB0, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB1, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB4, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB5, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB8, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB9, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x80, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x81, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA0, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA1, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x84, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x85, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x88, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x89, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA2, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA3, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAC, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAD, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAE, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAF, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAA, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAB, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAC, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAD, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0x94, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x8C, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, + 0x8E, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x81, 0x90, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x81, 0x92, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x81, 0x94, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x81, 0x96, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x81, 0x98, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x9A, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x9C, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, + 0x9E, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x81, 0xA0, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x81, 0xA2, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x81, 0xA5, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x81, 0xA7, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x81, 0xA9, FIL_, 0x02, + 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xB1, FIL_, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB0, FIL_, + 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB3, + FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xB4, + FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, + 0xB6, FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, + 0xB7, FIL_, 0x02, 0xE3, 0x82, 0x9A, FIL_, 0xE3, + 0x81, 0xBA, FIL_, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x81, 0xB9, FIL_, 0x02, 0xE3, 0x82, 0x9A, FIL_, + 0xE3, 0x81, 0xBD, FIL_, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x81, 0xBC, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x82, 0x9E, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x83, 0xB4, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x82, 0xAC, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xAE, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB0, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, + 0xB2, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x82, 0xB4, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x82, 0xB6, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x82, 0xB8, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x82, 0xBA, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x82, 0xBC, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xBE, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x80, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, + 0x82, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x83, 0x85, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x83, 0x87, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x83, 0x89, FIL_, 0x02, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x83, 0x90, FIL_, 0xE3, 0x82, + 0x9A, FIL_, 0xE3, 0x83, 0x91, FIL_, 0x02, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x83, 0x93, FIL_, 0xE3, + 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x94, FIL_, 0x02, + 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x97, FIL_, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x96, FIL_, + 0x02, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x9A, + FIL_, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x99, + FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, + 0x9C, FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x83, + 0x9D, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x83, 0xB7, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x83, 0xB8, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x83, 0xB9, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x83, 0xBA, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x83, 0xBE, FIL_, + }, +}; + +static const uchar_t u8_decomp_b2_tbl[2][2][256] = { + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, 2, 3, 4, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 5, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 6, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 7, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, 2, 3, 4, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 5, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 6, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 7, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + +}; + +static const u8_displacement_t u8_decomp_b3_tbl[2][8][256] = { + { + { /* Third byte table 0. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0, 0 }, + { 1, 35 }, { 2, 247 }, { 3, 474 }, + { 4, 693 }, { 5, 709 }, { 6, 951 }, + { N_, 0 }, { 7, 1139 }, { 8, 1152 }, + { N_, 0 }, { 9, 1177 }, { 10, 1199 }, + { 11, 1295 }, { 12, 1360 }, { 13, 1405 }, + { N_, 0 }, { 14, 1450 }, { N_, 0 }, + { N_, 0 }, { 15, 1620 }, { N_, 0 }, + { 16, 1624 }, { 17, 1649 }, { N_, 0 }, + { 18, 1665 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 19, 1680 }, + { 20, 1701 }, { N_, 0 }, { 21, 1757 }, + { 22, 1792 }, { 23, 1806 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 24, 1834 }, + { 25, 1869 }, { 26, 1876 }, { N_, 0 }, + { 27, 1897 }, { N_, 0 }, { 28, 1904 }, + { N_, 0 }, { 29, 1942 }, { N_, 0 }, + { 30, 1963 }, { 31, 1994 }, { N_, 0 }, + { 32, 2000 }, { 33, 2006 }, { 34, 2018 }, + { 35, 2021 }, { 36, 2109 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 37, 2158 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 0x8000, 2165 }, { 0x8001, 2445 }, + { 0x8002, 2741 }, { 0x8003, 3029 }, { 0x8004, 3337 }, + { 0x8005, 3725 }, { 0x8006, 4053 }, { 0x8007, 4536 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 38, 4895 }, + { 39, 4964 }, { 40, 4999 }, { N_, 0 }, + { 41, 5018 }, { 42, 5098 }, { 43, 5230 }, + { 44, 5248 }, { 45, 5266 }, { 46, 5326 }, + { 47, 5410 }, { 48, 5470 }, { 49, 5518 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 50, 5526 }, { 51, 5596 }, + { 52, 5767 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 53, 5810 }, { 54, 5822 }, { N_, 0 }, + { 55, 5830 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 56, 5836 }, { 57, 5839 }, { 58, 5842 }, + { 59, 6034 }, { 60, 6226 }, { 61, 6418 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 62, 6484 }, + { 63, 6497 }, { 64, 6672 }, { 65, 6770 }, + { 66, 6923 }, { 67, 6968 }, { 68, 7160 }, + { N_, 0 }, { 0x8008, 7247 }, { 69, 7597 }, + { 70, 7773 }, { 71, 7950 }, { 0x8009, 8142 }, + { 0x800A, 8919 }, { 72, 9351 }, { 73, 9522 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 5. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0x800B, 9743 }, + { 0x800C, 9999 }, { 0x800D, 10255 }, { 0x800E, 10511 }, + { 74, 10767 }, { 75, 10967 }, { N_, 0 }, + { N_, 0 }, { 76, 11139 }, { 77, 11303 }, + { 78, 11468 }, { 79, 11576 }, { 0x800F, 11740 }, + { 0x8010, 12006 }, { 0x8011, 12280 }, { 0x8012, 12546 }, + { 80, 12812 }, { 0x8013, 13060 }, { 0x8014, 13348 }, + { 81, 13720 }, { 82, 13898 }, { 83, 13933 }, + { 84, 14045 }, { 85, 14197 }, { 86, 14347 }, + { 87, 14410 }, { 88, 14540 }, { 89, 14729 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 6. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 90, 14829 }, { 91, 14912 }, + { 92, 14969 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 93, 14982 }, { 94, 15046 }, { 95, 15109 }, + { 96, 15163 }, { 97, 15225 }, { 98, 15282 }, + { 99, 15341 }, { 100, 15405 }, { 101, 15469 }, + { 102, 15533 }, { 103, 15597 }, { 104, 15681 }, + { 105, 15812 }, { 106, 15942 }, { 107, 16072 }, + { 108, 16202 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 7. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 0x8015, 16273 }, { 0x8016, 16536 }, + { 0x8017, 16799 }, { 0x8018, 17064 }, { 0x8019, 17329 }, + { 0x801A, 17601 }, { 0x801B, 17878 }, { 0x801C, 18147 }, + { 109, 18419 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + }, + { + { /* Third byte table 0. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0, 0 }, + { 1, 35 }, { 2, 247 }, { 3, 474 }, + { 4, 693 }, { 5, 709 }, { 6, 951 }, + { N_, 0 }, { 7, 1139 }, { 8, 1152 }, + { N_, 0 }, { 9, 1177 }, { 10, 1199 }, + { 11, 1295 }, { 12, 1362 }, { 13, 1407 }, + { N_, 0 }, { 14, 1452 }, { N_, 0 }, + { N_, 0 }, { 15, 1622 }, { N_, 0 }, + { 16, 1626 }, { 17, 1651 }, { N_, 0 }, + { 18, 1667 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 19, 1682 }, + { 20, 1703 }, { N_, 0 }, { 21, 1759 }, + { 22, 1794 }, { 23, 1808 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 24, 1836 }, + { 25, 1871 }, { 26, 1878 }, { N_, 0 }, + { 27, 1899 }, { N_, 0 }, { 28, 1906 }, + { N_, 0 }, { 29, 1944 }, { N_, 0 }, + { 30, 1965 }, { 31, 1996 }, { N_, 0 }, + { 32, 2002 }, { 33, 2008 }, { 34, 2020 }, + { 35, 2023 }, { 36, 2111 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 37, 2160 }, + { N_, 0 }, { N_, 0 }, { 38, 2167 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 39, 2170 }, { 40, 2226 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 41, 2247 }, { 42, 2268 }, { 43, 2340 }, + { N_, 0 }, { 0x8000, 2414 }, { 0x8001, 2694 }, + { 0x8002, 2990 }, { 0x8003, 3278 }, { 0x8004, 3586 }, + { 0x8005, 3974 }, { 0x8006, 4302 }, { 0x8007, 4785 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 44, 5144 }, + { 45, 5213 }, { 46, 5248 }, { N_, 0 }, + { 47, 5273 }, { 48, 5358 }, { 49, 5490 }, + { 50, 5508 }, { 51, 5526 }, { 52, 5586 }, + { 53, 5670 }, { 54, 5730 }, { 55, 5778 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 56, 5786 }, { 57, 5856 }, + { 58, 6027 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 59, 6070 }, { 60, 6082 }, { N_, 0 }, + { 61, 6090 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 62, 6096 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 63, 6099 }, { 64, 6102 }, { 65, 6105 }, + { 66, 6297 }, { 67, 6489 }, { 68, 6681 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 69, 6747 }, + { 70, 6760 }, { 71, 6935 }, { 72, 7033 }, + { 73, 7186 }, { 74, 7231 }, { 75, 7423 }, + { N_, 0 }, { 0x8008, 7510 }, { 76, 7891 }, + { 77, 8103 }, { 78, 8280 }, { 0x8009, 8482 }, + { 0x800A, 9259 }, { 79, 9701 }, { 80, 9872 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 5. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0x800B, 10106 }, + { 0x800C, 10362 }, { 0x800D, 10618 }, { 0x800E, 10874 }, + { 81, 11130 }, { 82, 11330 }, { 0x800F, 11566 }, + { 83, 11822 }, { 84, 11932 }, { 85, 12096 }, + { 86, 12261 }, { 87, 12369 }, { 0x8010, 12533 }, + { 0x8011, 12799 }, { 0x8012, 13073 }, { 0x8013, 13339 }, + { 88, 13605 }, { 0x8014, 13853 }, { 0x8015, 14141 }, + { 89, 14513 }, { 90, 14691 }, { 91, 14746 }, + { 92, 14860 }, { 93, 15012 }, { 94, 15162 }, + { 95, 15225 }, { 96, 15355 }, { 97, 15544 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 6. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 98, 15644 }, { 99, 15727 }, + { 100, 15784 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 101, 15797 }, { 102, 15861 }, { 103, 15924 }, + { 104, 15978 }, { 105, 16041 }, { 106, 16098 }, + { 107, 16157 }, { 108, 16221 }, { 109, 16285 }, + { 110, 16349 }, { 111, 16413 }, { 112, 16501 }, + { 113, 16632 }, { 114, 16762 }, { 115, 16892 }, + { 116, 17022 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 7. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 0x8016, 17097 }, { 0x8017, 17360 }, + { 0x8018, 17623 }, { 0x8019, 17888 }, { 0x801A, 18153 }, + { 0x801B, 18425 }, { 0x801C, 18702 }, { 0x801D, 18971 }, + { 117, 19243 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + }, +}; + +static const uchar_t u8_decomp_b4_tbl[2][118][257] = { + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, + 1, 4, 4, 5, 5, 5, 5, 5, + 8, 8, 8, 9, 10, 13, 15, 15, + 15, 18, 19, 20, 20, 25, 30, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 24, + 28, 32, 36, 40, 44, 48, 52, 56, + 60, 60, 64, 68, 72, 76, 80, 84, + 84, 84, 88, 92, 96, 100, 104, 104, + 104, 108, 112, 116, 120, 124, 128, 128, + 132, 136, 140, 144, 148, 152, 156, 160, + 164, 164, 168, 172, 176, 180, 184, 188, + 188, 188, 192, 196, 200, 204, 208, 208, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 64, 64, 68, 72, 76, 80, 84, + 88, 92, 96, 100, 104, 108, 112, 116, + 120, 124, 128, 132, 136, 140, 144, 144, + 144, 148, 152, 156, 160, 164, 168, 172, + 176, 180, 180, 182, 184, 188, 192, 196, + 200, 200, 204, 208, 212, 216, 220, 224, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 3, 7, 11, 15, 19, + 23, 27, 30, 30, 30, 34, 38, 42, + 46, 50, 54, 54, 54, 58, 62, 66, + 70, 74, 78, 82, 86, 90, 94, 98, + 102, 106, 110, 114, 118, 122, 126, 126, + 126, 130, 134, 138, 142, 146, 150, 154, + 158, 162, 166, 170, 174, 178, 182, 186, + 190, 194, 198, 202, 206, 210, 214, 218, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 12, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 4, 8, 12, + 14, 16, 18, 20, 22, 24, 28, 32, + 36, 40, 44, 48, 52, 56, 62, 68, + 74, 80, 86, 92, 98, 104, 104, 110, + 116, 122, 128, 133, 138, 138, 138, 142, + 146, 150, 154, 158, 162, 168, 174, 179, + 184, 188, 190, 192, 194, 198, 202, 202, + 202, 206, 210, 216, 222, 227, 232, 237, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 112, 112, 116, + 120, 120, 120, 120, 120, 120, 120, 124, + 128, 132, 136, 142, 148, 154, 160, 164, + 168, 174, 180, 184, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 3, 4, 5, 7, 9, 11, + 12, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 20, 21, 22, 23, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 6, 9, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 17, 17, 17, + 17, 17, 17, 20, 20, 20, 20, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 14, 19, + 22, 27, 32, 37, 37, 42, 42, 47, + 52, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 64, 69, 74, 79, 84, + 89, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 5, 10, 15, 20, 25, + 25, 27, 29, 31, 41, 51, 53, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 57, 59, 61, 61, 63, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 10, 15, 15, 15, 15, + 20, 20, 20, 20, 20, 25, 30, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 10, 15, 15, 15, 15, + 20, 20, 20, 20, 20, 25, 30, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 40, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 5, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, + 10, 15, 20, 25, 30, 30, 30, 35, + 40, 40, 40, 45, 50, 55, 60, 65, + 70, 70, 70, 75, 80, 85, 90, 95, + 100, 100, 100, 105, 110, 115, 120, 125, + 130, 135, 140, 145, 150, 155, 160, 160, + 160, 165, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 5, 10, 15, 20, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 8, + 12, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 5, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 7, 7, 7, 7, 7, + 7, 7, 14, 14, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 21, 28, 35, 42, 49, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 21, 28, 28, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 7, 7, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 14, 21, 21, 21, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 7, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 28, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 7, 7, 7, 7, 7, + 14, 21, 21, 28, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 7, 14, 24, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 14, 14, + 14, 14, 14, 21, 21, 21, 21, 21, + 28, 28, 28, 28, 28, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 49, 49, 56, 63, + 72, 79, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 21, 21, + 21, 21, 21, 28, 28, 28, 28, 28, + 35, 35, 35, 35, 35, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 21, 21, 21, 21, + 21, 21, 24, 24, 24, 24, 24, 24, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 28, 30, 33, + 33, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 34, 40, 49, 49, 55, + 64, 64, 64, 64, 64, 66, 66, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, + }, + { /* Fourth byte table 39. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 2, 4, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 18, 18, 18, 18, 18, 18, 18, 18, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 20, 21, 21, 21, 22, 23, 24, + 25, 26, 27, 28, 31, 32, 33, 34, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 40. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 14, 15, 16, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, + }, + { /* Fourth byte table 41. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 7, 10, 10, 13, 16, + 18, 18, 21, 22, 23, 24, 25, 26, + 28, 29, 30, 31, 32, 32, 33, 35, + 35, 35, 36, 37, 38, 39, 40, 40, + 40, 42, 45, 47, 47, 48, 48, 51, + 51, 52, 52, 54, 58, 59, 60, 60, + 61, 62, 63, 63, 64, 65, 67, 69, + 71, 73, 74, 74, 74, 74, 76, 78, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, + }, + { /* Fourth byte table 42. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 3, 3, 3, 4, 5, + 6, 7, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 13, 18, 23, 28, + 33, 38, 43, 48, 53, 58, 63, 68, + 72, 73, 75, 78, 80, 81, 83, 86, + 90, 92, 93, 95, 98, 99, 100, 101, + 102, 103, 105, 108, 110, 111, 113, 116, + 120, 122, 123, 125, 128, 129, 130, 131, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, + }, + { /* Fourth byte table 43. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 6, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, + }, + { /* Fourth byte table 44. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 6, 12, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, + }, + { /* Fourth byte table 45. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 6, 6, + 6, 6, 12, 12, 12, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 24, 24, 30, + 30, 30, 30, 30, 30, 36, 45, 45, + 51, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, + }, + { /* Fourth byte table 46. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 6, 6, 6, 12, 12, 12, + 18, 18, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 28, 28, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 40, 44, + 48, 54, 60, 60, 60, 66, 72, 72, + 72, 78, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, + }, + { /* Fourth byte table 47. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 12, 12, 18, 24, 24, + 24, 30, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 42, 48, 54, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, + }, + { /* Fourth byte table 48. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 24, 24, 24, + 24, 24, 24, 30, 36, 42, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 49. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, + }, + { /* Fourth byte table 50. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 11, 13, 15, 17, 19, 21, + 23, 25, 27, 29, 31, 34, 37, 40, + 43, 46, 49, 52, 55, 58, 62, 66, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, + }, + { /* Fourth byte table 51. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 53, 56, 59, 62, 65, 68, + 71, 74, 77, 80, 83, 86, 89, 92, + 95, 98, 101, 104, 107, 110, 113, 116, + 119, 122, 125, 128, 131, 134, 137, 140, + 143, 146, 149, 152, 155, 158, 161, 162, + 163, 164, 165, 166, 167, 168, 169, 170, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, + }, + { /* Fourth byte table 52. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, + }, + { /* Fourth byte table 53. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, + }, + { /* Fourth byte table 54. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 5, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, + }, + { /* Fourth byte table 55. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, + }, + { /* Fourth byte table 56. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 57. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 58. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 59. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 60. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 61. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, + }, + { /* Fourth byte table 62. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 4, + 4, 7, 10, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, + }, + { /* Fourth byte table 63. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 7, 7, 14, + 14, 21, 21, 28, 28, 35, 35, 42, + 42, 49, 49, 56, 56, 63, 63, 70, + 70, 77, 77, 84, 84, 84, 91, 91, + 98, 98, 105, 105, 105, 105, 105, 105, + 105, 112, 119, 119, 126, 133, 133, 140, + 147, 147, 154, 161, 161, 168, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, + }, + { /* Fourth byte table 64. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 7, 7, 7, + 7, 7, 7, 7, 11, 15, 15, 22, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 35, 35, 42, + 42, 49, 49, 56, 56, 63, 63, 70, + 70, 77, 77, 84, 84, 91, 91, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, + }, + { /* Fourth byte table 65. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 14, 14, 14, 21, 21, + 28, 28, 35, 35, 35, 35, 35, 35, + 35, 42, 49, 49, 56, 63, 63, 70, + 77, 77, 84, 91, 91, 98, 105, 105, + 105, 105, 105, 105, 105, 105, 105, 105, + 105, 105, 105, 105, 105, 105, 105, 105, + 105, 105, 105, 105, 105, 112, 112, 112, + 119, 126, 133, 140, 140, 140, 140, 147, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, + }, + { /* Fourth byte table 66. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 6, 9, 12, 15, 18, + 21, 24, 27, 30, 33, 36, 39, 42, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, + }, + { /* Fourth byte table 67. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 68. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 45, 45, 45, 48, 51, 54, 57, 60, + 63, 66, 69, 72, 75, 78, 81, 84, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 69. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 15, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, + 50, 53, 56, 59, 62, 65, 68, 71, + 74, 77, 80, 83, 86, 89, 92, 98, + 104, 110, 116, 122, 128, 134, 140, 146, + 152, 158, 164, 170, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, + }, + { /* Fourth byte table 70. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 149, 151, 153, 155, 157, 159, + 161, 163, 165, 167, 169, 171, 173, 175, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, + }, + { /* Fourth byte table 71. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 41, 46, 51, 51, 51, 51, + 51, 54, 57, 60, 63, 66, 69, 72, + 75, 78, 81, 84, 87, 90, 93, 96, + 99, 102, 105, 108, 111, 114, 117, 120, + 123, 126, 129, 132, 135, 138, 141, 144, + 147, 150, 153, 156, 159, 162, 165, 168, + 171, 174, 177, 180, 183, 186, 189, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 72. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 7, 9, 11, 13, 15, + 17, 20, 24, 26, 28, 31, 34, 36, + 38, 40, 43, 46, 49, 52, 55, 57, + 59, 61, 63, 65, 68, 70, 72, 74, + 77, 80, 82, 85, 88, 91, 93, 96, + 101, 107, 109, 112, 115, 118, 121, 128, + 136, 138, 140, 143, 145, 147, 149, 152, + 154, 156, 158, 160, 162, 165, 167, 169, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, + }, + { /* Fourth byte table 73. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 10, 12, 14, 16, 22, + 25, 27, 29, 31, 33, 35, 37, 39, + 41, 43, 45, 48, 50, 52, 55, 58, + 60, 64, 67, 69, 71, 73, 75, 75, + 75, 79, 83, 87, 91, 95, 99, 103, + 107, 111, 116, 121, 126, 131, 136, 141, + 146, 151, 156, 161, 166, 171, 176, 181, + 186, 191, 196, 201, 206, 211, 216, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, + }, + { /* Fourth byte table 74. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 56, + 56, 60, 60, 64, 64, 64, 68, 72, + 76, 80, 84, 88, 92, 96, 100, 104, + 104, 108, 108, 112, 112, 112, 116, 120, + 120, 120, 120, 124, 128, 132, 136, 136, + 136, 140, 144, 148, 152, 156, 160, 164, + 168, 172, 176, 180, 184, 188, 192, 196, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, + }, + { /* Fourth byte table 75. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, + }, + { /* Fourth byte table 76. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 9, 12, 14, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 20, 24, 28, 32, + 36, 36, 36, 36, 36, 36, 41, 41, + 46, 48, 50, 52, 54, 56, 58, 60, + 62, 64, 65, 70, 75, 82, 89, 94, + 99, 104, 109, 114, 119, 124, 129, 134, + 134, 139, 144, 149, 154, 159, 159, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, + }, + { /* Fourth byte table 77. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 10, 15, 20, 20, 25, + 30, 35, 40, 45, 50, 55, 60, 65, + 69, 71, 73, 75, 77, 79, 81, 83, + 85, 87, 89, 91, 93, 95, 97, 99, + 101, 103, 105, 107, 109, 111, 113, 115, + 117, 119, 121, 123, 125, 127, 129, 131, + 133, 135, 137, 139, 141, 143, 145, 147, + 149, 151, 153, 155, 157, 159, 161, 163, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, + }, + { /* Fourth byte table 78. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 76, 80, 82, + 84, 86, 88, 90, 92, 94, 96, 98, + 100, 104, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, + }, + { /* Fourth byte table 79. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 4, 6, 8, + 10, 12, 14, 16, 18, 20, 24, 26, + 28, 30, 32, 34, 36, 38, 40, 42, + 44, 46, 48, 54, 60, 66, 72, 78, + 84, 90, 96, 102, 108, 114, 120, 126, + 132, 138, 144, 150, 156, 158, 160, 162, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, + }, + { /* Fourth byte table 80. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, + }, + { /* Fourth byte table 81. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 54, 60, 68, 76, 84, 92, 100, + 108, 116, 122, 155, 170, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, + }, + { /* Fourth byte table 82. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 5, 8, 9, 10, 11, 12, + 13, 14, 17, 20, 23, 26, 29, 32, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 83. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 15, 15, + 15, 15, 18, 21, 24, 27, 28, 29, + 30, 31, 34, 35, 35, 36, 37, 38, + 39, 42, 43, 44, 45, 46, 49, 52, + 53, 54, 55, 56, 57, 58, 59, 60, + 60, 61, 62, 63, 64, 64, 64, 64, + 64, 67, 71, 74, 74, 77, 77, 80, + 84, 87, 91, 94, 98, 101, 105, 108, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, + }, + { /* Fourth byte table 84. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 6, 10, 14, 18, 22, 26, + 30, 34, 38, 42, 46, 50, 52, 54, + 56, 58, 60, 62, 64, 66, 68, 70, + 72, 74, 76, 78, 80, 82, 84, 86, + 88, 90, 92, 94, 96, 98, 100, 102, + 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, + 136, 138, 140, 142, 144, 146, 148, 150, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, + }, + { /* Fourth byte table 85. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, + 96, 98, 100, 102, 104, 106, 112, 118, + 124, 130, 136, 142, 146, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, + }, + { /* Fourth byte table 86. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 87. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 34, 37, 40, 43, 46, 49, 52, 55, + 58, 61, 64, 67, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 103, + 106, 109, 112, 115, 118, 121, 124, 127, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 88. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, + }, + { /* Fourth byte table 89. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 3, 6, 9, 12, 15, + 18, 18, 18, 21, 24, 27, 30, 33, + 36, 36, 36, 39, 42, 45, 48, 51, + 54, 54, 54, 57, 60, 63, 63, 63, + 63, 65, 67, 69, 72, 74, 76, 79, + 79, 82, 85, 88, 91, 94, 97, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, + }, + { /* Fourth byte table 90. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 9, + 18, 31, 44, 57, 70, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, + }, + { /* Fourth byte table 91. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 18, 31, 44, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, + }, + { /* Fourth byte table 92. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, + }, + { /* Fourth byte table 93. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 94. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 95. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 29, 30, + 31, 31, 31, 32, 32, 32, 33, 34, + 34, 34, 35, 36, 37, 38, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 50, 51, 51, 52, 53, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 96. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 2, 3, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 61, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 97. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 6, + 7, 8, 9, 10, 10, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 18, 19, + 20, 21, 22, 23, 24, 25, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 53, 54, 55, 56, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, + }, + { /* Fourth byte table 98. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 5, 6, + 6, 6, 6, 7, 8, 9, 10, 11, + 12, 13, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, + }, + { /* Fourth byte table 99. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 100. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 101. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 102. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 103. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 36, 36, 36, + 36, 38, 40, 42, 44, 46, 48, 50, + 52, 54, 56, 58, 60, 62, 64, 66, + 68, 70, 72, 74, 76, 78, 80, 82, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, + }, + { /* Fourth byte table 104. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 58, 60, 62, 64, + 66, 68, 70, 72, 74, 76, 78, 80, + 82, 84, 86, 88, 90, 92, 94, 96, + 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 123, 125, 127, 129, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, + }, + { /* Fourth byte table 105. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, + 81, 83, 85, 87, 89, 91, 93, 95, + 97, 99, 101, 103, 105, 107, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 106. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, + 81, 83, 85, 87, 89, 91, 93, 95, + 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 107. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, + 81, 83, 86, 88, 90, 92, 94, 96, + 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 108. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 9, 11, 13, 15, + 17, 19, 21, 21, 21, 21, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, 65, 66, 67, 68, 69, 70, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, + }, + { /* Fourth byte table 109. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 9, 13, 17, 21, 25, 29, + 33, 37, 42, 46, 50, 54, 58, 62, + 66, 71, 75, 80, 85, 90, 94, 98, + 102, 106, 110, 114, 118, 122, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, + }, + { /* Fourth byte table 110. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 111. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 112. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 113. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 114. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 115. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 116. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 117. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + }, + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, + 1, 4, 4, 5, 5, 5, 5, 5, + 8, 8, 8, 9, 10, 13, 15, 15, + 15, 18, 19, 20, 20, 25, 30, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 24, + 28, 32, 36, 40, 44, 48, 52, 56, + 60, 60, 64, 68, 72, 76, 80, 84, + 84, 84, 88, 92, 96, 100, 104, 104, + 104, 108, 112, 116, 120, 124, 128, 128, + 132, 136, 140, 144, 148, 152, 156, 160, + 164, 164, 168, 172, 176, 180, 184, 188, + 188, 188, 192, 196, 200, 204, 208, 208, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 64, 64, 68, 72, 76, 80, 84, + 88, 92, 96, 100, 104, 108, 112, 116, + 120, 124, 128, 132, 136, 140, 144, 144, + 144, 148, 152, 156, 160, 164, 168, 172, + 176, 180, 180, 182, 184, 188, 192, 196, + 200, 200, 204, 208, 212, 216, 220, 224, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 3, 7, 11, 15, 19, + 23, 27, 30, 30, 30, 34, 38, 42, + 46, 50, 54, 54, 54, 58, 62, 66, + 70, 74, 78, 82, 86, 90, 94, 98, + 102, 106, 110, 114, 118, 122, 126, 126, + 126, 130, 134, 138, 142, 146, 150, 154, + 158, 162, 166, 170, 174, 178, 182, 186, + 190, 194, 198, 202, 206, 210, 214, 218, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 12, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 4, 8, 12, + 14, 16, 18, 20, 22, 24, 28, 32, + 36, 40, 44, 48, 52, 56, 62, 68, + 74, 80, 86, 92, 98, 104, 104, 110, + 116, 122, 128, 133, 138, 138, 138, 142, + 146, 150, 154, 158, 162, 168, 174, 179, + 184, 188, 190, 192, 194, 198, 202, 202, + 202, 206, 210, 216, 222, 227, 232, 237, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 112, 112, 116, + 120, 120, 120, 120, 120, 120, 120, 124, + 128, 132, 136, 142, 148, 154, 160, 164, + 168, 174, 180, 184, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 3, 4, 5, 7, 9, 11, + 12, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 20, 21, 22, 23, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 6, 9, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 17, 17, 17, + 17, 17, 17, 20, 20, 20, 20, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 14, 19, + 22, 27, 32, 37, 37, 42, 42, 47, + 52, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 64, 69, 74, 79, 84, + 89, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 5, 10, 15, 20, 25, + 25, 27, 29, 31, 41, 51, 53, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 57, 59, 61, 61, 63, 65, 65, + 65, 65, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 10, 15, 15, 15, 15, + 20, 20, 20, 20, 20, 25, 30, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 10, 15, 15, 15, 15, + 20, 20, 20, 20, 20, 25, 30, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 40, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 5, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, + 10, 15, 20, 25, 30, 30, 30, 35, + 40, 40, 40, 45, 50, 55, 60, 65, + 70, 70, 70, 75, 80, 85, 90, 95, + 100, 100, 100, 105, 110, 115, 120, 125, + 130, 135, 140, 145, 150, 155, 160, 160, + 160, 165, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 5, 10, 15, 20, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 8, + 12, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 5, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 7, 7, 7, 7, 7, + 7, 7, 14, 14, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 21, 28, 35, 42, 49, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 21, 28, 28, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 7, 7, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 14, 21, 21, 21, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 7, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 28, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 7, 7, 7, 7, 7, + 14, 21, 21, 28, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 7, 14, 24, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 14, 14, + 14, 14, 14, 21, 21, 21, 21, 21, + 28, 28, 28, 28, 28, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 49, 49, 56, 63, + 72, 79, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 21, 21, + 21, 21, 21, 28, 28, 28, 28, 28, + 35, 35, 35, 35, 35, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 39. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 7, 14, 14, 21, 21, 28, 28, 35, + 35, 35, 35, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 49, 49, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 40. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 14, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 41. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 3, 4, + 4, 5, 6, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 16, 17, 19, 20, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 42. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 6, 8, 11, + 12, 13, 14, 16, 18, 20, 21, 21, + 22, 23, 25, 26, 28, 31, 34, 35, + 36, 37, 40, 42, 43, 46, 48, 50, + 52, 54, 56, 57, 58, 59, 60, 62, + 64, 66, 68, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, + }, + { /* Fourth byte table 43. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 3, 5, 7, + 9, 10, 12, 14, 16, 18, 20, 22, + 25, 27, 29, 32, 34, 36, 38, 40, + 42, 44, 46, 48, 50, 52, 54, 56, + 58, 61, 63, 65, 66, 68, 70, 72, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, + }, + { /* Fourth byte table 44. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 21, 21, 21, 21, + 21, 21, 24, 24, 24, 24, 24, 24, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 28, 30, 33, + 33, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 34, 40, 49, 49, 55, + 64, 64, 64, 64, 64, 66, 66, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, + }, + { /* Fourth byte table 45. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 2, 4, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 18, 18, 18, 18, 18, 18, 18, 18, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 20, 21, 21, 21, 22, 23, 24, + 25, 26, 27, 28, 31, 32, 33, 34, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 46. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 14, 15, 16, 17, + 17, 18, 19, 20, 21, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, + 23, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 47. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 7, 10, 10, 13, 16, + 18, 18, 21, 22, 23, 24, 25, 26, + 28, 29, 30, 31, 32, 32, 33, 35, + 35, 35, 36, 37, 38, 39, 40, 40, + 40, 42, 45, 47, 47, 48, 48, 51, + 51, 52, 52, 54, 58, 59, 60, 60, + 61, 62, 63, 63, 64, 65, 67, 69, + 71, 73, 74, 74, 77, 79, 81, 83, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, + }, + { /* Fourth byte table 48. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 3, 3, 3, 4, 5, + 6, 7, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 13, 18, 23, 28, + 33, 38, 43, 48, 53, 58, 63, 68, + 72, 73, 75, 78, 80, 81, 83, 86, + 90, 92, 93, 95, 98, 99, 100, 101, + 102, 103, 105, 108, 110, 111, 113, 116, + 120, 122, 123, 125, 128, 129, 130, 131, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, + }, + { /* Fourth byte table 49. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 6, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, + }, + { /* Fourth byte table 50. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 6, 12, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, + }, + { /* Fourth byte table 51. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 6, 6, + 6, 6, 12, 12, 12, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 24, 24, 30, + 30, 30, 30, 30, 30, 36, 45, 45, + 51, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, + }, + { /* Fourth byte table 52. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 6, 6, 6, 12, 12, 12, + 18, 18, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 28, 28, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 40, 44, + 48, 54, 60, 60, 60, 66, 72, 72, + 72, 78, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, + }, + { /* Fourth byte table 53. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 12, 12, 18, 24, 24, + 24, 30, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 42, 48, 54, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, + }, + { /* Fourth byte table 54. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 24, 24, 24, + 24, 24, 24, 30, 36, 42, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 55. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, + }, + { /* Fourth byte table 56. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 11, 13, 15, 17, 19, 21, + 23, 25, 27, 29, 31, 34, 37, 40, + 43, 46, 49, 52, 55, 58, 62, 66, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, + }, + { /* Fourth byte table 57. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 53, 56, 59, 62, 65, 68, + 71, 74, 77, 80, 83, 86, 89, 92, + 95, 98, 101, 104, 107, 110, 113, 116, + 119, 122, 125, 128, 131, 134, 137, 140, + 143, 146, 149, 152, 155, 158, 161, 162, + 163, 164, 165, 166, 167, 168, 169, 170, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, + }, + { /* Fourth byte table 58. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, + }, + { /* Fourth byte table 59. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, + }, + { /* Fourth byte table 60. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 5, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, + }, + { /* Fourth byte table 61. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, + }, + { /* Fourth byte table 62. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 63. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 64. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 65. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 66. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 67. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 68. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, + }, + { /* Fourth byte table 69. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 4, + 4, 7, 10, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, + }, + { /* Fourth byte table 70. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 7, 7, 14, + 14, 21, 21, 28, 28, 35, 35, 42, + 42, 49, 49, 56, 56, 63, 63, 70, + 70, 77, 77, 84, 84, 84, 91, 91, + 98, 98, 105, 105, 105, 105, 105, 105, + 105, 112, 119, 119, 126, 133, 133, 140, + 147, 147, 154, 161, 161, 168, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, + }, + { /* Fourth byte table 71. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 7, 7, 7, + 7, 7, 7, 7, 11, 15, 15, 22, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 35, 35, 42, + 42, 49, 49, 56, 56, 63, 63, 70, + 70, 77, 77, 84, 84, 91, 91, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, + }, + { /* Fourth byte table 72. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 14, 14, 14, 21, 21, + 28, 28, 35, 35, 35, 35, 35, 35, + 35, 42, 49, 49, 56, 63, 63, 70, + 77, 77, 84, 91, 91, 98, 105, 105, + 105, 105, 105, 105, 105, 105, 105, 105, + 105, 105, 105, 105, 105, 105, 105, 105, + 105, 105, 105, 105, 105, 112, 112, 112, + 119, 126, 133, 140, 140, 140, 140, 147, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, + }, + { /* Fourth byte table 73. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 6, 9, 12, 15, 18, + 21, 24, 27, 30, 33, 36, 39, 42, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, + }, + { /* Fourth byte table 74. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 75. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 45, 45, 45, 48, 51, 54, 57, 60, + 63, 66, 69, 72, 75, 78, 81, 84, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 76. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 15, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 23, 25, 27, 29, 31, 33, 35, + 37, 39, 41, 43, 45, 47, 49, 51, + 53, 56, 59, 62, 65, 68, 71, 74, + 77, 80, 83, 86, 89, 92, 95, 101, + 107, 113, 119, 125, 131, 137, 143, 149, + 155, 161, 167, 173, 179, 194, 206, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, + }, + { /* Fourth byte table 77. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 149, 151, 153, 155, 157, 159, + 161, 163, 165, 167, 169, 171, 173, 175, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, + }, + { /* Fourth byte table 78. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 41, 46, 51, 53, 56, 58, + 61, 64, 67, 70, 73, 76, 79, 82, + 85, 88, 91, 94, 97, 100, 103, 106, + 109, 112, 115, 118, 121, 124, 127, 130, + 133, 136, 139, 142, 145, 148, 151, 154, + 157, 160, 163, 166, 169, 172, 175, 178, + 181, 184, 187, 190, 193, 196, 199, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, + }, + { /* Fourth byte table 79. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 7, 9, 11, 13, 15, + 17, 20, 24, 26, 28, 31, 34, 36, + 38, 40, 43, 46, 49, 52, 55, 57, + 59, 61, 63, 65, 68, 70, 72, 74, + 77, 80, 82, 85, 88, 91, 93, 96, + 101, 107, 109, 112, 115, 118, 121, 128, + 136, 138, 140, 143, 145, 147, 149, 152, + 154, 156, 158, 160, 162, 165, 167, 169, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, + }, + { /* Fourth byte table 80. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 10, 12, 14, 16, 22, + 25, 27, 29, 31, 33, 35, 37, 39, + 41, 43, 45, 48, 50, 52, 55, 58, + 60, 64, 67, 69, 71, 73, 75, 80, + 85, 89, 93, 97, 101, 105, 109, 113, + 117, 121, 126, 131, 136, 141, 146, 151, + 156, 161, 166, 171, 176, 181, 186, 191, + 196, 201, 206, 211, 216, 221, 226, 231, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, + }, + { /* Fourth byte table 81. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 56, + 56, 60, 60, 64, 64, 64, 68, 72, + 76, 80, 84, 88, 92, 96, 100, 104, + 104, 108, 108, 112, 112, 112, 116, 120, + 120, 120, 120, 124, 128, 132, 136, 136, + 136, 140, 144, 148, 152, 156, 160, 164, + 168, 172, 176, 180, 184, 188, 192, 196, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, + }, + { /* Fourth byte table 82. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 172, 172, 172, 172, + 172, 176, 180, 184, 188, 192, 196, 200, + 204, 208, 212, 216, 220, 224, 228, 232, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, + }, + { /* Fourth byte table 83. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 65, 70, 75, 79, 83, 87, 92, 97, + 102, 106, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, + }, + { /* Fourth byte table 84. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 9, 12, 14, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 20, 24, 28, 32, + 36, 36, 36, 36, 36, 36, 41, 41, + 46, 48, 50, 52, 54, 56, 58, 60, + 62, 64, 65, 70, 75, 82, 89, 94, + 99, 104, 109, 114, 119, 124, 129, 134, + 134, 139, 144, 149, 154, 159, 159, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, + }, + { /* Fourth byte table 85. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 10, 15, 20, 20, 25, + 30, 35, 40, 45, 50, 55, 60, 65, + 69, 71, 73, 75, 77, 79, 81, 83, + 85, 87, 89, 91, 93, 95, 97, 99, + 101, 103, 105, 107, 109, 111, 113, 115, + 117, 119, 121, 123, 125, 127, 129, 131, + 133, 135, 137, 139, 141, 143, 145, 147, + 149, 151, 153, 155, 157, 159, 161, 163, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, + }, + { /* Fourth byte table 86. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 76, 80, 82, + 84, 86, 88, 90, 92, 94, 96, 98, + 100, 104, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, + }, + { /* Fourth byte table 87. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 4, 6, 8, + 10, 12, 14, 16, 18, 20, 24, 26, + 28, 30, 32, 34, 36, 38, 40, 42, + 44, 46, 48, 54, 60, 66, 72, 78, + 84, 90, 96, 102, 108, 114, 120, 126, + 132, 138, 144, 150, 156, 158, 160, 162, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, + }, + { /* Fourth byte table 88. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, + }, + { /* Fourth byte table 89. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 54, 60, 68, 76, 84, 92, 100, + 108, 116, 122, 155, 170, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, + }, + { /* Fourth byte table 90. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 4, 7, 8, 9, 10, 11, + 14, 17, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 22, 25, 28, 29, 30, 31, 32, + 33, 34, 37, 40, 43, 46, 49, 52, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, + }, + { /* Fourth byte table 91. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 15, 15, + 16, 17, 20, 23, 26, 29, 30, 31, + 32, 33, 36, 37, 37, 38, 39, 40, + 41, 44, 45, 46, 47, 48, 51, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 62, 63, 64, 65, 66, 66, 66, 66, + 66, 69, 73, 76, 76, 79, 79, 82, + 86, 89, 93, 96, 100, 103, 107, 110, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, + }, + { /* Fourth byte table 92. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 6, 10, 14, 18, 22, 26, + 30, 34, 38, 42, 46, 50, 52, 54, + 56, 58, 60, 62, 64, 66, 68, 70, + 72, 74, 76, 78, 80, 82, 84, 86, + 88, 90, 92, 94, 96, 98, 100, 102, + 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, + 136, 138, 140, 142, 144, 146, 148, 150, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, + }, + { /* Fourth byte table 93. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, + 96, 98, 100, 102, 104, 106, 112, 118, + 124, 130, 136, 142, 146, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, + }, + { /* Fourth byte table 94. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 95. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 34, 37, 40, 43, 46, 49, 52, 55, + 58, 61, 64, 67, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 103, + 106, 109, 112, 115, 118, 121, 124, 127, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 96. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, + }, + { /* Fourth byte table 97. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 3, 6, 9, 12, 15, + 18, 18, 18, 21, 24, 27, 30, 33, + 36, 36, 36, 39, 42, 45, 48, 51, + 54, 54, 54, 57, 60, 63, 63, 63, + 63, 65, 67, 69, 72, 74, 76, 79, + 79, 82, 85, 88, 91, 94, 97, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, + }, + { /* Fourth byte table 98. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 9, + 18, 31, 44, 57, 70, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, + }, + { /* Fourth byte table 99. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 18, 31, 44, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, + }, + { /* Fourth byte table 100. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, + }, + { /* Fourth byte table 101. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 102. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 103. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 29, 30, + 31, 31, 31, 32, 32, 32, 33, 34, + 34, 34, 35, 36, 37, 38, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 50, 51, 51, 52, 53, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 104. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 105. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 6, + 7, 8, 9, 10, 10, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 18, 19, + 20, 21, 22, 23, 24, 25, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 53, 54, 55, 56, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, + }, + { /* Fourth byte table 106. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 5, 6, + 6, 6, 6, 7, 8, 9, 10, 11, + 12, 13, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, + }, + { /* Fourth byte table 107. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 108. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 109. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 110. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 111. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 38, 40, 40, + 40, 42, 44, 46, 48, 50, 52, 54, + 56, 58, 60, 62, 64, 66, 68, 70, + 72, 74, 76, 78, 80, 82, 84, 86, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, + }, + { /* Fourth byte table 112. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 58, 60, 62, 64, + 66, 68, 70, 72, 74, 76, 78, 80, + 82, 84, 86, 88, 90, 92, 94, 96, + 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 123, 125, 127, 129, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, + }, + { /* Fourth byte table 113. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, + 81, 83, 85, 87, 89, 91, 93, 95, + 97, 99, 101, 103, 105, 107, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 114. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, + 81, 83, 85, 87, 89, 91, 93, 95, + 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 115. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, + 81, 83, 86, 88, 90, 92, 94, 96, + 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 116. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 25, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 66, + 67, 68, 69, 70, 71, 72, 73, 74, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, + }, + { /* Fourth byte table 117. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 9, 13, 17, 21, 25, 29, + 33, 37, 42, 46, 50, 54, 58, 62, + 66, 71, 75, 80, 85, 90, 94, 98, + 102, 106, 110, 114, 118, 122, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, + }, + }, +}; + +static const uint16_t u8_decomp_b4_16bit_tbl[2][30][257] = { + { + { /* Fourth byte 16-bit table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 38, 44, 48, 52, 56, 60, 64, + 68, 72, 76, 80, 84, 90, 96, 102, + 108, 112, 116, 120, 124, 130, 136, 140, + 144, 148, 152, 156, 160, 164, 168, 172, + 176, 180, 184, 188, 192, 196, 200, 206, + 212, 216, 220, 224, 228, 232, 236, 240, + 244, 250, 256, 260, 264, 268, 272, 276, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, + }, + { /* Fourth byte 16-bit table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 54, 60, 66, + 72, 78, 84, 90, 96, 100, 104, 108, + 112, 116, 120, 124, 128, 134, 140, 144, + 148, 152, 156, 160, 164, 170, 176, 182, + 188, 194, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 262, 268, 274, 280, 284, 288, 292, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, + }, + { /* Fourth byte 16-bit table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 107, 116, 116, 116, 116, + 116, 120, 124, 128, 132, 138, 144, 150, + 156, 162, 168, 174, 180, 186, 192, 198, + 204, 210, 216, 222, 228, 234, 240, 246, + 252, 256, 260, 264, 268, 272, 276, 282, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, + }, + { /* Fourth byte 16-bit table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 52, 56, 60, 64, 68, 72, 76, + 80, 86, 92, 98, 104, 110, 116, 122, + 128, 134, 140, 146, 152, 158, 164, 170, + 176, 182, 188, 194, 200, 204, 208, 212, + 216, 222, 228, 234, 240, 246, 252, 258, + 264, 270, 276, 280, 284, 288, 292, 296, + 300, 304, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, + }, + { /* Fourth byte 16-bit table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 17, 24, 31, 38, 45, + 52, 57, 62, 69, 76, 83, 90, 97, + 104, 109, 114, 121, 128, 135, 142, 142, + 142, 147, 152, 159, 166, 173, 180, 180, + 180, 185, 190, 197, 204, 211, 218, 225, + 232, 237, 242, 249, 256, 263, 270, 277, + 284, 289, 294, 301, 308, 315, 322, 329, + 336, 341, 346, 353, 360, 367, 374, 381, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, + }, + { /* Fourth byte 16-bit table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 17, 24, 31, 38, 38, + 38, 43, 48, 55, 62, 69, 76, 76, + 76, 81, 86, 93, 100, 107, 114, 121, + 128, 128, 133, 133, 140, 140, 147, 147, + 154, 159, 164, 171, 178, 185, 192, 199, + 206, 211, 216, 223, 230, 237, 244, 251, + 258, 263, 268, 273, 278, 283, 288, 293, + 298, 303, 308, 313, 318, 323, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, + }, + { /* Fourth byte 16-bit table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 23, 32, 41, 50, 59, + 68, 75, 82, 91, 100, 109, 118, 127, + 136, 143, 150, 159, 168, 177, 186, 195, + 204, 211, 218, 227, 236, 245, 254, 263, + 272, 279, 286, 295, 304, 313, 322, 331, + 340, 347, 354, 363, 372, 381, 390, 399, + 408, 413, 418, 425, 430, 437, 437, 442, + 449, 454, 459, 464, 469, 474, 477, 480, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, + }, + { /* Fourth byte 16-bit table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 14, 21, 26, 33, 33, 38, + 45, 50, 55, 60, 65, 70, 82, 94, + 106, 111, 116, 123, 130, 130, 130, 135, + 142, 147, 152, 157, 162, 162, 174, 186, + 198, 203, 208, 215, 222, 227, 232, 237, + 244, 249, 254, 259, 264, 269, 280, 291, + 293, 293, 293, 300, 305, 312, 312, 317, + 324, 329, 334, 339, 344, 349, 356, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, + }, + { /* Fourth byte 16-bit table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 15, 20, 25, 30, 35, + 40, 45, 50, 55, 60, 65, 70, 78, + 86, 94, 102, 110, 118, 126, 134, 142, + 150, 158, 166, 174, 182, 190, 190, 190, + 190, 195, 200, 205, 210, 215, 220, 225, + 230, 235, 240, 245, 250, 255, 260, 265, + 270, 275, 280, 285, 290, 295, 300, 305, + 310, 315, 320, 325, 330, 335, 340, 345, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, + }, + { /* Fourth byte 16-bit table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 27, 42, 51, 66, 75, 84, + 102, 114, 123, 132, 141, 153, 165, 177, + 189, 201, 213, 225, 243, 249, 267, 285, + 300, 312, 330, 348, 360, 369, 378, 390, + 402, 417, 432, 441, 450, 462, 471, 480, + 486, 492, 501, 510, 528, 540, 555, 573, + 585, 594, 603, 621, 633, 651, 660, 675, + 684, 696, 705, 717, 732, 744, 759, 771, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, + }, + { /* Fourth byte 16-bit table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 24, 33, 45, 54, 63, 72, + 87, 99, 105, 123, 132, 147, 159, 171, + 180, 189, 201, 207, 219, 234, 240, 258, + 267, 271, 275, 279, 283, 287, 291, 295, + 299, 303, 307, 312, 317, 322, 327, 332, + 337, 342, 347, 352, 357, 362, 367, 372, + 377, 382, 385, 387, 389, 392, 394, 396, + 396, 396, 396, 396, 402, 408, 414, 420, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, + }, + { /* Fourth byte 16-bit table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 34, 38, + 42, 46, 50, 54, 58, 62, 66, 70, + 74, 78, 82, 86, 90, 94, 98, 102, + 106, 110, 114, 118, 122, 126, 130, 134, + 138, 142, 146, 150, 154, 158, 162, 166, + 170, 174, 178, 182, 186, 190, 194, 198, + 202, 206, 210, 214, 218, 222, 226, 230, + 234, 238, 242, 246, 250, 254, 258, 262, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, + }, + { /* Fourth byte 16-bit table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 125, + 130, 135, 140, 145, 150, 156, 162, 168, + 174, 180, 186, 190, 194, 198, 202, 206, + 210, 214, 218, 222, 226, 230, 234, 238, + 242, 246, 250, 254, 258, 262, 266, 270, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, + }, + { /* Fourth byte 16-bit table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 98, 104, 110, 116, 122, 126, 130, 134, + 138, 142, 146, 150, 154, 158, 162, 166, + 170, 174, 178, 182, 186, 190, 194, 198, + 202, 206, 210, 214, 218, 222, 226, 230, + 234, 238, 242, 246, 250, 254, 258, 262, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, + }, + { /* Fourth byte 16-bit table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 130, 136, 140, 144, 148, 152, 156, 160, + 164, 168, 172, 176, 180, 184, 188, 192, + 196, 200, 204, 210, 216, 222, 226, 230, + 234, 238, 242, 246, 250, 254, 258, 262, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, + }, + { /* Fourth byte 16-bit table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 54, 60, 66, 72, 78, 84, 90, + 96, 102, 108, 114, 120, 126, 132, 138, + 144, 150, 156, 162, 168, 174, 180, 186, + 192, 198, 204, 210, 216, 222, 228, 234, + 240, 246, 252, 258, 264, 270, 276, 282, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, + }, + { /* Fourth byte 16-bit table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 54, 60, 66, 72, 78, 84, 90, + 96, 96, 96, 102, 108, 114, 120, 126, + 132, 138, 144, 150, 156, 162, 168, 174, + 180, 186, 192, 198, 204, 210, 216, 222, + 228, 234, 240, 246, 252, 258, 264, 270, + 276, 282, 288, 294, 300, 306, 312, 318, + 324, 330, 336, 342, 348, 354, 360, 366, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, + }, + { /* Fourth byte 16-bit table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 17, 21, 25, 29, + 33, 37, 41, 45, 49, 53, 58, 62, + 66, 70, 74, 79, 83, 87, 91, 96, + 100, 104, 108, 112, 116, 121, 125, 129, + 133, 137, 141, 145, 149, 153, 157, 161, + 165, 169, 173, 177, 181, 185, 189, 193, + 197, 201, 205, 209, 213, 218, 222, 226, + 230, 235, 239, 243, 247, 251, 255, 259, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, + }, + { /* Fourth byte 16-bit table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 105, 109, 113, 117, 121, 125, + 129, 134, 139, 143, 147, 151, 155, 159, + 163, 167, 171, 175, 179, 184, 188, 192, + 196, 200, 205, 209, 213, 217, 221, 225, + 229, 233, 237, 241, 246, 250, 255, 259, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, + }, + { /* Fourth byte 16-bit table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 41, 45, 49, 53, 57, 61, + 66, 70, 75, 80, 84, 88, 92, 96, + 101, 106, 110, 114, 118, 122, 126, 130, + 134, 138, 142, 146, 150, 155, 159, 163, + 167, 171, 175, 179, 183, 187, 191, 195, + 199, 203, 207, 211, 215, 219, 223, 227, + 231, 236, 240, 244, 248, 252, 256, 261, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, + }, + { /* Fourth byte 16-bit table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 45, 49, 53, 57, 61, + 65, 69, 73, 77, 81, 85, 89, 93, + 97, 101, 105, 109, 113, 117, 122, 126, + 130, 134, 138, 142, 147, 151, 155, 159, + 163, 167, 171, 175, 179, 184, 188, 192, + 196, 201, 205, 209, 213, 217, 221, 225, + 230, 235, 240, 244, 249, 253, 257, 261, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, + }, + { /* Fourth byte 16-bit table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 29, + 33, 37, 41, 45, 49, 53, 58, 62, + 66, 71, 76, 80, 84, 88, 92, 96, + 100, 104, 108, 112, 117, 121, 126, 130, + 135, 139, 143, 147, 152, 156, 160, 165, + 170, 174, 178, 182, 186, 190, 194, 198, + 202, 206, 210, 214, 218, 222, 227, 231, + 236, 240, 245, 249, 254, 259, 264, 268, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, + }, + { /* Fourth byte 16-bit table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 9, 14, 19, 24, 28, 32, + 36, 40, 44, 48, 52, 56, 61, 65, + 69, 73, 77, 82, 86, 91, 96, 100, + 104, 108, 112, 116, 120, 125, 130, 135, + 139, 143, 148, 152, 156, 160, 165, 169, + 173, 177, 181, 185, 190, 194, 198, 202, + 206, 210, 214, 219, 224, 228, 233, 237, + 242, 246, 250, 254, 259, 264, 268, 273, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, + }, + { /* Fourth byte 16-bit table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 9, 13, 17, 21, 25, 29, + 34, 39, 44, 49, 53, 57, 61, 65, + 69, 73, 77, 81, 85, 89, 93, 97, + 102, 106, 110, 114, 118, 122, 126, 130, + 134, 138, 142, 146, 150, 155, 160, 165, + 169, 173, 177, 181, 186, 190, 195, 199, + 203, 208, 213, 217, 221, 225, 229, 233, + 237, 241, 245, 249, 253, 257, 261, 265, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, + }, + { /* Fourth byte 16-bit table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 25, 29, + 33, 37, 41, 45, 50, 55, 59, 63, + 67, 71, 75, 79, 84, 88, 92, 96, + 100, 105, 110, 114, 118, 122, 127, 131, + 135, 140, 145, 149, 153, 157, 162, 166, + 170, 174, 178, 182, 186, 190, 195, 199, + 203, 207, 212, 216, 220, 224, 228, 233, + 238, 242, 246, 250, 255, 259, 264, 268, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, + }, + { /* Fourth byte 16-bit table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + }, + { + { /* Fourth byte 16-bit table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 38, 44, 48, 52, 56, 60, 64, + 68, 72, 76, 80, 84, 90, 96, 102, + 108, 112, 116, 120, 124, 130, 136, 140, + 144, 148, 152, 156, 160, 164, 168, 172, + 176, 180, 184, 188, 192, 196, 200, 206, + 212, 216, 220, 224, 228, 232, 236, 240, + 244, 250, 256, 260, 264, 268, 272, 276, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, + }, + { /* Fourth byte 16-bit table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 54, 60, 66, + 72, 78, 84, 90, 96, 100, 104, 108, + 112, 116, 120, 124, 128, 134, 140, 144, + 148, 152, 156, 160, 164, 170, 176, 182, + 188, 194, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 262, 268, 274, 280, 284, 288, 292, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, + }, + { /* Fourth byte 16-bit table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 107, 116, 116, 116, 116, + 116, 120, 124, 128, 132, 138, 144, 150, + 156, 162, 168, 174, 180, 186, 192, 198, + 204, 210, 216, 222, 228, 234, 240, 246, + 252, 256, 260, 264, 268, 272, 276, 282, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, + }, + { /* Fourth byte 16-bit table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 52, 56, 60, 64, 68, 72, 76, + 80, 86, 92, 98, 104, 110, 116, 122, + 128, 134, 140, 146, 152, 158, 164, 170, + 176, 182, 188, 194, 200, 204, 208, 212, + 216, 222, 228, 234, 240, 246, 252, 258, + 264, 270, 276, 280, 284, 288, 292, 296, + 300, 304, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, + }, + { /* Fourth byte 16-bit table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 17, 24, 31, 38, 45, + 52, 57, 62, 69, 76, 83, 90, 97, + 104, 109, 114, 121, 128, 135, 142, 142, + 142, 147, 152, 159, 166, 173, 180, 180, + 180, 185, 190, 197, 204, 211, 218, 225, + 232, 237, 242, 249, 256, 263, 270, 277, + 284, 289, 294, 301, 308, 315, 322, 329, + 336, 341, 346, 353, 360, 367, 374, 381, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, + }, + { /* Fourth byte 16-bit table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 17, 24, 31, 38, 38, + 38, 43, 48, 55, 62, 69, 76, 76, + 76, 81, 86, 93, 100, 107, 114, 121, + 128, 128, 133, 133, 140, 140, 147, 147, + 154, 159, 164, 171, 178, 185, 192, 199, + 206, 211, 216, 223, 230, 237, 244, 251, + 258, 263, 268, 273, 278, 283, 288, 293, + 298, 303, 308, 313, 318, 323, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, + }, + { /* Fourth byte 16-bit table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 23, 32, 41, 50, 59, + 68, 75, 82, 91, 100, 109, 118, 127, + 136, 143, 150, 159, 168, 177, 186, 195, + 204, 211, 218, 227, 236, 245, 254, 263, + 272, 279, 286, 295, 304, 313, 322, 331, + 340, 347, 354, 363, 372, 381, 390, 399, + 408, 413, 418, 425, 430, 437, 437, 442, + 449, 454, 459, 464, 469, 474, 477, 480, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, + }, + { /* Fourth byte 16-bit table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 14, 21, 26, 33, 33, 38, + 45, 50, 55, 60, 65, 70, 82, 94, + 106, 111, 116, 123, 130, 130, 130, 135, + 142, 147, 152, 157, 162, 162, 174, 186, + 198, 203, 208, 215, 222, 227, 232, 237, + 244, 249, 254, 259, 264, 269, 280, 291, + 293, 293, 293, 300, 305, 312, 312, 317, + 324, 329, 334, 339, 344, 349, 356, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, + }, + { /* Fourth byte 16-bit table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 15, 20, 25, 30, 35, + 40, 45, 50, 55, 60, 65, 70, 78, + 86, 94, 102, 110, 118, 126, 134, 142, + 150, 158, 166, 174, 182, 190, 207, 221, + 221, 226, 231, 236, 241, 246, 251, 256, + 261, 266, 271, 276, 281, 286, 291, 296, + 301, 306, 311, 316, 321, 326, 331, 336, + 341, 346, 351, 356, 361, 366, 371, 376, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, + }, + { /* Fourth byte 16-bit table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 27, 42, 51, 66, 75, 84, + 102, 114, 123, 132, 141, 153, 165, 177, + 189, 201, 213, 225, 243, 249, 267, 285, + 300, 312, 330, 348, 360, 369, 378, 390, + 402, 417, 432, 441, 450, 462, 471, 480, + 486, 492, 501, 510, 528, 540, 555, 573, + 585, 594, 603, 621, 633, 651, 660, 675, + 684, 696, 705, 717, 732, 744, 759, 771, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, + }, + { /* Fourth byte 16-bit table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 24, 33, 45, 54, 63, 72, + 87, 99, 105, 123, 132, 147, 159, 171, + 180, 189, 201, 207, 219, 234, 240, 258, + 267, 271, 275, 279, 283, 287, 291, 295, + 299, 303, 307, 312, 317, 322, 327, 332, + 337, 342, 347, 352, 357, 362, 367, 372, + 377, 382, 385, 387, 389, 392, 394, 396, + 398, 401, 404, 406, 412, 418, 424, 430, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, + }, + { /* Fourth byte 16-bit table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 34, 38, + 42, 46, 50, 54, 58, 62, 66, 70, + 74, 78, 82, 86, 90, 94, 98, 102, + 106, 110, 114, 118, 122, 126, 130, 134, + 138, 142, 146, 150, 154, 158, 162, 166, + 170, 174, 178, 182, 186, 190, 194, 198, + 202, 206, 210, 214, 218, 222, 226, 230, + 234, 238, 242, 246, 250, 254, 258, 262, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, + }, + { /* Fourth byte 16-bit table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 125, + 130, 135, 140, 145, 150, 156, 162, 168, + 174, 180, 186, 190, 194, 198, 202, 206, + 210, 214, 218, 222, 226, 230, 234, 238, + 242, 246, 250, 254, 258, 262, 266, 270, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, + }, + { /* Fourth byte 16-bit table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 98, 104, 110, 116, 122, 126, 130, 134, + 138, 142, 146, 150, 154, 158, 162, 166, + 170, 174, 178, 182, 186, 190, 194, 198, + 202, 206, 210, 214, 218, 222, 226, 230, + 234, 238, 242, 246, 250, 254, 258, 262, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, + }, + { /* Fourth byte 16-bit table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 130, 136, 140, 144, 148, 152, 156, 160, + 164, 168, 172, 176, 180, 184, 188, 192, + 196, 200, 204, 210, 216, 222, 226, 230, + 234, 238, 242, 246, 250, 254, 258, 262, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, + }, + { /* Fourth byte 16-bit table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 54, 60, 66, 72, 78, 84, 90, + 96, 102, 108, 114, 120, 126, 132, 138, + 144, 150, 156, 162, 168, 174, 180, 186, + 192, 198, 204, 210, 216, 222, 228, 234, + 240, 246, 252, 258, 264, 270, 276, 282, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, + }, + { /* Fourth byte 16-bit table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 54, 60, 66, 72, 78, 84, 90, + 96, 96, 96, 102, 108, 114, 120, 126, + 132, 138, 144, 150, 156, 162, 168, 174, + 180, 186, 192, 198, 204, 210, 216, 222, + 228, 234, 240, 246, 252, 258, 264, 270, + 276, 282, 288, 294, 300, 306, 312, 318, + 324, 330, 336, 342, 348, 354, 360, 366, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, + }, + { /* Fourth byte 16-bit table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 17, 21, 25, 29, + 33, 37, 41, 45, 49, 53, 58, 62, + 66, 70, 74, 79, 83, 87, 91, 96, + 100, 104, 108, 112, 116, 121, 125, 129, + 133, 137, 141, 145, 149, 153, 157, 161, + 165, 169, 173, 177, 181, 185, 189, 193, + 197, 201, 205, 209, 213, 218, 222, 226, + 230, 235, 239, 243, 247, 251, 255, 259, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, + }, + { /* Fourth byte 16-bit table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 105, 109, 113, 117, 121, 125, + 129, 134, 139, 143, 147, 151, 155, 159, + 163, 167, 171, 175, 179, 184, 188, 192, + 196, 200, 205, 209, 213, 217, 221, 225, + 229, 233, 237, 241, 246, 250, 255, 259, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, + }, + { /* Fourth byte 16-bit table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 41, 45, 49, 53, 57, 61, + 66, 70, 75, 80, 84, 88, 92, 96, + 101, 106, 110, 114, 118, 122, 126, 130, + 134, 138, 142, 146, 150, 155, 159, 163, + 167, 171, 175, 179, 183, 187, 191, 195, + 199, 203, 207, 211, 215, 219, 223, 227, + 231, 236, 240, 244, 248, 252, 256, 261, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, + }, + { /* Fourth byte 16-bit table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 45, 49, 53, 57, 61, + 65, 69, 73, 77, 81, 85, 89, 93, + 97, 101, 105, 109, 113, 117, 122, 126, + 130, 134, 138, 142, 147, 151, 155, 159, + 163, 167, 171, 175, 179, 184, 188, 192, + 196, 201, 205, 209, 213, 217, 221, 225, + 230, 235, 240, 244, 249, 253, 257, 261, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, + }, + { /* Fourth byte 16-bit table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 29, + 33, 37, 41, 45, 49, 53, 58, 62, + 66, 71, 76, 80, 84, 88, 92, 96, + 100, 104, 108, 112, 117, 121, 126, 130, + 135, 139, 143, 147, 152, 156, 160, 165, + 170, 174, 178, 182, 186, 190, 194, 198, + 202, 206, 210, 214, 218, 222, 227, 231, + 236, 240, 245, 249, 254, 259, 264, 268, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, + }, + { /* Fourth byte 16-bit table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 9, 14, 19, 24, 28, 32, + 36, 40, 44, 48, 52, 56, 61, 65, + 69, 73, 77, 82, 86, 91, 96, 100, + 104, 108, 112, 116, 120, 125, 130, 135, + 139, 143, 148, 152, 156, 160, 165, 169, + 173, 177, 181, 185, 190, 194, 198, 202, + 206, 210, 214, 219, 224, 228, 233, 237, + 242, 246, 250, 254, 259, 264, 268, 273, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, + }, + { /* Fourth byte 16-bit table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 9, 13, 17, 21, 25, 29, + 34, 39, 44, 49, 53, 57, 61, 65, + 69, 73, 77, 81, 85, 89, 93, 97, + 102, 106, 110, 114, 118, 122, 126, 130, + 134, 138, 142, 146, 150, 155, 160, 165, + 169, 173, 177, 181, 186, 190, 195, 199, + 203, 208, 213, 217, 221, 225, 229, 233, + 237, 241, 245, 249, 253, 257, 261, 265, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, + }, + { /* Fourth byte 16-bit table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 25, 29, + 33, 37, 41, 45, 50, 55, 59, 63, + 67, 71, 75, 79, 84, 88, 92, 96, + 100, 105, 110, 114, 118, 122, 127, 131, + 135, 140, 145, 149, 153, 157, 162, 166, + 170, 174, 178, 182, 186, 190, 195, 199, + 203, 207, 212, 216, 220, 224, 228, 233, + 238, 242, 246, 250, 255, 259, 264, 268, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, + }, + }, +}; + +static const uchar_t u8_decomp_final_tbl[2][19370] = { + { + 0x20, 0x20, 0xCC, 0x88, 0x61, 0x20, 0xCC, 0x84, + 0x32, 0x33, 0x20, 0xCC, 0x81, 0xCE, 0xBC, 0x20, + 0xCC, 0xA7, 0x31, 0x6F, 0x31, 0xE2, 0x81, 0x84, + 0x34, 0x31, 0xE2, 0x81, 0x84, 0x32, 0x33, 0xE2, + 0x81, 0x84, 0x34, 0xF6, 0x41, 0xCC, 0x80, 0xF6, + 0x41, 0xCC, 0x81, 0xF6, 0x41, 0xCC, 0x82, 0xF6, + 0x41, 0xCC, 0x83, 0xF6, 0x41, 0xCC, 0x88, 0xF6, + 0x41, 0xCC, 0x8A, 0xF6, 0x43, 0xCC, 0xA7, 0xF6, + 0x45, 0xCC, 0x80, 0xF6, 0x45, 0xCC, 0x81, 0xF6, + 0x45, 0xCC, 0x82, 0xF6, 0x45, 0xCC, 0x88, 0xF6, + 0x49, 0xCC, 0x80, 0xF6, 0x49, 0xCC, 0x81, 0xF6, + 0x49, 0xCC, 0x82, 0xF6, 0x49, 0xCC, 0x88, 0xF6, + 0x4E, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x80, 0xF6, + 0x4F, 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xF6, + 0x4F, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x88, 0xF6, + 0x55, 0xCC, 0x80, 0xF6, 0x55, 0xCC, 0x81, 0xF6, + 0x55, 0xCC, 0x82, 0xF6, 0x55, 0xCC, 0x88, 0xF6, + 0x59, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x80, 0xF6, + 0x61, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x82, 0xF6, + 0x61, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x88, 0xF6, + 0x61, 0xCC, 0x8A, 0xF6, 0x63, 0xCC, 0xA7, 0xF6, + 0x65, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x81, 0xF6, + 0x65, 0xCC, 0x82, 0xF6, 0x65, 0xCC, 0x88, 0xF6, + 0x69, 0xCC, 0x80, 0xF6, 0x69, 0xCC, 0x81, 0xF6, + 0x69, 0xCC, 0x82, 0xF6, 0x69, 0xCC, 0x88, 0xF6, + 0x6E, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x80, 0xF6, + 0x6F, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, 0xF6, + 0x6F, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x88, 0xF6, + 0x75, 0xCC, 0x80, 0xF6, 0x75, 0xCC, 0x81, 0xF6, + 0x75, 0xCC, 0x82, 0xF6, 0x75, 0xCC, 0x88, 0xF6, + 0x79, 0xCC, 0x81, 0xF6, 0x79, 0xCC, 0x88, 0xF6, + 0x41, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x84, 0xF6, + 0x41, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0x86, 0xF6, + 0x41, 0xCC, 0xA8, 0xF6, 0x61, 0xCC, 0xA8, 0xF6, + 0x43, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0x81, 0xF6, + 0x43, 0xCC, 0x82, 0xF6, 0x63, 0xCC, 0x82, 0xF6, + 0x43, 0xCC, 0x87, 0xF6, 0x63, 0xCC, 0x87, 0xF6, + 0x43, 0xCC, 0x8C, 0xF6, 0x63, 0xCC, 0x8C, 0xF6, + 0x44, 0xCC, 0x8C, 0xF6, 0x64, 0xCC, 0x8C, 0xF6, + 0x45, 0xCC, 0x84, 0xF6, 0x65, 0xCC, 0x84, 0xF6, + 0x45, 0xCC, 0x86, 0xF6, 0x65, 0xCC, 0x86, 0xF6, + 0x45, 0xCC, 0x87, 0xF6, 0x65, 0xCC, 0x87, 0xF6, + 0x45, 0xCC, 0xA8, 0xF6, 0x65, 0xCC, 0xA8, 0xF6, + 0x45, 0xCC, 0x8C, 0xF6, 0x65, 0xCC, 0x8C, 0xF6, + 0x47, 0xCC, 0x82, 0xF6, 0x67, 0xCC, 0x82, 0xF6, + 0x47, 0xCC, 0x86, 0xF6, 0x67, 0xCC, 0x86, 0xF6, + 0x47, 0xCC, 0x87, 0xF6, 0x67, 0xCC, 0x87, 0xF6, + 0x47, 0xCC, 0xA7, 0xF6, 0x67, 0xCC, 0xA7, 0xF6, + 0x48, 0xCC, 0x82, 0xF6, 0x68, 0xCC, 0x82, 0xF6, + 0x49, 0xCC, 0x83, 0xF6, 0x69, 0xCC, 0x83, 0xF6, + 0x49, 0xCC, 0x84, 0xF6, 0x69, 0xCC, 0x84, 0xF6, + 0x49, 0xCC, 0x86, 0xF6, 0x69, 0xCC, 0x86, 0xF6, + 0x49, 0xCC, 0xA8, 0xF6, 0x69, 0xCC, 0xA8, 0xF6, + 0x49, 0xCC, 0x87, 0x49, 0x4A, 0x69, 0x6A, 0xF6, + 0x4A, 0xCC, 0x82, 0xF6, 0x6A, 0xCC, 0x82, 0xF6, + 0x4B, 0xCC, 0xA7, 0xF6, 0x6B, 0xCC, 0xA7, 0xF6, + 0x4C, 0xCC, 0x81, 0xF6, 0x6C, 0xCC, 0x81, 0xF6, + 0x4C, 0xCC, 0xA7, 0xF6, 0x6C, 0xCC, 0xA7, 0xF6, + 0x4C, 0xCC, 0x8C, 0xF6, 0x6C, 0xCC, 0x8C, 0x4C, + 0xC2, 0xB7, 0x6C, 0xC2, 0xB7, 0xF6, 0x4E, 0xCC, + 0x81, 0xF6, 0x6E, 0xCC, 0x81, 0xF6, 0x4E, 0xCC, + 0xA7, 0xF6, 0x6E, 0xCC, 0xA7, 0xF6, 0x4E, 0xCC, + 0x8C, 0xF6, 0x6E, 0xCC, 0x8C, 0xCA, 0xBC, 0x6E, + 0xF6, 0x4F, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, 0x84, + 0xF6, 0x4F, 0xCC, 0x86, 0xF6, 0x6F, 0xCC, 0x86, + 0xF6, 0x4F, 0xCC, 0x8B, 0xF6, 0x6F, 0xCC, 0x8B, + 0xF6, 0x52, 0xCC, 0x81, 0xF6, 0x72, 0xCC, 0x81, + 0xF6, 0x52, 0xCC, 0xA7, 0xF6, 0x72, 0xCC, 0xA7, + 0xF6, 0x52, 0xCC, 0x8C, 0xF6, 0x72, 0xCC, 0x8C, + 0xF6, 0x53, 0xCC, 0x81, 0xF6, 0x73, 0xCC, 0x81, + 0xF6, 0x53, 0xCC, 0x82, 0xF6, 0x73, 0xCC, 0x82, + 0xF6, 0x53, 0xCC, 0xA7, 0xF6, 0x73, 0xCC, 0xA7, + 0xF6, 0x53, 0xCC, 0x8C, 0xF6, 0x73, 0xCC, 0x8C, + 0xF6, 0x54, 0xCC, 0xA7, 0xF6, 0x74, 0xCC, 0xA7, + 0xF6, 0x54, 0xCC, 0x8C, 0xF6, 0x74, 0xCC, 0x8C, + 0xF6, 0x55, 0xCC, 0x83, 0xF6, 0x75, 0xCC, 0x83, + 0xF6, 0x55, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x84, + 0xF6, 0x55, 0xCC, 0x86, 0xF6, 0x75, 0xCC, 0x86, + 0xF6, 0x55, 0xCC, 0x8A, 0xF6, 0x75, 0xCC, 0x8A, + 0xF6, 0x55, 0xCC, 0x8B, 0xF6, 0x75, 0xCC, 0x8B, + 0xF6, 0x55, 0xCC, 0xA8, 0xF6, 0x75, 0xCC, 0xA8, + 0xF6, 0x57, 0xCC, 0x82, 0xF6, 0x77, 0xCC, 0x82, + 0xF6, 0x59, 0xCC, 0x82, 0xF6, 0x79, 0xCC, 0x82, + 0xF6, 0x59, 0xCC, 0x88, 0xF6, 0x5A, 0xCC, 0x81, + 0xF6, 0x7A, 0xCC, 0x81, 0xF6, 0x5A, 0xCC, 0x87, + 0xF6, 0x7A, 0xCC, 0x87, 0xF6, 0x5A, 0xCC, 0x8C, + 0xF6, 0x7A, 0xCC, 0x8C, 0x73, 0xF6, 0x4F, 0xCC, + 0x9B, 0xF6, 0x6F, 0xCC, 0x9B, 0xF6, 0x55, 0xCC, + 0x9B, 0xF6, 0x75, 0xCC, 0x9B, 0x44, 0x5A, 0xCC, + 0x8C, 0x44, 0x7A, 0xCC, 0x8C, 0x64, 0x7A, 0xCC, + 0x8C, 0x4C, 0x4A, 0x4C, 0x6A, 0x6C, 0x6A, 0x4E, + 0x4A, 0x4E, 0x6A, 0x6E, 0x6A, 0xF6, 0x41, 0xCC, + 0x8C, 0xF6, 0x61, 0xCC, 0x8C, 0xF6, 0x49, 0xCC, + 0x8C, 0xF6, 0x69, 0xCC, 0x8C, 0xF6, 0x4F, 0xCC, + 0x8C, 0xF6, 0x6F, 0xCC, 0x8C, 0xF6, 0x55, 0xCC, + 0x8C, 0xF6, 0x75, 0xCC, 0x8C, 0xF6, 0x55, 0xCC, + 0x88, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x88, 0xCC, + 0x84, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0x75, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x55, 0xCC, + 0x88, 0xCC, 0x8C, 0xF6, 0x75, 0xCC, 0x88, 0xCC, + 0x8C, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x80, 0xF6, + 0x75, 0xCC, 0x88, 0xCC, 0x80, 0xF6, 0x41, 0xCC, + 0x88, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x88, 0xCC, + 0x84, 0xF6, 0x41, 0xCC, 0x87, 0xCC, 0x84, 0xF6, + 0x61, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0xC3, 0x86, + 0xCC, 0x84, 0xF6, 0xC3, 0xA6, 0xCC, 0x84, 0xF6, + 0x47, 0xCC, 0x8C, 0xF6, 0x67, 0xCC, 0x8C, 0xF6, + 0x4B, 0xCC, 0x8C, 0xF6, 0x6B, 0xCC, 0x8C, 0xF6, + 0x4F, 0xCC, 0xA8, 0xF6, 0x6F, 0xCC, 0xA8, 0xF6, + 0x4F, 0xCC, 0xA8, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, + 0xA8, 0xCC, 0x84, 0xF6, 0xC6, 0xB7, 0xCC, 0x8C, + 0xF6, 0xCA, 0x92, 0xCC, 0x8C, 0xF6, 0x6A, 0xCC, + 0x8C, 0x44, 0x5A, 0x44, 0x7A, 0x64, 0x7A, 0xF6, + 0x47, 0xCC, 0x81, 0xF6, 0x67, 0xCC, 0x81, 0xF6, + 0x4E, 0xCC, 0x80, 0xF6, 0x6E, 0xCC, 0x80, 0xF6, + 0x41, 0xCC, 0x8A, 0xCC, 0x81, 0xF6, 0x61, 0xCC, + 0x8A, 0xCC, 0x81, 0xF6, 0xC3, 0x86, 0xCC, 0x81, + 0xF6, 0xC3, 0xA6, 0xCC, 0x81, 0xF6, 0xC3, 0x98, + 0xCC, 0x81, 0xF6, 0xC3, 0xB8, 0xCC, 0x81, 0xF6, + 0x41, 0xCC, 0x8F, 0xF6, 0x61, 0xCC, 0x8F, 0xF6, + 0x41, 0xCC, 0x91, 0xF6, 0x61, 0xCC, 0x91, 0xF6, + 0x45, 0xCC, 0x8F, 0xF6, 0x65, 0xCC, 0x8F, 0xF6, + 0x45, 0xCC, 0x91, 0xF6, 0x65, 0xCC, 0x91, 0xF6, + 0x49, 0xCC, 0x8F, 0xF6, 0x69, 0xCC, 0x8F, 0xF6, + 0x49, 0xCC, 0x91, 0xF6, 0x69, 0xCC, 0x91, 0xF6, + 0x4F, 0xCC, 0x8F, 0xF6, 0x6F, 0xCC, 0x8F, 0xF6, + 0x4F, 0xCC, 0x91, 0xF6, 0x6F, 0xCC, 0x91, 0xF6, + 0x52, 0xCC, 0x8F, 0xF6, 0x72, 0xCC, 0x8F, 0xF6, + 0x52, 0xCC, 0x91, 0xF6, 0x72, 0xCC, 0x91, 0xF6, + 0x55, 0xCC, 0x8F, 0xF6, 0x75, 0xCC, 0x8F, 0xF6, + 0x55, 0xCC, 0x91, 0xF6, 0x75, 0xCC, 0x91, 0xF6, + 0x53, 0xCC, 0xA6, 0xF6, 0x73, 0xCC, 0xA6, 0xF6, + 0x54, 0xCC, 0xA6, 0xF6, 0x74, 0xCC, 0xA6, 0xF6, + 0x48, 0xCC, 0x8C, 0xF6, 0x68, 0xCC, 0x8C, 0xF6, + 0x41, 0xCC, 0x87, 0xF6, 0x61, 0xCC, 0x87, 0xF6, + 0x45, 0xCC, 0xA7, 0xF6, 0x65, 0xCC, 0xA7, 0xF6, + 0x4F, 0xCC, 0x88, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, + 0x88, 0xCC, 0x84, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, + 0x84, 0xF6, 0x6F, 0xCC, 0x83, 0xCC, 0x84, 0xF6, + 0x4F, 0xCC, 0x87, 0xF6, 0x6F, 0xCC, 0x87, 0xF6, + 0x4F, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, + 0x87, 0xCC, 0x84, 0xF6, 0x59, 0xCC, 0x84, 0xF6, + 0x79, 0xCC, 0x84, 0x68, 0xC9, 0xA6, 0x6A, 0x72, + 0xC9, 0xB9, 0xC9, 0xBB, 0xCA, 0x81, 0x77, 0x79, + 0x20, 0xCC, 0x86, 0x20, 0xCC, 0x87, 0x20, 0xCC, + 0x8A, 0x20, 0xCC, 0xA8, 0x20, 0xCC, 0x83, 0x20, + 0xCC, 0x8B, 0xC9, 0xA3, 0x6C, 0x73, 0x78, 0xCA, + 0x95, 0xF6, 0xCC, 0x80, 0xF6, 0xCC, 0x81, 0xF6, + 0xCC, 0x93, 0xF6, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0xCA, 0xB9, 0x20, 0xCD, 0x85, 0xF6, 0x3B, 0x20, + 0xCC, 0x81, 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, + 0x20, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0xCE, 0x91, + 0xCC, 0x81, 0xF6, 0xC2, 0xB7, 0xF6, 0xCE, 0x95, + 0xCC, 0x81, 0xF6, 0xCE, 0x97, 0xCC, 0x81, 0xF6, + 0xCE, 0x99, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, + 0x81, 0xF6, 0xCE, 0xA5, 0xCC, 0x81, 0xF6, 0xCE, + 0xA9, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x88, + 0xCC, 0x81, 0xF6, 0xCE, 0x99, 0xCC, 0x88, 0xF6, + 0xCE, 0xA5, 0xCC, 0x88, 0xF6, 0xCE, 0xB1, 0xCC, + 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x81, 0xF6, 0xCE, + 0xB7, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x81, + 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0xCE, 0xB9, 0xCC, 0x88, 0xF6, 0xCF, 0x85, 0xCC, + 0x88, 0xF6, 0xCE, 0xBF, 0xCC, 0x81, 0xF6, 0xCF, + 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, 0x81, + 0xCE, 0xB2, 0xCE, 0xB8, 0xCE, 0xA5, 0xF5, 0x05, + 0xCF, 0x92, 0xCC, 0x81, 0xCE, 0xA5, 0xCC, 0x81, + 0xF5, 0x05, 0xCF, 0x92, 0xCC, 0x88, 0xCE, 0xA5, + 0xCC, 0x88, 0xCF, 0x86, 0xCF, 0x80, 0xCE, 0xBA, + 0xCF, 0x81, 0xCF, 0x82, 0xCE, 0x98, 0xCE, 0xB5, + 0xF6, 0xD0, 0x95, 0xCC, 0x80, 0xF6, 0xD0, 0x95, + 0xCC, 0x88, 0xF6, 0xD0, 0x93, 0xCC, 0x81, 0xF6, + 0xD0, 0x86, 0xCC, 0x88, 0xF6, 0xD0, 0x9A, 0xCC, + 0x81, 0xF6, 0xD0, 0x98, 0xCC, 0x80, 0xF6, 0xD0, + 0xA3, 0xCC, 0x86, 0xF6, 0xD0, 0x98, 0xCC, 0x86, + 0xF6, 0xD0, 0xB8, 0xCC, 0x86, 0xF6, 0xD0, 0xB5, + 0xCC, 0x80, 0xF6, 0xD0, 0xB5, 0xCC, 0x88, 0xF6, + 0xD0, 0xB3, 0xCC, 0x81, 0xF6, 0xD1, 0x96, 0xCC, + 0x88, 0xF6, 0xD0, 0xBA, 0xCC, 0x81, 0xF6, 0xD0, + 0xB8, 0xCC, 0x80, 0xF6, 0xD1, 0x83, 0xCC, 0x86, + 0xF6, 0xD1, 0xB4, 0xCC, 0x8F, 0xF6, 0xD1, 0xB5, + 0xCC, 0x8F, 0xF6, 0xD0, 0x96, 0xCC, 0x86, 0xF6, + 0xD0, 0xB6, 0xCC, 0x86, 0xF6, 0xD0, 0x90, 0xCC, + 0x86, 0xF6, 0xD0, 0xB0, 0xCC, 0x86, 0xF6, 0xD0, + 0x90, 0xCC, 0x88, 0xF6, 0xD0, 0xB0, 0xCC, 0x88, + 0xF6, 0xD0, 0x95, 0xCC, 0x86, 0xF6, 0xD0, 0xB5, + 0xCC, 0x86, 0xF6, 0xD3, 0x98, 0xCC, 0x88, 0xF6, + 0xD3, 0x99, 0xCC, 0x88, 0xF6, 0xD0, 0x96, 0xCC, + 0x88, 0xF6, 0xD0, 0xB6, 0xCC, 0x88, 0xF6, 0xD0, + 0x97, 0xCC, 0x88, 0xF6, 0xD0, 0xB7, 0xCC, 0x88, + 0xF6, 0xD0, 0x98, 0xCC, 0x84, 0xF6, 0xD0, 0xB8, + 0xCC, 0x84, 0xF6, 0xD0, 0x98, 0xCC, 0x88, 0xF6, + 0xD0, 0xB8, 0xCC, 0x88, 0xF6, 0xD0, 0x9E, 0xCC, + 0x88, 0xF6, 0xD0, 0xBE, 0xCC, 0x88, 0xF6, 0xD3, + 0xA8, 0xCC, 0x88, 0xF6, 0xD3, 0xA9, 0xCC, 0x88, + 0xF6, 0xD0, 0xAD, 0xCC, 0x88, 0xF6, 0xD1, 0x8D, + 0xCC, 0x88, 0xF6, 0xD0, 0xA3, 0xCC, 0x84, 0xF6, + 0xD1, 0x83, 0xCC, 0x84, 0xF6, 0xD0, 0xA3, 0xCC, + 0x88, 0xF6, 0xD1, 0x83, 0xCC, 0x88, 0xF6, 0xD0, + 0xA3, 0xCC, 0x8B, 0xF6, 0xD1, 0x83, 0xCC, 0x8B, + 0xF6, 0xD0, 0xA7, 0xCC, 0x88, 0xF6, 0xD1, 0x87, + 0xCC, 0x88, 0xF6, 0xD0, 0xAB, 0xCC, 0x88, 0xF6, + 0xD1, 0x8B, 0xCC, 0x88, 0xD5, 0xA5, 0xD6, 0x82, + 0xF6, 0xD8, 0xA7, 0xD9, 0x93, 0xF6, 0xD8, 0xA7, + 0xD9, 0x94, 0xF6, 0xD9, 0x88, 0xD9, 0x94, 0xF6, + 0xD8, 0xA7, 0xD9, 0x95, 0xF6, 0xD9, 0x8A, 0xD9, + 0x94, 0xD8, 0xA7, 0xD9, 0xB4, 0xD9, 0x88, 0xD9, + 0xB4, 0xDB, 0x87, 0xD9, 0xB4, 0xD9, 0x8A, 0xD9, + 0xB4, 0xF6, 0xDB, 0x95, 0xD9, 0x94, 0xF6, 0xDB, + 0x81, 0xD9, 0x94, 0xF6, 0xDB, 0x92, 0xD9, 0x94, + 0xF6, 0xE0, 0xA4, 0xA8, 0xE0, 0xA4, 0xBC, 0xF6, + 0xE0, 0xA4, 0xB0, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, + 0xA4, 0xB3, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, + 0x95, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x96, + 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x97, 0xE0, + 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x9C, 0xE0, 0xA4, + 0xBC, 0xF6, 0xE0, 0xA4, 0xA1, 0xE0, 0xA4, 0xBC, + 0xF6, 0xE0, 0xA4, 0xA2, 0xE0, 0xA4, 0xBC, 0xF6, + 0xE0, 0xA4, 0xAB, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, + 0xA4, 0xAF, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA7, + 0x87, 0xE0, 0xA6, 0xBE, 0xF6, 0xE0, 0xA7, 0x87, + 0xE0, 0xA7, 0x97, 0xF6, 0xE0, 0xA6, 0xA1, 0xE0, + 0xA6, 0xBC, 0xF6, 0xE0, 0xA6, 0xA2, 0xE0, 0xA6, + 0xBC, 0xF6, 0xE0, 0xA6, 0xAF, 0xE0, 0xA6, 0xBC, + 0xF6, 0xE0, 0xA8, 0xB2, 0xE0, 0xA8, 0xBC, 0xF6, + 0xE0, 0xA8, 0xB8, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, + 0xA8, 0x96, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, + 0x97, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, 0x9C, + 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, 0xAB, 0xE0, + 0xA8, 0xBC, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAD, + 0x96, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAC, 0xBE, + 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAD, 0x97, 0xF6, + 0xE0, 0xAC, 0xA1, 0xE0, 0xAC, 0xBC, 0xF6, 0xE0, + 0xAC, 0xA2, 0xE0, 0xAC, 0xBC, 0xF6, 0xE0, 0xAE, + 0x92, 0xE0, 0xAF, 0x97, 0xF6, 0xE0, 0xAF, 0x86, + 0xE0, 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, 0x87, 0xE0, + 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, 0x86, 0xE0, 0xAF, + 0x97, 0xF6, 0xE0, 0xB1, 0x86, 0xE0, 0xB1, 0x96, + 0xF6, 0xE0, 0xB2, 0xBF, 0xE0, 0xB3, 0x95, 0xF6, + 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x95, 0xF6, 0xE0, + 0xB3, 0x86, 0xE0, 0xB3, 0x96, 0xF6, 0xE0, 0xB3, + 0x86, 0xE0, 0xB3, 0x82, 0xF6, 0xE0, 0xB3, 0x86, + 0xE0, 0xB3, 0x82, 0xE0, 0xB3, 0x95, 0xF6, 0xE0, + 0xB5, 0x86, 0xE0, 0xB4, 0xBE, 0xF6, 0xE0, 0xB5, + 0x87, 0xE0, 0xB4, 0xBE, 0xF6, 0xE0, 0xB5, 0x86, + 0xE0, 0xB5, 0x97, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, + 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, 0xB7, + 0x8F, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, 0xB7, 0x8F, + 0xE0, 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, + 0xB7, 0x9F, 0xE0, 0xB9, 0x8D, 0xE0, 0xB8, 0xB2, + 0xE0, 0xBB, 0x8D, 0xE0, 0xBA, 0xB2, 0xE0, 0xBA, + 0xAB, 0xE0, 0xBA, 0x99, 0xE0, 0xBA, 0xAB, 0xE0, + 0xBA, 0xA1, 0xE0, 0xBC, 0x8B, 0xF6, 0xE0, 0xBD, + 0x82, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x8C, + 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x91, 0xE0, + 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x96, 0xE0, 0xBE, + 0xB7, 0xF6, 0xE0, 0xBD, 0x9B, 0xE0, 0xBE, 0xB7, + 0xF6, 0xE0, 0xBD, 0x80, 0xE0, 0xBE, 0xB5, 0xF6, + 0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB2, 0xF6, 0xE0, + 0xBD, 0xB1, 0xE0, 0xBD, 0xB4, 0xF6, 0xE0, 0xBE, + 0xB2, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, 0xB2, 0xE0, + 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, 0xBE, + 0xB3, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, 0xB3, 0xE0, + 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, 0xBD, + 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, 0xBE, 0x92, + 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0x9C, 0xE0, + 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xA1, 0xE0, 0xBE, + 0xB7, 0xF6, 0xE0, 0xBE, 0xA6, 0xE0, 0xBE, 0xB7, + 0xF6, 0xE0, 0xBE, 0xAB, 0xE0, 0xBE, 0xB7, 0xF6, + 0xE0, 0xBE, 0x90, 0xE0, 0xBE, 0xB5, 0xF6, 0xE1, + 0x80, 0xA5, 0xE1, 0x80, 0xAE, 0xF6, 0x41, 0xCC, + 0xA5, 0xF6, 0x61, 0xCC, 0xA5, 0xF6, 0x42, 0xCC, + 0x87, 0xF6, 0x62, 0xCC, 0x87, 0xF6, 0x42, 0xCC, + 0xA3, 0xF6, 0x62, 0xCC, 0xA3, 0xF6, 0x42, 0xCC, + 0xB1, 0xF6, 0x62, 0xCC, 0xB1, 0xF6, 0x43, 0xCC, + 0xA7, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0xA7, 0xCC, + 0x81, 0xF6, 0x44, 0xCC, 0x87, 0xF6, 0x64, 0xCC, + 0x87, 0xF6, 0x44, 0xCC, 0xA3, 0xF6, 0x64, 0xCC, + 0xA3, 0xF6, 0x44, 0xCC, 0xB1, 0xF6, 0x64, 0xCC, + 0xB1, 0xF6, 0x44, 0xCC, 0xA7, 0xF6, 0x64, 0xCC, + 0xA7, 0xF6, 0x44, 0xCC, 0xAD, 0xF6, 0x64, 0xCC, + 0xAD, 0xF6, 0x45, 0xCC, 0x84, 0xCC, 0x80, 0xF6, + 0x65, 0xCC, 0x84, 0xCC, 0x80, 0xF6, 0x45, 0xCC, + 0x84, 0xCC, 0x81, 0xF6, 0x65, 0xCC, 0x84, 0xCC, + 0x81, 0xF6, 0x45, 0xCC, 0xAD, 0xF6, 0x65, 0xCC, + 0xAD, 0xF6, 0x45, 0xCC, 0xB0, 0xF6, 0x65, 0xCC, + 0xB0, 0xF6, 0x45, 0xCC, 0xA7, 0xCC, 0x86, 0xF6, + 0x65, 0xCC, 0xA7, 0xCC, 0x86, 0xF6, 0x46, 0xCC, + 0x87, 0xF6, 0x66, 0xCC, 0x87, 0xF6, 0x47, 0xCC, + 0x84, 0xF6, 0x67, 0xCC, 0x84, 0xF6, 0x48, 0xCC, + 0x87, 0xF6, 0x68, 0xCC, 0x87, 0xF6, 0x48, 0xCC, + 0xA3, 0xF6, 0x68, 0xCC, 0xA3, 0xF6, 0x48, 0xCC, + 0x88, 0xF6, 0x68, 0xCC, 0x88, 0xF6, 0x48, 0xCC, + 0xA7, 0xF6, 0x68, 0xCC, 0xA7, 0xF6, 0x48, 0xCC, + 0xAE, 0xF6, 0x68, 0xCC, 0xAE, 0xF6, 0x49, 0xCC, + 0xB0, 0xF6, 0x69, 0xCC, 0xB0, 0xF6, 0x49, 0xCC, + 0x88, 0xCC, 0x81, 0xF6, 0x69, 0xCC, 0x88, 0xCC, + 0x81, 0xF6, 0x4B, 0xCC, 0x81, 0xF6, 0x6B, 0xCC, + 0x81, 0xF6, 0x4B, 0xCC, 0xA3, 0xF6, 0x6B, 0xCC, + 0xA3, 0xF6, 0x4B, 0xCC, 0xB1, 0xF6, 0x6B, 0xCC, + 0xB1, 0xF6, 0x4C, 0xCC, 0xA3, 0xF6, 0x6C, 0xCC, + 0xA3, 0xF6, 0x4C, 0xCC, 0xA3, 0xCC, 0x84, 0xF6, + 0x6C, 0xCC, 0xA3, 0xCC, 0x84, 0xF6, 0x4C, 0xCC, + 0xB1, 0xF6, 0x6C, 0xCC, 0xB1, 0xF6, 0x4C, 0xCC, + 0xAD, 0xF6, 0x6C, 0xCC, 0xAD, 0xF6, 0x4D, 0xCC, + 0x81, 0xF6, 0x6D, 0xCC, 0x81, 0xF6, 0x4D, 0xCC, + 0x87, 0xF6, 0x6D, 0xCC, 0x87, 0xF6, 0x4D, 0xCC, + 0xA3, 0xF6, 0x6D, 0xCC, 0xA3, 0xF6, 0x4E, 0xCC, + 0x87, 0xF6, 0x6E, 0xCC, 0x87, 0xF6, 0x4E, 0xCC, + 0xA3, 0xF6, 0x6E, 0xCC, 0xA3, 0xF6, 0x4E, 0xCC, + 0xB1, 0xF6, 0x6E, 0xCC, 0xB1, 0xF6, 0x4E, 0xCC, + 0xAD, 0xF6, 0x6E, 0xCC, 0xAD, 0xF6, 0x4F, 0xCC, + 0x83, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x83, 0xCC, + 0x81, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, 0x88, 0xF6, + 0x6F, 0xCC, 0x83, 0xCC, 0x88, 0xF6, 0x4F, 0xCC, + 0x84, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x84, 0xCC, + 0x80, 0xF6, 0x4F, 0xCC, 0x84, 0xCC, 0x81, 0xF6, + 0x6F, 0xCC, 0x84, 0xCC, 0x81, 0xF6, 0x50, 0xCC, + 0x81, 0xF6, 0x70, 0xCC, 0x81, 0xF6, 0x50, 0xCC, + 0x87, 0xF6, 0x70, 0xCC, 0x87, 0xF6, 0x52, 0xCC, + 0x87, 0xF6, 0x72, 0xCC, 0x87, 0xF6, 0x52, 0xCC, + 0xA3, 0xF6, 0x72, 0xCC, 0xA3, 0xF6, 0x52, 0xCC, + 0xA3, 0xCC, 0x84, 0xF6, 0x72, 0xCC, 0xA3, 0xCC, + 0x84, 0xF6, 0x52, 0xCC, 0xB1, 0xF6, 0x72, 0xCC, + 0xB1, 0xF6, 0x53, 0xCC, 0x87, 0xF6, 0x73, 0xCC, + 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xF6, 0x73, 0xCC, + 0xA3, 0xF6, 0x53, 0xCC, 0x81, 0xCC, 0x87, 0xF6, + 0x73, 0xCC, 0x81, 0xCC, 0x87, 0xF6, 0x53, 0xCC, + 0x8C, 0xCC, 0x87, 0xF6, 0x73, 0xCC, 0x8C, 0xCC, + 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xCC, 0x87, 0xF6, + 0x73, 0xCC, 0xA3, 0xCC, 0x87, 0xF6, 0x54, 0xCC, + 0x87, 0xF6, 0x74, 0xCC, 0x87, 0xF6, 0x54, 0xCC, + 0xA3, 0xF6, 0x74, 0xCC, 0xA3, 0xF6, 0x54, 0xCC, + 0xB1, 0xF6, 0x74, 0xCC, 0xB1, 0xF6, 0x54, 0xCC, + 0xAD, 0xF6, 0x74, 0xCC, 0xAD, 0xF6, 0x55, 0xCC, + 0xA4, 0xF6, 0x75, 0xCC, 0xA4, 0xF6, 0x55, 0xCC, + 0xB0, 0xF6, 0x75, 0xCC, 0xB0, 0xF6, 0x55, 0xCC, + 0xAD, 0xF6, 0x75, 0xCC, 0xAD, 0xF6, 0x55, 0xCC, + 0x83, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x83, 0xCC, + 0x81, 0xF6, 0x55, 0xCC, 0x84, 0xCC, 0x88, 0xF6, + 0x75, 0xCC, 0x84, 0xCC, 0x88, 0xF6, 0x56, 0xCC, + 0x83, 0xF6, 0x76, 0xCC, 0x83, 0xF6, 0x56, 0xCC, + 0xA3, 0xF6, 0x76, 0xCC, 0xA3, 0xF6, 0x57, 0xCC, + 0x80, 0xF6, 0x77, 0xCC, 0x80, 0xF6, 0x57, 0xCC, + 0x81, 0xF6, 0x77, 0xCC, 0x81, 0xF6, 0x57, 0xCC, + 0x88, 0xF6, 0x77, 0xCC, 0x88, 0xF6, 0x57, 0xCC, + 0x87, 0xF6, 0x77, 0xCC, 0x87, 0xF6, 0x57, 0xCC, + 0xA3, 0xF6, 0x77, 0xCC, 0xA3, 0xF6, 0x58, 0xCC, + 0x87, 0xF6, 0x78, 0xCC, 0x87, 0xF6, 0x58, 0xCC, + 0x88, 0xF6, 0x78, 0xCC, 0x88, 0xF6, 0x59, 0xCC, + 0x87, 0xF6, 0x79, 0xCC, 0x87, 0xF6, 0x5A, 0xCC, + 0x82, 0xF6, 0x7A, 0xCC, 0x82, 0xF6, 0x5A, 0xCC, + 0xA3, 0xF6, 0x7A, 0xCC, 0xA3, 0xF6, 0x5A, 0xCC, + 0xB1, 0xF6, 0x7A, 0xCC, 0xB1, 0xF6, 0x68, 0xCC, + 0xB1, 0xF6, 0x74, 0xCC, 0x88, 0xF6, 0x77, 0xCC, + 0x8A, 0xF6, 0x79, 0xCC, 0x8A, 0x61, 0xCA, 0xBE, + 0xF5, 0x05, 0xC5, 0xBF, 0xCC, 0x87, 0x73, 0xCC, + 0x87, 0xF6, 0x41, 0xCC, 0xA3, 0xF6, 0x61, 0xCC, + 0xA3, 0xF6, 0x41, 0xCC, 0x89, 0xF6, 0x61, 0xCC, + 0x89, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x81, 0xF6, + 0x61, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x41, 0xCC, + 0x82, 0xCC, 0x80, 0xF6, 0x61, 0xCC, 0x82, 0xCC, + 0x80, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x89, 0xF6, + 0x61, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x41, 0xCC, + 0x82, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x82, 0xCC, + 0x83, 0xF6, 0x41, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, + 0x61, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x41, 0xCC, + 0x86, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x86, 0xCC, + 0x81, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x80, 0xF6, + 0x61, 0xCC, 0x86, 0xCC, 0x80, 0xF6, 0x41, 0xCC, + 0x86, 0xCC, 0x89, 0xF6, 0x61, 0xCC, 0x86, 0xCC, + 0x89, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x83, 0xF6, + 0x61, 0xCC, 0x86, 0xCC, 0x83, 0xF6, 0x41, 0xCC, + 0xA3, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0xA3, 0xCC, + 0x86, 0xF6, 0x45, 0xCC, 0xA3, 0xF6, 0x65, 0xCC, + 0xA3, 0xF6, 0x45, 0xCC, 0x89, 0xF6, 0x65, 0xCC, + 0x89, 0xF6, 0x45, 0xCC, 0x83, 0xF6, 0x65, 0xCC, + 0x83, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x81, 0xF6, + 0x65, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x45, 0xCC, + 0x82, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x82, 0xCC, + 0x80, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x89, 0xF6, + 0x65, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x45, 0xCC, + 0x82, 0xCC, 0x83, 0xF6, 0x65, 0xCC, 0x82, 0xCC, + 0x83, 0xF6, 0x45, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, + 0x65, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x49, 0xCC, + 0x89, 0xF6, 0x69, 0xCC, 0x89, 0xF6, 0x49, 0xCC, + 0xA3, 0xF6, 0x69, 0xCC, 0xA3, 0xF6, 0x4F, 0xCC, + 0xA3, 0xF6, 0x6F, 0xCC, 0xA3, 0xF6, 0x4F, 0xCC, + 0x89, 0xF6, 0x6F, 0xCC, 0x89, 0xF6, 0x4F, 0xCC, + 0x82, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, 0xCC, + 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x80, 0xF6, + 0x6F, 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x4F, 0xCC, + 0x82, 0xCC, 0x89, 0xF6, 0x6F, 0xCC, 0x82, 0xCC, + 0x89, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x83, 0xF6, + 0x6F, 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, + 0xA3, 0xCC, 0x82, 0xF6, 0x6F, 0xCC, 0xA3, 0xCC, + 0x82, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x81, 0xF6, + 0x6F, 0xCC, 0x9B, 0xCC, 0x81, 0xF6, 0x4F, 0xCC, + 0x9B, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, + 0x80, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x89, 0xF6, + 0x6F, 0xCC, 0x9B, 0xCC, 0x89, 0xF6, 0x4F, 0xCC, + 0x9B, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, + 0x83, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, + 0x6F, 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, 0x55, 0xCC, + 0xA3, 0xF6, 0x75, 0xCC, 0xA3, 0xF6, 0x55, 0xCC, + 0x89, 0xF6, 0x75, 0xCC, 0x89, 0xF6, 0x55, 0xCC, + 0x9B, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x9B, 0xCC, + 0x81, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x80, 0xF6, + 0x75, 0xCC, 0x9B, 0xCC, 0x80, 0xF6, 0x55, 0xCC, + 0x9B, 0xCC, 0x89, 0xF6, 0x75, 0xCC, 0x9B, 0xCC, + 0x89, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x83, 0xF6, + 0x75, 0xCC, 0x9B, 0xCC, 0x83, 0xF6, 0x55, 0xCC, + 0x9B, 0xCC, 0xA3, 0xF6, 0x75, 0xCC, 0x9B, 0xCC, + 0xA3, 0xF6, 0x59, 0xCC, 0x80, 0xF6, 0x79, 0xCC, + 0x80, 0xF6, 0x59, 0xCC, 0xA3, 0xF6, 0x79, 0xCC, + 0xA3, 0xF6, 0x59, 0xCC, 0x89, 0xF6, 0x79, 0xCC, + 0x89, 0xF6, 0x59, 0xCC, 0x83, 0xF6, 0x79, 0xCC, + 0x83, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xF6, 0xCE, + 0xB1, 0xCC, 0x94, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, + 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCC, + 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCC, 0x81, + 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCC, 0x81, 0xF6, + 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCE, + 0xB1, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0x91, + 0xCC, 0x93, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xF6, + 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, + 0x91, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0x91, + 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCC, + 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCC, 0x93, + 0xCD, 0x82, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCD, + 0x82, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xF6, 0xCE, + 0xB5, 0xCC, 0x94, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, + 0xCC, 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x94, 0xCC, + 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xCC, 0x81, + 0xF6, 0xCE, 0xB5, 0xCC, 0x94, 0xCC, 0x81, 0xF6, + 0xCE, 0x95, 0xCC, 0x93, 0xF6, 0xCE, 0x95, 0xCC, + 0x94, 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xCC, 0x80, + 0xF6, 0xCE, 0x95, 0xCC, 0x94, 0xCC, 0x80, 0xF6, + 0xCE, 0x95, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, + 0x95, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, + 0xCC, 0x93, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xF6, + 0xCE, 0xB7, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, + 0xB7, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xB7, + 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, + 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, 0x93, + 0xCD, 0x82, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xCD, + 0x82, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xF6, 0xCE, + 0x97, 0xCC, 0x94, 0xF6, 0xCE, 0x97, 0xCC, 0x93, + 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, + 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x81, + 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x81, 0xF6, + 0xCE, 0x97, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCE, + 0x97, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xB9, + 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, 0xF6, + 0xCE, 0xB9, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, + 0xB9, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, + 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, + 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x93, + 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, 0xCD, + 0x82, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xF6, 0xCE, + 0x99, 0xCC, 0x94, 0xF6, 0xCE, 0x99, 0xCC, 0x93, + 0xCC, 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x94, 0xCC, + 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xCC, 0x81, + 0xF6, 0xCE, 0x99, 0xCC, 0x94, 0xCC, 0x81, 0xF6, + 0xCE, 0x99, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCE, + 0x99, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xBF, + 0xCC, 0x93, 0xF6, 0xCE, 0xBF, 0xCC, 0x94, 0xF6, + 0xCE, 0xBF, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, + 0xBF, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xBF, + 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xBF, 0xCC, + 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, 0x93, + 0xF6, 0xCE, 0x9F, 0xCC, 0x94, 0xF6, 0xCE, 0x9F, + 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, + 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, 0x93, + 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, 0x94, 0xCC, + 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xF6, 0xCF, + 0x85, 0xCC, 0x94, 0xF6, 0xCF, 0x85, 0xCC, 0x93, + 0xCC, 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x94, 0xCC, + 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xCC, 0x81, + 0xF6, 0xCF, 0x85, 0xCC, 0x94, 0xCC, 0x81, 0xF6, + 0xCF, 0x85, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCF, + 0x85, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xA5, + 0xCC, 0x94, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCC, + 0x80, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCC, 0x81, + 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCD, 0x82, 0xF6, + 0xCF, 0x89, 0xCC, 0x93, 0xF6, 0xCF, 0x89, 0xCC, + 0x94, 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCC, 0x80, + 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x80, 0xF6, + 0xCF, 0x89, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCF, + 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCF, 0x89, + 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCF, 0x89, 0xCC, + 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, + 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xF6, 0xCE, 0xA9, + 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, + 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, + 0xCC, 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, + 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, 0x82, + 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x82, 0xF6, + 0xCE, 0xB1, 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, + 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x80, 0xF6, 0xCE, + 0xB5, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, 0x80, + 0xF6, 0xCE, 0xB7, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, + 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, 0x81, 0xF6, + 0xCE, 0xBF, 0xCC, 0x80, 0xF6, 0xCE, 0xBF, 0xCC, + 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x80, 0xF6, 0xCF, + 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, 0x80, + 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xF6, 0xCE, 0xB1, + 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, + 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, + 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, + 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, + 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, + 0xB1, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, 0xF6, + 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xCD, 0x85, + 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCD, 0x82, 0xCD, + 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCD, 0x85, + 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCD, 0x85, 0xF6, + 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xCD, 0x85, + 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCC, 0x80, 0xCD, + 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x81, + 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, + 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, + 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, + 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, + 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x93, + 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, + 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, + 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, + 0xB7, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, 0xF6, + 0xCE, 0xB7, 0xCC, 0x93, 0xCD, 0x82, 0xCD, 0x85, + 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xCD, 0x82, 0xCD, + 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCD, 0x85, + 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCD, 0x85, 0xF6, + 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x80, 0xCD, 0x85, + 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x80, 0xCD, + 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x81, + 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, + 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, + 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCF, 0x89, + 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, + 0x94, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x93, + 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, + 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, + 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCF, + 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, 0xF6, + 0xCF, 0x89, 0xCC, 0x93, 0xCD, 0x82, 0xCD, 0x85, + 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCD, 0x82, 0xCD, + 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, 0x85, + 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x85, 0xF6, + 0xCE, 0xA9, 0xCC, 0x93, 0xCC, 0x80, 0xCD, 0x85, + 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, 0x80, 0xCD, + 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCC, 0x81, + 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, + 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, + 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, + 0xCC, 0x86, 0xF6, 0xCE, 0xB1, 0xCC, 0x84, 0xF6, + 0xCE, 0xB1, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, + 0xB1, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, 0x81, + 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCD, 0x82, 0xF6, + 0xCE, 0xB1, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, + 0x91, 0xCC, 0x86, 0xF6, 0xCE, 0x91, 0xCC, 0x84, + 0xF6, 0xCE, 0x91, 0xCC, 0x80, 0xF6, 0xCE, 0x91, + 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCD, 0x85, 0x20, + 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0x20, 0xCC, 0x93, + 0x20, 0xCD, 0x82, 0xF5, 0x05, 0xC2, 0xA8, 0xCD, + 0x82, 0x20, 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, + 0xB7, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, + 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x81, 0xCD, + 0x85, 0xF6, 0xCE, 0xB7, 0xCD, 0x82, 0xF6, 0xCE, + 0xB7, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x95, + 0xCC, 0x80, 0xF6, 0xCE, 0x95, 0xCC, 0x81, 0xF6, + 0xCE, 0x97, 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, + 0x81, 0xF6, 0xCE, 0x97, 0xCD, 0x85, 0xF5, 0x06, + 0xE1, 0xBE, 0xBF, 0xCC, 0x80, 0x20, 0xCC, 0x93, + 0xCC, 0x80, 0xF5, 0x06, 0xE1, 0xBE, 0xBF, 0xCC, + 0x81, 0x20, 0xCC, 0x93, 0xCC, 0x81, 0xF5, 0x06, + 0xE1, 0xBE, 0xBF, 0xCD, 0x82, 0x20, 0xCC, 0x93, + 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x86, 0xF6, + 0xCE, 0xB9, 0xCC, 0x84, 0xF6, 0xCE, 0xB9, 0xCC, + 0x88, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, 0x88, + 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCD, 0x82, 0xF6, + 0xCE, 0xB9, 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, + 0x99, 0xCC, 0x86, 0xF6, 0xCE, 0x99, 0xCC, 0x84, + 0xF6, 0xCE, 0x99, 0xCC, 0x80, 0xF6, 0xCE, 0x99, + 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, 0xCC, + 0x80, 0x20, 0xCC, 0x94, 0xCC, 0x80, 0xF5, 0x06, + 0xE1, 0xBF, 0xBE, 0xCC, 0x81, 0x20, 0xCC, 0x94, + 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, 0xCD, + 0x82, 0x20, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCF, + 0x85, 0xCC, 0x86, 0xF6, 0xCF, 0x85, 0xCC, 0x84, + 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x80, 0xF6, + 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0xCF, + 0x81, 0xCC, 0x93, 0xF6, 0xCF, 0x81, 0xCC, 0x94, + 0xF6, 0xCF, 0x85, 0xCD, 0x82, 0xF6, 0xCF, 0x85, + 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, 0xA5, 0xCC, + 0x86, 0xF6, 0xCE, 0xA5, 0xCC, 0x84, 0xF6, 0xCE, + 0xA5, 0xCC, 0x80, 0xF6, 0xCE, 0xA5, 0xCC, 0x81, + 0xF6, 0xCE, 0xA1, 0xCC, 0x94, 0xF5, 0x05, 0xC2, + 0xA8, 0xCC, 0x80, 0x20, 0xCC, 0x88, 0xCC, 0x80, + 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, 0x20, 0xCC, + 0x88, 0xCC, 0x81, 0xF6, 0x60, 0xF6, 0xCF, 0x89, + 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCD, + 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xCD, 0x85, + 0xF6, 0xCF, 0x89, 0xCD, 0x82, 0xF6, 0xCF, 0x89, + 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x9F, 0xCC, + 0x80, 0xF6, 0xCE, 0x9F, 0xCC, 0x81, 0xF6, 0xCE, + 0xA9, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, 0x81, + 0xF6, 0xCE, 0xA9, 0xCD, 0x85, 0xF5, 0x03, 0xC2, + 0xB4, 0x20, 0xCC, 0x81, 0x20, 0xCC, 0x94, 0xF5, + 0x04, 0xE2, 0x80, 0x82, 0x20, 0xF5, 0x04, 0xE2, + 0x80, 0x83, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0xE2, 0x80, 0x90, 0x20, + 0xCC, 0xB3, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, + 0x20, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0xE2, + 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, + 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0xE2, 0x80, + 0xB5, 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0x21, + 0x21, 0x20, 0xCC, 0x85, 0x3F, 0x3F, 0x3F, 0x21, + 0x21, 0x3F, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, + 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0x20, 0x30, + 0x69, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x2B, + 0xE2, 0x88, 0x92, 0x3D, 0x28, 0x29, 0x6E, 0x30, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x39, 0x2B, 0xE2, 0x88, 0x92, 0x3D, 0x28, 0x29, + 0x52, 0x73, 0x61, 0x2F, 0x63, 0x61, 0x2F, 0x73, + 0x43, 0xC2, 0xB0, 0x43, 0x63, 0x2F, 0x6F, 0x63, + 0x2F, 0x75, 0xC6, 0x90, 0xC2, 0xB0, 0x46, 0x67, + 0x48, 0x48, 0x48, 0x68, 0xC4, 0xA7, 0x49, 0x49, + 0x4C, 0x6C, 0x4E, 0x4E, 0x6F, 0x50, 0x51, 0x52, + 0x52, 0x52, 0x53, 0x4D, 0x54, 0x45, 0x4C, 0x54, + 0x4D, 0x5A, 0xF6, 0xCE, 0xA9, 0x5A, 0xF6, 0x4B, + 0xF6, 0x41, 0xCC, 0x8A, 0x42, 0x43, 0x65, 0x45, + 0x46, 0x4D, 0x6F, 0xD7, 0x90, 0xD7, 0x91, 0xD7, + 0x92, 0xD7, 0x93, 0x69, 0xCE, 0xB3, 0xCE, 0x93, + 0xCE, 0xA0, 0xE2, 0x88, 0x91, 0x44, 0x64, 0x65, + 0x69, 0x6A, 0x31, 0xE2, 0x81, 0x84, 0x33, 0x32, + 0xE2, 0x81, 0x84, 0x33, 0x31, 0xE2, 0x81, 0x84, + 0x35, 0x32, 0xE2, 0x81, 0x84, 0x35, 0x33, 0xE2, + 0x81, 0x84, 0x35, 0x34, 0xE2, 0x81, 0x84, 0x35, + 0x31, 0xE2, 0x81, 0x84, 0x36, 0x35, 0xE2, 0x81, + 0x84, 0x36, 0x31, 0xE2, 0x81, 0x84, 0x38, 0x33, + 0xE2, 0x81, 0x84, 0x38, 0x35, 0xE2, 0x81, 0x84, + 0x38, 0x37, 0xE2, 0x81, 0x84, 0x38, 0x31, 0xE2, + 0x81, 0x84, 0x49, 0x49, 0x49, 0x49, 0x49, 0x49, + 0x49, 0x56, 0x56, 0x56, 0x49, 0x56, 0x49, 0x49, + 0x56, 0x49, 0x49, 0x49, 0x49, 0x58, 0x58, 0x58, + 0x49, 0x58, 0x49, 0x49, 0x4C, 0x43, 0x44, 0x4D, + 0x69, 0x69, 0x69, 0x69, 0x69, 0x69, 0x69, 0x76, + 0x76, 0x76, 0x69, 0x76, 0x69, 0x69, 0x76, 0x69, + 0x69, 0x69, 0x69, 0x78, 0x78, 0x78, 0x69, 0x78, + 0x69, 0x69, 0x6C, 0x63, 0x64, 0x6D, 0xF6, 0xE2, + 0x86, 0x90, 0xCC, 0xB8, 0xF6, 0xE2, 0x86, 0x92, + 0xCC, 0xB8, 0xF6, 0xE2, 0x86, 0x94, 0xCC, 0xB8, + 0xF6, 0xE2, 0x87, 0x90, 0xCC, 0xB8, 0xF6, 0xE2, + 0x87, 0x94, 0xCC, 0xB8, 0xF6, 0xE2, 0x87, 0x92, + 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0x83, 0xCC, 0xB8, + 0xF6, 0xE2, 0x88, 0x88, 0xCC, 0xB8, 0xF6, 0xE2, + 0x88, 0x8B, 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0xA3, + 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0xA5, 0xCC, 0xB8, + 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, + 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, + 0x88, 0xAE, 0xE2, 0x88, 0xAE, 0xE2, 0x88, 0xAE, + 0xE2, 0x88, 0xAE, 0xE2, 0x88, 0xAE, 0xF6, 0xE2, + 0x88, 0xBC, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x83, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x85, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0x88, 0xCC, 0xB8, 0xF6, 0x3D, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xA1, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0x8D, 0xCC, 0xB8, 0xF6, 0x3C, + 0xCC, 0xB8, 0xF6, 0x3E, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0xA4, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xA5, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB2, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0xB3, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0xB6, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB7, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBA, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0xBB, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0x82, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x83, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x86, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0x87, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0xA2, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xA8, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xA9, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0xAB, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0xBC, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBD, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x91, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0x92, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0xB2, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB3, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB4, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0xB5, 0xCC, 0xB8, 0xF6, 0xE3, + 0x80, 0x88, 0xF6, 0xE3, 0x80, 0x89, 0x31, 0x32, + 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x31, + 0x30, 0x31, 0x31, 0x31, 0x32, 0x31, 0x33, 0x31, + 0x34, 0x31, 0x35, 0x31, 0x36, 0x31, 0x37, 0x31, + 0x38, 0x31, 0x39, 0x32, 0x30, 0x28, 0x31, 0x29, + 0x28, 0x32, 0x29, 0x28, 0x33, 0x29, 0x28, 0x34, + 0x29, 0x28, 0x35, 0x29, 0x28, 0x36, 0x29, 0x28, + 0x37, 0x29, 0x28, 0x38, 0x29, 0x28, 0x39, 0x29, + 0x28, 0x31, 0x30, 0x29, 0x28, 0x31, 0x31, 0x29, + 0x28, 0x31, 0x32, 0x29, 0x28, 0x31, 0x33, 0x29, + 0x28, 0x31, 0x34, 0x29, 0x28, 0x31, 0x35, 0x29, + 0x28, 0x31, 0x36, 0x29, 0x28, 0x31, 0x37, 0x29, + 0x28, 0x31, 0x38, 0x29, 0x28, 0x31, 0x39, 0x29, + 0x28, 0x32, 0x30, 0x29, 0x31, 0x2E, 0x32, 0x2E, + 0x33, 0x2E, 0x34, 0x2E, 0x35, 0x2E, 0x36, 0x2E, + 0x37, 0x2E, 0x38, 0x2E, 0x39, 0x2E, 0x31, 0x30, + 0x2E, 0x31, 0x31, 0x2E, 0x31, 0x32, 0x2E, 0x31, + 0x33, 0x2E, 0x31, 0x34, 0x2E, 0x31, 0x35, 0x2E, + 0x31, 0x36, 0x2E, 0x31, 0x37, 0x2E, 0x31, 0x38, + 0x2E, 0x31, 0x39, 0x2E, 0x32, 0x30, 0x2E, 0x28, + 0x61, 0x29, 0x28, 0x62, 0x29, 0x28, 0x63, 0x29, + 0x28, 0x64, 0x29, 0x28, 0x65, 0x29, 0x28, 0x66, + 0x29, 0x28, 0x67, 0x29, 0x28, 0x68, 0x29, 0x28, + 0x69, 0x29, 0x28, 0x6A, 0x29, 0x28, 0x6B, 0x29, + 0x28, 0x6C, 0x29, 0x28, 0x6D, 0x29, 0x28, 0x6E, + 0x29, 0x28, 0x6F, 0x29, 0x28, 0x70, 0x29, 0x28, + 0x71, 0x29, 0x28, 0x72, 0x29, 0x28, 0x73, 0x29, + 0x28, 0x74, 0x29, 0x28, 0x75, 0x29, 0x28, 0x76, + 0x29, 0x28, 0x77, 0x29, 0x28, 0x78, 0x29, 0x28, + 0x79, 0x29, 0x28, 0x7A, 0x29, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x30, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, + 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0x3A, 0x3A, + 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0xF6, 0xE2, + 0xAB, 0x9D, 0xCC, 0xB8, 0xE6, 0xAF, 0x8D, 0xE9, + 0xBE, 0x9F, 0xE4, 0xB8, 0x80, 0xE4, 0xB8, 0xA8, + 0xE4, 0xB8, 0xB6, 0xE4, 0xB8, 0xBF, 0xE4, 0xB9, + 0x99, 0xE4, 0xBA, 0x85, 0xE4, 0xBA, 0x8C, 0xE4, + 0xBA, 0xA0, 0xE4, 0xBA, 0xBA, 0xE5, 0x84, 0xBF, + 0xE5, 0x85, 0xA5, 0xE5, 0x85, 0xAB, 0xE5, 0x86, + 0x82, 0xE5, 0x86, 0x96, 0xE5, 0x86, 0xAB, 0xE5, + 0x87, 0xA0, 0xE5, 0x87, 0xB5, 0xE5, 0x88, 0x80, + 0xE5, 0x8A, 0x9B, 0xE5, 0x8B, 0xB9, 0xE5, 0x8C, + 0x95, 0xE5, 0x8C, 0x9A, 0xE5, 0x8C, 0xB8, 0xE5, + 0x8D, 0x81, 0xE5, 0x8D, 0x9C, 0xE5, 0x8D, 0xA9, + 0xE5, 0x8E, 0x82, 0xE5, 0x8E, 0xB6, 0xE5, 0x8F, + 0x88, 0xE5, 0x8F, 0xA3, 0xE5, 0x9B, 0x97, 0xE5, + 0x9C, 0x9F, 0xE5, 0xA3, 0xAB, 0xE5, 0xA4, 0x82, + 0xE5, 0xA4, 0x8A, 0xE5, 0xA4, 0x95, 0xE5, 0xA4, + 0xA7, 0xE5, 0xA5, 0xB3, 0xE5, 0xAD, 0x90, 0xE5, + 0xAE, 0x80, 0xE5, 0xAF, 0xB8, 0xE5, 0xB0, 0x8F, + 0xE5, 0xB0, 0xA2, 0xE5, 0xB0, 0xB8, 0xE5, 0xB1, + 0xAE, 0xE5, 0xB1, 0xB1, 0xE5, 0xB7, 0x9B, 0xE5, + 0xB7, 0xA5, 0xE5, 0xB7, 0xB1, 0xE5, 0xB7, 0xBE, + 0xE5, 0xB9, 0xB2, 0xE5, 0xB9, 0xBA, 0xE5, 0xB9, + 0xBF, 0xE5, 0xBB, 0xB4, 0xE5, 0xBB, 0xBE, 0xE5, + 0xBC, 0x8B, 0xE5, 0xBC, 0x93, 0xE5, 0xBD, 0x90, + 0xE5, 0xBD, 0xA1, 0xE5, 0xBD, 0xB3, 0xE5, 0xBF, + 0x83, 0xE6, 0x88, 0x88, 0xE6, 0x88, 0xB6, 0xE6, + 0x89, 0x8B, 0xE6, 0x94, 0xAF, 0xE6, 0x94, 0xB4, + 0xE6, 0x96, 0x87, 0xE6, 0x96, 0x97, 0xE6, 0x96, + 0xA4, 0xE6, 0x96, 0xB9, 0xE6, 0x97, 0xA0, 0xE6, + 0x97, 0xA5, 0xE6, 0x9B, 0xB0, 0xE6, 0x9C, 0x88, + 0xE6, 0x9C, 0xA8, 0xE6, 0xAC, 0xA0, 0xE6, 0xAD, + 0xA2, 0xE6, 0xAD, 0xB9, 0xE6, 0xAE, 0xB3, 0xE6, + 0xAF, 0x8B, 0xE6, 0xAF, 0x94, 0xE6, 0xAF, 0x9B, + 0xE6, 0xB0, 0x8F, 0xE6, 0xB0, 0x94, 0xE6, 0xB0, + 0xB4, 0xE7, 0x81, 0xAB, 0xE7, 0x88, 0xAA, 0xE7, + 0x88, 0xB6, 0xE7, 0x88, 0xBB, 0xE7, 0x88, 0xBF, + 0xE7, 0x89, 0x87, 0xE7, 0x89, 0x99, 0xE7, 0x89, + 0x9B, 0xE7, 0x8A, 0xAC, 0xE7, 0x8E, 0x84, 0xE7, + 0x8E, 0x89, 0xE7, 0x93, 0x9C, 0xE7, 0x93, 0xA6, + 0xE7, 0x94, 0x98, 0xE7, 0x94, 0x9F, 0xE7, 0x94, + 0xA8, 0xE7, 0x94, 0xB0, 0xE7, 0x96, 0x8B, 0xE7, + 0x96, 0x92, 0xE7, 0x99, 0xB6, 0xE7, 0x99, 0xBD, + 0xE7, 0x9A, 0xAE, 0xE7, 0x9A, 0xBF, 0xE7, 0x9B, + 0xAE, 0xE7, 0x9F, 0x9B, 0xE7, 0x9F, 0xA2, 0xE7, + 0x9F, 0xB3, 0xE7, 0xA4, 0xBA, 0xE7, 0xA6, 0xB8, + 0xE7, 0xA6, 0xBE, 0xE7, 0xA9, 0xB4, 0xE7, 0xAB, + 0x8B, 0xE7, 0xAB, 0xB9, 0xE7, 0xB1, 0xB3, 0xE7, + 0xB3, 0xB8, 0xE7, 0xBC, 0xB6, 0xE7, 0xBD, 0x91, + 0xE7, 0xBE, 0x8A, 0xE7, 0xBE, 0xBD, 0xE8, 0x80, + 0x81, 0xE8, 0x80, 0x8C, 0xE8, 0x80, 0x92, 0xE8, + 0x80, 0xB3, 0xE8, 0x81, 0xBF, 0xE8, 0x82, 0x89, + 0xE8, 0x87, 0xA3, 0xE8, 0x87, 0xAA, 0xE8, 0x87, + 0xB3, 0xE8, 0x87, 0xBC, 0xE8, 0x88, 0x8C, 0xE8, + 0x88, 0x9B, 0xE8, 0x88, 0x9F, 0xE8, 0x89, 0xAE, + 0xE8, 0x89, 0xB2, 0xE8, 0x89, 0xB8, 0xE8, 0x99, + 0x8D, 0xE8, 0x99, 0xAB, 0xE8, 0xA1, 0x80, 0xE8, + 0xA1, 0x8C, 0xE8, 0xA1, 0xA3, 0xE8, 0xA5, 0xBE, + 0xE8, 0xA6, 0x8B, 0xE8, 0xA7, 0x92, 0xE8, 0xA8, + 0x80, 0xE8, 0xB0, 0xB7, 0xE8, 0xB1, 0x86, 0xE8, + 0xB1, 0x95, 0xE8, 0xB1, 0xB8, 0xE8, 0xB2, 0x9D, + 0xE8, 0xB5, 0xA4, 0xE8, 0xB5, 0xB0, 0xE8, 0xB6, + 0xB3, 0xE8, 0xBA, 0xAB, 0xE8, 0xBB, 0x8A, 0xE8, + 0xBE, 0x9B, 0xE8, 0xBE, 0xB0, 0xE8, 0xBE, 0xB5, + 0xE9, 0x82, 0x91, 0xE9, 0x85, 0x89, 0xE9, 0x87, + 0x86, 0xE9, 0x87, 0x8C, 0xE9, 0x87, 0x91, 0xE9, + 0x95, 0xB7, 0xE9, 0x96, 0x80, 0xE9, 0x98, 0x9C, + 0xE9, 0x9A, 0xB6, 0xE9, 0x9A, 0xB9, 0xE9, 0x9B, + 0xA8, 0xE9, 0x9D, 0x91, 0xE9, 0x9D, 0x9E, 0xE9, + 0x9D, 0xA2, 0xE9, 0x9D, 0xA9, 0xE9, 0x9F, 0x8B, + 0xE9, 0x9F, 0xAD, 0xE9, 0x9F, 0xB3, 0xE9, 0xA0, + 0x81, 0xE9, 0xA2, 0xA8, 0xE9, 0xA3, 0x9B, 0xE9, + 0xA3, 0x9F, 0xE9, 0xA6, 0x96, 0xE9, 0xA6, 0x99, + 0xE9, 0xA6, 0xAC, 0xE9, 0xAA, 0xA8, 0xE9, 0xAB, + 0x98, 0xE9, 0xAB, 0x9F, 0xE9, 0xAC, 0xA5, 0xE9, + 0xAC, 0xAF, 0xE9, 0xAC, 0xB2, 0xE9, 0xAC, 0xBC, + 0xE9, 0xAD, 0x9A, 0xE9, 0xB3, 0xA5, 0xE9, 0xB9, + 0xB5, 0xE9, 0xB9, 0xBF, 0xE9, 0xBA, 0xA5, 0xE9, + 0xBA, 0xBB, 0xE9, 0xBB, 0x83, 0xE9, 0xBB, 0x8D, + 0xE9, 0xBB, 0x91, 0xE9, 0xBB, 0xB9, 0xE9, 0xBB, + 0xBD, 0xE9, 0xBC, 0x8E, 0xE9, 0xBC, 0x93, 0xE9, + 0xBC, 0xA0, 0xE9, 0xBC, 0xBB, 0xE9, 0xBD, 0x8A, + 0xE9, 0xBD, 0x92, 0xE9, 0xBE, 0x8D, 0xE9, 0xBE, + 0x9C, 0xE9, 0xBE, 0xA0, 0x20, 0xE3, 0x80, 0x92, + 0xE5, 0x8D, 0x81, 0xE5, 0x8D, 0x84, 0xE5, 0x8D, + 0x85, 0xF6, 0xE3, 0x81, 0x8B, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x81, 0x8D, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x81, 0x8F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x81, 0x91, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, + 0x93, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x95, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x97, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x99, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x81, 0x9B, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x81, 0x9D, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x81, 0x9F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x81, 0xA1, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, + 0xA4, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA6, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA8, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, 0x9A, + 0xF6, 0xE3, 0x81, 0xB2, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x81, 0xB2, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, + 0x81, 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, + 0xB5, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x81, 0xB8, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xB8, 0xE3, + 0x82, 0x9A, 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, 0x9A, + 0xF6, 0xE3, 0x81, 0x86, 0xE3, 0x82, 0x99, 0x20, + 0xE3, 0x82, 0x99, 0x20, 0xE3, 0x82, 0x9A, 0xF6, + 0xE3, 0x82, 0x9D, 0xE3, 0x82, 0x99, 0xE3, 0x82, + 0x88, 0xE3, 0x82, 0x8A, 0xF6, 0xE3, 0x82, 0xAB, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xAD, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xAF, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x82, 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x82, 0xB7, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, + 0xB9, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBB, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBD, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBF, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x83, 0x81, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x83, 0x84, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x83, 0x86, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x83, 0x88, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, + 0x8F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x8F, + 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x83, 0x92, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x92, 0xE3, 0x82, + 0x9A, 0xF6, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x9A, 0xF6, + 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x83, + 0x9B, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x9B, + 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x82, 0xA6, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x83, 0xAF, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x83, 0xB0, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x83, 0xB1, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x83, 0xB2, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x83, 0xBD, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xB3, + 0xE3, 0x83, 0x88, 0xE1, 0x84, 0x80, 0xE1, 0x84, + 0x81, 0xE1, 0x86, 0xAA, 0xE1, 0x84, 0x82, 0xE1, + 0x86, 0xAC, 0xE1, 0x86, 0xAD, 0xE1, 0x84, 0x83, + 0xE1, 0x84, 0x84, 0xE1, 0x84, 0x85, 0xE1, 0x86, + 0xB0, 0xE1, 0x86, 0xB1, 0xE1, 0x86, 0xB2, 0xE1, + 0x86, 0xB3, 0xE1, 0x86, 0xB4, 0xE1, 0x86, 0xB5, + 0xE1, 0x84, 0x9A, 0xE1, 0x84, 0x86, 0xE1, 0x84, + 0x87, 0xE1, 0x84, 0x88, 0xE1, 0x84, 0xA1, 0xE1, + 0x84, 0x89, 0xE1, 0x84, 0x8A, 0xE1, 0x84, 0x8B, + 0xE1, 0x84, 0x8C, 0xE1, 0x84, 0x8D, 0xE1, 0x84, + 0x8E, 0xE1, 0x84, 0x8F, 0xE1, 0x84, 0x90, 0xE1, + 0x84, 0x91, 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, + 0xE1, 0x85, 0xA2, 0xE1, 0x85, 0xA3, 0xE1, 0x85, + 0xA4, 0xE1, 0x85, 0xA5, 0xE1, 0x85, 0xA6, 0xE1, + 0x85, 0xA7, 0xE1, 0x85, 0xA8, 0xE1, 0x85, 0xA9, + 0xE1, 0x85, 0xAA, 0xE1, 0x85, 0xAB, 0xE1, 0x85, + 0xAC, 0xE1, 0x85, 0xAD, 0xE1, 0x85, 0xAE, 0xE1, + 0x85, 0xAF, 0xE1, 0x85, 0xB0, 0xE1, 0x85, 0xB1, + 0xE1, 0x85, 0xB2, 0xE1, 0x85, 0xB3, 0xE1, 0x85, + 0xB4, 0xE1, 0x85, 0xB5, 0xE1, 0x85, 0xA0, 0xE1, + 0x84, 0x94, 0xE1, 0x84, 0x95, 0xE1, 0x87, 0x87, + 0xE1, 0x87, 0x88, 0xE1, 0x87, 0x8C, 0xE1, 0x87, + 0x8E, 0xE1, 0x87, 0x93, 0xE1, 0x87, 0x97, 0xE1, + 0x87, 0x99, 0xE1, 0x84, 0x9C, 0xE1, 0x87, 0x9D, + 0xE1, 0x87, 0x9F, 0xE1, 0x84, 0x9D, 0xE1, 0x84, + 0x9E, 0xE1, 0x84, 0xA0, 0xE1, 0x84, 0xA2, 0xE1, + 0x84, 0xA3, 0xE1, 0x84, 0xA7, 0xE1, 0x84, 0xA9, + 0xE1, 0x84, 0xAB, 0xE1, 0x84, 0xAC, 0xE1, 0x84, + 0xAD, 0xE1, 0x84, 0xAE, 0xE1, 0x84, 0xAF, 0xE1, + 0x84, 0xB2, 0xE1, 0x84, 0xB6, 0xE1, 0x85, 0x80, + 0xE1, 0x85, 0x87, 0xE1, 0x85, 0x8C, 0xE1, 0x87, + 0xB1, 0xE1, 0x87, 0xB2, 0xE1, 0x85, 0x97, 0xE1, + 0x85, 0x98, 0xE1, 0x85, 0x99, 0xE1, 0x86, 0x84, + 0xE1, 0x86, 0x85, 0xE1, 0x86, 0x88, 0xE1, 0x86, + 0x91, 0xE1, 0x86, 0x92, 0xE1, 0x86, 0x94, 0xE1, + 0x86, 0x9E, 0xE1, 0x86, 0xA1, 0xE4, 0xB8, 0x80, + 0xE4, 0xBA, 0x8C, 0xE4, 0xB8, 0x89, 0xE5, 0x9B, + 0x9B, 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, 0xAD, 0xE4, + 0xB8, 0x8B, 0xE7, 0x94, 0xB2, 0xE4, 0xB9, 0x99, + 0xE4, 0xB8, 0x99, 0xE4, 0xB8, 0x81, 0xE5, 0xA4, + 0xA9, 0xE5, 0x9C, 0xB0, 0xE4, 0xBA, 0xBA, 0x28, + 0xE1, 0x84, 0x80, 0x29, 0x28, 0xE1, 0x84, 0x82, + 0x29, 0x28, 0xE1, 0x84, 0x83, 0x29, 0x28, 0xE1, + 0x84, 0x85, 0x29, 0x28, 0xE1, 0x84, 0x86, 0x29, + 0x28, 0xE1, 0x84, 0x87, 0x29, 0x28, 0xE1, 0x84, + 0x89, 0x29, 0x28, 0xE1, 0x84, 0x8B, 0x29, 0x28, + 0xE1, 0x84, 0x8C, 0x29, 0x28, 0xE1, 0x84, 0x8E, + 0x29, 0x28, 0xE1, 0x84, 0x8F, 0x29, 0x28, 0xE1, + 0x84, 0x90, 0x29, 0x28, 0xE1, 0x84, 0x91, 0x29, + 0x28, 0xE1, 0x84, 0x92, 0x29, 0x28, 0xE1, 0x84, + 0x80, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x82, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x83, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x85, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x86, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x87, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x89, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x8B, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x8C, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x8E, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x8F, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x90, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x91, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x92, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x8C, 0xE1, 0x85, 0xAE, 0x29, 0x28, 0xE4, 0xB8, + 0x80, 0x29, 0x28, 0xE4, 0xBA, 0x8C, 0x29, 0x28, + 0xE4, 0xB8, 0x89, 0x29, 0x28, 0xE5, 0x9B, 0x9B, + 0x29, 0x28, 0xE4, 0xBA, 0x94, 0x29, 0x28, 0xE5, + 0x85, 0xAD, 0x29, 0x28, 0xE4, 0xB8, 0x83, 0x29, + 0x28, 0xE5, 0x85, 0xAB, 0x29, 0x28, 0xE4, 0xB9, + 0x9D, 0x29, 0x28, 0xE5, 0x8D, 0x81, 0x29, 0x28, + 0xE6, 0x9C, 0x88, 0x29, 0x28, 0xE7, 0x81, 0xAB, + 0x29, 0x28, 0xE6, 0xB0, 0xB4, 0x29, 0x28, 0xE6, + 0x9C, 0xA8, 0x29, 0x28, 0xE9, 0x87, 0x91, 0x29, + 0x28, 0xE5, 0x9C, 0x9F, 0x29, 0x28, 0xE6, 0x97, + 0xA5, 0x29, 0x28, 0xE6, 0xA0, 0xAA, 0x29, 0x28, + 0xE6, 0x9C, 0x89, 0x29, 0x28, 0xE7, 0xA4, 0xBE, + 0x29, 0x28, 0xE5, 0x90, 0x8D, 0x29, 0x28, 0xE7, + 0x89, 0xB9, 0x29, 0x28, 0xE8, 0xB2, 0xA1, 0x29, + 0x28, 0xE7, 0xA5, 0x9D, 0x29, 0x28, 0xE5, 0x8A, + 0xB4, 0x29, 0x28, 0xE4, 0xBB, 0xA3, 0x29, 0x28, + 0xE5, 0x91, 0xBC, 0x29, 0x28, 0xE5, 0xAD, 0xA6, + 0x29, 0x28, 0xE7, 0x9B, 0xA3, 0x29, 0x28, 0xE4, + 0xBC, 0x81, 0x29, 0x28, 0xE8, 0xB3, 0x87, 0x29, + 0x28, 0xE5, 0x8D, 0x94, 0x29, 0x28, 0xE7, 0xA5, + 0xAD, 0x29, 0x28, 0xE4, 0xBC, 0x91, 0x29, 0x28, + 0xE8, 0x87, 0xAA, 0x29, 0x28, 0xE8, 0x87, 0xB3, + 0x29, 0x32, 0x31, 0x32, 0x32, 0x32, 0x33, 0x32, + 0x34, 0x32, 0x35, 0x32, 0x36, 0x32, 0x37, 0x32, + 0x38, 0x32, 0x39, 0x33, 0x30, 0x33, 0x31, 0x33, + 0x32, 0x33, 0x33, 0x33, 0x34, 0x33, 0x35, 0xE1, + 0x84, 0x80, 0xE1, 0x84, 0x82, 0xE1, 0x84, 0x83, + 0xE1, 0x84, 0x85, 0xE1, 0x84, 0x86, 0xE1, 0x84, + 0x87, 0xE1, 0x84, 0x89, 0xE1, 0x84, 0x8B, 0xE1, + 0x84, 0x8C, 0xE1, 0x84, 0x8E, 0xE1, 0x84, 0x8F, + 0xE1, 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, 0x84, + 0x92, 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xA1, 0xE1, + 0x84, 0x82, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x83, + 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x85, 0xE1, 0x85, + 0xA1, 0xE1, 0x84, 0x86, 0xE1, 0x85, 0xA1, 0xE1, + 0x84, 0x87, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x89, + 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x8B, 0xE1, 0x85, + 0xA1, 0xE1, 0x84, 0x8C, 0xE1, 0x85, 0xA1, 0xE1, + 0x84, 0x8E, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x8F, + 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x90, 0xE1, 0x85, + 0xA1, 0xE1, 0x84, 0x91, 0xE1, 0x85, 0xA1, 0xE1, + 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE4, 0xB8, 0x80, + 0xE4, 0xBA, 0x8C, 0xE4, 0xB8, 0x89, 0xE5, 0x9B, + 0x9B, 0xE4, 0xBA, 0x94, 0xE5, 0x85, 0xAD, 0xE4, + 0xB8, 0x83, 0xE5, 0x85, 0xAB, 0xE4, 0xB9, 0x9D, + 0xE5, 0x8D, 0x81, 0xE6, 0x9C, 0x88, 0xE7, 0x81, + 0xAB, 0xE6, 0xB0, 0xB4, 0xE6, 0x9C, 0xA8, 0xE9, + 0x87, 0x91, 0xE5, 0x9C, 0x9F, 0xE6, 0x97, 0xA5, + 0xE6, 0xA0, 0xAA, 0xE6, 0x9C, 0x89, 0xE7, 0xA4, + 0xBE, 0xE5, 0x90, 0x8D, 0xE7, 0x89, 0xB9, 0xE8, + 0xB2, 0xA1, 0xE7, 0xA5, 0x9D, 0xE5, 0x8A, 0xB4, + 0xE7, 0xA7, 0x98, 0xE7, 0x94, 0xB7, 0xE5, 0xA5, + 0xB3, 0xE9, 0x81, 0xA9, 0xE5, 0x84, 0xAA, 0xE5, + 0x8D, 0xB0, 0xE6, 0xB3, 0xA8, 0xE9, 0xA0, 0x85, + 0xE4, 0xBC, 0x91, 0xE5, 0x86, 0x99, 0xE6, 0xAD, + 0xA3, 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, 0xAD, 0xE4, + 0xB8, 0x8B, 0xE5, 0xB7, 0xA6, 0xE5, 0x8F, 0xB3, + 0xE5, 0x8C, 0xBB, 0xE5, 0xAE, 0x97, 0xE5, 0xAD, + 0xA6, 0xE7, 0x9B, 0xA3, 0xE4, 0xBC, 0x81, 0xE8, + 0xB3, 0x87, 0xE5, 0x8D, 0x94, 0xE5, 0xA4, 0x9C, + 0x33, 0x36, 0x33, 0x37, 0x33, 0x38, 0x33, 0x39, + 0x34, 0x30, 0x34, 0x31, 0x34, 0x32, 0x34, 0x33, + 0x34, 0x34, 0x34, 0x35, 0x34, 0x36, 0x34, 0x37, + 0x34, 0x38, 0x34, 0x39, 0x35, 0x30, 0x31, 0xE6, + 0x9C, 0x88, 0x32, 0xE6, 0x9C, 0x88, 0x33, 0xE6, + 0x9C, 0x88, 0x34, 0xE6, 0x9C, 0x88, 0x35, 0xE6, + 0x9C, 0x88, 0x36, 0xE6, 0x9C, 0x88, 0x37, 0xE6, + 0x9C, 0x88, 0x38, 0xE6, 0x9C, 0x88, 0x39, 0xE6, + 0x9C, 0x88, 0x31, 0x30, 0xE6, 0x9C, 0x88, 0x31, + 0x31, 0xE6, 0x9C, 0x88, 0x31, 0x32, 0xE6, 0x9C, + 0x88, 0xE3, 0x82, 0xA2, 0xE3, 0x82, 0xA4, 0xE3, + 0x82, 0xA6, 0xE3, 0x82, 0xA8, 0xE3, 0x82, 0xAA, + 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0xAD, 0xE3, 0x82, + 0xAF, 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0xB3, 0xE3, + 0x82, 0xB5, 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0xB9, + 0xE3, 0x82, 0xBB, 0xE3, 0x82, 0xBD, 0xE3, 0x82, + 0xBF, 0xE3, 0x83, 0x81, 0xE3, 0x83, 0x84, 0xE3, + 0x83, 0x86, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x8A, + 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0x8C, 0xE3, 0x83, + 0x8D, 0xE3, 0x83, 0x8E, 0xE3, 0x83, 0x8F, 0xE3, + 0x83, 0x92, 0xE3, 0x83, 0x95, 0xE3, 0x83, 0x98, + 0xE3, 0x83, 0x9B, 0xE3, 0x83, 0x9E, 0xE3, 0x83, + 0x9F, 0xE3, 0x83, 0xA0, 0xE3, 0x83, 0xA1, 0xE3, + 0x83, 0xA2, 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xA6, + 0xE3, 0x83, 0xA8, 0xE3, 0x83, 0xA9, 0xE3, 0x83, + 0xAA, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xAC, 0xE3, + 0x83, 0xAD, 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0xB0, + 0xE3, 0x83, 0xB1, 0xE3, 0x83, 0xB2, 0xE3, 0x82, + 0xA2, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x9A, 0xE3, + 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xA2, + 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x95, 0xE3, 0x82, + 0xA1, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xB3, 0xE3, + 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xA2, + 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xBC, 0xE3, 0x83, + 0xAB, 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0x8B, 0xE3, + 0x83, 0xB3, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, + 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0x81, 0xE3, 0x82, 0xA6, 0xE3, 0x82, 0xA9, 0xE3, + 0x83, 0xB3, 0xE3, 0x82, 0xA8, 0xE3, 0x82, 0xB9, + 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xBC, 0xE3, 0x83, + 0x88, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xA8, 0xE3, + 0x83, 0xBC, 0xE3, 0x82, 0xAB, 0xE3, 0x83, 0xBC, + 0xE3, 0x82, 0xAA, 0xE3, 0x83, 0xB3, 0xE3, 0x82, + 0xB9, 0xE3, 0x82, 0xAA, 0xE3, 0x83, 0xBC, 0xE3, + 0x83, 0xA0, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0xA4, + 0xE3, 0x83, 0xAA, 0xE3, 0x82, 0xAB, 0xE3, 0x83, + 0xA9, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, + 0x82, 0xAB, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xAA, + 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAB, 0xE3, 0x82, + 0x99, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xB3, 0xE3, + 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xB3, + 0xE3, 0x83, 0x9E, 0xE3, 0x82, 0xAD, 0xE3, 0x82, + 0x99, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, + 0x82, 0xAD, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x8B, + 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAD, 0xE3, 0x83, + 0xA5, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xBC, 0xE3, + 0x82, 0xAD, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAB, + 0xE3, 0x82, 0xBF, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0xBC, 0xE3, 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, + 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, 0x82, 0xAF, + 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA9, 0xE3, 0x83, + 0xA0, 0xE3, 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, + 0x83, 0xA1, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, + 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xAD, 0xE3, 0x83, + 0xAD, 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0x83, 0xE3, + 0x83, 0x88, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, + 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xA0, 0xE3, 0x82, + 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA9, 0xE3, + 0x83, 0xA0, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xB3, + 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAB, 0xE3, 0x82, + 0xBB, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xA4, 0xE3, + 0x83, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAD, + 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x8D, 0xE3, 0x82, + 0xB1, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xB9, 0xE3, + 0x82, 0xB3, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x8A, + 0xE3, 0x82, 0xB3, 0xE3, 0x83, 0xBC, 0xE3, 0x83, + 0x9B, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xB5, 0xE3, + 0x82, 0xA4, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAB, + 0xE3, 0x82, 0xB5, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0x81, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xA0, 0xE3, + 0x82, 0xB7, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xB3, + 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x82, + 0xBB, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x81, 0xE3, + 0x82, 0xBB, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, + 0xE3, 0x82, 0xBF, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0xBC, 0xE3, 0x82, 0xB9, 0xE3, 0x83, 0x86, 0xE3, + 0x82, 0x99, 0xE3, 0x82, 0xB7, 0xE3, 0x83, 0x88, + 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, + 0x88, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x8A, 0xE3, + 0x83, 0x8E, 0xE3, 0x83, 0x8E, 0xE3, 0x83, 0x83, + 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x8F, 0xE3, 0x82, + 0xA4, 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x8F, 0xE3, + 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xBB, + 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x83, + 0x8F, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, + 0x83, 0x84, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x99, + 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAC, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, + 0x82, 0xA2, 0xE3, 0x82, 0xB9, 0xE3, 0x83, 0x88, + 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, + 0x9A, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAB, 0xE3, + 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xB3, + 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0xA1, 0xE3, + 0x83, 0xA9, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, + 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x95, 0xE3, 0x82, + 0xA3, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, + 0x83, 0x95, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x83, + 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0xA7, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x95, 0xE3, 0x83, 0xA9, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0xAF, + 0xE3, 0x82, 0xBF, 0xE3, 0x83, 0xBC, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, + 0x82, 0xBD, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x9A, + 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0x92, 0xE3, 0x83, + 0x98, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x84, 0xE3, + 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xB3, + 0xE3, 0x82, 0xB9, 0xE3, 0x83, 0x98, 0xE3, 0x82, + 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xB7, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x99, + 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xBF, 0xE3, 0x83, + 0x9B, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xA4, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x9B, + 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, + 0x88, 0xE3, 0x83, 0x9B, 0xE3, 0x83, 0xB3, 0xE3, + 0x83, 0x9B, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xB3, + 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0x9B, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, + 0x83, 0x9B, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xB3, + 0xE3, 0x83, 0x9E, 0xE3, 0x82, 0xA4, 0xE3, 0x82, + 0xAF, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0x9E, 0xE3, + 0x82, 0xA4, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x9E, + 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x8F, 0xE3, 0x83, + 0x9E, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xAF, 0xE3, + 0x83, 0x9E, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xB7, + 0xE3, 0x83, 0xA7, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0x9F, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAD, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0x9F, 0xE3, 0x83, 0xAA, + 0xE3, 0x83, 0x9F, 0xE3, 0x83, 0xAA, 0xE3, 0x83, + 0x8F, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, + 0x83, 0xAB, 0xE3, 0x83, 0xA1, 0xE3, 0x82, 0xAB, + 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA1, 0xE3, 0x82, + 0xAB, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x88, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, 0xE3, 0x83, + 0xA4, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xA6, 0xE3, 0x82, + 0xA2, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0xAA, 0xE3, + 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, + 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xA9, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, + 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xA0, 0xE3, + 0x83, 0xAC, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, + 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0xB3, 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0x83, 0xE3, + 0x83, 0x88, 0x30, 0xE7, 0x82, 0xB9, 0x31, 0xE7, + 0x82, 0xB9, 0x32, 0xE7, 0x82, 0xB9, 0x33, 0xE7, + 0x82, 0xB9, 0x34, 0xE7, 0x82, 0xB9, 0x35, 0xE7, + 0x82, 0xB9, 0x36, 0xE7, 0x82, 0xB9, 0x37, 0xE7, + 0x82, 0xB9, 0x38, 0xE7, 0x82, 0xB9, 0x39, 0xE7, + 0x82, 0xB9, 0x31, 0x30, 0xE7, 0x82, 0xB9, 0x31, + 0x31, 0xE7, 0x82, 0xB9, 0x31, 0x32, 0xE7, 0x82, + 0xB9, 0x31, 0x33, 0xE7, 0x82, 0xB9, 0x31, 0x34, + 0xE7, 0x82, 0xB9, 0x31, 0x35, 0xE7, 0x82, 0xB9, + 0x31, 0x36, 0xE7, 0x82, 0xB9, 0x31, 0x37, 0xE7, + 0x82, 0xB9, 0x31, 0x38, 0xE7, 0x82, 0xB9, 0x31, + 0x39, 0xE7, 0x82, 0xB9, 0x32, 0x30, 0xE7, 0x82, + 0xB9, 0x32, 0x31, 0xE7, 0x82, 0xB9, 0x32, 0x32, + 0xE7, 0x82, 0xB9, 0x32, 0x33, 0xE7, 0x82, 0xB9, + 0x32, 0x34, 0xE7, 0x82, 0xB9, 0x68, 0x50, 0x61, + 0x64, 0x61, 0x41, 0x55, 0x62, 0x61, 0x72, 0x6F, + 0x56, 0x70, 0x63, 0xE5, 0xB9, 0xB3, 0xE6, 0x88, + 0x90, 0xE6, 0x98, 0xAD, 0xE5, 0x92, 0x8C, 0xE5, + 0xA4, 0xA7, 0xE6, 0xAD, 0xA3, 0xE6, 0x98, 0x8E, + 0xE6, 0xB2, 0xBB, 0xE6, 0xA0, 0xAA, 0xE5, 0xBC, + 0x8F, 0xE4, 0xBC, 0x9A, 0xE7, 0xA4, 0xBE, 0x70, + 0x41, 0x6E, 0x41, 0xCE, 0xBC, 0x41, 0x6D, 0x41, + 0x6B, 0x41, 0x4B, 0x42, 0x4D, 0x42, 0x47, 0x42, + 0x63, 0x61, 0x6C, 0x6B, 0x63, 0x61, 0x6C, 0x70, + 0x46, 0x6E, 0x46, 0xCE, 0xBC, 0x46, 0xCE, 0xBC, + 0x67, 0x6D, 0x67, 0x6B, 0x67, 0x48, 0x7A, 0x6B, + 0x48, 0x7A, 0x4D, 0x48, 0x7A, 0x47, 0x48, 0x7A, + 0x54, 0x48, 0x7A, 0xCE, 0xBC, 0x6C, 0x6D, 0x6C, + 0x64, 0x6C, 0x6B, 0x6C, 0x66, 0x6D, 0x6E, 0x6D, + 0xCE, 0xBC, 0x6D, 0x6D, 0x6D, 0x63, 0x6D, 0x6B, + 0x6D, 0x6D, 0x6D, 0x32, 0x63, 0x6D, 0x32, 0x6D, + 0x32, 0x6B, 0x6D, 0x32, 0x6D, 0x6D, 0x33, 0x63, + 0x6D, 0x33, 0x6D, 0x33, 0x6B, 0x6D, 0x33, 0x6D, + 0xE2, 0x88, 0x95, 0x73, 0x6D, 0xE2, 0x88, 0x95, + 0x73, 0x32, 0x50, 0x61, 0x6B, 0x50, 0x61, 0x4D, + 0x50, 0x61, 0x47, 0x50, 0x61, 0x72, 0x61, 0x64, + 0x72, 0x61, 0x64, 0xE2, 0x88, 0x95, 0x73, 0x72, + 0x61, 0x64, 0xE2, 0x88, 0x95, 0x73, 0x32, 0x70, + 0x73, 0x6E, 0x73, 0xCE, 0xBC, 0x73, 0x6D, 0x73, + 0x70, 0x56, 0x6E, 0x56, 0xCE, 0xBC, 0x56, 0x6D, + 0x56, 0x6B, 0x56, 0x4D, 0x56, 0x70, 0x57, 0x6E, + 0x57, 0xCE, 0xBC, 0x57, 0x6D, 0x57, 0x6B, 0x57, + 0x4D, 0x57, 0x6B, 0xCE, 0xA9, 0x4D, 0xCE, 0xA9, + 0x61, 0x2E, 0x6D, 0x2E, 0x42, 0x71, 0x63, 0x63, + 0x63, 0x64, 0x43, 0xE2, 0x88, 0x95, 0x6B, 0x67, + 0x43, 0x6F, 0x2E, 0x64, 0x42, 0x47, 0x79, 0x68, + 0x61, 0x48, 0x50, 0x69, 0x6E, 0x4B, 0x4B, 0x4B, + 0x4D, 0x6B, 0x74, 0x6C, 0x6D, 0x6C, 0x6E, 0x6C, + 0x6F, 0x67, 0x6C, 0x78, 0x6D, 0x62, 0x6D, 0x69, + 0x6C, 0x6D, 0x6F, 0x6C, 0x50, 0x48, 0x70, 0x2E, + 0x6D, 0x2E, 0x50, 0x50, 0x4D, 0x50, 0x52, 0x73, + 0x72, 0x53, 0x76, 0x57, 0x62, 0x31, 0xE6, 0x97, + 0xA5, 0x32, 0xE6, 0x97, 0xA5, 0x33, 0xE6, 0x97, + 0xA5, 0x34, 0xE6, 0x97, 0xA5, 0x35, 0xE6, 0x97, + 0xA5, 0x36, 0xE6, 0x97, 0xA5, 0x37, 0xE6, 0x97, + 0xA5, 0x38, 0xE6, 0x97, 0xA5, 0x39, 0xE6, 0x97, + 0xA5, 0x31, 0x30, 0xE6, 0x97, 0xA5, 0x31, 0x31, + 0xE6, 0x97, 0xA5, 0x31, 0x32, 0xE6, 0x97, 0xA5, + 0x31, 0x33, 0xE6, 0x97, 0xA5, 0x31, 0x34, 0xE6, + 0x97, 0xA5, 0x31, 0x35, 0xE6, 0x97, 0xA5, 0x31, + 0x36, 0xE6, 0x97, 0xA5, 0x31, 0x37, 0xE6, 0x97, + 0xA5, 0x31, 0x38, 0xE6, 0x97, 0xA5, 0x31, 0x39, + 0xE6, 0x97, 0xA5, 0x32, 0x30, 0xE6, 0x97, 0xA5, + 0x32, 0x31, 0xE6, 0x97, 0xA5, 0x32, 0x32, 0xE6, + 0x97, 0xA5, 0x32, 0x33, 0xE6, 0x97, 0xA5, 0x32, + 0x34, 0xE6, 0x97, 0xA5, 0x32, 0x35, 0xE6, 0x97, + 0xA5, 0x32, 0x36, 0xE6, 0x97, 0xA5, 0x32, 0x37, + 0xE6, 0x97, 0xA5, 0x32, 0x38, 0xE6, 0x97, 0xA5, + 0x32, 0x39, 0xE6, 0x97, 0xA5, 0x33, 0x30, 0xE6, + 0x97, 0xA5, 0x33, 0x31, 0xE6, 0x97, 0xA5, 0xF6, + 0xE8, 0xB1, 0x88, 0xF6, 0xE6, 0x9B, 0xB4, 0xF6, + 0xE8, 0xBB, 0x8A, 0xF6, 0xE8, 0xB3, 0x88, 0xF6, + 0xE6, 0xBB, 0x91, 0xF6, 0xE4, 0xB8, 0xB2, 0xF6, + 0xE5, 0x8F, 0xA5, 0xF6, 0xE9, 0xBE, 0x9C, 0xF6, + 0xE9, 0xBE, 0x9C, 0xF6, 0xE5, 0xA5, 0x91, 0xF6, + 0xE9, 0x87, 0x91, 0xF6, 0xE5, 0x96, 0x87, 0xF6, + 0xE5, 0xA5, 0x88, 0xF6, 0xE6, 0x87, 0xB6, 0xF6, + 0xE7, 0x99, 0xA9, 0xF6, 0xE7, 0xBE, 0x85, 0xF6, + 0xE8, 0x98, 0xBF, 0xF6, 0xE8, 0x9E, 0xBA, 0xF6, + 0xE8, 0xA3, 0xB8, 0xF6, 0xE9, 0x82, 0x8F, 0xF6, + 0xE6, 0xA8, 0x82, 0xF6, 0xE6, 0xB4, 0x9B, 0xF6, + 0xE7, 0x83, 0x99, 0xF6, 0xE7, 0x8F, 0x9E, 0xF6, + 0xE8, 0x90, 0xBD, 0xF6, 0xE9, 0x85, 0xAA, 0xF6, + 0xE9, 0xA7, 0xB1, 0xF6, 0xE4, 0xBA, 0x82, 0xF6, + 0xE5, 0x8D, 0xB5, 0xF6, 0xE6, 0xAC, 0x84, 0xF6, + 0xE7, 0x88, 0x9B, 0xF6, 0xE8, 0x98, 0xAD, 0xF6, + 0xE9, 0xB8, 0x9E, 0xF6, 0xE5, 0xB5, 0x90, 0xF6, + 0xE6, 0xBF, 0xAB, 0xF6, 0xE8, 0x97, 0x8D, 0xF6, + 0xE8, 0xA5, 0xA4, 0xF6, 0xE6, 0x8B, 0x89, 0xF6, + 0xE8, 0x87, 0x98, 0xF6, 0xE8, 0xA0, 0x9F, 0xF6, + 0xE5, 0xBB, 0x8A, 0xF6, 0xE6, 0x9C, 0x97, 0xF6, + 0xE6, 0xB5, 0xAA, 0xF6, 0xE7, 0x8B, 0xBC, 0xF6, + 0xE9, 0x83, 0x8E, 0xF6, 0xE4, 0xBE, 0x86, 0xF6, + 0xE5, 0x86, 0xB7, 0xF6, 0xE5, 0x8B, 0x9E, 0xF6, + 0xE6, 0x93, 0x84, 0xF6, 0xE6, 0xAB, 0x93, 0xF6, + 0xE7, 0x88, 0x90, 0xF6, 0xE7, 0x9B, 0xA7, 0xF6, + 0xE8, 0x80, 0x81, 0xF6, 0xE8, 0x98, 0x86, 0xF6, + 0xE8, 0x99, 0x9C, 0xF6, 0xE8, 0xB7, 0xAF, 0xF6, + 0xE9, 0x9C, 0xB2, 0xF6, 0xE9, 0xAD, 0xAF, 0xF6, + 0xE9, 0xB7, 0xBA, 0xF6, 0xE7, 0xA2, 0x8C, 0xF6, + 0xE7, 0xA5, 0xBF, 0xF6, 0xE7, 0xB6, 0xA0, 0xF6, + 0xE8, 0x8F, 0x89, 0xF6, 0xE9, 0x8C, 0x84, 0xF6, + 0xE9, 0xB9, 0xBF, 0xF6, 0xE8, 0xAB, 0x96, 0xF6, + 0xE5, 0xA3, 0x9F, 0xF6, 0xE5, 0xBC, 0x84, 0xF6, + 0xE7, 0xB1, 0xA0, 0xF6, 0xE8, 0x81, 0xBE, 0xF6, + 0xE7, 0x89, 0xA2, 0xF6, 0xE7, 0xA3, 0x8A, 0xF6, + 0xE8, 0xB3, 0x82, 0xF6, 0xE9, 0x9B, 0xB7, 0xF6, + 0xE5, 0xA3, 0x98, 0xF6, 0xE5, 0xB1, 0xA2, 0xF6, + 0xE6, 0xA8, 0x93, 0xF6, 0xE6, 0xB7, 0x9A, 0xF6, + 0xE6, 0xBC, 0x8F, 0xF6, 0xE7, 0xB4, 0xAF, 0xF6, + 0xE7, 0xB8, 0xB7, 0xF6, 0xE9, 0x99, 0x8B, 0xF6, + 0xE5, 0x8B, 0x92, 0xF6, 0xE8, 0x82, 0x8B, 0xF6, + 0xE5, 0x87, 0x9C, 0xF6, 0xE5, 0x87, 0x8C, 0xF6, + 0xE7, 0xA8, 0x9C, 0xF6, 0xE7, 0xB6, 0xBE, 0xF6, + 0xE8, 0x8F, 0xB1, 0xF6, 0xE9, 0x99, 0xB5, 0xF6, + 0xE8, 0xAE, 0x80, 0xF6, 0xE6, 0x8B, 0x8F, 0xF6, + 0xE6, 0xA8, 0x82, 0xF6, 0xE8, 0xAB, 0xBE, 0xF6, + 0xE4, 0xB8, 0xB9, 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, + 0xE6, 0x80, 0x92, 0xF6, 0xE7, 0x8E, 0x87, 0xF6, + 0xE7, 0x95, 0xB0, 0xF6, 0xE5, 0x8C, 0x97, 0xF6, + 0xE7, 0xA3, 0xBB, 0xF6, 0xE4, 0xBE, 0xBF, 0xF6, + 0xE5, 0xBE, 0xA9, 0xF6, 0xE4, 0xB8, 0x8D, 0xF6, + 0xE6, 0xB3, 0x8C, 0xF6, 0xE6, 0x95, 0xB8, 0xF6, + 0xE7, 0xB4, 0xA2, 0xF6, 0xE5, 0x8F, 0x83, 0xF6, + 0xE5, 0xA1, 0x9E, 0xF6, 0xE7, 0x9C, 0x81, 0xF6, + 0xE8, 0x91, 0x89, 0xF6, 0xE8, 0xAA, 0xAA, 0xF6, + 0xE6, 0xAE, 0xBA, 0xF6, 0xE8, 0xBE, 0xB0, 0xF6, + 0xE6, 0xB2, 0x88, 0xF6, 0xE6, 0x8B, 0xBE, 0xF6, + 0xE8, 0x8B, 0xA5, 0xF6, 0xE6, 0x8E, 0xA0, 0xF6, + 0xE7, 0x95, 0xA5, 0xF6, 0xE4, 0xBA, 0xAE, 0xF6, + 0xE5, 0x85, 0xA9, 0xF6, 0xE5, 0x87, 0x89, 0xF6, + 0xE6, 0xA2, 0x81, 0xF6, 0xE7, 0xB3, 0xA7, 0xF6, + 0xE8, 0x89, 0xAF, 0xF6, 0xE8, 0xAB, 0x92, 0xF6, + 0xE9, 0x87, 0x8F, 0xF6, 0xE5, 0x8B, 0xB5, 0xF6, + 0xE5, 0x91, 0x82, 0xF6, 0xE5, 0xA5, 0xB3, 0xF6, + 0xE5, 0xBB, 0xAC, 0xF6, 0xE6, 0x97, 0x85, 0xF6, + 0xE6, 0xBF, 0xBE, 0xF6, 0xE7, 0xA4, 0xAA, 0xF6, + 0xE9, 0x96, 0xAD, 0xF6, 0xE9, 0xA9, 0xAA, 0xF6, + 0xE9, 0xBA, 0x97, 0xF6, 0xE9, 0xBB, 0x8E, 0xF6, + 0xE5, 0x8A, 0x9B, 0xF6, 0xE6, 0x9B, 0x86, 0xF6, + 0xE6, 0xAD, 0xB7, 0xF6, 0xE8, 0xBD, 0xA2, 0xF6, + 0xE5, 0xB9, 0xB4, 0xF6, 0xE6, 0x86, 0x90, 0xF6, + 0xE6, 0x88, 0x80, 0xF6, 0xE6, 0x92, 0x9A, 0xF6, + 0xE6, 0xBC, 0xA3, 0xF6, 0xE7, 0x85, 0x89, 0xF6, + 0xE7, 0x92, 0x89, 0xF6, 0xE7, 0xA7, 0x8A, 0xF6, + 0xE7, 0xB7, 0xB4, 0xF6, 0xE8, 0x81, 0xAF, 0xF6, + 0xE8, 0xBC, 0xA6, 0xF6, 0xE8, 0x93, 0xAE, 0xF6, + 0xE9, 0x80, 0xA3, 0xF6, 0xE9, 0x8D, 0x8A, 0xF6, + 0xE5, 0x88, 0x97, 0xF6, 0xE5, 0x8A, 0xA3, 0xF6, + 0xE5, 0x92, 0xBD, 0xF6, 0xE7, 0x83, 0x88, 0xF6, + 0xE8, 0xA3, 0x82, 0xF6, 0xE8, 0xAA, 0xAA, 0xF6, + 0xE5, 0xBB, 0x89, 0xF6, 0xE5, 0xBF, 0xB5, 0xF6, + 0xE6, 0x8D, 0xBB, 0xF6, 0xE6, 0xAE, 0xAE, 0xF6, + 0xE7, 0xB0, 0xBE, 0xF6, 0xE7, 0x8D, 0xB5, 0xF6, + 0xE4, 0xBB, 0xA4, 0xF6, 0xE5, 0x9B, 0xB9, 0xF6, + 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, 0xB6, 0xBA, 0xF6, + 0xE6, 0x80, 0x9C, 0xF6, 0xE7, 0x8E, 0xB2, 0xF6, + 0xE7, 0x91, 0xA9, 0xF6, 0xE7, 0xBE, 0x9A, 0xF6, + 0xE8, 0x81, 0x86, 0xF6, 0xE9, 0x88, 0xB4, 0xF6, + 0xE9, 0x9B, 0xB6, 0xF6, 0xE9, 0x9D, 0x88, 0xF6, + 0xE9, 0xA0, 0x98, 0xF6, 0xE4, 0xBE, 0x8B, 0xF6, + 0xE7, 0xA6, 0xAE, 0xF6, 0xE9, 0x86, 0xB4, 0xF6, + 0xE9, 0x9A, 0xB8, 0xF6, 0xE6, 0x83, 0xA1, 0xF6, + 0xE4, 0xBA, 0x86, 0xF6, 0xE5, 0x83, 0x9A, 0xF6, + 0xE5, 0xAF, 0xAE, 0xF6, 0xE5, 0xB0, 0xBF, 0xF6, + 0xE6, 0x96, 0x99, 0xF6, 0xE6, 0xA8, 0x82, 0xF6, + 0xE7, 0x87, 0x8E, 0xF6, 0xE7, 0x99, 0x82, 0xF6, + 0xE8, 0x93, 0xBC, 0xF6, 0xE9, 0x81, 0xBC, 0xF6, + 0xE9, 0xBE, 0x8D, 0xF6, 0xE6, 0x9A, 0x88, 0xF6, + 0xE9, 0x98, 0xAE, 0xF6, 0xE5, 0x8A, 0x89, 0xF6, + 0xE6, 0x9D, 0xBB, 0xF6, 0xE6, 0x9F, 0xB3, 0xF6, + 0xE6, 0xB5, 0x81, 0xF6, 0xE6, 0xBA, 0x9C, 0xF6, + 0xE7, 0x90, 0x89, 0xF6, 0xE7, 0x95, 0x99, 0xF6, + 0xE7, 0xA1, 0xAB, 0xF6, 0xE7, 0xB4, 0x90, 0xF6, + 0xE9, 0xA1, 0x9E, 0xF6, 0xE5, 0x85, 0xAD, 0xF6, + 0xE6, 0x88, 0xAE, 0xF6, 0xE9, 0x99, 0xB8, 0xF6, + 0xE5, 0x80, 0xAB, 0xF6, 0xE5, 0xB4, 0x99, 0xF6, + 0xE6, 0xB7, 0xAA, 0xF6, 0xE8, 0xBC, 0xAA, 0xF6, + 0xE5, 0xBE, 0x8B, 0xF6, 0xE6, 0x85, 0x84, 0xF6, + 0xE6, 0xA0, 0x97, 0xF6, 0xE7, 0x8E, 0x87, 0xF6, + 0xE9, 0x9A, 0x86, 0xF6, 0xE5, 0x88, 0xA9, 0xF6, + 0xE5, 0x90, 0x8F, 0xF6, 0xE5, 0xB1, 0xA5, 0xF6, + 0xE6, 0x98, 0x93, 0xF6, 0xE6, 0x9D, 0x8E, 0xF6, + 0xE6, 0xA2, 0xA8, 0xF6, 0xE6, 0xB3, 0xA5, 0xF6, + 0xE7, 0x90, 0x86, 0xF6, 0xE7, 0x97, 0xA2, 0xF6, + 0xE7, 0xBD, 0xB9, 0xF6, 0xE8, 0xA3, 0x8F, 0xF6, + 0xE8, 0xA3, 0xA1, 0xF6, 0xE9, 0x87, 0x8C, 0xF6, + 0xE9, 0x9B, 0xA2, 0xF6, 0xE5, 0x8C, 0xBF, 0xF6, + 0xE6, 0xBA, 0xBA, 0xF6, 0xE5, 0x90, 0x9D, 0xF6, + 0xE7, 0x87, 0x90, 0xF6, 0xE7, 0x92, 0x98, 0xF6, + 0xE8, 0x97, 0xBA, 0xF6, 0xE9, 0x9A, 0xA3, 0xF6, + 0xE9, 0xB1, 0x97, 0xF6, 0xE9, 0xBA, 0x9F, 0xF6, + 0xE6, 0x9E, 0x97, 0xF6, 0xE6, 0xB7, 0x8B, 0xF6, + 0xE8, 0x87, 0xA8, 0xF6, 0xE7, 0xAB, 0x8B, 0xF6, + 0xE7, 0xAC, 0xA0, 0xF6, 0xE7, 0xB2, 0x92, 0xF6, + 0xE7, 0x8B, 0x80, 0xF6, 0xE7, 0x82, 0x99, 0xF6, + 0xE8, 0xAD, 0x98, 0xF6, 0xE4, 0xBB, 0x80, 0xF6, + 0xE8, 0x8C, 0xB6, 0xF6, 0xE5, 0x88, 0xBA, 0xF6, + 0xE5, 0x88, 0x87, 0xF6, 0xE5, 0xBA, 0xA6, 0xF6, + 0xE6, 0x8B, 0x93, 0xF6, 0xE7, 0xB3, 0x96, 0xF6, + 0xE5, 0xAE, 0x85, 0xF6, 0xE6, 0xB4, 0x9E, 0xF6, + 0xE6, 0x9A, 0xB4, 0xF6, 0xE8, 0xBC, 0xBB, 0xF6, + 0xE8, 0xA1, 0x8C, 0xF6, 0xE9, 0x99, 0x8D, 0xF6, + 0xE8, 0xA6, 0x8B, 0xF6, 0xE5, 0xBB, 0x93, 0xF6, + 0xE5, 0x85, 0x80, 0xF6, 0xE5, 0x97, 0x80, 0xF6, + 0xE5, 0xA1, 0x9A, 0xF6, 0xE6, 0x99, 0xB4, 0xF6, + 0xE5, 0x87, 0x9E, 0xF6, 0xE7, 0x8C, 0xAA, 0xF6, + 0xE7, 0x9B, 0x8A, 0xF6, 0xE7, 0xA4, 0xBC, 0xF6, + 0xE7, 0xA5, 0x9E, 0xF6, 0xE7, 0xA5, 0xA5, 0xF6, + 0xE7, 0xA6, 0x8F, 0xF6, 0xE9, 0x9D, 0x96, 0xF6, + 0xE7, 0xB2, 0xBE, 0xF6, 0xE7, 0xBE, 0xBD, 0xF6, + 0xE8, 0x98, 0x92, 0xF6, 0xE8, 0xAB, 0xB8, 0xF6, + 0xE9, 0x80, 0xB8, 0xF6, 0xE9, 0x83, 0xBD, 0xF6, + 0xE9, 0xA3, 0xAF, 0xF6, 0xE9, 0xA3, 0xBC, 0xF6, + 0xE9, 0xA4, 0xA8, 0xF6, 0xE9, 0xB6, 0xB4, 0xF6, + 0xE4, 0xBE, 0xAE, 0xF6, 0xE5, 0x83, 0xA7, 0xF6, + 0xE5, 0x85, 0x8D, 0xF6, 0xE5, 0x8B, 0x89, 0xF6, + 0xE5, 0x8B, 0xA4, 0xF6, 0xE5, 0x8D, 0x91, 0xF6, + 0xE5, 0x96, 0x9D, 0xF6, 0xE5, 0x98, 0x86, 0xF6, + 0xE5, 0x99, 0xA8, 0xF6, 0xE5, 0xA1, 0x80, 0xF6, + 0xE5, 0xA2, 0xA8, 0xF6, 0xE5, 0xB1, 0xA4, 0xF6, + 0xE5, 0xB1, 0xAE, 0xF6, 0xE6, 0x82, 0x94, 0xF6, + 0xE6, 0x85, 0xA8, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, + 0xE6, 0x87, 0xB2, 0xF6, 0xE6, 0x95, 0x8F, 0xF6, + 0xE6, 0x97, 0xA2, 0xF6, 0xE6, 0x9A, 0x91, 0xF6, + 0xE6, 0xA2, 0x85, 0xF6, 0xE6, 0xB5, 0xB7, 0xF6, + 0xE6, 0xB8, 0x9A, 0xF6, 0xE6, 0xBC, 0xA2, 0xF6, + 0xE7, 0x85, 0xAE, 0xF6, 0xE7, 0x88, 0xAB, 0xF6, + 0xE7, 0x90, 0xA2, 0xF6, 0xE7, 0xA2, 0x91, 0xF6, + 0xE7, 0xA4, 0xBE, 0xF6, 0xE7, 0xA5, 0x89, 0xF6, + 0xE7, 0xA5, 0x88, 0xF6, 0xE7, 0xA5, 0x90, 0xF6, + 0xE7, 0xA5, 0x96, 0xF6, 0xE7, 0xA5, 0x9D, 0xF6, + 0xE7, 0xA6, 0x8D, 0xF6, 0xE7, 0xA6, 0x8E, 0xF6, + 0xE7, 0xA9, 0x80, 0xF6, 0xE7, 0xAA, 0x81, 0xF6, + 0xE7, 0xAF, 0x80, 0xF6, 0xE7, 0xB7, 0xB4, 0xF6, + 0xE7, 0xB8, 0x89, 0xF6, 0xE7, 0xB9, 0x81, 0xF6, + 0xE7, 0xBD, 0xB2, 0xF6, 0xE8, 0x80, 0x85, 0xF6, + 0xE8, 0x87, 0xAD, 0xF6, 0xE8, 0x89, 0xB9, 0xF6, + 0xE8, 0x89, 0xB9, 0xF6, 0xE8, 0x91, 0x97, 0xF6, + 0xE8, 0xA4, 0x90, 0xF6, 0xE8, 0xA6, 0x96, 0xF6, + 0xE8, 0xAC, 0x81, 0xF6, 0xE8, 0xAC, 0xB9, 0xF6, + 0xE8, 0xB3, 0x93, 0xF6, 0xE8, 0xB4, 0x88, 0xF6, + 0xE8, 0xBE, 0xB6, 0xF6, 0xE9, 0x80, 0xB8, 0xF6, + 0xE9, 0x9B, 0xA3, 0xF6, 0xE9, 0x9F, 0xBF, 0xF6, + 0xE9, 0xA0, 0xBB, 0x66, 0x66, 0x66, 0x69, 0x66, + 0x6C, 0x66, 0x66, 0x69, 0x66, 0x66, 0x6C, 0x73, + 0x74, 0x73, 0x74, 0xD5, 0xB4, 0xD5, 0xB6, 0xD5, + 0xB4, 0xD5, 0xA5, 0xD5, 0xB4, 0xD5, 0xAB, 0xD5, + 0xBE, 0xD5, 0xB6, 0xD5, 0xB4, 0xD5, 0xAD, 0xF6, + 0xD7, 0x99, 0xD6, 0xB4, 0xF6, 0xD7, 0xB2, 0xD6, + 0xB7, 0xD7, 0xA2, 0xD7, 0x90, 0xD7, 0x93, 0xD7, + 0x94, 0xD7, 0x9B, 0xD7, 0x9C, 0xD7, 0x9D, 0xD7, + 0xA8, 0xD7, 0xAA, 0x2B, 0xF6, 0xD7, 0xA9, 0xD7, + 0x81, 0xF6, 0xD7, 0xA9, 0xD7, 0x82, 0xF6, 0xD7, + 0xA9, 0xD6, 0xBC, 0xD7, 0x81, 0xF6, 0xD7, 0xA9, + 0xD6, 0xBC, 0xD7, 0x82, 0xF6, 0xD7, 0x90, 0xD6, + 0xB7, 0xF6, 0xD7, 0x90, 0xD6, 0xB8, 0xF6, 0xD7, + 0x90, 0xD6, 0xBC, 0xF6, 0xD7, 0x91, 0xD6, 0xBC, + 0xF6, 0xD7, 0x92, 0xD6, 0xBC, 0xF6, 0xD7, 0x93, + 0xD6, 0xBC, 0xF6, 0xD7, 0x94, 0xD6, 0xBC, 0xF6, + 0xD7, 0x95, 0xD6, 0xBC, 0xF6, 0xD7, 0x96, 0xD6, + 0xBC, 0xF6, 0xD7, 0x98, 0xD6, 0xBC, 0xF6, 0xD7, + 0x99, 0xD6, 0xBC, 0xF6, 0xD7, 0x9A, 0xD6, 0xBC, + 0xF6, 0xD7, 0x9B, 0xD6, 0xBC, 0xF6, 0xD7, 0x9C, + 0xD6, 0xBC, 0xF6, 0xD7, 0x9E, 0xD6, 0xBC, 0xF6, + 0xD7, 0xA0, 0xD6, 0xBC, 0xF6, 0xD7, 0xA1, 0xD6, + 0xBC, 0xF6, 0xD7, 0xA3, 0xD6, 0xBC, 0xF6, 0xD7, + 0xA4, 0xD6, 0xBC, 0xF6, 0xD7, 0xA6, 0xD6, 0xBC, + 0xF6, 0xD7, 0xA7, 0xD6, 0xBC, 0xF6, 0xD7, 0xA8, + 0xD6, 0xBC, 0xF6, 0xD7, 0xA9, 0xD6, 0xBC, 0xF6, + 0xD7, 0xAA, 0xD6, 0xBC, 0xF6, 0xD7, 0x95, 0xD6, + 0xB9, 0xF6, 0xD7, 0x91, 0xD6, 0xBF, 0xF6, 0xD7, + 0x9B, 0xD6, 0xBF, 0xF6, 0xD7, 0xA4, 0xD6, 0xBF, + 0xD7, 0x90, 0xD7, 0x9C, 0xD9, 0xB1, 0xD9, 0xB1, + 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, 0xBB, + 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, 0xBE, + 0xDA, 0x80, 0xDA, 0x80, 0xDA, 0x80, 0xDA, 0x80, + 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, 0xBA, + 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, 0xBF, + 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, 0xB9, + 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, 0xA4, + 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, 0xA6, + 0xDA, 0x84, 0xDA, 0x84, 0xDA, 0x84, 0xDA, 0x84, + 0xDA, 0x83, 0xDA, 0x83, 0xDA, 0x83, 0xDA, 0x83, + 0xDA, 0x86, 0xDA, 0x86, 0xDA, 0x86, 0xDA, 0x86, + 0xDA, 0x87, 0xDA, 0x87, 0xDA, 0x87, 0xDA, 0x87, + 0xDA, 0x8D, 0xDA, 0x8D, 0xDA, 0x8C, 0xDA, 0x8C, + 0xDA, 0x8E, 0xDA, 0x8E, 0xDA, 0x88, 0xDA, 0x88, + 0xDA, 0x98, 0xDA, 0x98, 0xDA, 0x91, 0xDA, 0x91, + 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, 0xA9, + 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, 0xAF, + 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, 0xB3, + 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, 0xB1, + 0xDA, 0xBA, 0xDA, 0xBA, 0xDA, 0xBB, 0xDA, 0xBB, + 0xDA, 0xBB, 0xDA, 0xBB, 0xDB, 0x95, 0xD9, 0x94, + 0xDB, 0x95, 0xD9, 0x94, 0xDB, 0x81, 0xDB, 0x81, + 0xDB, 0x81, 0xDB, 0x81, 0xDA, 0xBE, 0xDA, 0xBE, + 0xDA, 0xBE, 0xDA, 0xBE, 0xDB, 0x92, 0xDB, 0x92, + 0xDB, 0x92, 0xD9, 0x94, 0xDB, 0x92, 0xD9, 0x94, + 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, 0xAD, + 0xDB, 0x87, 0xDB, 0x87, 0xDB, 0x86, 0xDB, 0x86, + 0xDB, 0x88, 0xDB, 0x88, 0xDB, 0x87, 0xD9, 0xB4, + 0xDB, 0x8B, 0xDB, 0x8B, 0xDB, 0x85, 0xDB, 0x85, + 0xDB, 0x89, 0xDB, 0x89, 0xDB, 0x90, 0xDB, 0x90, + 0xDB, 0x90, 0xDB, 0x90, 0xD9, 0x89, 0xD9, 0x89, + 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x8A, + 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x8A, 0xD9, 0x94, + 0xDB, 0x95, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x95, + 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x8A, 0xD9, 0x94, + 0xDB, 0x87, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x87, + 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x86, 0xD9, 0x8A, + 0xD9, 0x94, 0xDB, 0x86, 0xD9, 0x8A, 0xD9, 0x94, + 0xDB, 0x88, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x88, + 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x90, 0xD9, 0x8A, + 0xD9, 0x94, 0xDB, 0x90, 0xD9, 0x8A, 0xD9, 0x94, + 0xDB, 0x90, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x89, + 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x89, 0xDB, 0x8C, 0xDB, 0x8C, + 0xDB, 0x8C, 0xDB, 0x8C, 0xD9, 0x8A, 0xD9, 0x94, + 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAD, + 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x89, 0xD9, 0x8A, 0xD9, 0x94, + 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, 0xA8, + 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, 0xA8, + 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x89, 0xD8, 0xA8, + 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD8, 0xAA, + 0xD8, 0xAD, 0xD8, 0xAA, 0xD8, 0xAE, 0xD8, 0xAA, + 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x89, 0xD8, 0xAA, + 0xD9, 0x8A, 0xD8, 0xAB, 0xD8, 0xAC, 0xD8, 0xAB, + 0xD9, 0x85, 0xD8, 0xAB, 0xD9, 0x89, 0xD8, 0xAB, + 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, + 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, 0xAD, + 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD8, 0xAE, + 0xD8, 0xAD, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB3, + 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, + 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xB5, + 0xD8, 0xAD, 0xD8, 0xB5, 0xD9, 0x85, 0xD8, 0xB6, + 0xD8, 0xAC, 0xD8, 0xB6, 0xD8, 0xAD, 0xD8, 0xB6, + 0xD8, 0xAE, 0xD8, 0xB6, 0xD9, 0x85, 0xD8, 0xB7, + 0xD8, 0xAD, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xB8, + 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, 0xB9, + 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, 0xBA, + 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, 0x81, + 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, 0x81, + 0xD9, 0x85, 0xD9, 0x81, 0xD9, 0x89, 0xD9, 0x81, + 0xD9, 0x8A, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, 0x82, + 0xD9, 0x85, 0xD9, 0x82, 0xD9, 0x89, 0xD9, 0x82, + 0xD9, 0x8A, 0xD9, 0x83, 0xD8, 0xA7, 0xD9, 0x83, + 0xD8, 0xAC, 0xD9, 0x83, 0xD8, 0xAD, 0xD9, 0x83, + 0xD8, 0xAE, 0xD9, 0x83, 0xD9, 0x84, 0xD9, 0x83, + 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x89, 0xD9, 0x83, + 0xD9, 0x8A, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x84, + 0xD8, 0xAD, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x84, + 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x89, 0xD9, 0x84, + 0xD9, 0x8A, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x85, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, 0x85, + 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x89, 0xD9, 0x85, + 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x86, + 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, 0x86, + 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x89, 0xD9, 0x86, + 0xD9, 0x8A, 0xD9, 0x87, 0xD8, 0xAC, 0xD9, 0x87, + 0xD9, 0x85, 0xD9, 0x87, 0xD9, 0x89, 0xD9, 0x87, + 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, 0x8A, + 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x89, 0xD9, 0x8A, + 0xD9, 0x8A, 0xD8, 0xB0, 0xD9, 0xB0, 0xD8, 0xB1, + 0xD9, 0xB0, 0xD9, 0x89, 0xD9, 0xB0, 0x20, 0xD9, + 0x8C, 0xD9, 0x91, 0x20, 0xD9, 0x8D, 0xD9, 0x91, + 0x20, 0xD9, 0x8E, 0xD9, 0x91, 0x20, 0xD9, 0x8F, + 0xD9, 0x91, 0x20, 0xD9, 0x90, 0xD9, 0x91, 0x20, + 0xD9, 0x91, 0xD9, 0xB0, 0xD9, 0x8A, 0xD9, 0x94, + 0xD8, 0xB1, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xB2, + 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, 0x94, + 0xD9, 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x8A, + 0xD8, 0xA8, 0xD8, 0xB1, 0xD8, 0xA8, 0xD8, 0xB2, + 0xD8, 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x86, + 0xD8, 0xA8, 0xD9, 0x89, 0xD8, 0xA8, 0xD9, 0x8A, + 0xD8, 0xAA, 0xD8, 0xB1, 0xD8, 0xAA, 0xD8, 0xB2, + 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x86, + 0xD8, 0xAA, 0xD9, 0x89, 0xD8, 0xAA, 0xD9, 0x8A, + 0xD8, 0xAB, 0xD8, 0xB1, 0xD8, 0xAB, 0xD8, 0xB2, + 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAB, 0xD9, 0x86, + 0xD8, 0xAB, 0xD9, 0x89, 0xD8, 0xAB, 0xD9, 0x8A, + 0xD9, 0x81, 0xD9, 0x89, 0xD9, 0x81, 0xD9, 0x8A, + 0xD9, 0x82, 0xD9, 0x89, 0xD9, 0x82, 0xD9, 0x8A, + 0xD9, 0x83, 0xD8, 0xA7, 0xD9, 0x83, 0xD9, 0x84, + 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x89, + 0xD9, 0x83, 0xD9, 0x8A, 0xD9, 0x84, 0xD9, 0x85, + 0xD9, 0x84, 0xD9, 0x89, 0xD9, 0x84, 0xD9, 0x8A, + 0xD9, 0x85, 0xD8, 0xA7, 0xD9, 0x85, 0xD9, 0x85, + 0xD9, 0x86, 0xD8, 0xB1, 0xD9, 0x86, 0xD8, 0xB2, + 0xD9, 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x86, + 0xD9, 0x86, 0xD9, 0x89, 0xD9, 0x86, 0xD9, 0x8A, + 0xD9, 0x89, 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, 0xB1, + 0xD9, 0x8A, 0xD8, 0xB2, 0xD9, 0x8A, 0xD9, 0x85, + 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, 0x89, + 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x94, + 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAD, + 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAE, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x94, + 0xD9, 0x87, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, 0xA8, + 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, 0xA8, + 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x87, 0xD8, 0xAA, + 0xD8, 0xAC, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAA, + 0xD8, 0xAE, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, + 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAC, + 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, 0xAD, + 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, + 0xD8, 0xAC, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB3, + 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, + 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xB5, + 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAE, 0xD8, 0xB5, + 0xD9, 0x85, 0xD8, 0xB6, 0xD8, 0xAC, 0xD8, 0xB6, + 0xD8, 0xAD, 0xD8, 0xB6, 0xD8, 0xAE, 0xD8, 0xB6, + 0xD9, 0x85, 0xD8, 0xB7, 0xD8, 0xAD, 0xD8, 0xB8, + 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, 0xB9, + 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, 0xBA, + 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, 0x81, + 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, 0x81, + 0xD9, 0x85, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, 0x82, + 0xD9, 0x85, 0xD9, 0x83, 0xD8, 0xAC, 0xD9, 0x83, + 0xD8, 0xAD, 0xD9, 0x83, 0xD8, 0xAE, 0xD9, 0x83, + 0xD9, 0x84, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x84, + 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x84, + 0xD8, 0xAE, 0xD9, 0x84, 0xD9, 0x85, 0xD9, 0x84, + 0xD9, 0x87, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x85, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, 0x85, + 0xD9, 0x85, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x86, + 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, 0x86, + 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, 0x87, + 0xD8, 0xAC, 0xD9, 0x87, 0xD9, 0x85, 0xD9, 0x87, + 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, 0x8A, + 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x94, + 0xD9, 0x87, 0xD8, 0xA8, 0xD9, 0x85, 0xD8, 0xA8, + 0xD9, 0x87, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, + 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAB, + 0xD9, 0x87, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xB3, + 0xD9, 0x87, 0xD8, 0xB4, 0xD9, 0x85, 0xD8, 0xB4, + 0xD9, 0x87, 0xD9, 0x83, 0xD9, 0x84, 0xD9, 0x83, + 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x85, 0xD9, 0x86, + 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, 0x8A, + 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x80, + 0xD9, 0x8E, 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x8F, + 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x90, 0xD9, 0x91, + 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, 0x8A, + 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, 0x8A, + 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x8A, + 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, 0x8A, + 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, 0x8A, + 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, 0x8A, + 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, 0x8A, + 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x8A, + 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, 0x8A, + 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD, + 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85, + 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, 0xB1, + 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, 0xB1, + 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, 0x8A, + 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, 0x8A, + 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x8A, + 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, 0x8A, + 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, 0x8A, + 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, 0x8A, + 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, 0x8A, + 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x8A, + 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, 0x8A, + 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD, + 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85, + 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, 0xB1, + 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, 0xB1, + 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD, + 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85, + 0xD8, 0xB3, 0xD9, 0x87, 0xD8, 0xB4, 0xD9, 0x87, + 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, 0xAC, + 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, 0xD8, 0xAE, + 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD, + 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB7, 0xD9, 0x85, + 0xD8, 0xB8, 0xD9, 0x85, 0xD8, 0xA7, 0xD9, 0x8B, + 0xD8, 0xA7, 0xD9, 0x8B, 0xD8, 0xAA, 0xD8, 0xAC, + 0xD9, 0x85, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAC, + 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, 0xAA, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAA, 0xD8, 0xAE, + 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAC, + 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAA, + 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, 0x85, + 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, 0xAD, + 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xAD, + 0xD9, 0x85, 0xD9, 0x89, 0xD8, 0xB3, 0xD8, 0xAD, + 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAC, 0xD8, 0xAD, + 0xD8, 0xB3, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xB3, + 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, 0x85, + 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xAC, + 0xD8, 0xB3, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB3, + 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB5, 0xD8, 0xAD, + 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAD, 0xD8, 0xAD, + 0xD8, 0xB5, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB4, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB4, 0xD8, 0xAD, + 0xD9, 0x85, 0xD8, 0xB4, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xB4, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xB4, + 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85, + 0xD9, 0x85, 0xD8, 0xB4, 0xD9, 0x85, 0xD9, 0x85, + 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xB6, + 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB6, 0xD8, 0xAE, + 0xD9, 0x85, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xAD, + 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xB7, + 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB7, 0xD9, 0x85, + 0xD9, 0x8A, 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, 0x85, + 0xD8, 0xB9, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB9, + 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, 0x85, + 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x85, 0xD9, 0x85, + 0xD8, 0xBA, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xBA, + 0xD9, 0x85, 0xD9, 0x89, 0xD9, 0x81, 0xD8, 0xAE, + 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, 0x85, + 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x82, + 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x84, 0xD8, 0xAD, + 0xD9, 0x85, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x8A, + 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x89, 0xD9, 0x84, + 0xD8, 0xAC, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAC, + 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x85, + 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x85, 0xD9, 0x84, + 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x84, 0xD9, 0x85, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAC, + 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x85, + 0xD8, 0xAD, 0xD9, 0x8A, 0xD9, 0x85, 0xD8, 0xAC, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x85, + 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, 0x85, + 0xD8, 0xAE, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xAC, + 0xD8, 0xAE, 0xD9, 0x87, 0xD9, 0x85, 0xD8, 0xAC, + 0xD9, 0x87, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x86, + 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x86, 0xD8, 0xAD, + 0xD9, 0x89, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x85, + 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, 0x86, + 0xD8, 0xAC, 0xD9, 0x89, 0xD9, 0x86, 0xD9, 0x85, + 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x85, 0xD9, 0x89, + 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x8A, + 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xA8, 0xD8, 0xAE, + 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAA, + 0xD8, 0xAE, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAE, + 0xD9, 0x89, 0xD8, 0xAA, 0xD9, 0x85, 0xD9, 0x8A, + 0xD8, 0xAA, 0xD9, 0x85, 0xD9, 0x89, 0xD8, 0xAC, + 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, 0xAD, + 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, 0x89, + 0xD8, 0xB3, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xB5, + 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xB4, 0xD8, 0xAD, + 0xD9, 0x8A, 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, 0x8A, + 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x84, + 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAD, + 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x85, + 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x82, 0xD9, 0x85, + 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAD, 0xD9, 0x8A, + 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x84, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, 0x85, + 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x8A, + 0xD9, 0x86, 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, 0x85, + 0xD8, 0xAE, 0xD9, 0x8A, 0xD9, 0x84, 0xD8, 0xAC, + 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x85, + 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, 0x86, + 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, 0xAD, + 0xD9, 0x8A, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x81, + 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, 0xAD, + 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x85, + 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, 0xB5, + 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, 0xAE, + 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xB5, 0xD9, 0x84, 0xDB, 0x92, 0xD9, 0x82, + 0xD9, 0x84, 0xDB, 0x92, 0xD8, 0xA7, 0xD9, 0x84, + 0xD9, 0x84, 0xD9, 0x87, 0xD8, 0xA7, 0xD9, 0x83, + 0xD8, 0xA8, 0xD8, 0xB1, 0xD9, 0x85, 0xD8, 0xAD, + 0xD9, 0x85, 0xD8, 0xAF, 0xD8, 0xB5, 0xD9, 0x84, + 0xD8, 0xB9, 0xD9, 0x85, 0xD8, 0xB1, 0xD8, 0xB3, + 0xD9, 0x88, 0xD9, 0x84, 0xD8, 0xB9, 0xD9, 0x84, + 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x88, 0xD8, 0xB3, + 0xD9, 0x84, 0xD9, 0x85, 0xD8, 0xB5, 0xD9, 0x84, + 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x84, 0xD9, 0x89, + 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9, 0x84, 0xD9, + 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9, 0x8A, + 0xD9, 0x87, 0x20, 0xD9, 0x88, 0xD8, 0xB3, 0xD9, + 0x84, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x84, 0x20, + 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x84, + 0xD9, 0x87, 0xD8, 0xB1, 0xDB, 0x8C, 0xD8, 0xA7, + 0xD9, 0x84, 0x2E, 0x2E, 0xE2, 0x80, 0x94, 0xE2, + 0x80, 0x93, 0x5F, 0x5F, 0x28, 0x29, 0x7B, 0x7D, + 0xE3, 0x80, 0x94, 0xE3, 0x80, 0x95, 0xE3, 0x80, + 0x90, 0xE3, 0x80, 0x91, 0xE3, 0x80, 0x8A, 0xE3, + 0x80, 0x8B, 0xE3, 0x80, 0x88, 0xE3, 0x80, 0x89, + 0xE3, 0x80, 0x8C, 0xE3, 0x80, 0x8D, 0xE3, 0x80, + 0x8E, 0xE3, 0x80, 0x8F, 0x20, 0xCC, 0x85, 0x20, + 0xCC, 0x85, 0x20, 0xCC, 0x85, 0x20, 0xCC, 0x85, + 0x5F, 0x5F, 0x5F, 0x2C, 0xE3, 0x80, 0x81, 0x2E, + 0x3B, 0x3A, 0x3F, 0x21, 0xE2, 0x80, 0x94, 0x28, + 0x29, 0x7B, 0x7D, 0xE3, 0x80, 0x94, 0xE3, 0x80, + 0x95, 0x23, 0x26, 0x2A, 0x2B, 0x2D, 0x3C, 0x3E, + 0x3D, 0x5C, 0x24, 0x25, 0x40, 0x20, 0xD9, 0x8B, + 0xD9, 0x80, 0xD9, 0x8B, 0x20, 0xD9, 0x8C, 0x20, + 0xD9, 0x8D, 0x20, 0xD9, 0x8E, 0xD9, 0x80, 0xD9, + 0x8E, 0x20, 0xD9, 0x8F, 0xD9, 0x80, 0xD9, 0x8F, + 0x20, 0xD9, 0x90, 0xD9, 0x80, 0xD9, 0x90, 0x20, + 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x91, 0x20, 0xD9, + 0x92, 0xD9, 0x80, 0xD9, 0x92, 0xD8, 0xA1, 0xD8, + 0xA7, 0xD9, 0x93, 0xD8, 0xA7, 0xD9, 0x93, 0xD8, + 0xA7, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, + 0x88, 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x94, 0xD8, + 0xA7, 0xD9, 0x95, 0xD8, 0xA7, 0xD9, 0x95, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, + 0xA7, 0xD8, 0xA7, 0xD8, 0xA8, 0xD8, 0xA8, 0xD8, + 0xA8, 0xD8, 0xA8, 0xD8, 0xA9, 0xD8, 0xA9, 0xD8, + 0xAA, 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, + 0xAB, 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, + 0xAC, 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, + 0xAD, 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, + 0xAE, 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, + 0xAF, 0xD8, 0xAF, 0xD8, 0xB0, 0xD8, 0xB0, 0xD8, + 0xB1, 0xD8, 0xB1, 0xD8, 0xB2, 0xD8, 0xB2, 0xD8, + 0xB3, 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, + 0xB4, 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, + 0xB5, 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, + 0xB6, 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, + 0xB7, 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, + 0xB8, 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, + 0xB9, 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, + 0xBA, 0xD8, 0xBA, 0xD8, 0xBA, 0xD8, 0xBA, 0xD9, + 0x81, 0xD9, 0x81, 0xD9, 0x81, 0xD9, 0x81, 0xD9, + 0x82, 0xD9, 0x82, 0xD9, 0x82, 0xD9, 0x82, 0xD9, + 0x83, 0xD9, 0x83, 0xD9, 0x83, 0xD9, 0x83, 0xD9, + 0x84, 0xD9, 0x84, 0xD9, 0x84, 0xD9, 0x84, 0xD9, + 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, + 0x86, 0xD9, 0x86, 0xD9, 0x86, 0xD9, 0x86, 0xD9, + 0x87, 0xD9, 0x87, 0xD9, 0x87, 0xD9, 0x87, 0xD9, + 0x88, 0xD9, 0x88, 0xD9, 0x89, 0xD9, 0x89, 0xD9, + 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, + 0x84, 0xD8, 0xA7, 0xD9, 0x93, 0xD9, 0x84, 0xD8, + 0xA7, 0xD9, 0x93, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, + 0x94, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, + 0x84, 0xD8, 0xA7, 0xD9, 0x95, 0xD9, 0x84, 0xD8, + 0xA7, 0xD9, 0x95, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, + 0x84, 0xD8, 0xA7, 0x21, 0x22, 0x23, 0x24, 0x25, + 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, + 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, + 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, + 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, + 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, + 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, + 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, + 0x5E, 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, + 0x7E, 0xE2, 0xA6, 0x85, 0xE2, 0xA6, 0x86, 0xE3, + 0x80, 0x82, 0xE3, 0x80, 0x8C, 0xE3, 0x80, 0x8D, + 0xE3, 0x80, 0x81, 0xE3, 0x83, 0xBB, 0xE3, 0x83, + 0xB2, 0xE3, 0x82, 0xA1, 0xE3, 0x82, 0xA3, 0xE3, + 0x82, 0xA5, 0xE3, 0x82, 0xA7, 0xE3, 0x82, 0xA9, + 0xE3, 0x83, 0xA3, 0xE3, 0x83, 0xA5, 0xE3, 0x83, + 0xA7, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0xBC, 0xE3, + 0x82, 0xA2, 0xE3, 0x82, 0xA4, 0xE3, 0x82, 0xA6, + 0xE3, 0x82, 0xA8, 0xE3, 0x82, 0xAA, 0xE3, 0x82, + 0xAB, 0xE3, 0x82, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, + 0x82, 0xB1, 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0xB5, + 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0xB9, 0xE3, 0x82, + 0xBB, 0xE3, 0x82, 0xBD, 0xE3, 0x82, 0xBF, 0xE3, + 0x83, 0x81, 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x86, + 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x8A, 0xE3, 0x83, + 0x8B, 0xE3, 0x83, 0x8C, 0xE3, 0x83, 0x8D, 0xE3, + 0x83, 0x8E, 0xE3, 0x83, 0x8F, 0xE3, 0x83, 0x92, + 0xE3, 0x83, 0x95, 0xE3, 0x83, 0x98, 0xE3, 0x83, + 0x9B, 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0x9F, 0xE3, + 0x83, 0xA0, 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xA2, + 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xA6, 0xE3, 0x83, + 0xA8, 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xAA, 0xE3, + 0x83, 0xAB, 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xAD, + 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0xB3, 0xE3, 0x82, + 0x99, 0xE3, 0x82, 0x9A, 0xE1, 0x85, 0xA0, 0xE1, + 0x84, 0x80, 0xE1, 0x84, 0x81, 0xE1, 0x86, 0xAA, + 0xE1, 0x84, 0x82, 0xE1, 0x86, 0xAC, 0xE1, 0x86, + 0xAD, 0xE1, 0x84, 0x83, 0xE1, 0x84, 0x84, 0xE1, + 0x84, 0x85, 0xE1, 0x86, 0xB0, 0xE1, 0x86, 0xB1, + 0xE1, 0x86, 0xB2, 0xE1, 0x86, 0xB3, 0xE1, 0x86, + 0xB4, 0xE1, 0x86, 0xB5, 0xE1, 0x84, 0x9A, 0xE1, + 0x84, 0x86, 0xE1, 0x84, 0x87, 0xE1, 0x84, 0x88, + 0xE1, 0x84, 0xA1, 0xE1, 0x84, 0x89, 0xE1, 0x84, + 0x8A, 0xE1, 0x84, 0x8B, 0xE1, 0x84, 0x8C, 0xE1, + 0x84, 0x8D, 0xE1, 0x84, 0x8E, 0xE1, 0x84, 0x8F, + 0xE1, 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, 0x84, + 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x85, 0xA2, 0xE1, + 0x85, 0xA3, 0xE1, 0x85, 0xA4, 0xE1, 0x85, 0xA5, + 0xE1, 0x85, 0xA6, 0xE1, 0x85, 0xA7, 0xE1, 0x85, + 0xA8, 0xE1, 0x85, 0xA9, 0xE1, 0x85, 0xAA, 0xE1, + 0x85, 0xAB, 0xE1, 0x85, 0xAC, 0xE1, 0x85, 0xAD, + 0xE1, 0x85, 0xAE, 0xE1, 0x85, 0xAF, 0xE1, 0x85, + 0xB0, 0xE1, 0x85, 0xB1, 0xE1, 0x85, 0xB2, 0xE1, + 0x85, 0xB3, 0xE1, 0x85, 0xB4, 0xE1, 0x85, 0xB5, + 0xC2, 0xA2, 0xC2, 0xA3, 0xC2, 0xAC, 0x20, 0xCC, + 0x84, 0xC2, 0xA6, 0xC2, 0xA5, 0xE2, 0x82, 0xA9, + 0xE2, 0x94, 0x82, 0xE2, 0x86, 0x90, 0xE2, 0x86, + 0x91, 0xE2, 0x86, 0x92, 0xE2, 0x86, 0x93, 0xE2, + 0x96, 0xA0, 0xE2, 0x97, 0x8B, 0xF6, 0xF0, 0x9D, + 0x85, 0x97, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, 0xF0, + 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, + 0xF0, 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, + 0xF0, 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x85, + 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, + 0xAF, 0xF6, 0xF0, 0x9D, 0x85, 0x98, 0xF0, 0x9D, + 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xB0, 0xF6, 0xF0, + 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, + 0x9D, 0x85, 0xB1, 0xF6, 0xF0, 0x9D, 0x85, 0x98, + 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xB2, + 0xF6, 0xF0, 0x9D, 0x86, 0xB9, 0xF0, 0x9D, 0x85, + 0xA5, 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, + 0x85, 0xA5, 0xF6, 0xF0, 0x9D, 0x86, 0xB9, 0xF0, + 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAE, 0xF6, + 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, 0xA5, + 0xF0, 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x86, + 0xB9, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, + 0xAF, 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, + 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAF, 0x41, 0x42, + 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, + 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, + 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, + 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, + 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, + 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, + 0x79, 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, + 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, + 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, + 0x57, 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, + 0x65, 0x66, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x43, 0x44, 0x47, 0x4A, 0x4B, 0x4E, + 0x4F, 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x66, + 0x68, 0x69, 0x6A, 0x6B, 0x6D, 0x6E, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, + 0x45, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, + 0x4F, 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, + 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, + 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, 0x45, + 0x46, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4F, + 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, + 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, + 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, + 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, + 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, + 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, + 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, + 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, + 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, + 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, + 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, + 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, + 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, + 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, + 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, + 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, + 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, + 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, + 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, + 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, + 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, + 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, + 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, + 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, + 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, + 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, + 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, + 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, + 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, + 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, + 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, + 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, + 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, + 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, + 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, + 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, + 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, + 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, + 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, + 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, + 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, + 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, + 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, + 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, + 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, + 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, + 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, + 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, + 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, + 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, + 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, + 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, + 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, + 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, + 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, + 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, + 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, + 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, + 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, + 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, + 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, + 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, + 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, + 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, + 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, + 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, + 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, + 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, + 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, + 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, + 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, + 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, + 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, + 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, 0x80, 0x30, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, + 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, + 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, + 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x39, 0xF6, 0xE4, 0xB8, 0xBD, 0xF6, 0xE4, 0xB8, + 0xB8, 0xF6, 0xE4, 0xB9, 0x81, 0xF6, 0xF0, 0xA0, + 0x84, 0xA2, 0xF6, 0xE4, 0xBD, 0xA0, 0xF6, 0xE4, + 0xBE, 0xAE, 0xF6, 0xE4, 0xBE, 0xBB, 0xF6, 0xE5, + 0x80, 0x82, 0xF6, 0xE5, 0x81, 0xBA, 0xF6, 0xE5, + 0x82, 0x99, 0xF6, 0xE5, 0x83, 0xA7, 0xF6, 0xE5, + 0x83, 0x8F, 0xF6, 0xE3, 0x92, 0x9E, 0xF6, 0xF0, + 0xA0, 0x98, 0xBA, 0xF6, 0xE5, 0x85, 0x8D, 0xF6, + 0xE5, 0x85, 0x94, 0xF6, 0xE5, 0x85, 0xA4, 0xF6, + 0xE5, 0x85, 0xB7, 0xF6, 0xF0, 0xA0, 0x94, 0x9C, + 0xF6, 0xE3, 0x92, 0xB9, 0xF6, 0xE5, 0x85, 0xA7, + 0xF6, 0xE5, 0x86, 0x8D, 0xF6, 0xF0, 0xA0, 0x95, + 0x8B, 0xF6, 0xE5, 0x86, 0x97, 0xF6, 0xE5, 0x86, + 0xA4, 0xF6, 0xE4, 0xBB, 0x8C, 0xF6, 0xE5, 0x86, + 0xAC, 0xF6, 0xE5, 0x86, 0xB5, 0xF6, 0xF0, 0xA9, + 0x87, 0x9F, 0xF6, 0xE5, 0x87, 0xB5, 0xF6, 0xE5, + 0x88, 0x83, 0xF6, 0xE3, 0x93, 0x9F, 0xF6, 0xE5, + 0x88, 0xBB, 0xF6, 0xE5, 0x89, 0x86, 0xF6, 0xE5, + 0x89, 0xB2, 0xF6, 0xE5, 0x89, 0xB7, 0xF6, 0xE3, + 0x94, 0x95, 0xF6, 0xE5, 0x8B, 0x87, 0xF6, 0xE5, + 0x8B, 0x89, 0xF6, 0xE5, 0x8B, 0xA4, 0xF6, 0xE5, + 0x8B, 0xBA, 0xF6, 0xE5, 0x8C, 0x85, 0xF6, 0xE5, + 0x8C, 0x86, 0xF6, 0xE5, 0x8C, 0x97, 0xF6, 0xE5, + 0x8D, 0x89, 0xF6, 0xE5, 0x8D, 0x91, 0xF6, 0xE5, + 0x8D, 0x9A, 0xF6, 0xE5, 0x8D, 0xB3, 0xF6, 0xE5, + 0x8D, 0xBD, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xE5, + 0x8D, 0xBF, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xF0, + 0xA0, 0xA8, 0xAC, 0xF6, 0xE7, 0x81, 0xB0, 0xF6, + 0xE5, 0x8F, 0x8A, 0xF6, 0xE5, 0x8F, 0x9F, 0xF6, + 0xF0, 0xA0, 0xAD, 0xA3, 0xF6, 0xE5, 0x8F, 0xAB, + 0xF6, 0xE5, 0x8F, 0xB1, 0xF6, 0xE5, 0x90, 0x86, + 0xF6, 0xE5, 0x92, 0x9E, 0xF6, 0xE5, 0x90, 0xB8, + 0xF6, 0xE5, 0x91, 0x88, 0xF6, 0xE5, 0x91, 0xA8, + 0xF6, 0xE5, 0x92, 0xA2, 0xF6, 0xE5, 0x93, 0xB6, + 0xF6, 0xE5, 0x94, 0x90, 0xF6, 0xE5, 0x95, 0x93, + 0xF6, 0xE5, 0x95, 0xA3, 0xF6, 0xE5, 0x96, 0x84, + 0xF6, 0xE5, 0x96, 0x84, 0xF6, 0xE5, 0x96, 0x99, + 0xF6, 0xE5, 0x96, 0xAB, 0xF6, 0xE5, 0x96, 0xB3, + 0xF6, 0xE5, 0x97, 0x82, 0xF6, 0xE5, 0x9C, 0x96, + 0xF6, 0xE5, 0x98, 0x86, 0xF6, 0xE5, 0x9C, 0x97, + 0xF6, 0xE5, 0x99, 0x91, 0xF6, 0xE5, 0x99, 0xB4, + 0xF6, 0xE5, 0x88, 0x87, 0xF6, 0xE5, 0xA3, 0xAE, + 0xF6, 0xE5, 0x9F, 0x8E, 0xF6, 0xE5, 0x9F, 0xB4, + 0xF6, 0xE5, 0xA0, 0x8D, 0xF6, 0xE5, 0x9E, 0x8B, + 0xF6, 0xE5, 0xA0, 0xB2, 0xF6, 0xE5, 0xA0, 0xB1, + 0xF6, 0xE5, 0xA2, 0xAC, 0xF6, 0xF0, 0xA1, 0x93, + 0xA4, 0xF6, 0xE5, 0xA3, 0xB2, 0xF6, 0xE5, 0xA3, + 0xB7, 0xF6, 0xE5, 0xA4, 0x86, 0xF6, 0xE5, 0xA4, + 0x9A, 0xF6, 0xE5, 0xA4, 0xA2, 0xF6, 0xE5, 0xA5, + 0xA2, 0xF6, 0xF0, 0xA1, 0x9A, 0xA8, 0xF6, 0xF0, + 0xA1, 0x9B, 0xAA, 0xF6, 0xE5, 0xA7, 0xAC, 0xF6, + 0xE5, 0xA8, 0x9B, 0xF6, 0xE5, 0xA8, 0xA7, 0xF6, + 0xE5, 0xA7, 0x98, 0xF6, 0xE5, 0xA9, 0xA6, 0xF6, + 0xE3, 0x9B, 0xAE, 0xF6, 0xE3, 0x9B, 0xBC, 0xF6, + 0xE5, 0xAC, 0x88, 0xF6, 0xE5, 0xAC, 0xBE, 0xF6, + 0xE5, 0xAC, 0xBE, 0xF6, 0xF0, 0xA1, 0xA7, 0x88, + 0xF6, 0xE5, 0xAF, 0x83, 0xF6, 0xE5, 0xAF, 0x98, + 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, 0xAF, 0xB3, + 0xF6, 0xF0, 0xA1, 0xAC, 0x98, 0xF6, 0xE5, 0xAF, + 0xBF, 0xF6, 0xE5, 0xB0, 0x86, 0xF6, 0xE5, 0xBD, + 0x93, 0xF6, 0xE5, 0xB0, 0xA2, 0xF6, 0xE3, 0x9E, + 0x81, 0xF6, 0xE5, 0xB1, 0xA0, 0xF6, 0xE5, 0xB1, + 0xAE, 0xF6, 0xE5, 0xB3, 0x80, 0xF6, 0xE5, 0xB2, + 0x8D, 0xF6, 0xF0, 0xA1, 0xB7, 0xA4, 0xF6, 0xE5, + 0xB5, 0x83, 0xF6, 0xF0, 0xA1, 0xB7, 0xA6, 0xF6, + 0xE5, 0xB5, 0xAE, 0xF6, 0xE5, 0xB5, 0xAB, 0xF6, + 0xE5, 0xB5, 0xBC, 0xF6, 0xE5, 0xB7, 0xA1, 0xF6, + 0xE5, 0xB7, 0xA2, 0xF6, 0xE3, 0xA0, 0xAF, 0xF6, + 0xE5, 0xB7, 0xBD, 0xF6, 0xE5, 0xB8, 0xA8, 0xF6, + 0xE5, 0xB8, 0xBD, 0xF6, 0xE5, 0xB9, 0xA9, 0xF6, + 0xE3, 0xA1, 0xA2, 0xF6, 0xF0, 0xA2, 0x86, 0x83, + 0xF6, 0xE3, 0xA1, 0xBC, 0xF6, 0xE5, 0xBA, 0xB0, + 0xF6, 0xE5, 0xBA, 0xB3, 0xF6, 0xE5, 0xBA, 0xB6, + 0xF6, 0xE5, 0xBB, 0x8A, 0xF6, 0xF0, 0xAA, 0x8E, + 0x92, 0xF6, 0xE5, 0xBB, 0xBE, 0xF6, 0xF0, 0xA2, + 0x8C, 0xB1, 0xF6, 0xF0, 0xA2, 0x8C, 0xB1, 0xF6, + 0xE8, 0x88, 0x81, 0xF6, 0xE5, 0xBC, 0xA2, 0xF6, + 0xE5, 0xBC, 0xA2, 0xF6, 0xE3, 0xA3, 0x87, 0xF6, + 0xF0, 0xA3, 0x8A, 0xB8, 0xF6, 0xF0, 0xA6, 0x87, + 0x9A, 0xF6, 0xE5, 0xBD, 0xA2, 0xF6, 0xE5, 0xBD, + 0xAB, 0xF6, 0xE3, 0xA3, 0xA3, 0xF6, 0xE5, 0xBE, + 0x9A, 0xF6, 0xE5, 0xBF, 0x8D, 0xF6, 0xE5, 0xBF, + 0x97, 0xF6, 0xE5, 0xBF, 0xB9, 0xF6, 0xE6, 0x82, + 0x81, 0xF6, 0xE3, 0xA4, 0xBA, 0xF6, 0xE3, 0xA4, + 0x9C, 0xF6, 0xE6, 0x82, 0x94, 0xF6, 0xF0, 0xA2, + 0x9B, 0x94, 0xF6, 0xE6, 0x83, 0x87, 0xF6, 0xE6, + 0x85, 0x88, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6, + 0x85, 0x8E, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6, + 0x85, 0xBA, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, 0xE6, + 0x86, 0xB2, 0xF6, 0xE6, 0x86, 0xA4, 0xF6, 0xE6, + 0x86, 0xAF, 0xF6, 0xE6, 0x87, 0x9E, 0xF6, 0xE6, + 0x87, 0xB2, 0xF6, 0xE6, 0x87, 0xB6, 0xF6, 0xE6, + 0x88, 0x90, 0xF6, 0xE6, 0x88, 0x9B, 0xF6, 0xE6, + 0x89, 0x9D, 0xF6, 0xE6, 0x8A, 0xB1, 0xF6, 0xE6, + 0x8B, 0x94, 0xF6, 0xE6, 0x8D, 0x90, 0xF6, 0xF0, + 0xA2, 0xAC, 0x8C, 0xF6, 0xE6, 0x8C, 0xBD, 0xF6, + 0xE6, 0x8B, 0xBC, 0xF6, 0xE6, 0x8D, 0xA8, 0xF6, + 0xE6, 0x8E, 0x83, 0xF6, 0xE6, 0x8F, 0xA4, 0xF6, + 0xF0, 0xA2, 0xAF, 0xB1, 0xF6, 0xE6, 0x90, 0xA2, + 0xF6, 0xE6, 0x8F, 0x85, 0xF6, 0xE6, 0x8E, 0xA9, + 0xF6, 0xE3, 0xA8, 0xAE, 0xF6, 0xE6, 0x91, 0xA9, + 0xF6, 0xE6, 0x91, 0xBE, 0xF6, 0xE6, 0x92, 0x9D, + 0xF6, 0xE6, 0x91, 0xB7, 0xF6, 0xE3, 0xA9, 0xAC, + 0xF6, 0xE6, 0x95, 0x8F, 0xF6, 0xE6, 0x95, 0xAC, + 0xF6, 0xF0, 0xA3, 0x80, 0x8A, 0xF6, 0xE6, 0x97, + 0xA3, 0xF6, 0xE6, 0x9B, 0xB8, 0xF6, 0xE6, 0x99, + 0x89, 0xF6, 0xE3, 0xAC, 0x99, 0xF6, 0xE6, 0x9A, + 0x91, 0xF6, 0xE3, 0xAC, 0x88, 0xF6, 0xE3, 0xAB, + 0xA4, 0xF6, 0xE5, 0x86, 0x92, 0xF6, 0xE5, 0x86, + 0x95, 0xF6, 0xE6, 0x9C, 0x80, 0xF6, 0xE6, 0x9A, + 0x9C, 0xF6, 0xE8, 0x82, 0xAD, 0xF6, 0xE4, 0x8F, + 0x99, 0xF6, 0xE6, 0x9C, 0x97, 0xF6, 0xE6, 0x9C, + 0x9B, 0xF6, 0xE6, 0x9C, 0xA1, 0xF6, 0xE6, 0x9D, + 0x9E, 0xF6, 0xE6, 0x9D, 0x93, 0xF6, 0xF0, 0xA3, + 0x8F, 0x83, 0xF6, 0xE3, 0xAD, 0x89, 0xF6, 0xE6, + 0x9F, 0xBA, 0xF6, 0xE6, 0x9E, 0x85, 0xF6, 0xE6, + 0xA1, 0x92, 0xF6, 0xE6, 0xA2, 0x85, 0xF6, 0xF0, + 0xA3, 0x91, 0xAD, 0xF6, 0xE6, 0xA2, 0x8E, 0xF6, + 0xE6, 0xA0, 0x9F, 0xF6, 0xE6, 0xA4, 0x94, 0xF6, + 0xE3, 0xAE, 0x9D, 0xF6, 0xE6, 0xA5, 0x82, 0xF6, + 0xE6, 0xA6, 0xA3, 0xF6, 0xE6, 0xA7, 0xAA, 0xF6, + 0xE6, 0xAA, 0xA8, 0xF6, 0xF0, 0xA3, 0x9A, 0xA3, + 0xF6, 0xE6, 0xAB, 0x9B, 0xF6, 0xE3, 0xB0, 0x98, + 0xF6, 0xE6, 0xAC, 0xA1, 0xF6, 0xF0, 0xA3, 0xA2, + 0xA7, 0xF6, 0xE6, 0xAD, 0x94, 0xF6, 0xE3, 0xB1, + 0x8E, 0xF6, 0xE6, 0xAD, 0xB2, 0xF6, 0xE6, 0xAE, + 0x9F, 0xF6, 0xE6, 0xAE, 0xBA, 0xF6, 0xE6, 0xAE, + 0xBB, 0xF6, 0xF0, 0xA3, 0xAA, 0x8D, 0xF6, 0xF0, + 0xA1, 0xB4, 0x8B, 0xF6, 0xF0, 0xA3, 0xAB, 0xBA, + 0xF6, 0xE6, 0xB1, 0x8E, 0xF6, 0xF0, 0xA3, 0xB2, + 0xBC, 0xF6, 0xE6, 0xB2, 0xBF, 0xF6, 0xE6, 0xB3, + 0x8D, 0xF6, 0xE6, 0xB1, 0xA7, 0xF6, 0xE6, 0xB4, + 0x96, 0xF6, 0xE6, 0xB4, 0xBE, 0xF6, 0xE6, 0xB5, + 0xB7, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, 0xB5, + 0xA9, 0xF6, 0xE6, 0xB5, 0xB8, 0xF6, 0xE6, 0xB6, + 0x85, 0xF6, 0xF0, 0xA3, 0xB4, 0x9E, 0xF6, 0xE6, + 0xB4, 0xB4, 0xF6, 0xE6, 0xB8, 0xAF, 0xF6, 0xE6, + 0xB9, 0xAE, 0xF6, 0xE3, 0xB4, 0xB3, 0xF6, 0xE6, + 0xBB, 0x8B, 0xF6, 0xE6, 0xBB, 0x87, 0xF6, 0xF0, + 0xA3, 0xBB, 0x91, 0xF6, 0xE6, 0xB7, 0xB9, 0xF6, + 0xE6, 0xBD, 0xAE, 0xF6, 0xF0, 0xA3, 0xBD, 0x9E, + 0xF6, 0xF0, 0xA3, 0xBE, 0x8E, 0xF6, 0xE6, 0xBF, + 0x86, 0xF6, 0xE7, 0x80, 0xB9, 0xF6, 0xE7, 0x80, + 0x9E, 0xF6, 0xE7, 0x80, 0x9B, 0xF6, 0xE3, 0xB6, + 0x96, 0xF6, 0xE7, 0x81, 0x8A, 0xF6, 0xE7, 0x81, + 0xBD, 0xF6, 0xE7, 0x81, 0xB7, 0xF6, 0xE7, 0x82, + 0xAD, 0xF6, 0xF0, 0xA0, 0x94, 0xA5, 0xF6, 0xE7, + 0x85, 0x85, 0xF6, 0xF0, 0xA4, 0x89, 0xA3, 0xF6, + 0xE7, 0x86, 0x9C, 0xF6, 0xF0, 0xA4, 0x8E, 0xAB, + 0xF6, 0xE7, 0x88, 0xA8, 0xF6, 0xE7, 0x88, 0xB5, + 0xF6, 0xE7, 0x89, 0x90, 0xF6, 0xF0, 0xA4, 0x98, + 0x88, 0xF6, 0xE7, 0x8A, 0x80, 0xF6, 0xE7, 0x8A, + 0x95, 0xF6, 0xF0, 0xA4, 0x9C, 0xB5, 0xF6, 0xF0, + 0xA4, 0xA0, 0x94, 0xF6, 0xE7, 0x8D, 0xBA, 0xF6, + 0xE7, 0x8E, 0x8B, 0xF6, 0xE3, 0xBA, 0xAC, 0xF6, + 0xE7, 0x8E, 0xA5, 0xF6, 0xE3, 0xBA, 0xB8, 0xF6, + 0xE3, 0xBA, 0xB8, 0xF6, 0xE7, 0x91, 0x87, 0xF6, + 0xE7, 0x91, 0x9C, 0xF6, 0xE7, 0x91, 0xB1, 0xF6, + 0xE7, 0x92, 0x85, 0xF6, 0xE7, 0x93, 0x8A, 0xF6, + 0xE3, 0xBC, 0x9B, 0xF6, 0xE7, 0x94, 0xA4, 0xF6, + 0xF0, 0xA4, 0xB0, 0xB6, 0xF6, 0xE7, 0x94, 0xBE, + 0xF6, 0xF0, 0xA4, 0xB2, 0x92, 0xF6, 0xE7, 0x95, + 0xB0, 0xF6, 0xF0, 0xA2, 0x86, 0x9F, 0xF6, 0xE7, + 0x98, 0x90, 0xF6, 0xF0, 0xA4, 0xBE, 0xA1, 0xF6, + 0xF0, 0xA4, 0xBE, 0xB8, 0xF6, 0xF0, 0xA5, 0x81, + 0x84, 0xF6, 0xE3, 0xBF, 0xBC, 0xF6, 0xE4, 0x80, + 0x88, 0xF6, 0xE7, 0x9B, 0xB4, 0xF6, 0xF0, 0xA5, + 0x83, 0xB3, 0xF6, 0xF0, 0xA5, 0x83, 0xB2, 0xF6, + 0xF0, 0xA5, 0x84, 0x99, 0xF6, 0xF0, 0xA5, 0x84, + 0xB3, 0xF6, 0xE7, 0x9C, 0x9E, 0xF6, 0xE7, 0x9C, + 0x9F, 0xF6, 0xE7, 0x9C, 0x9F, 0xF6, 0xE7, 0x9D, + 0x8A, 0xF6, 0xE4, 0x80, 0xB9, 0xF6, 0xE7, 0x9E, + 0x8B, 0xF6, 0xE4, 0x81, 0x86, 0xF6, 0xE4, 0x82, + 0x96, 0xF6, 0xF0, 0xA5, 0x90, 0x9D, 0xF6, 0xE7, + 0xA1, 0x8E, 0xF6, 0xE7, 0xA2, 0x8C, 0xF6, 0xE7, + 0xA3, 0x8C, 0xF6, 0xE4, 0x83, 0xA3, 0xF6, 0xF0, + 0xA5, 0x98, 0xA6, 0xF6, 0xE7, 0xA5, 0x96, 0xF6, + 0xF0, 0xA5, 0x9A, 0x9A, 0xF6, 0xF0, 0xA5, 0x9B, + 0x85, 0xF6, 0xE7, 0xA6, 0x8F, 0xF6, 0xE7, 0xA7, + 0xAB, 0xF6, 0xE4, 0x84, 0xAF, 0xF6, 0xE7, 0xA9, + 0x80, 0xF6, 0xE7, 0xA9, 0x8A, 0xF6, 0xE7, 0xA9, + 0x8F, 0xF6, 0xF0, 0xA5, 0xA5, 0xBC, 0xF6, 0xF0, + 0xA5, 0xAA, 0xA7, 0xF6, 0xF0, 0xA5, 0xAA, 0xA7, + 0xF6, 0xE7, 0xAB, 0xAE, 0xF6, 0xE4, 0x88, 0x82, + 0xF6, 0xF0, 0xA5, 0xAE, 0xAB, 0xF6, 0xE7, 0xAF, + 0x86, 0xF6, 0xE7, 0xAF, 0x89, 0xF6, 0xE4, 0x88, + 0xA7, 0xF6, 0xF0, 0xA5, 0xB2, 0x80, 0xF6, 0xE7, + 0xB3, 0x92, 0xF6, 0xE4, 0x8A, 0xA0, 0xF6, 0xE7, + 0xB3, 0xA8, 0xF6, 0xE7, 0xB3, 0xA3, 0xF6, 0xE7, + 0xB4, 0x80, 0xF6, 0xF0, 0xA5, 0xBE, 0x86, 0xF6, + 0xE7, 0xB5, 0xA3, 0xF6, 0xE4, 0x8C, 0x81, 0xF6, + 0xE7, 0xB7, 0x87, 0xF6, 0xE7, 0xB8, 0x82, 0xF6, + 0xE7, 0xB9, 0x85, 0xF6, 0xE4, 0x8C, 0xB4, 0xF6, + 0xF0, 0xA6, 0x88, 0xA8, 0xF6, 0xF0, 0xA6, 0x89, + 0x87, 0xF6, 0xE4, 0x8D, 0x99, 0xF6, 0xF0, 0xA6, + 0x8B, 0x99, 0xF6, 0xE7, 0xBD, 0xBA, 0xF6, 0xF0, + 0xA6, 0x8C, 0xBE, 0xF6, 0xE7, 0xBE, 0x95, 0xF6, + 0xE7, 0xBF, 0xBA, 0xF6, 0xE8, 0x80, 0x85, 0xF6, + 0xF0, 0xA6, 0x93, 0x9A, 0xF6, 0xF0, 0xA6, 0x94, + 0xA3, 0xF6, 0xE8, 0x81, 0xA0, 0xF6, 0xF0, 0xA6, + 0x96, 0xA8, 0xF6, 0xE8, 0x81, 0xB0, 0xF6, 0xF0, + 0xA3, 0x8D, 0x9F, 0xF6, 0xE4, 0x8F, 0x95, 0xF6, + 0xE8, 0x82, 0xB2, 0xF6, 0xE8, 0x84, 0x83, 0xF6, + 0xE4, 0x90, 0x8B, 0xF6, 0xE8, 0x84, 0xBE, 0xF6, + 0xE5, 0xAA, 0xB5, 0xF6, 0xF0, 0xA6, 0x9E, 0xA7, + 0xF6, 0xF0, 0xA6, 0x9E, 0xB5, 0xF6, 0xF0, 0xA3, + 0x8E, 0x93, 0xF6, 0xF0, 0xA3, 0x8E, 0x9C, 0xF6, + 0xE8, 0x88, 0x81, 0xF6, 0xE8, 0x88, 0x84, 0xF6, + 0xE8, 0xBE, 0x9E, 0xF6, 0xE4, 0x91, 0xAB, 0xF6, + 0xE8, 0x8A, 0x91, 0xF6, 0xE8, 0x8A, 0x8B, 0xF6, + 0xE8, 0x8A, 0x9D, 0xF6, 0xE5, 0x8A, 0xB3, 0xF6, + 0xE8, 0x8A, 0xB1, 0xF6, 0xE8, 0x8A, 0xB3, 0xF6, + 0xE8, 0x8A, 0xBD, 0xF6, 0xE8, 0x8B, 0xA6, 0xF6, + 0xF0, 0xA6, 0xAC, 0xBC, 0xF6, 0xE8, 0x8B, 0xA5, + 0xF6, 0xE8, 0x8C, 0x9D, 0xF6, 0xE8, 0x8D, 0xA3, + 0xF6, 0xE8, 0x8E, 0xAD, 0xF6, 0xE8, 0x8C, 0xA3, + 0xF6, 0xE8, 0x8E, 0xBD, 0xF6, 0xE8, 0x8F, 0xA7, + 0xF6, 0xE8, 0x91, 0x97, 0xF6, 0xE8, 0x8D, 0x93, + 0xF6, 0xE8, 0x8F, 0x8A, 0xF6, 0xE8, 0x8F, 0x8C, + 0xF6, 0xE8, 0x8F, 0x9C, 0xF6, 0xF0, 0xA6, 0xB0, + 0xB6, 0xF6, 0xF0, 0xA6, 0xB5, 0xAB, 0xF6, 0xF0, + 0xA6, 0xB3, 0x95, 0xF6, 0xE4, 0x94, 0xAB, 0xF6, + 0xE8, 0x93, 0xB1, 0xF6, 0xE8, 0x93, 0xB3, 0xF6, + 0xE8, 0x94, 0x96, 0xF6, 0xF0, 0xA7, 0x8F, 0x8A, + 0xF6, 0xE8, 0x95, 0xA4, 0xF6, 0xF0, 0xA6, 0xBC, + 0xAC, 0xF6, 0xE4, 0x95, 0x9D, 0xF6, 0xE4, 0x95, + 0xA1, 0xF6, 0xF0, 0xA6, 0xBE, 0xB1, 0xF6, 0xF0, + 0xA7, 0x83, 0x92, 0xF6, 0xE4, 0x95, 0xAB, 0xF6, + 0xE8, 0x99, 0x90, 0xF6, 0xE8, 0x99, 0x9C, 0xF6, + 0xE8, 0x99, 0xA7, 0xF6, 0xE8, 0x99, 0xA9, 0xF6, + 0xE8, 0x9A, 0xA9, 0xF6, 0xE8, 0x9A, 0x88, 0xF6, + 0xE8, 0x9C, 0x8E, 0xF6, 0xE8, 0x9B, 0xA2, 0xF6, + 0xE8, 0x9D, 0xB9, 0xF6, 0xE8, 0x9C, 0xA8, 0xF6, + 0xE8, 0x9D, 0xAB, 0xF6, 0xE8, 0x9E, 0x86, 0xF6, + 0xE4, 0x97, 0x97, 0xF6, 0xE8, 0x9F, 0xA1, 0xF6, + 0xE8, 0xA0, 0x81, 0xF6, 0xE4, 0x97, 0xB9, 0xF6, + 0xE8, 0xA1, 0xA0, 0xF6, 0xE8, 0xA1, 0xA3, 0xF6, + 0xF0, 0xA7, 0x99, 0xA7, 0xF6, 0xE8, 0xA3, 0x97, + 0xF6, 0xE8, 0xA3, 0x9E, 0xF6, 0xE4, 0x98, 0xB5, + 0xF6, 0xE8, 0xA3, 0xBA, 0xF6, 0xE3, 0x92, 0xBB, + 0xF6, 0xF0, 0xA7, 0xA2, 0xAE, 0xF6, 0xF0, 0xA7, + 0xA5, 0xA6, 0xF6, 0xE4, 0x9A, 0xBE, 0xF6, 0xE4, + 0x9B, 0x87, 0xF6, 0xE8, 0xAA, 0xA0, 0xF6, 0xE8, + 0xAB, 0xAD, 0xF6, 0xE8, 0xAE, 0x8A, 0xF6, 0xE8, + 0xB1, 0x95, 0xF6, 0xF0, 0xA7, 0xB2, 0xA8, 0xF6, + 0xE8, 0xB2, 0xAB, 0xF6, 0xE8, 0xB3, 0x81, 0xF6, + 0xE8, 0xB4, 0x9B, 0xF6, 0xE8, 0xB5, 0xB7, 0xF6, + 0xF0, 0xA7, 0xBC, 0xAF, 0xF6, 0xF0, 0xA0, 0xA0, + 0x84, 0xF6, 0xE8, 0xB7, 0x8B, 0xF6, 0xE8, 0xB6, + 0xBC, 0xF6, 0xE8, 0xB7, 0xB0, 0xF6, 0xF0, 0xA0, + 0xA3, 0x9E, 0xF6, 0xE8, 0xBB, 0x94, 0xF6, 0xE8, + 0xBC, 0xB8, 0xF6, 0xF0, 0xA8, 0x97, 0x92, 0xF6, + 0xF0, 0xA8, 0x97, 0xAD, 0xF6, 0xE9, 0x82, 0x94, + 0xF6, 0xE9, 0x83, 0xB1, 0xF6, 0xE9, 0x84, 0x91, + 0xF6, 0xF0, 0xA8, 0x9C, 0xAE, 0xF6, 0xE9, 0x84, + 0x9B, 0xF6, 0xE9, 0x88, 0xB8, 0xF6, 0xE9, 0x8B, + 0x97, 0xF6, 0xE9, 0x8B, 0x98, 0xF6, 0xE9, 0x89, + 0xBC, 0xF6, 0xE9, 0x8F, 0xB9, 0xF6, 0xE9, 0x90, + 0x95, 0xF6, 0xF0, 0xA8, 0xAF, 0xBA, 0xF6, 0xE9, + 0x96, 0x8B, 0xF6, 0xE4, 0xA6, 0x95, 0xF6, 0xE9, + 0x96, 0xB7, 0xF6, 0xF0, 0xA8, 0xB5, 0xB7, 0xF6, + 0xE4, 0xA7, 0xA6, 0xF6, 0xE9, 0x9B, 0x83, 0xF6, + 0xE5, 0xB6, 0xB2, 0xF6, 0xE9, 0x9C, 0xA3, 0xF6, + 0xF0, 0xA9, 0x85, 0x85, 0xF6, 0xF0, 0xA9, 0x88, + 0x9A, 0xF6, 0xE4, 0xA9, 0xAE, 0xF6, 0xE4, 0xA9, + 0xB6, 0xF6, 0xE9, 0x9F, 0xA0, 0xF6, 0xF0, 0xA9, + 0x90, 0x8A, 0xF6, 0xE4, 0xAA, 0xB2, 0xF6, 0xF0, + 0xA9, 0x92, 0x96, 0xF6, 0xE9, 0xA0, 0x8B, 0xF6, + 0xE9, 0xA0, 0x8B, 0xF6, 0xE9, 0xA0, 0xA9, 0xF6, + 0xF0, 0xA9, 0x96, 0xB6, 0xF6, 0xE9, 0xA3, 0xA2, + 0xF6, 0xE4, 0xAC, 0xB3, 0xF6, 0xE9, 0xA4, 0xA9, + 0xF6, 0xE9, 0xA6, 0xA7, 0xF6, 0xE9, 0xA7, 0x82, + 0xF6, 0xE9, 0xA7, 0xBE, 0xF6, 0xE4, 0xAF, 0x8E, + 0xF6, 0xF0, 0xA9, 0xAC, 0xB0, 0xF6, 0xE9, 0xAC, + 0x92, 0xF6, 0xE9, 0xB1, 0x80, 0xF6, 0xE9, 0xB3, + 0xBD, 0xF6, 0xE4, 0xB3, 0x8E, 0xF6, 0xE4, 0xB3, + 0xAD, 0xF6, 0xE9, 0xB5, 0xA7, 0xF6, 0xF0, 0xAA, + 0x83, 0x8E, 0xF6, 0xE4, 0xB3, 0xB8, 0xF6, 0xF0, + 0xAA, 0x84, 0x85, 0xF6, 0xF0, 0xAA, 0x88, 0x8E, + 0xF6, 0xF0, 0xAA, 0x8A, 0x91, 0xF6, 0xE9, 0xBA, + 0xBB, 0xF6, 0xE4, 0xB5, 0x96, 0xF6, 0xE9, 0xBB, + 0xB9, 0xF6, 0xE9, 0xBB, 0xBE, 0xF6, 0xE9, 0xBC, + 0x85, 0xF6, 0xE9, 0xBC, 0x8F, 0xF6, 0xE9, 0xBC, + 0x96, 0xF6, 0xE9, 0xBC, 0xBB, 0xF6, 0xF0, 0xAA, + 0x98, 0x80, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, + }, + { + 0x20, 0x20, 0xCC, 0x88, 0x61, 0x20, 0xCC, 0x84, + 0x32, 0x33, 0x20, 0xCC, 0x81, 0xCE, 0xBC, 0x20, + 0xCC, 0xA7, 0x31, 0x6F, 0x31, 0xE2, 0x81, 0x84, + 0x34, 0x31, 0xE2, 0x81, 0x84, 0x32, 0x33, 0xE2, + 0x81, 0x84, 0x34, 0xF6, 0x41, 0xCC, 0x80, 0xF6, + 0x41, 0xCC, 0x81, 0xF6, 0x41, 0xCC, 0x82, 0xF6, + 0x41, 0xCC, 0x83, 0xF6, 0x41, 0xCC, 0x88, 0xF6, + 0x41, 0xCC, 0x8A, 0xF6, 0x43, 0xCC, 0xA7, 0xF6, + 0x45, 0xCC, 0x80, 0xF6, 0x45, 0xCC, 0x81, 0xF6, + 0x45, 0xCC, 0x82, 0xF6, 0x45, 0xCC, 0x88, 0xF6, + 0x49, 0xCC, 0x80, 0xF6, 0x49, 0xCC, 0x81, 0xF6, + 0x49, 0xCC, 0x82, 0xF6, 0x49, 0xCC, 0x88, 0xF6, + 0x4E, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x80, 0xF6, + 0x4F, 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xF6, + 0x4F, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x88, 0xF6, + 0x55, 0xCC, 0x80, 0xF6, 0x55, 0xCC, 0x81, 0xF6, + 0x55, 0xCC, 0x82, 0xF6, 0x55, 0xCC, 0x88, 0xF6, + 0x59, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x80, 0xF6, + 0x61, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x82, 0xF6, + 0x61, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x88, 0xF6, + 0x61, 0xCC, 0x8A, 0xF6, 0x63, 0xCC, 0xA7, 0xF6, + 0x65, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x81, 0xF6, + 0x65, 0xCC, 0x82, 0xF6, 0x65, 0xCC, 0x88, 0xF6, + 0x69, 0xCC, 0x80, 0xF6, 0x69, 0xCC, 0x81, 0xF6, + 0x69, 0xCC, 0x82, 0xF6, 0x69, 0xCC, 0x88, 0xF6, + 0x6E, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x80, 0xF6, + 0x6F, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, 0xF6, + 0x6F, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x88, 0xF6, + 0x75, 0xCC, 0x80, 0xF6, 0x75, 0xCC, 0x81, 0xF6, + 0x75, 0xCC, 0x82, 0xF6, 0x75, 0xCC, 0x88, 0xF6, + 0x79, 0xCC, 0x81, 0xF6, 0x79, 0xCC, 0x88, 0xF6, + 0x41, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x84, 0xF6, + 0x41, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0x86, 0xF6, + 0x41, 0xCC, 0xA8, 0xF6, 0x61, 0xCC, 0xA8, 0xF6, + 0x43, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0x81, 0xF6, + 0x43, 0xCC, 0x82, 0xF6, 0x63, 0xCC, 0x82, 0xF6, + 0x43, 0xCC, 0x87, 0xF6, 0x63, 0xCC, 0x87, 0xF6, + 0x43, 0xCC, 0x8C, 0xF6, 0x63, 0xCC, 0x8C, 0xF6, + 0x44, 0xCC, 0x8C, 0xF6, 0x64, 0xCC, 0x8C, 0xF6, + 0x45, 0xCC, 0x84, 0xF6, 0x65, 0xCC, 0x84, 0xF6, + 0x45, 0xCC, 0x86, 0xF6, 0x65, 0xCC, 0x86, 0xF6, + 0x45, 0xCC, 0x87, 0xF6, 0x65, 0xCC, 0x87, 0xF6, + 0x45, 0xCC, 0xA8, 0xF6, 0x65, 0xCC, 0xA8, 0xF6, + 0x45, 0xCC, 0x8C, 0xF6, 0x65, 0xCC, 0x8C, 0xF6, + 0x47, 0xCC, 0x82, 0xF6, 0x67, 0xCC, 0x82, 0xF6, + 0x47, 0xCC, 0x86, 0xF6, 0x67, 0xCC, 0x86, 0xF6, + 0x47, 0xCC, 0x87, 0xF6, 0x67, 0xCC, 0x87, 0xF6, + 0x47, 0xCC, 0xA7, 0xF6, 0x67, 0xCC, 0xA7, 0xF6, + 0x48, 0xCC, 0x82, 0xF6, 0x68, 0xCC, 0x82, 0xF6, + 0x49, 0xCC, 0x83, 0xF6, 0x69, 0xCC, 0x83, 0xF6, + 0x49, 0xCC, 0x84, 0xF6, 0x69, 0xCC, 0x84, 0xF6, + 0x49, 0xCC, 0x86, 0xF6, 0x69, 0xCC, 0x86, 0xF6, + 0x49, 0xCC, 0xA8, 0xF6, 0x69, 0xCC, 0xA8, 0xF6, + 0x49, 0xCC, 0x87, 0x49, 0x4A, 0x69, 0x6A, 0xF6, + 0x4A, 0xCC, 0x82, 0xF6, 0x6A, 0xCC, 0x82, 0xF6, + 0x4B, 0xCC, 0xA7, 0xF6, 0x6B, 0xCC, 0xA7, 0xF6, + 0x4C, 0xCC, 0x81, 0xF6, 0x6C, 0xCC, 0x81, 0xF6, + 0x4C, 0xCC, 0xA7, 0xF6, 0x6C, 0xCC, 0xA7, 0xF6, + 0x4C, 0xCC, 0x8C, 0xF6, 0x6C, 0xCC, 0x8C, 0x4C, + 0xC2, 0xB7, 0x6C, 0xC2, 0xB7, 0xF6, 0x4E, 0xCC, + 0x81, 0xF6, 0x6E, 0xCC, 0x81, 0xF6, 0x4E, 0xCC, + 0xA7, 0xF6, 0x6E, 0xCC, 0xA7, 0xF6, 0x4E, 0xCC, + 0x8C, 0xF6, 0x6E, 0xCC, 0x8C, 0xCA, 0xBC, 0x6E, + 0xF6, 0x4F, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, 0x84, + 0xF6, 0x4F, 0xCC, 0x86, 0xF6, 0x6F, 0xCC, 0x86, + 0xF6, 0x4F, 0xCC, 0x8B, 0xF6, 0x6F, 0xCC, 0x8B, + 0xF6, 0x52, 0xCC, 0x81, 0xF6, 0x72, 0xCC, 0x81, + 0xF6, 0x52, 0xCC, 0xA7, 0xF6, 0x72, 0xCC, 0xA7, + 0xF6, 0x52, 0xCC, 0x8C, 0xF6, 0x72, 0xCC, 0x8C, + 0xF6, 0x53, 0xCC, 0x81, 0xF6, 0x73, 0xCC, 0x81, + 0xF6, 0x53, 0xCC, 0x82, 0xF6, 0x73, 0xCC, 0x82, + 0xF6, 0x53, 0xCC, 0xA7, 0xF6, 0x73, 0xCC, 0xA7, + 0xF6, 0x53, 0xCC, 0x8C, 0xF6, 0x73, 0xCC, 0x8C, + 0xF6, 0x54, 0xCC, 0xA7, 0xF6, 0x74, 0xCC, 0xA7, + 0xF6, 0x54, 0xCC, 0x8C, 0xF6, 0x74, 0xCC, 0x8C, + 0xF6, 0x55, 0xCC, 0x83, 0xF6, 0x75, 0xCC, 0x83, + 0xF6, 0x55, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x84, + 0xF6, 0x55, 0xCC, 0x86, 0xF6, 0x75, 0xCC, 0x86, + 0xF6, 0x55, 0xCC, 0x8A, 0xF6, 0x75, 0xCC, 0x8A, + 0xF6, 0x55, 0xCC, 0x8B, 0xF6, 0x75, 0xCC, 0x8B, + 0xF6, 0x55, 0xCC, 0xA8, 0xF6, 0x75, 0xCC, 0xA8, + 0xF6, 0x57, 0xCC, 0x82, 0xF6, 0x77, 0xCC, 0x82, + 0xF6, 0x59, 0xCC, 0x82, 0xF6, 0x79, 0xCC, 0x82, + 0xF6, 0x59, 0xCC, 0x88, 0xF6, 0x5A, 0xCC, 0x81, + 0xF6, 0x7A, 0xCC, 0x81, 0xF6, 0x5A, 0xCC, 0x87, + 0xF6, 0x7A, 0xCC, 0x87, 0xF6, 0x5A, 0xCC, 0x8C, + 0xF6, 0x7A, 0xCC, 0x8C, 0x73, 0xF6, 0x4F, 0xCC, + 0x9B, 0xF6, 0x6F, 0xCC, 0x9B, 0xF6, 0x55, 0xCC, + 0x9B, 0xF6, 0x75, 0xCC, 0x9B, 0x44, 0x5A, 0xCC, + 0x8C, 0x44, 0x7A, 0xCC, 0x8C, 0x64, 0x7A, 0xCC, + 0x8C, 0x4C, 0x4A, 0x4C, 0x6A, 0x6C, 0x6A, 0x4E, + 0x4A, 0x4E, 0x6A, 0x6E, 0x6A, 0xF6, 0x41, 0xCC, + 0x8C, 0xF6, 0x61, 0xCC, 0x8C, 0xF6, 0x49, 0xCC, + 0x8C, 0xF6, 0x69, 0xCC, 0x8C, 0xF6, 0x4F, 0xCC, + 0x8C, 0xF6, 0x6F, 0xCC, 0x8C, 0xF6, 0x55, 0xCC, + 0x8C, 0xF6, 0x75, 0xCC, 0x8C, 0xF6, 0x55, 0xCC, + 0x88, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x88, 0xCC, + 0x84, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0x75, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x55, 0xCC, + 0x88, 0xCC, 0x8C, 0xF6, 0x75, 0xCC, 0x88, 0xCC, + 0x8C, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x80, 0xF6, + 0x75, 0xCC, 0x88, 0xCC, 0x80, 0xF6, 0x41, 0xCC, + 0x88, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x88, 0xCC, + 0x84, 0xF6, 0x41, 0xCC, 0x87, 0xCC, 0x84, 0xF6, + 0x61, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0xC3, 0x86, + 0xCC, 0x84, 0xF6, 0xC3, 0xA6, 0xCC, 0x84, 0xF6, + 0x47, 0xCC, 0x8C, 0xF6, 0x67, 0xCC, 0x8C, 0xF6, + 0x4B, 0xCC, 0x8C, 0xF6, 0x6B, 0xCC, 0x8C, 0xF6, + 0x4F, 0xCC, 0xA8, 0xF6, 0x6F, 0xCC, 0xA8, 0xF6, + 0x4F, 0xCC, 0xA8, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, + 0xA8, 0xCC, 0x84, 0xF6, 0xC6, 0xB7, 0xCC, 0x8C, + 0xF6, 0xCA, 0x92, 0xCC, 0x8C, 0xF6, 0x6A, 0xCC, + 0x8C, 0x44, 0x5A, 0x44, 0x7A, 0x64, 0x7A, 0xF6, + 0x47, 0xCC, 0x81, 0xF6, 0x67, 0xCC, 0x81, 0xF6, + 0x4E, 0xCC, 0x80, 0xF6, 0x6E, 0xCC, 0x80, 0xF6, + 0x41, 0xCC, 0x8A, 0xCC, 0x81, 0xF6, 0x61, 0xCC, + 0x8A, 0xCC, 0x81, 0xF6, 0xC3, 0x86, 0xCC, 0x81, + 0xF6, 0xC3, 0xA6, 0xCC, 0x81, 0xF6, 0xC3, 0x98, + 0xCC, 0x81, 0xF6, 0xC3, 0xB8, 0xCC, 0x81, 0xF6, + 0x41, 0xCC, 0x8F, 0xF6, 0x61, 0xCC, 0x8F, 0xF6, + 0x41, 0xCC, 0x91, 0xF6, 0x61, 0xCC, 0x91, 0xF6, + 0x45, 0xCC, 0x8F, 0xF6, 0x65, 0xCC, 0x8F, 0xF6, + 0x45, 0xCC, 0x91, 0xF6, 0x65, 0xCC, 0x91, 0xF6, + 0x49, 0xCC, 0x8F, 0xF6, 0x69, 0xCC, 0x8F, 0xF6, + 0x49, 0xCC, 0x91, 0xF6, 0x69, 0xCC, 0x91, 0xF6, + 0x4F, 0xCC, 0x8F, 0xF6, 0x6F, 0xCC, 0x8F, 0xF6, + 0x4F, 0xCC, 0x91, 0xF6, 0x6F, 0xCC, 0x91, 0xF6, + 0x52, 0xCC, 0x8F, 0xF6, 0x72, 0xCC, 0x8F, 0xF6, + 0x52, 0xCC, 0x91, 0xF6, 0x72, 0xCC, 0x91, 0xF6, + 0x55, 0xCC, 0x8F, 0xF6, 0x75, 0xCC, 0x8F, 0xF6, + 0x55, 0xCC, 0x91, 0xF6, 0x75, 0xCC, 0x91, 0xF6, + 0x53, 0xCC, 0xA6, 0xF6, 0x73, 0xCC, 0xA6, 0xF6, + 0x54, 0xCC, 0xA6, 0xF6, 0x74, 0xCC, 0xA6, 0xF6, + 0x48, 0xCC, 0x8C, 0xF6, 0x68, 0xCC, 0x8C, 0xF6, + 0x41, 0xCC, 0x87, 0xF6, 0x61, 0xCC, 0x87, 0xF6, + 0x45, 0xCC, 0xA7, 0xF6, 0x65, 0xCC, 0xA7, 0xF6, + 0x4F, 0xCC, 0x88, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, + 0x88, 0xCC, 0x84, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, + 0x84, 0xF6, 0x6F, 0xCC, 0x83, 0xCC, 0x84, 0xF6, + 0x4F, 0xCC, 0x87, 0xF6, 0x6F, 0xCC, 0x87, 0xF6, + 0x4F, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, + 0x87, 0xCC, 0x84, 0xF6, 0x59, 0xCC, 0x84, 0xF6, + 0x79, 0xCC, 0x84, 0x68, 0xC9, 0xA6, 0x6A, 0x72, + 0xC9, 0xB9, 0xC9, 0xBB, 0xCA, 0x81, 0x77, 0x79, + 0x20, 0xCC, 0x86, 0x20, 0xCC, 0x87, 0x20, 0xCC, + 0x8A, 0x20, 0xCC, 0xA8, 0x20, 0xCC, 0x83, 0x20, + 0xCC, 0x8B, 0xC9, 0xA3, 0x6C, 0x73, 0x78, 0xCA, + 0x95, 0xF6, 0xCC, 0x80, 0xF6, 0xCC, 0x81, 0xF6, + 0xCC, 0x93, 0xF6, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0xCA, 0xB9, 0x20, 0xCD, 0x85, 0xF6, 0x3B, 0x20, + 0xCC, 0x81, 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, + 0x20, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0xCE, 0x91, + 0xCC, 0x81, 0xF6, 0xC2, 0xB7, 0xF6, 0xCE, 0x95, + 0xCC, 0x81, 0xF6, 0xCE, 0x97, 0xCC, 0x81, 0xF6, + 0xCE, 0x99, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, + 0x81, 0xF6, 0xCE, 0xA5, 0xCC, 0x81, 0xF6, 0xCE, + 0xA9, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x88, + 0xCC, 0x81, 0xF6, 0xCE, 0x99, 0xCC, 0x88, 0xF6, + 0xCE, 0xA5, 0xCC, 0x88, 0xF6, 0xCE, 0xB1, 0xCC, + 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x81, 0xF6, 0xCE, + 0xB7, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x81, + 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0xCE, 0xB9, 0xCC, 0x88, 0xF6, 0xCF, 0x85, 0xCC, + 0x88, 0xF6, 0xCE, 0xBF, 0xCC, 0x81, 0xF6, 0xCF, + 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, 0x81, + 0xCE, 0xB2, 0xCE, 0xB8, 0xCE, 0xA5, 0xF5, 0x05, + 0xCF, 0x92, 0xCC, 0x81, 0xCE, 0xA5, 0xCC, 0x81, + 0xF5, 0x05, 0xCF, 0x92, 0xCC, 0x88, 0xCE, 0xA5, + 0xCC, 0x88, 0xCF, 0x86, 0xCF, 0x80, 0xCE, 0xBA, + 0xCF, 0x81, 0xCF, 0x82, 0xCE, 0x98, 0xCE, 0xB5, + 0xCE, 0xA3, 0xF6, 0xD0, 0x95, 0xCC, 0x80, 0xF6, + 0xD0, 0x95, 0xCC, 0x88, 0xF6, 0xD0, 0x93, 0xCC, + 0x81, 0xF6, 0xD0, 0x86, 0xCC, 0x88, 0xF6, 0xD0, + 0x9A, 0xCC, 0x81, 0xF6, 0xD0, 0x98, 0xCC, 0x80, + 0xF6, 0xD0, 0xA3, 0xCC, 0x86, 0xF6, 0xD0, 0x98, + 0xCC, 0x86, 0xF6, 0xD0, 0xB8, 0xCC, 0x86, 0xF6, + 0xD0, 0xB5, 0xCC, 0x80, 0xF6, 0xD0, 0xB5, 0xCC, + 0x88, 0xF6, 0xD0, 0xB3, 0xCC, 0x81, 0xF6, 0xD1, + 0x96, 0xCC, 0x88, 0xF6, 0xD0, 0xBA, 0xCC, 0x81, + 0xF6, 0xD0, 0xB8, 0xCC, 0x80, 0xF6, 0xD1, 0x83, + 0xCC, 0x86, 0xF6, 0xD1, 0xB4, 0xCC, 0x8F, 0xF6, + 0xD1, 0xB5, 0xCC, 0x8F, 0xF6, 0xD0, 0x96, 0xCC, + 0x86, 0xF6, 0xD0, 0xB6, 0xCC, 0x86, 0xF6, 0xD0, + 0x90, 0xCC, 0x86, 0xF6, 0xD0, 0xB0, 0xCC, 0x86, + 0xF6, 0xD0, 0x90, 0xCC, 0x88, 0xF6, 0xD0, 0xB0, + 0xCC, 0x88, 0xF6, 0xD0, 0x95, 0xCC, 0x86, 0xF6, + 0xD0, 0xB5, 0xCC, 0x86, 0xF6, 0xD3, 0x98, 0xCC, + 0x88, 0xF6, 0xD3, 0x99, 0xCC, 0x88, 0xF6, 0xD0, + 0x96, 0xCC, 0x88, 0xF6, 0xD0, 0xB6, 0xCC, 0x88, + 0xF6, 0xD0, 0x97, 0xCC, 0x88, 0xF6, 0xD0, 0xB7, + 0xCC, 0x88, 0xF6, 0xD0, 0x98, 0xCC, 0x84, 0xF6, + 0xD0, 0xB8, 0xCC, 0x84, 0xF6, 0xD0, 0x98, 0xCC, + 0x88, 0xF6, 0xD0, 0xB8, 0xCC, 0x88, 0xF6, 0xD0, + 0x9E, 0xCC, 0x88, 0xF6, 0xD0, 0xBE, 0xCC, 0x88, + 0xF6, 0xD3, 0xA8, 0xCC, 0x88, 0xF6, 0xD3, 0xA9, + 0xCC, 0x88, 0xF6, 0xD0, 0xAD, 0xCC, 0x88, 0xF6, + 0xD1, 0x8D, 0xCC, 0x88, 0xF6, 0xD0, 0xA3, 0xCC, + 0x84, 0xF6, 0xD1, 0x83, 0xCC, 0x84, 0xF6, 0xD0, + 0xA3, 0xCC, 0x88, 0xF6, 0xD1, 0x83, 0xCC, 0x88, + 0xF6, 0xD0, 0xA3, 0xCC, 0x8B, 0xF6, 0xD1, 0x83, + 0xCC, 0x8B, 0xF6, 0xD0, 0xA7, 0xCC, 0x88, 0xF6, + 0xD1, 0x87, 0xCC, 0x88, 0xF6, 0xD0, 0xAB, 0xCC, + 0x88, 0xF6, 0xD1, 0x8B, 0xCC, 0x88, 0xD5, 0xA5, + 0xD6, 0x82, 0xF6, 0xD8, 0xA7, 0xD9, 0x93, 0xF6, + 0xD8, 0xA7, 0xD9, 0x94, 0xF6, 0xD9, 0x88, 0xD9, + 0x94, 0xF6, 0xD8, 0xA7, 0xD9, 0x95, 0xF6, 0xD9, + 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0xB4, 0xD9, + 0x88, 0xD9, 0xB4, 0xDB, 0x87, 0xD9, 0xB4, 0xD9, + 0x8A, 0xD9, 0xB4, 0xF6, 0xDB, 0x95, 0xD9, 0x94, + 0xF6, 0xDB, 0x81, 0xD9, 0x94, 0xF6, 0xDB, 0x92, + 0xD9, 0x94, 0xF6, 0xE0, 0xA4, 0xA8, 0xE0, 0xA4, + 0xBC, 0xF6, 0xE0, 0xA4, 0xB0, 0xE0, 0xA4, 0xBC, + 0xF6, 0xE0, 0xA4, 0xB3, 0xE0, 0xA4, 0xBC, 0xF6, + 0xE0, 0xA4, 0x95, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, + 0xA4, 0x96, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, + 0x97, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x9C, + 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0xA1, 0xE0, + 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0xA2, 0xE0, 0xA4, + 0xBC, 0xF6, 0xE0, 0xA4, 0xAB, 0xE0, 0xA4, 0xBC, + 0xF6, 0xE0, 0xA4, 0xAF, 0xE0, 0xA4, 0xBC, 0xF6, + 0xE0, 0xA7, 0x87, 0xE0, 0xA6, 0xBE, 0xF6, 0xE0, + 0xA7, 0x87, 0xE0, 0xA7, 0x97, 0xF6, 0xE0, 0xA6, + 0xA1, 0xE0, 0xA6, 0xBC, 0xF6, 0xE0, 0xA6, 0xA2, + 0xE0, 0xA6, 0xBC, 0xF6, 0xE0, 0xA6, 0xAF, 0xE0, + 0xA6, 0xBC, 0xF6, 0xE0, 0xA8, 0xB2, 0xE0, 0xA8, + 0xBC, 0xF6, 0xE0, 0xA8, 0xB8, 0xE0, 0xA8, 0xBC, + 0xF6, 0xE0, 0xA8, 0x96, 0xE0, 0xA8, 0xBC, 0xF6, + 0xE0, 0xA8, 0x97, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, + 0xA8, 0x9C, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, + 0xAB, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xAD, 0x87, + 0xE0, 0xAD, 0x96, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, + 0xAC, 0xBE, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAD, + 0x97, 0xF6, 0xE0, 0xAC, 0xA1, 0xE0, 0xAC, 0xBC, + 0xF6, 0xE0, 0xAC, 0xA2, 0xE0, 0xAC, 0xBC, 0xF6, + 0xE0, 0xAE, 0x92, 0xE0, 0xAF, 0x97, 0xF6, 0xE0, + 0xAF, 0x86, 0xE0, 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, + 0x87, 0xE0, 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, 0x86, + 0xE0, 0xAF, 0x97, 0xF6, 0xE0, 0xB1, 0x86, 0xE0, + 0xB1, 0x96, 0xF6, 0xE0, 0xB2, 0xBF, 0xE0, 0xB3, + 0x95, 0xF6, 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x95, + 0xF6, 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x96, 0xF6, + 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x82, 0xF6, 0xE0, + 0xB3, 0x86, 0xE0, 0xB3, 0x82, 0xE0, 0xB3, 0x95, + 0xF6, 0xE0, 0xB5, 0x86, 0xE0, 0xB4, 0xBE, 0xF6, + 0xE0, 0xB5, 0x87, 0xE0, 0xB4, 0xBE, 0xF6, 0xE0, + 0xB5, 0x86, 0xE0, 0xB5, 0x97, 0xF6, 0xE0, 0xB7, + 0x99, 0xE0, 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, 0x99, + 0xE0, 0xB7, 0x8F, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, + 0xB7, 0x8F, 0xE0, 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, + 0x99, 0xE0, 0xB7, 0x9F, 0xE0, 0xB9, 0x8D, 0xE0, + 0xB8, 0xB2, 0xE0, 0xBB, 0x8D, 0xE0, 0xBA, 0xB2, + 0xE0, 0xBA, 0xAB, 0xE0, 0xBA, 0x99, 0xE0, 0xBA, + 0xAB, 0xE0, 0xBA, 0xA1, 0xE0, 0xBC, 0x8B, 0xF6, + 0xE0, 0xBD, 0x82, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, + 0xBD, 0x8C, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, + 0x91, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x96, + 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x9B, 0xE0, + 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x80, 0xE0, 0xBE, + 0xB5, 0xF6, 0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB2, + 0xF6, 0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB4, 0xF6, + 0xE0, 0xBE, 0xB2, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, + 0xB2, 0xE0, 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, + 0xE0, 0xBE, 0xB3, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, + 0xB3, 0xE0, 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, + 0xE0, 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, + 0xBE, 0x92, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, + 0x9C, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xA1, + 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xA6, 0xE0, + 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xAB, 0xE0, 0xBE, + 0xB7, 0xF6, 0xE0, 0xBE, 0x90, 0xE0, 0xBE, 0xB5, + 0xF6, 0xE1, 0x80, 0xA5, 0xE1, 0x80, 0xAE, 0xE1, + 0x83, 0x9C, 0xF6, 0xE1, 0xAC, 0x85, 0xE1, 0xAC, + 0xB5, 0xF6, 0xE1, 0xAC, 0x87, 0xE1, 0xAC, 0xB5, + 0xF6, 0xE1, 0xAC, 0x89, 0xE1, 0xAC, 0xB5, 0xF6, + 0xE1, 0xAC, 0x8B, 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, + 0xAC, 0x8D, 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, + 0x91, 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, 0xBA, + 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, 0xBC, 0xE1, + 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, 0xBE, 0xE1, 0xAC, + 0xB5, 0xF6, 0xE1, 0xAC, 0xBF, 0xE1, 0xAC, 0xB5, + 0xF6, 0xE1, 0xAD, 0x82, 0xE1, 0xAC, 0xB5, 0x41, + 0xC3, 0x86, 0x42, 0x44, 0x45, 0xC6, 0x8E, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0xC8, 0xA2, 0x50, 0x52, 0x54, 0x55, 0x57, 0x61, + 0xC9, 0x90, 0xC9, 0x91, 0xE1, 0xB4, 0x82, 0x62, + 0x64, 0x65, 0xC9, 0x99, 0xC9, 0x9B, 0xC9, 0x9C, + 0x67, 0x6B, 0x6D, 0xC5, 0x8B, 0x6F, 0xC9, 0x94, + 0xE1, 0xB4, 0x96, 0xE1, 0xB4, 0x97, 0x70, 0x74, + 0x75, 0xE1, 0xB4, 0x9D, 0xC9, 0xAF, 0x76, 0xE1, + 0xB4, 0xA5, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, + 0xCF, 0x86, 0xCF, 0x87, 0x69, 0x72, 0x75, 0x76, + 0xCE, 0xB2, 0xCE, 0xB3, 0xCF, 0x81, 0xCF, 0x86, + 0xCF, 0x87, 0xD0, 0xBD, 0xC9, 0x92, 0x63, 0xC9, + 0x95, 0xC3, 0xB0, 0xC9, 0x9C, 0x66, 0xC9, 0x9F, + 0xC9, 0xA1, 0xC9, 0xA5, 0xC9, 0xA8, 0xC9, 0xA9, + 0xC9, 0xAA, 0xE1, 0xB5, 0xBB, 0xCA, 0x9D, 0xC9, + 0xAD, 0xE1, 0xB6, 0x85, 0xCA, 0x9F, 0xC9, 0xB1, + 0xC9, 0xB0, 0xC9, 0xB2, 0xC9, 0xB3, 0xC9, 0xB4, + 0xC9, 0xB5, 0xC9, 0xB8, 0xCA, 0x82, 0xCA, 0x83, + 0xC6, 0xAB, 0xCA, 0x89, 0xCA, 0x8A, 0xE1, 0xB4, + 0x9C, 0xCA, 0x8B, 0xCA, 0x8C, 0x7A, 0xCA, 0x90, + 0xCA, 0x91, 0xCA, 0x92, 0xCE, 0xB8, 0xF6, 0x41, + 0xCC, 0xA5, 0xF6, 0x61, 0xCC, 0xA5, 0xF6, 0x42, + 0xCC, 0x87, 0xF6, 0x62, 0xCC, 0x87, 0xF6, 0x42, + 0xCC, 0xA3, 0xF6, 0x62, 0xCC, 0xA3, 0xF6, 0x42, + 0xCC, 0xB1, 0xF6, 0x62, 0xCC, 0xB1, 0xF6, 0x43, + 0xCC, 0xA7, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0xA7, + 0xCC, 0x81, 0xF6, 0x44, 0xCC, 0x87, 0xF6, 0x64, + 0xCC, 0x87, 0xF6, 0x44, 0xCC, 0xA3, 0xF6, 0x64, + 0xCC, 0xA3, 0xF6, 0x44, 0xCC, 0xB1, 0xF6, 0x64, + 0xCC, 0xB1, 0xF6, 0x44, 0xCC, 0xA7, 0xF6, 0x64, + 0xCC, 0xA7, 0xF6, 0x44, 0xCC, 0xAD, 0xF6, 0x64, + 0xCC, 0xAD, 0xF6, 0x45, 0xCC, 0x84, 0xCC, 0x80, + 0xF6, 0x65, 0xCC, 0x84, 0xCC, 0x80, 0xF6, 0x45, + 0xCC, 0x84, 0xCC, 0x81, 0xF6, 0x65, 0xCC, 0x84, + 0xCC, 0x81, 0xF6, 0x45, 0xCC, 0xAD, 0xF6, 0x65, + 0xCC, 0xAD, 0xF6, 0x45, 0xCC, 0xB0, 0xF6, 0x65, + 0xCC, 0xB0, 0xF6, 0x45, 0xCC, 0xA7, 0xCC, 0x86, + 0xF6, 0x65, 0xCC, 0xA7, 0xCC, 0x86, 0xF6, 0x46, + 0xCC, 0x87, 0xF6, 0x66, 0xCC, 0x87, 0xF6, 0x47, + 0xCC, 0x84, 0xF6, 0x67, 0xCC, 0x84, 0xF6, 0x48, + 0xCC, 0x87, 0xF6, 0x68, 0xCC, 0x87, 0xF6, 0x48, + 0xCC, 0xA3, 0xF6, 0x68, 0xCC, 0xA3, 0xF6, 0x48, + 0xCC, 0x88, 0xF6, 0x68, 0xCC, 0x88, 0xF6, 0x48, + 0xCC, 0xA7, 0xF6, 0x68, 0xCC, 0xA7, 0xF6, 0x48, + 0xCC, 0xAE, 0xF6, 0x68, 0xCC, 0xAE, 0xF6, 0x49, + 0xCC, 0xB0, 0xF6, 0x69, 0xCC, 0xB0, 0xF6, 0x49, + 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x69, 0xCC, 0x88, + 0xCC, 0x81, 0xF6, 0x4B, 0xCC, 0x81, 0xF6, 0x6B, + 0xCC, 0x81, 0xF6, 0x4B, 0xCC, 0xA3, 0xF6, 0x6B, + 0xCC, 0xA3, 0xF6, 0x4B, 0xCC, 0xB1, 0xF6, 0x6B, + 0xCC, 0xB1, 0xF6, 0x4C, 0xCC, 0xA3, 0xF6, 0x6C, + 0xCC, 0xA3, 0xF6, 0x4C, 0xCC, 0xA3, 0xCC, 0x84, + 0xF6, 0x6C, 0xCC, 0xA3, 0xCC, 0x84, 0xF6, 0x4C, + 0xCC, 0xB1, 0xF6, 0x6C, 0xCC, 0xB1, 0xF6, 0x4C, + 0xCC, 0xAD, 0xF6, 0x6C, 0xCC, 0xAD, 0xF6, 0x4D, + 0xCC, 0x81, 0xF6, 0x6D, 0xCC, 0x81, 0xF6, 0x4D, + 0xCC, 0x87, 0xF6, 0x6D, 0xCC, 0x87, 0xF6, 0x4D, + 0xCC, 0xA3, 0xF6, 0x6D, 0xCC, 0xA3, 0xF6, 0x4E, + 0xCC, 0x87, 0xF6, 0x6E, 0xCC, 0x87, 0xF6, 0x4E, + 0xCC, 0xA3, 0xF6, 0x6E, 0xCC, 0xA3, 0xF6, 0x4E, + 0xCC, 0xB1, 0xF6, 0x6E, 0xCC, 0xB1, 0xF6, 0x4E, + 0xCC, 0xAD, 0xF6, 0x6E, 0xCC, 0xAD, 0xF6, 0x4F, + 0xCC, 0x83, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x83, + 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, 0x88, + 0xF6, 0x6F, 0xCC, 0x83, 0xCC, 0x88, 0xF6, 0x4F, + 0xCC, 0x84, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x84, + 0xCC, 0x80, 0xF6, 0x4F, 0xCC, 0x84, 0xCC, 0x81, + 0xF6, 0x6F, 0xCC, 0x84, 0xCC, 0x81, 0xF6, 0x50, + 0xCC, 0x81, 0xF6, 0x70, 0xCC, 0x81, 0xF6, 0x50, + 0xCC, 0x87, 0xF6, 0x70, 0xCC, 0x87, 0xF6, 0x52, + 0xCC, 0x87, 0xF6, 0x72, 0xCC, 0x87, 0xF6, 0x52, + 0xCC, 0xA3, 0xF6, 0x72, 0xCC, 0xA3, 0xF6, 0x52, + 0xCC, 0xA3, 0xCC, 0x84, 0xF6, 0x72, 0xCC, 0xA3, + 0xCC, 0x84, 0xF6, 0x52, 0xCC, 0xB1, 0xF6, 0x72, + 0xCC, 0xB1, 0xF6, 0x53, 0xCC, 0x87, 0xF6, 0x73, + 0xCC, 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xF6, 0x73, + 0xCC, 0xA3, 0xF6, 0x53, 0xCC, 0x81, 0xCC, 0x87, + 0xF6, 0x73, 0xCC, 0x81, 0xCC, 0x87, 0xF6, 0x53, + 0xCC, 0x8C, 0xCC, 0x87, 0xF6, 0x73, 0xCC, 0x8C, + 0xCC, 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xCC, 0x87, + 0xF6, 0x73, 0xCC, 0xA3, 0xCC, 0x87, 0xF6, 0x54, + 0xCC, 0x87, 0xF6, 0x74, 0xCC, 0x87, 0xF6, 0x54, + 0xCC, 0xA3, 0xF6, 0x74, 0xCC, 0xA3, 0xF6, 0x54, + 0xCC, 0xB1, 0xF6, 0x74, 0xCC, 0xB1, 0xF6, 0x54, + 0xCC, 0xAD, 0xF6, 0x74, 0xCC, 0xAD, 0xF6, 0x55, + 0xCC, 0xA4, 0xF6, 0x75, 0xCC, 0xA4, 0xF6, 0x55, + 0xCC, 0xB0, 0xF6, 0x75, 0xCC, 0xB0, 0xF6, 0x55, + 0xCC, 0xAD, 0xF6, 0x75, 0xCC, 0xAD, 0xF6, 0x55, + 0xCC, 0x83, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x83, + 0xCC, 0x81, 0xF6, 0x55, 0xCC, 0x84, 0xCC, 0x88, + 0xF6, 0x75, 0xCC, 0x84, 0xCC, 0x88, 0xF6, 0x56, + 0xCC, 0x83, 0xF6, 0x76, 0xCC, 0x83, 0xF6, 0x56, + 0xCC, 0xA3, 0xF6, 0x76, 0xCC, 0xA3, 0xF6, 0x57, + 0xCC, 0x80, 0xF6, 0x77, 0xCC, 0x80, 0xF6, 0x57, + 0xCC, 0x81, 0xF6, 0x77, 0xCC, 0x81, 0xF6, 0x57, + 0xCC, 0x88, 0xF6, 0x77, 0xCC, 0x88, 0xF6, 0x57, + 0xCC, 0x87, 0xF6, 0x77, 0xCC, 0x87, 0xF6, 0x57, + 0xCC, 0xA3, 0xF6, 0x77, 0xCC, 0xA3, 0xF6, 0x58, + 0xCC, 0x87, 0xF6, 0x78, 0xCC, 0x87, 0xF6, 0x58, + 0xCC, 0x88, 0xF6, 0x78, 0xCC, 0x88, 0xF6, 0x59, + 0xCC, 0x87, 0xF6, 0x79, 0xCC, 0x87, 0xF6, 0x5A, + 0xCC, 0x82, 0xF6, 0x7A, 0xCC, 0x82, 0xF6, 0x5A, + 0xCC, 0xA3, 0xF6, 0x7A, 0xCC, 0xA3, 0xF6, 0x5A, + 0xCC, 0xB1, 0xF6, 0x7A, 0xCC, 0xB1, 0xF6, 0x68, + 0xCC, 0xB1, 0xF6, 0x74, 0xCC, 0x88, 0xF6, 0x77, + 0xCC, 0x8A, 0xF6, 0x79, 0xCC, 0x8A, 0x61, 0xCA, + 0xBE, 0xF5, 0x05, 0xC5, 0xBF, 0xCC, 0x87, 0x73, + 0xCC, 0x87, 0xF6, 0x41, 0xCC, 0xA3, 0xF6, 0x61, + 0xCC, 0xA3, 0xF6, 0x41, 0xCC, 0x89, 0xF6, 0x61, + 0xCC, 0x89, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x81, + 0xF6, 0x61, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x41, + 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x61, 0xCC, 0x82, + 0xCC, 0x80, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x89, + 0xF6, 0x61, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x41, + 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x82, + 0xCC, 0x83, 0xF6, 0x41, 0xCC, 0xA3, 0xCC, 0x82, + 0xF6, 0x61, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x41, + 0xCC, 0x86, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x86, + 0xCC, 0x81, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x80, + 0xF6, 0x61, 0xCC, 0x86, 0xCC, 0x80, 0xF6, 0x41, + 0xCC, 0x86, 0xCC, 0x89, 0xF6, 0x61, 0xCC, 0x86, + 0xCC, 0x89, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x83, + 0xF6, 0x61, 0xCC, 0x86, 0xCC, 0x83, 0xF6, 0x41, + 0xCC, 0xA3, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0xA3, + 0xCC, 0x86, 0xF6, 0x45, 0xCC, 0xA3, 0xF6, 0x65, + 0xCC, 0xA3, 0xF6, 0x45, 0xCC, 0x89, 0xF6, 0x65, + 0xCC, 0x89, 0xF6, 0x45, 0xCC, 0x83, 0xF6, 0x65, + 0xCC, 0x83, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x81, + 0xF6, 0x65, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x45, + 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x82, + 0xCC, 0x80, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x89, + 0xF6, 0x65, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x45, + 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x65, 0xCC, 0x82, + 0xCC, 0x83, 0xF6, 0x45, 0xCC, 0xA3, 0xCC, 0x82, + 0xF6, 0x65, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x49, + 0xCC, 0x89, 0xF6, 0x69, 0xCC, 0x89, 0xF6, 0x49, + 0xCC, 0xA3, 0xF6, 0x69, 0xCC, 0xA3, 0xF6, 0x4F, + 0xCC, 0xA3, 0xF6, 0x6F, 0xCC, 0xA3, 0xF6, 0x4F, + 0xCC, 0x89, 0xF6, 0x6F, 0xCC, 0x89, 0xF6, 0x4F, + 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, + 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x80, + 0xF6, 0x6F, 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x4F, + 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x6F, 0xCC, 0x82, + 0xCC, 0x89, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x83, + 0xF6, 0x6F, 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x4F, + 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x6F, 0xCC, 0xA3, + 0xCC, 0x82, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x81, + 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, 0x81, 0xF6, 0x4F, + 0xCC, 0x9B, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x9B, + 0xCC, 0x80, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x89, + 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, 0x89, 0xF6, 0x4F, + 0xCC, 0x9B, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x9B, + 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0xA3, + 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, 0x55, + 0xCC, 0xA3, 0xF6, 0x75, 0xCC, 0xA3, 0xF6, 0x55, + 0xCC, 0x89, 0xF6, 0x75, 0xCC, 0x89, 0xF6, 0x55, + 0xCC, 0x9B, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x9B, + 0xCC, 0x81, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x80, + 0xF6, 0x75, 0xCC, 0x9B, 0xCC, 0x80, 0xF6, 0x55, + 0xCC, 0x9B, 0xCC, 0x89, 0xF6, 0x75, 0xCC, 0x9B, + 0xCC, 0x89, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x83, + 0xF6, 0x75, 0xCC, 0x9B, 0xCC, 0x83, 0xF6, 0x55, + 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, 0x75, 0xCC, 0x9B, + 0xCC, 0xA3, 0xF6, 0x59, 0xCC, 0x80, 0xF6, 0x79, + 0xCC, 0x80, 0xF6, 0x59, 0xCC, 0xA3, 0xF6, 0x79, + 0xCC, 0xA3, 0xF6, 0x59, 0xCC, 0x89, 0xF6, 0x79, + 0xCC, 0x89, 0xF6, 0x59, 0xCC, 0x83, 0xF6, 0x79, + 0xCC, 0x83, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xF6, + 0xCE, 0xB1, 0xCC, 0x94, 0xF6, 0xCE, 0xB1, 0xCC, + 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, + 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCC, + 0x81, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCC, 0x81, + 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xF6, + 0xCE, 0xB1, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, + 0x91, 0xCC, 0x93, 0xF6, 0xCE, 0x91, 0xCC, 0x94, + 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xF6, + 0xCE, 0x91, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, + 0x91, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0x91, + 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCC, + 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0x91, 0xCC, 0x94, + 0xCD, 0x82, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xF6, + 0xCE, 0xB5, 0xCC, 0x94, 0xF6, 0xCE, 0xB5, 0xCC, + 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x94, + 0xCC, 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xCC, + 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x94, 0xCC, 0x81, + 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xF6, 0xCE, 0x95, + 0xCC, 0x94, 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xCC, + 0x80, 0xF6, 0xCE, 0x95, 0xCC, 0x94, 0xCC, 0x80, + 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xCC, 0x81, 0xF6, + 0xCE, 0x95, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, + 0xB7, 0xCC, 0x93, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, + 0xF6, 0xCE, 0xB7, 0xCC, 0x93, 0xCC, 0x80, 0xF6, + 0xCE, 0xB7, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, + 0xB7, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, + 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, + 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, + 0xCD, 0x82, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xF6, + 0xCE, 0x97, 0xCC, 0x94, 0xF6, 0xCE, 0x97, 0xCC, + 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x94, + 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, + 0x81, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x81, + 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCD, 0x82, 0xF6, + 0xCE, 0x97, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, + 0xB9, 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, + 0xF6, 0xCE, 0xB9, 0xCC, 0x93, 0xCC, 0x80, 0xF6, + 0xCE, 0xB9, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, + 0xB9, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, + 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, + 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, + 0xCD, 0x82, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xF6, + 0xCE, 0x99, 0xCC, 0x94, 0xF6, 0xCE, 0x99, 0xCC, + 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x94, + 0xCC, 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xCC, + 0x81, 0xF6, 0xCE, 0x99, 0xCC, 0x94, 0xCC, 0x81, + 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xCD, 0x82, 0xF6, + 0xCE, 0x99, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, + 0xBF, 0xCC, 0x93, 0xF6, 0xCE, 0xBF, 0xCC, 0x94, + 0xF6, 0xCE, 0xBF, 0xCC, 0x93, 0xCC, 0x80, 0xF6, + 0xCE, 0xBF, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, + 0xBF, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xBF, + 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, + 0x93, 0xF6, 0xCE, 0x9F, 0xCC, 0x94, 0xF6, 0xCE, + 0x9F, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, + 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, + 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, 0x94, + 0xCC, 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xF6, + 0xCF, 0x85, 0xCC, 0x94, 0xF6, 0xCF, 0x85, 0xCC, + 0x93, 0xCC, 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x94, + 0xCC, 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xCC, + 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x94, 0xCC, 0x81, + 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xCD, 0x82, 0xF6, + 0xCF, 0x85, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, + 0xA5, 0xCC, 0x94, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, + 0xCC, 0x80, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCC, + 0x81, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCD, 0x82, + 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xF6, 0xCF, 0x89, + 0xCC, 0x94, 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCC, + 0x80, 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x80, + 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCC, 0x81, 0xF6, + 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCF, + 0x89, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCF, 0x89, + 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xA9, 0xCC, + 0x93, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xF6, 0xCE, + 0xA9, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, + 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, + 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, + 0xCC, 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, + 0x82, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x82, + 0xF6, 0xCE, 0xB1, 0xCC, 0x80, 0xF6, 0xCE, 0xB1, + 0xCC, 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x80, 0xF6, + 0xCE, 0xB5, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, + 0x80, 0xF6, 0xCE, 0xB7, 0xCC, 0x81, 0xF6, 0xCE, + 0xB9, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, 0x81, + 0xF6, 0xCE, 0xBF, 0xCC, 0x80, 0xF6, 0xCE, 0xBF, + 0xCC, 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x80, 0xF6, + 0xCF, 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, + 0x80, 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xF6, 0xCE, + 0xB1, 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, + 0xCC, 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, + 0x93, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, + 0xCC, 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, + 0xB1, 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, + 0xCE, 0xB1, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, + 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xCD, + 0x85, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCD, 0x82, + 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCD, + 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCD, 0x85, + 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xCD, + 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCC, 0x80, + 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, + 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, + 0x93, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x91, + 0xCC, 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, + 0xB7, 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, + 0xCC, 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, + 0x93, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, + 0xCC, 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, + 0xB7, 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, + 0xCE, 0xB7, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, + 0xF6, 0xCE, 0xB7, 0xCC, 0x93, 0xCD, 0x82, 0xCD, + 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xCD, 0x82, + 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCD, + 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCD, 0x85, + 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x80, 0xCD, + 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x80, + 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, + 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, + 0x93, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x97, + 0xCC, 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCF, + 0x89, 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCF, 0x89, + 0xCC, 0x94, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, + 0x93, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, + 0xCC, 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, + 0x89, 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, + 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, + 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCD, 0x82, 0xCD, + 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCD, 0x82, + 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, + 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x85, + 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCC, 0x80, 0xCD, + 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, 0x80, + 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, + 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, + 0x93, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, + 0xCC, 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, + 0xB1, 0xCC, 0x86, 0xF6, 0xCE, 0xB1, 0xCC, 0x84, + 0xF6, 0xCE, 0xB1, 0xCC, 0x80, 0xCD, 0x85, 0xF6, + 0xCE, 0xB1, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCD, 0x82, + 0xF6, 0xCE, 0xB1, 0xCD, 0x82, 0xCD, 0x85, 0xF6, + 0xCE, 0x91, 0xCC, 0x86, 0xF6, 0xCE, 0x91, 0xCC, + 0x84, 0xF6, 0xCE, 0x91, 0xCC, 0x80, 0xF6, 0xCE, + 0x91, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCD, 0x85, + 0x20, 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0x20, 0xCC, + 0x93, 0x20, 0xCD, 0x82, 0xF5, 0x05, 0xC2, 0xA8, + 0xCD, 0x82, 0x20, 0xCC, 0x88, 0xCD, 0x82, 0xF6, + 0xCE, 0xB7, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, + 0xB7, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x81, + 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCD, 0x82, 0xF6, + 0xCE, 0xB7, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, + 0x95, 0xCC, 0x80, 0xF6, 0xCE, 0x95, 0xCC, 0x81, + 0xF6, 0xCE, 0x97, 0xCC, 0x80, 0xF6, 0xCE, 0x97, + 0xCC, 0x81, 0xF6, 0xCE, 0x97, 0xCD, 0x85, 0xF5, + 0x06, 0xE1, 0xBE, 0xBF, 0xCC, 0x80, 0x20, 0xCC, + 0x93, 0xCC, 0x80, 0xF5, 0x06, 0xE1, 0xBE, 0xBF, + 0xCC, 0x81, 0x20, 0xCC, 0x93, 0xCC, 0x81, 0xF5, + 0x06, 0xE1, 0xBE, 0xBF, 0xCD, 0x82, 0x20, 0xCC, + 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x86, + 0xF6, 0xCE, 0xB9, 0xCC, 0x84, 0xF6, 0xCE, 0xB9, + 0xCC, 0x88, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, + 0x88, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCD, 0x82, + 0xF6, 0xCE, 0xB9, 0xCC, 0x88, 0xCD, 0x82, 0xF6, + 0xCE, 0x99, 0xCC, 0x86, 0xF6, 0xCE, 0x99, 0xCC, + 0x84, 0xF6, 0xCE, 0x99, 0xCC, 0x80, 0xF6, 0xCE, + 0x99, 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, + 0xCC, 0x80, 0x20, 0xCC, 0x94, 0xCC, 0x80, 0xF5, + 0x06, 0xE1, 0xBF, 0xBE, 0xCC, 0x81, 0x20, 0xCC, + 0x94, 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, + 0xCD, 0x82, 0x20, 0xCC, 0x94, 0xCD, 0x82, 0xF6, + 0xCF, 0x85, 0xCC, 0x86, 0xF6, 0xCF, 0x85, 0xCC, + 0x84, 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x80, + 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0xCF, 0x81, 0xCC, 0x93, 0xF6, 0xCF, 0x81, 0xCC, + 0x94, 0xF6, 0xCF, 0x85, 0xCD, 0x82, 0xF6, 0xCF, + 0x85, 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, 0xA5, + 0xCC, 0x86, 0xF6, 0xCE, 0xA5, 0xCC, 0x84, 0xF6, + 0xCE, 0xA5, 0xCC, 0x80, 0xF6, 0xCE, 0xA5, 0xCC, + 0x81, 0xF6, 0xCE, 0xA1, 0xCC, 0x94, 0xF5, 0x05, + 0xC2, 0xA8, 0xCC, 0x80, 0x20, 0xCC, 0x88, 0xCC, + 0x80, 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, 0x20, + 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x60, 0xF6, 0xCF, + 0x89, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, + 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xCD, + 0x85, 0xF6, 0xCF, 0x89, 0xCD, 0x82, 0xF6, 0xCF, + 0x89, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x9F, + 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, 0x81, 0xF6, + 0xCE, 0xA9, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, + 0x81, 0xF6, 0xCE, 0xA9, 0xCD, 0x85, 0xF5, 0x03, + 0xC2, 0xB4, 0x20, 0xCC, 0x81, 0x20, 0xCC, 0x94, + 0xF5, 0x04, 0xE2, 0x80, 0x82, 0x20, 0xF5, 0x04, + 0xE2, 0x80, 0x83, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xE2, 0x80, 0x90, + 0x20, 0xCC, 0xB3, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, + 0x2E, 0x20, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, + 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0xE2, 0x80, + 0xB2, 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0xE2, + 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, + 0x21, 0x21, 0x20, 0xCC, 0x85, 0x3F, 0x3F, 0x3F, + 0x21, 0x21, 0x3F, 0xE2, 0x80, 0xB2, 0xE2, 0x80, + 0xB2, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0x20, + 0x30, 0x69, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x2B, 0xE2, 0x88, 0x92, 0x3D, 0x28, 0x29, 0x6E, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x2B, 0xE2, 0x88, 0x92, 0x3D, 0x28, + 0x29, 0x61, 0x65, 0x6F, 0x78, 0xC9, 0x99, 0x52, + 0x73, 0x61, 0x2F, 0x63, 0x61, 0x2F, 0x73, 0x43, + 0xC2, 0xB0, 0x43, 0x63, 0x2F, 0x6F, 0x63, 0x2F, + 0x75, 0xC6, 0x90, 0xC2, 0xB0, 0x46, 0x67, 0x48, + 0x48, 0x48, 0x68, 0xC4, 0xA7, 0x49, 0x49, 0x4C, + 0x6C, 0x4E, 0x4E, 0x6F, 0x50, 0x51, 0x52, 0x52, + 0x52, 0x53, 0x4D, 0x54, 0x45, 0x4C, 0x54, 0x4D, + 0x5A, 0xF6, 0xCE, 0xA9, 0x5A, 0xF6, 0x4B, 0xF6, + 0x41, 0xCC, 0x8A, 0x42, 0x43, 0x65, 0x45, 0x46, + 0x4D, 0x6F, 0xD7, 0x90, 0xD7, 0x91, 0xD7, 0x92, + 0xD7, 0x93, 0x69, 0x46, 0x41, 0x58, 0xCF, 0x80, + 0xCE, 0xB3, 0xCE, 0x93, 0xCE, 0xA0, 0xE2, 0x88, + 0x91, 0x44, 0x64, 0x65, 0x69, 0x6A, 0x31, 0xE2, + 0x81, 0x84, 0x33, 0x32, 0xE2, 0x81, 0x84, 0x33, + 0x31, 0xE2, 0x81, 0x84, 0x35, 0x32, 0xE2, 0x81, + 0x84, 0x35, 0x33, 0xE2, 0x81, 0x84, 0x35, 0x34, + 0xE2, 0x81, 0x84, 0x35, 0x31, 0xE2, 0x81, 0x84, + 0x36, 0x35, 0xE2, 0x81, 0x84, 0x36, 0x31, 0xE2, + 0x81, 0x84, 0x38, 0x33, 0xE2, 0x81, 0x84, 0x38, + 0x35, 0xE2, 0x81, 0x84, 0x38, 0x37, 0xE2, 0x81, + 0x84, 0x38, 0x31, 0xE2, 0x81, 0x84, 0x49, 0x49, + 0x49, 0x49, 0x49, 0x49, 0x49, 0x56, 0x56, 0x56, + 0x49, 0x56, 0x49, 0x49, 0x56, 0x49, 0x49, 0x49, + 0x49, 0x58, 0x58, 0x58, 0x49, 0x58, 0x49, 0x49, + 0x4C, 0x43, 0x44, 0x4D, 0x69, 0x69, 0x69, 0x69, + 0x69, 0x69, 0x69, 0x76, 0x76, 0x76, 0x69, 0x76, + 0x69, 0x69, 0x76, 0x69, 0x69, 0x69, 0x69, 0x78, + 0x78, 0x78, 0x69, 0x78, 0x69, 0x69, 0x6C, 0x63, + 0x64, 0x6D, 0xF6, 0xE2, 0x86, 0x90, 0xCC, 0xB8, + 0xF6, 0xE2, 0x86, 0x92, 0xCC, 0xB8, 0xF6, 0xE2, + 0x86, 0x94, 0xCC, 0xB8, 0xF6, 0xE2, 0x87, 0x90, + 0xCC, 0xB8, 0xF6, 0xE2, 0x87, 0x94, 0xCC, 0xB8, + 0xF6, 0xE2, 0x87, 0x92, 0xCC, 0xB8, 0xF6, 0xE2, + 0x88, 0x83, 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0x88, + 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0x8B, 0xCC, 0xB8, + 0xF6, 0xE2, 0x88, 0xA3, 0xCC, 0xB8, 0xF6, 0xE2, + 0x88, 0xA5, 0xCC, 0xB8, 0xE2, 0x88, 0xAB, 0xE2, + 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, + 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAE, 0xE2, 0x88, + 0xAE, 0xE2, 0x88, 0xAE, 0xE2, 0x88, 0xAE, 0xE2, + 0x88, 0xAE, 0xF6, 0xE2, 0x88, 0xBC, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0x83, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0x85, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x88, + 0xCC, 0xB8, 0xF6, 0x3D, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0xA1, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x8D, + 0xCC, 0xB8, 0xF6, 0x3C, 0xCC, 0xB8, 0xF6, 0x3E, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xA4, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0xA5, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0xB2, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB3, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB6, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0xB7, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0xBA, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBB, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x82, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0x83, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0x86, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x87, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xA2, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0xA8, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0xA9, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xAB, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBC, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0xBD, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0x91, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x92, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB2, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0xB3, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0xB4, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB5, + 0xCC, 0xB8, 0xF6, 0xE3, 0x80, 0x88, 0xF6, 0xE3, + 0x80, 0x89, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, + 0x37, 0x38, 0x39, 0x31, 0x30, 0x31, 0x31, 0x31, + 0x32, 0x31, 0x33, 0x31, 0x34, 0x31, 0x35, 0x31, + 0x36, 0x31, 0x37, 0x31, 0x38, 0x31, 0x39, 0x32, + 0x30, 0x28, 0x31, 0x29, 0x28, 0x32, 0x29, 0x28, + 0x33, 0x29, 0x28, 0x34, 0x29, 0x28, 0x35, 0x29, + 0x28, 0x36, 0x29, 0x28, 0x37, 0x29, 0x28, 0x38, + 0x29, 0x28, 0x39, 0x29, 0x28, 0x31, 0x30, 0x29, + 0x28, 0x31, 0x31, 0x29, 0x28, 0x31, 0x32, 0x29, + 0x28, 0x31, 0x33, 0x29, 0x28, 0x31, 0x34, 0x29, + 0x28, 0x31, 0x35, 0x29, 0x28, 0x31, 0x36, 0x29, + 0x28, 0x31, 0x37, 0x29, 0x28, 0x31, 0x38, 0x29, + 0x28, 0x31, 0x39, 0x29, 0x28, 0x32, 0x30, 0x29, + 0x31, 0x2E, 0x32, 0x2E, 0x33, 0x2E, 0x34, 0x2E, + 0x35, 0x2E, 0x36, 0x2E, 0x37, 0x2E, 0x38, 0x2E, + 0x39, 0x2E, 0x31, 0x30, 0x2E, 0x31, 0x31, 0x2E, + 0x31, 0x32, 0x2E, 0x31, 0x33, 0x2E, 0x31, 0x34, + 0x2E, 0x31, 0x35, 0x2E, 0x31, 0x36, 0x2E, 0x31, + 0x37, 0x2E, 0x31, 0x38, 0x2E, 0x31, 0x39, 0x2E, + 0x32, 0x30, 0x2E, 0x28, 0x61, 0x29, 0x28, 0x62, + 0x29, 0x28, 0x63, 0x29, 0x28, 0x64, 0x29, 0x28, + 0x65, 0x29, 0x28, 0x66, 0x29, 0x28, 0x67, 0x29, + 0x28, 0x68, 0x29, 0x28, 0x69, 0x29, 0x28, 0x6A, + 0x29, 0x28, 0x6B, 0x29, 0x28, 0x6C, 0x29, 0x28, + 0x6D, 0x29, 0x28, 0x6E, 0x29, 0x28, 0x6F, 0x29, + 0x28, 0x70, 0x29, 0x28, 0x71, 0x29, 0x28, 0x72, + 0x29, 0x28, 0x73, 0x29, 0x28, 0x74, 0x29, 0x28, + 0x75, 0x29, 0x28, 0x76, 0x29, 0x28, 0x77, 0x29, + 0x28, 0x78, 0x29, 0x28, 0x79, 0x29, 0x28, 0x7A, + 0x29, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x30, 0xE2, 0x88, + 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, + 0x88, 0xAB, 0x3A, 0x3A, 0x3D, 0x3D, 0x3D, 0x3D, + 0x3D, 0x3D, 0xF6, 0xE2, 0xAB, 0x9D, 0xCC, 0xB8, + 0xE2, 0xB5, 0xA1, 0xE6, 0xAF, 0x8D, 0xE9, 0xBE, + 0x9F, 0xE4, 0xB8, 0x80, 0xE4, 0xB8, 0xA8, 0xE4, + 0xB8, 0xB6, 0xE4, 0xB8, 0xBF, 0xE4, 0xB9, 0x99, + 0xE4, 0xBA, 0x85, 0xE4, 0xBA, 0x8C, 0xE4, 0xBA, + 0xA0, 0xE4, 0xBA, 0xBA, 0xE5, 0x84, 0xBF, 0xE5, + 0x85, 0xA5, 0xE5, 0x85, 0xAB, 0xE5, 0x86, 0x82, + 0xE5, 0x86, 0x96, 0xE5, 0x86, 0xAB, 0xE5, 0x87, + 0xA0, 0xE5, 0x87, 0xB5, 0xE5, 0x88, 0x80, 0xE5, + 0x8A, 0x9B, 0xE5, 0x8B, 0xB9, 0xE5, 0x8C, 0x95, + 0xE5, 0x8C, 0x9A, 0xE5, 0x8C, 0xB8, 0xE5, 0x8D, + 0x81, 0xE5, 0x8D, 0x9C, 0xE5, 0x8D, 0xA9, 0xE5, + 0x8E, 0x82, 0xE5, 0x8E, 0xB6, 0xE5, 0x8F, 0x88, + 0xE5, 0x8F, 0xA3, 0xE5, 0x9B, 0x97, 0xE5, 0x9C, + 0x9F, 0xE5, 0xA3, 0xAB, 0xE5, 0xA4, 0x82, 0xE5, + 0xA4, 0x8A, 0xE5, 0xA4, 0x95, 0xE5, 0xA4, 0xA7, + 0xE5, 0xA5, 0xB3, 0xE5, 0xAD, 0x90, 0xE5, 0xAE, + 0x80, 0xE5, 0xAF, 0xB8, 0xE5, 0xB0, 0x8F, 0xE5, + 0xB0, 0xA2, 0xE5, 0xB0, 0xB8, 0xE5, 0xB1, 0xAE, + 0xE5, 0xB1, 0xB1, 0xE5, 0xB7, 0x9B, 0xE5, 0xB7, + 0xA5, 0xE5, 0xB7, 0xB1, 0xE5, 0xB7, 0xBE, 0xE5, + 0xB9, 0xB2, 0xE5, 0xB9, 0xBA, 0xE5, 0xB9, 0xBF, + 0xE5, 0xBB, 0xB4, 0xE5, 0xBB, 0xBE, 0xE5, 0xBC, + 0x8B, 0xE5, 0xBC, 0x93, 0xE5, 0xBD, 0x90, 0xE5, + 0xBD, 0xA1, 0xE5, 0xBD, 0xB3, 0xE5, 0xBF, 0x83, + 0xE6, 0x88, 0x88, 0xE6, 0x88, 0xB6, 0xE6, 0x89, + 0x8B, 0xE6, 0x94, 0xAF, 0xE6, 0x94, 0xB4, 0xE6, + 0x96, 0x87, 0xE6, 0x96, 0x97, 0xE6, 0x96, 0xA4, + 0xE6, 0x96, 0xB9, 0xE6, 0x97, 0xA0, 0xE6, 0x97, + 0xA5, 0xE6, 0x9B, 0xB0, 0xE6, 0x9C, 0x88, 0xE6, + 0x9C, 0xA8, 0xE6, 0xAC, 0xA0, 0xE6, 0xAD, 0xA2, + 0xE6, 0xAD, 0xB9, 0xE6, 0xAE, 0xB3, 0xE6, 0xAF, + 0x8B, 0xE6, 0xAF, 0x94, 0xE6, 0xAF, 0x9B, 0xE6, + 0xB0, 0x8F, 0xE6, 0xB0, 0x94, 0xE6, 0xB0, 0xB4, + 0xE7, 0x81, 0xAB, 0xE7, 0x88, 0xAA, 0xE7, 0x88, + 0xB6, 0xE7, 0x88, 0xBB, 0xE7, 0x88, 0xBF, 0xE7, + 0x89, 0x87, 0xE7, 0x89, 0x99, 0xE7, 0x89, 0x9B, + 0xE7, 0x8A, 0xAC, 0xE7, 0x8E, 0x84, 0xE7, 0x8E, + 0x89, 0xE7, 0x93, 0x9C, 0xE7, 0x93, 0xA6, 0xE7, + 0x94, 0x98, 0xE7, 0x94, 0x9F, 0xE7, 0x94, 0xA8, + 0xE7, 0x94, 0xB0, 0xE7, 0x96, 0x8B, 0xE7, 0x96, + 0x92, 0xE7, 0x99, 0xB6, 0xE7, 0x99, 0xBD, 0xE7, + 0x9A, 0xAE, 0xE7, 0x9A, 0xBF, 0xE7, 0x9B, 0xAE, + 0xE7, 0x9F, 0x9B, 0xE7, 0x9F, 0xA2, 0xE7, 0x9F, + 0xB3, 0xE7, 0xA4, 0xBA, 0xE7, 0xA6, 0xB8, 0xE7, + 0xA6, 0xBE, 0xE7, 0xA9, 0xB4, 0xE7, 0xAB, 0x8B, + 0xE7, 0xAB, 0xB9, 0xE7, 0xB1, 0xB3, 0xE7, 0xB3, + 0xB8, 0xE7, 0xBC, 0xB6, 0xE7, 0xBD, 0x91, 0xE7, + 0xBE, 0x8A, 0xE7, 0xBE, 0xBD, 0xE8, 0x80, 0x81, + 0xE8, 0x80, 0x8C, 0xE8, 0x80, 0x92, 0xE8, 0x80, + 0xB3, 0xE8, 0x81, 0xBF, 0xE8, 0x82, 0x89, 0xE8, + 0x87, 0xA3, 0xE8, 0x87, 0xAA, 0xE8, 0x87, 0xB3, + 0xE8, 0x87, 0xBC, 0xE8, 0x88, 0x8C, 0xE8, 0x88, + 0x9B, 0xE8, 0x88, 0x9F, 0xE8, 0x89, 0xAE, 0xE8, + 0x89, 0xB2, 0xE8, 0x89, 0xB8, 0xE8, 0x99, 0x8D, + 0xE8, 0x99, 0xAB, 0xE8, 0xA1, 0x80, 0xE8, 0xA1, + 0x8C, 0xE8, 0xA1, 0xA3, 0xE8, 0xA5, 0xBE, 0xE8, + 0xA6, 0x8B, 0xE8, 0xA7, 0x92, 0xE8, 0xA8, 0x80, + 0xE8, 0xB0, 0xB7, 0xE8, 0xB1, 0x86, 0xE8, 0xB1, + 0x95, 0xE8, 0xB1, 0xB8, 0xE8, 0xB2, 0x9D, 0xE8, + 0xB5, 0xA4, 0xE8, 0xB5, 0xB0, 0xE8, 0xB6, 0xB3, + 0xE8, 0xBA, 0xAB, 0xE8, 0xBB, 0x8A, 0xE8, 0xBE, + 0x9B, 0xE8, 0xBE, 0xB0, 0xE8, 0xBE, 0xB5, 0xE9, + 0x82, 0x91, 0xE9, 0x85, 0x89, 0xE9, 0x87, 0x86, + 0xE9, 0x87, 0x8C, 0xE9, 0x87, 0x91, 0xE9, 0x95, + 0xB7, 0xE9, 0x96, 0x80, 0xE9, 0x98, 0x9C, 0xE9, + 0x9A, 0xB6, 0xE9, 0x9A, 0xB9, 0xE9, 0x9B, 0xA8, + 0xE9, 0x9D, 0x91, 0xE9, 0x9D, 0x9E, 0xE9, 0x9D, + 0xA2, 0xE9, 0x9D, 0xA9, 0xE9, 0x9F, 0x8B, 0xE9, + 0x9F, 0xAD, 0xE9, 0x9F, 0xB3, 0xE9, 0xA0, 0x81, + 0xE9, 0xA2, 0xA8, 0xE9, 0xA3, 0x9B, 0xE9, 0xA3, + 0x9F, 0xE9, 0xA6, 0x96, 0xE9, 0xA6, 0x99, 0xE9, + 0xA6, 0xAC, 0xE9, 0xAA, 0xA8, 0xE9, 0xAB, 0x98, + 0xE9, 0xAB, 0x9F, 0xE9, 0xAC, 0xA5, 0xE9, 0xAC, + 0xAF, 0xE9, 0xAC, 0xB2, 0xE9, 0xAC, 0xBC, 0xE9, + 0xAD, 0x9A, 0xE9, 0xB3, 0xA5, 0xE9, 0xB9, 0xB5, + 0xE9, 0xB9, 0xBF, 0xE9, 0xBA, 0xA5, 0xE9, 0xBA, + 0xBB, 0xE9, 0xBB, 0x83, 0xE9, 0xBB, 0x8D, 0xE9, + 0xBB, 0x91, 0xE9, 0xBB, 0xB9, 0xE9, 0xBB, 0xBD, + 0xE9, 0xBC, 0x8E, 0xE9, 0xBC, 0x93, 0xE9, 0xBC, + 0xA0, 0xE9, 0xBC, 0xBB, 0xE9, 0xBD, 0x8A, 0xE9, + 0xBD, 0x92, 0xE9, 0xBE, 0x8D, 0xE9, 0xBE, 0x9C, + 0xE9, 0xBE, 0xA0, 0x20, 0xE3, 0x80, 0x92, 0xE5, + 0x8D, 0x81, 0xE5, 0x8D, 0x84, 0xE5, 0x8D, 0x85, + 0xF6, 0xE3, 0x81, 0x8B, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x81, 0x8D, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x81, 0x8F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, + 0x91, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x93, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x95, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x97, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x81, 0x99, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x81, 0x9B, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x81, 0x9D, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x81, 0x9F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, + 0xA1, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA4, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA6, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA8, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, 0x9A, 0xF6, + 0xE3, 0x81, 0xB2, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x81, 0xB2, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x81, + 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xB5, + 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x81, 0xB8, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xB8, 0xE3, 0x82, + 0x9A, 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, 0x9A, 0xF6, + 0xE3, 0x81, 0x86, 0xE3, 0x82, 0x99, 0x20, 0xE3, + 0x82, 0x99, 0x20, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, + 0x82, 0x9D, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0x88, + 0xE3, 0x82, 0x8A, 0xF6, 0xE3, 0x82, 0xAB, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xAD, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x82, 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, + 0xB7, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xB9, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBB, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBD, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x82, 0xBF, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x83, 0x81, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x83, 0x84, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x83, 0x86, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, + 0x88, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x8F, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x8F, 0xE3, + 0x82, 0x9A, 0xF6, 0xE3, 0x83, 0x92, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x9A, + 0xF6, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, + 0x83, 0x98, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, + 0x98, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x83, 0x9B, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x9B, 0xE3, + 0x82, 0x9A, 0xF6, 0xE3, 0x82, 0xA6, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x83, 0xAF, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x83, 0xB0, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x83, 0xB1, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x83, 0xB2, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, + 0xBD, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xB3, 0xE3, + 0x83, 0x88, 0xE1, 0x84, 0x80, 0xE1, 0x84, 0x81, + 0xE1, 0x86, 0xAA, 0xE1, 0x84, 0x82, 0xE1, 0x86, + 0xAC, 0xE1, 0x86, 0xAD, 0xE1, 0x84, 0x83, 0xE1, + 0x84, 0x84, 0xE1, 0x84, 0x85, 0xE1, 0x86, 0xB0, + 0xE1, 0x86, 0xB1, 0xE1, 0x86, 0xB2, 0xE1, 0x86, + 0xB3, 0xE1, 0x86, 0xB4, 0xE1, 0x86, 0xB5, 0xE1, + 0x84, 0x9A, 0xE1, 0x84, 0x86, 0xE1, 0x84, 0x87, + 0xE1, 0x84, 0x88, 0xE1, 0x84, 0xA1, 0xE1, 0x84, + 0x89, 0xE1, 0x84, 0x8A, 0xE1, 0x84, 0x8B, 0xE1, + 0x84, 0x8C, 0xE1, 0x84, 0x8D, 0xE1, 0x84, 0x8E, + 0xE1, 0x84, 0x8F, 0xE1, 0x84, 0x90, 0xE1, 0x84, + 0x91, 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, + 0x85, 0xA2, 0xE1, 0x85, 0xA3, 0xE1, 0x85, 0xA4, + 0xE1, 0x85, 0xA5, 0xE1, 0x85, 0xA6, 0xE1, 0x85, + 0xA7, 0xE1, 0x85, 0xA8, 0xE1, 0x85, 0xA9, 0xE1, + 0x85, 0xAA, 0xE1, 0x85, 0xAB, 0xE1, 0x85, 0xAC, + 0xE1, 0x85, 0xAD, 0xE1, 0x85, 0xAE, 0xE1, 0x85, + 0xAF, 0xE1, 0x85, 0xB0, 0xE1, 0x85, 0xB1, 0xE1, + 0x85, 0xB2, 0xE1, 0x85, 0xB3, 0xE1, 0x85, 0xB4, + 0xE1, 0x85, 0xB5, 0xE1, 0x85, 0xA0, 0xE1, 0x84, + 0x94, 0xE1, 0x84, 0x95, 0xE1, 0x87, 0x87, 0xE1, + 0x87, 0x88, 0xE1, 0x87, 0x8C, 0xE1, 0x87, 0x8E, + 0xE1, 0x87, 0x93, 0xE1, 0x87, 0x97, 0xE1, 0x87, + 0x99, 0xE1, 0x84, 0x9C, 0xE1, 0x87, 0x9D, 0xE1, + 0x87, 0x9F, 0xE1, 0x84, 0x9D, 0xE1, 0x84, 0x9E, + 0xE1, 0x84, 0xA0, 0xE1, 0x84, 0xA2, 0xE1, 0x84, + 0xA3, 0xE1, 0x84, 0xA7, 0xE1, 0x84, 0xA9, 0xE1, + 0x84, 0xAB, 0xE1, 0x84, 0xAC, 0xE1, 0x84, 0xAD, + 0xE1, 0x84, 0xAE, 0xE1, 0x84, 0xAF, 0xE1, 0x84, + 0xB2, 0xE1, 0x84, 0xB6, 0xE1, 0x85, 0x80, 0xE1, + 0x85, 0x87, 0xE1, 0x85, 0x8C, 0xE1, 0x87, 0xB1, + 0xE1, 0x87, 0xB2, 0xE1, 0x85, 0x97, 0xE1, 0x85, + 0x98, 0xE1, 0x85, 0x99, 0xE1, 0x86, 0x84, 0xE1, + 0x86, 0x85, 0xE1, 0x86, 0x88, 0xE1, 0x86, 0x91, + 0xE1, 0x86, 0x92, 0xE1, 0x86, 0x94, 0xE1, 0x86, + 0x9E, 0xE1, 0x86, 0xA1, 0xE4, 0xB8, 0x80, 0xE4, + 0xBA, 0x8C, 0xE4, 0xB8, 0x89, 0xE5, 0x9B, 0x9B, + 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, 0xAD, 0xE4, 0xB8, + 0x8B, 0xE7, 0x94, 0xB2, 0xE4, 0xB9, 0x99, 0xE4, + 0xB8, 0x99, 0xE4, 0xB8, 0x81, 0xE5, 0xA4, 0xA9, + 0xE5, 0x9C, 0xB0, 0xE4, 0xBA, 0xBA, 0x28, 0xE1, + 0x84, 0x80, 0x29, 0x28, 0xE1, 0x84, 0x82, 0x29, + 0x28, 0xE1, 0x84, 0x83, 0x29, 0x28, 0xE1, 0x84, + 0x85, 0x29, 0x28, 0xE1, 0x84, 0x86, 0x29, 0x28, + 0xE1, 0x84, 0x87, 0x29, 0x28, 0xE1, 0x84, 0x89, + 0x29, 0x28, 0xE1, 0x84, 0x8B, 0x29, 0x28, 0xE1, + 0x84, 0x8C, 0x29, 0x28, 0xE1, 0x84, 0x8E, 0x29, + 0x28, 0xE1, 0x84, 0x8F, 0x29, 0x28, 0xE1, 0x84, + 0x90, 0x29, 0x28, 0xE1, 0x84, 0x91, 0x29, 0x28, + 0xE1, 0x84, 0x92, 0x29, 0x28, 0xE1, 0x84, 0x80, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x82, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x83, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x85, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x86, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x87, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x89, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8B, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8C, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8E, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8F, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x90, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x91, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x92, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8C, + 0xE1, 0x85, 0xAE, 0x29, 0x28, 0xE1, 0x84, 0x8B, + 0xE1, 0x85, 0xA9, 0xE1, 0x84, 0x8C, 0xE1, 0x85, + 0xA5, 0xE1, 0x86, 0xAB, 0x29, 0x28, 0xE1, 0x84, + 0x8B, 0xE1, 0x85, 0xA9, 0xE1, 0x84, 0x92, 0xE1, + 0x85, 0xAE, 0x29, 0x28, 0xE4, 0xB8, 0x80, 0x29, + 0x28, 0xE4, 0xBA, 0x8C, 0x29, 0x28, 0xE4, 0xB8, + 0x89, 0x29, 0x28, 0xE5, 0x9B, 0x9B, 0x29, 0x28, + 0xE4, 0xBA, 0x94, 0x29, 0x28, 0xE5, 0x85, 0xAD, + 0x29, 0x28, 0xE4, 0xB8, 0x83, 0x29, 0x28, 0xE5, + 0x85, 0xAB, 0x29, 0x28, 0xE4, 0xB9, 0x9D, 0x29, + 0x28, 0xE5, 0x8D, 0x81, 0x29, 0x28, 0xE6, 0x9C, + 0x88, 0x29, 0x28, 0xE7, 0x81, 0xAB, 0x29, 0x28, + 0xE6, 0xB0, 0xB4, 0x29, 0x28, 0xE6, 0x9C, 0xA8, + 0x29, 0x28, 0xE9, 0x87, 0x91, 0x29, 0x28, 0xE5, + 0x9C, 0x9F, 0x29, 0x28, 0xE6, 0x97, 0xA5, 0x29, + 0x28, 0xE6, 0xA0, 0xAA, 0x29, 0x28, 0xE6, 0x9C, + 0x89, 0x29, 0x28, 0xE7, 0xA4, 0xBE, 0x29, 0x28, + 0xE5, 0x90, 0x8D, 0x29, 0x28, 0xE7, 0x89, 0xB9, + 0x29, 0x28, 0xE8, 0xB2, 0xA1, 0x29, 0x28, 0xE7, + 0xA5, 0x9D, 0x29, 0x28, 0xE5, 0x8A, 0xB4, 0x29, + 0x28, 0xE4, 0xBB, 0xA3, 0x29, 0x28, 0xE5, 0x91, + 0xBC, 0x29, 0x28, 0xE5, 0xAD, 0xA6, 0x29, 0x28, + 0xE7, 0x9B, 0xA3, 0x29, 0x28, 0xE4, 0xBC, 0x81, + 0x29, 0x28, 0xE8, 0xB3, 0x87, 0x29, 0x28, 0xE5, + 0x8D, 0x94, 0x29, 0x28, 0xE7, 0xA5, 0xAD, 0x29, + 0x28, 0xE4, 0xBC, 0x91, 0x29, 0x28, 0xE8, 0x87, + 0xAA, 0x29, 0x28, 0xE8, 0x87, 0xB3, 0x29, 0x50, + 0x54, 0x45, 0x32, 0x31, 0x32, 0x32, 0x32, 0x33, + 0x32, 0x34, 0x32, 0x35, 0x32, 0x36, 0x32, 0x37, + 0x32, 0x38, 0x32, 0x39, 0x33, 0x30, 0x33, 0x31, + 0x33, 0x32, 0x33, 0x33, 0x33, 0x34, 0x33, 0x35, + 0xE1, 0x84, 0x80, 0xE1, 0x84, 0x82, 0xE1, 0x84, + 0x83, 0xE1, 0x84, 0x85, 0xE1, 0x84, 0x86, 0xE1, + 0x84, 0x87, 0xE1, 0x84, 0x89, 0xE1, 0x84, 0x8B, + 0xE1, 0x84, 0x8C, 0xE1, 0x84, 0x8E, 0xE1, 0x84, + 0x8F, 0xE1, 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, + 0x84, 0x92, 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xA1, + 0xE1, 0x84, 0x82, 0xE1, 0x85, 0xA1, 0xE1, 0x84, + 0x83, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x85, 0xE1, + 0x85, 0xA1, 0xE1, 0x84, 0x86, 0xE1, 0x85, 0xA1, + 0xE1, 0x84, 0x87, 0xE1, 0x85, 0xA1, 0xE1, 0x84, + 0x89, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x8B, 0xE1, + 0x85, 0xA1, 0xE1, 0x84, 0x8C, 0xE1, 0x85, 0xA1, + 0xE1, 0x84, 0x8E, 0xE1, 0x85, 0xA1, 0xE1, 0x84, + 0x8F, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x90, 0xE1, + 0x85, 0xA1, 0xE1, 0x84, 0x91, 0xE1, 0x85, 0xA1, + 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x84, + 0x8E, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xB7, 0xE1, + 0x84, 0x80, 0xE1, 0x85, 0xA9, 0xE1, 0x84, 0x8C, + 0xE1, 0x85, 0xAE, 0xE1, 0x84, 0x8B, 0xE1, 0x85, + 0xB4, 0xE1, 0x84, 0x8B, 0xE1, 0x85, 0xAE, 0xE4, + 0xB8, 0x80, 0xE4, 0xBA, 0x8C, 0xE4, 0xB8, 0x89, + 0xE5, 0x9B, 0x9B, 0xE4, 0xBA, 0x94, 0xE5, 0x85, + 0xAD, 0xE4, 0xB8, 0x83, 0xE5, 0x85, 0xAB, 0xE4, + 0xB9, 0x9D, 0xE5, 0x8D, 0x81, 0xE6, 0x9C, 0x88, + 0xE7, 0x81, 0xAB, 0xE6, 0xB0, 0xB4, 0xE6, 0x9C, + 0xA8, 0xE9, 0x87, 0x91, 0xE5, 0x9C, 0x9F, 0xE6, + 0x97, 0xA5, 0xE6, 0xA0, 0xAA, 0xE6, 0x9C, 0x89, + 0xE7, 0xA4, 0xBE, 0xE5, 0x90, 0x8D, 0xE7, 0x89, + 0xB9, 0xE8, 0xB2, 0xA1, 0xE7, 0xA5, 0x9D, 0xE5, + 0x8A, 0xB4, 0xE7, 0xA7, 0x98, 0xE7, 0x94, 0xB7, + 0xE5, 0xA5, 0xB3, 0xE9, 0x81, 0xA9, 0xE5, 0x84, + 0xAA, 0xE5, 0x8D, 0xB0, 0xE6, 0xB3, 0xA8, 0xE9, + 0xA0, 0x85, 0xE4, 0xBC, 0x91, 0xE5, 0x86, 0x99, + 0xE6, 0xAD, 0xA3, 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, + 0xAD, 0xE4, 0xB8, 0x8B, 0xE5, 0xB7, 0xA6, 0xE5, + 0x8F, 0xB3, 0xE5, 0x8C, 0xBB, 0xE5, 0xAE, 0x97, + 0xE5, 0xAD, 0xA6, 0xE7, 0x9B, 0xA3, 0xE4, 0xBC, + 0x81, 0xE8, 0xB3, 0x87, 0xE5, 0x8D, 0x94, 0xE5, + 0xA4, 0x9C, 0x33, 0x36, 0x33, 0x37, 0x33, 0x38, + 0x33, 0x39, 0x34, 0x30, 0x34, 0x31, 0x34, 0x32, + 0x34, 0x33, 0x34, 0x34, 0x34, 0x35, 0x34, 0x36, + 0x34, 0x37, 0x34, 0x38, 0x34, 0x39, 0x35, 0x30, + 0x31, 0xE6, 0x9C, 0x88, 0x32, 0xE6, 0x9C, 0x88, + 0x33, 0xE6, 0x9C, 0x88, 0x34, 0xE6, 0x9C, 0x88, + 0x35, 0xE6, 0x9C, 0x88, 0x36, 0xE6, 0x9C, 0x88, + 0x37, 0xE6, 0x9C, 0x88, 0x38, 0xE6, 0x9C, 0x88, + 0x39, 0xE6, 0x9C, 0x88, 0x31, 0x30, 0xE6, 0x9C, + 0x88, 0x31, 0x31, 0xE6, 0x9C, 0x88, 0x31, 0x32, + 0xE6, 0x9C, 0x88, 0x48, 0x67, 0x65, 0x72, 0x67, + 0x65, 0x56, 0x4C, 0x54, 0x44, 0xE3, 0x82, 0xA2, + 0xE3, 0x82, 0xA4, 0xE3, 0x82, 0xA6, 0xE3, 0x82, + 0xA8, 0xE3, 0x82, 0xAA, 0xE3, 0x82, 0xAB, 0xE3, + 0x82, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0xB1, + 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0xB5, 0xE3, 0x82, + 0xB7, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xBB, 0xE3, + 0x82, 0xBD, 0xE3, 0x82, 0xBF, 0xE3, 0x83, 0x81, + 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x86, 0xE3, 0x83, + 0x88, 0xE3, 0x83, 0x8A, 0xE3, 0x83, 0x8B, 0xE3, + 0x83, 0x8C, 0xE3, 0x83, 0x8D, 0xE3, 0x83, 0x8E, + 0xE3, 0x83, 0x8F, 0xE3, 0x83, 0x92, 0xE3, 0x83, + 0x95, 0xE3, 0x83, 0x98, 0xE3, 0x83, 0x9B, 0xE3, + 0x83, 0x9E, 0xE3, 0x83, 0x9F, 0xE3, 0x83, 0xA0, + 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xA2, 0xE3, 0x83, + 0xA4, 0xE3, 0x83, 0xA6, 0xE3, 0x83, 0xA8, 0xE3, + 0x83, 0xA9, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xAB, + 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xAD, 0xE3, 0x83, + 0xAF, 0xE3, 0x83, 0xB0, 0xE3, 0x83, 0xB1, 0xE3, + 0x83, 0xB2, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0x8F, + 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x83, + 0x88, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xAB, 0xE3, + 0x83, 0x95, 0xE3, 0x82, 0xA1, 0xE3, 0x82, 0xA2, + 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x98, 0xE3, 0x82, + 0x9A, 0xE3, 0x82, 0xA2, 0xE3, 0x82, 0xA2, 0xE3, + 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xA4, + 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0xB3, 0xE3, 0x82, + 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xA4, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0x81, 0xE3, 0x82, 0xA6, + 0xE3, 0x82, 0xA9, 0xE3, 0x83, 0xB3, 0xE3, 0x82, + 0xA8, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xAF, 0xE3, + 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, + 0xE3, 0x82, 0xA8, 0xE3, 0x83, 0xBC, 0xE3, 0x82, + 0xAB, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAA, 0xE3, + 0x83, 0xB3, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xAA, + 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xA0, 0xE3, 0x82, + 0xAB, 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xAA, 0xE3, + 0x82, 0xAB, 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0x83, + 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xAB, 0xE3, 0x83, + 0xAD, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xBC, 0xE3, + 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAD, + 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xAB, 0xE3, 0x82, + 0x99, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x9E, 0xE3, + 0x82, 0xAD, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xAB, + 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xAD, 0xE3, 0x82, + 0x99, 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0xBC, 0xE3, + 0x82, 0xAD, 0xE3, 0x83, 0xA5, 0xE3, 0x83, 0xAA, + 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAD, 0xE3, 0x82, + 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xBF, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAD, + 0xE3, 0x83, 0xAD, 0xE3, 0x82, 0xAD, 0xE3, 0x83, + 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, 0xE3, + 0x83, 0xA9, 0xE3, 0x83, 0xA0, 0xE3, 0x82, 0xAD, + 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xA1, 0xE3, 0x83, + 0xBC, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, 0xE3, + 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xAF, + 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, 0x82, + 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA9, 0xE3, + 0x83, 0xA0, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, + 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xA0, 0xE3, 0x83, + 0x88, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xAF, 0xE3, + 0x83, 0xAB, 0xE3, 0x82, 0xBB, 0xE3, 0x82, 0x99, + 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xAD, 0xE3, 0x82, + 0xAF, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xBC, 0xE3, + 0x83, 0x8D, 0xE3, 0x82, 0xB1, 0xE3, 0x83, 0xBC, + 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xB3, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x8A, 0xE3, 0x82, 0xB3, 0xE3, + 0x83, 0xBC, 0xE3, 0x83, 0x9B, 0xE3, 0x82, 0x9A, + 0xE3, 0x82, 0xB5, 0xE3, 0x82, 0xA4, 0xE3, 0x82, + 0xAF, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xB5, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0x81, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0xA0, 0xE3, 0x82, 0xB7, 0xE3, 0x83, + 0xAA, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xAF, 0xE3, + 0x82, 0x99, 0xE3, 0x82, 0xBB, 0xE3, 0x83, 0xB3, + 0xE3, 0x83, 0x81, 0xE3, 0x82, 0xBB, 0xE3, 0x83, + 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xBF, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xB9, + 0xE3, 0x83, 0x86, 0xE3, 0x82, 0x99, 0xE3, 0x82, + 0xB7, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, + 0x83, 0xAB, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xB3, + 0xE3, 0x83, 0x8A, 0xE3, 0x83, 0x8E, 0xE3, 0x83, + 0x8E, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, + 0x83, 0x8F, 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0x84, + 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x9A, 0xE3, 0x83, + 0xBC, 0xE3, 0x82, 0xBB, 0xE3, 0x83, 0xB3, 0xE3, + 0x83, 0x88, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x9A, + 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x84, 0xE3, 0x83, + 0x8F, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, + 0x83, 0xAC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, + 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xA2, 0xE3, 0x82, + 0xB9, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, 0xE3, + 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xAF, + 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, + 0x9A, 0xE3, 0x82, 0xB3, 0xE3, 0x83, 0x92, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x95, + 0xE3, 0x82, 0xA1, 0xE3, 0x83, 0xA9, 0xE3, 0x83, + 0x83, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, + 0x83, 0x95, 0xE3, 0x82, 0xA3, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x95, 0xE3, 0x82, + 0x99, 0xE3, 0x83, 0x83, 0xE3, 0x82, 0xB7, 0xE3, + 0x82, 0xA7, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x95, + 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0x98, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0xBF, 0xE3, + 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x98, + 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xBD, 0xE3, 0x83, + 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0x8B, 0xE3, + 0x83, 0x92, 0xE3, 0x83, 0x98, 0xE3, 0x83, 0xAB, + 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x98, 0xE3, 0x82, + 0x9A, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xB9, 0xE3, + 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, + 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0x98, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, + 0x82, 0xBF, 0xE3, 0x83, 0x9B, 0xE3, 0x82, 0x9A, + 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0x88, 0xE3, 0x83, 0x9B, 0xE3, 0x82, 0x99, 0xE3, + 0x83, 0xAB, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x9B, + 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x9B, 0xE3, 0x82, + 0x9A, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0x9B, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x9B, 0xE3, 0x83, + 0xBC, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x9E, 0xE3, + 0x82, 0xA4, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAD, + 0xE3, 0x83, 0x9E, 0xE3, 0x82, 0xA4, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0x83, 0xE3, + 0x83, 0x8F, 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0xAB, + 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0x9E, 0xE3, 0x83, + 0xB3, 0xE3, 0x82, 0xB7, 0xE3, 0x83, 0xA7, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0x9F, 0xE3, 0x82, 0xAF, + 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0x9F, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0x9F, 0xE3, + 0x83, 0xAA, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x99, + 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, + 0xA1, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, + 0x83, 0xA1, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0x99, + 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0xA1, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, + 0x83, 0xAB, 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0xA4, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, + 0x83, 0xA6, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xB3, + 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0x83, 0xE3, 0x83, + 0x88, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xAA, 0xE3, + 0x83, 0xA9, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, + 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x95, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xAC, + 0xE3, 0x83, 0xA0, 0xE3, 0x83, 0xAC, 0xE3, 0x83, + 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xB1, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0xAF, + 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0x30, 0xE7, + 0x82, 0xB9, 0x31, 0xE7, 0x82, 0xB9, 0x32, 0xE7, + 0x82, 0xB9, 0x33, 0xE7, 0x82, 0xB9, 0x34, 0xE7, + 0x82, 0xB9, 0x35, 0xE7, 0x82, 0xB9, 0x36, 0xE7, + 0x82, 0xB9, 0x37, 0xE7, 0x82, 0xB9, 0x38, 0xE7, + 0x82, 0xB9, 0x39, 0xE7, 0x82, 0xB9, 0x31, 0x30, + 0xE7, 0x82, 0xB9, 0x31, 0x31, 0xE7, 0x82, 0xB9, + 0x31, 0x32, 0xE7, 0x82, 0xB9, 0x31, 0x33, 0xE7, + 0x82, 0xB9, 0x31, 0x34, 0xE7, 0x82, 0xB9, 0x31, + 0x35, 0xE7, 0x82, 0xB9, 0x31, 0x36, 0xE7, 0x82, + 0xB9, 0x31, 0x37, 0xE7, 0x82, 0xB9, 0x31, 0x38, + 0xE7, 0x82, 0xB9, 0x31, 0x39, 0xE7, 0x82, 0xB9, + 0x32, 0x30, 0xE7, 0x82, 0xB9, 0x32, 0x31, 0xE7, + 0x82, 0xB9, 0x32, 0x32, 0xE7, 0x82, 0xB9, 0x32, + 0x33, 0xE7, 0x82, 0xB9, 0x32, 0x34, 0xE7, 0x82, + 0xB9, 0x68, 0x50, 0x61, 0x64, 0x61, 0x41, 0x55, + 0x62, 0x61, 0x72, 0x6F, 0x56, 0x70, 0x63, 0x64, + 0x6D, 0x64, 0x6D, 0x32, 0x64, 0x6D, 0x33, 0x49, + 0x55, 0xE5, 0xB9, 0xB3, 0xE6, 0x88, 0x90, 0xE6, + 0x98, 0xAD, 0xE5, 0x92, 0x8C, 0xE5, 0xA4, 0xA7, + 0xE6, 0xAD, 0xA3, 0xE6, 0x98, 0x8E, 0xE6, 0xB2, + 0xBB, 0xE6, 0xA0, 0xAA, 0xE5, 0xBC, 0x8F, 0xE4, + 0xBC, 0x9A, 0xE7, 0xA4, 0xBE, 0x70, 0x41, 0x6E, + 0x41, 0xCE, 0xBC, 0x41, 0x6D, 0x41, 0x6B, 0x41, + 0x4B, 0x42, 0x4D, 0x42, 0x47, 0x42, 0x63, 0x61, + 0x6C, 0x6B, 0x63, 0x61, 0x6C, 0x70, 0x46, 0x6E, + 0x46, 0xCE, 0xBC, 0x46, 0xCE, 0xBC, 0x67, 0x6D, + 0x67, 0x6B, 0x67, 0x48, 0x7A, 0x6B, 0x48, 0x7A, + 0x4D, 0x48, 0x7A, 0x47, 0x48, 0x7A, 0x54, 0x48, + 0x7A, 0xCE, 0xBC, 0x6C, 0x6D, 0x6C, 0x64, 0x6C, + 0x6B, 0x6C, 0x66, 0x6D, 0x6E, 0x6D, 0xCE, 0xBC, + 0x6D, 0x6D, 0x6D, 0x63, 0x6D, 0x6B, 0x6D, 0x6D, + 0x6D, 0x32, 0x63, 0x6D, 0x32, 0x6D, 0x32, 0x6B, + 0x6D, 0x32, 0x6D, 0x6D, 0x33, 0x63, 0x6D, 0x33, + 0x6D, 0x33, 0x6B, 0x6D, 0x33, 0x6D, 0xE2, 0x88, + 0x95, 0x73, 0x6D, 0xE2, 0x88, 0x95, 0x73, 0x32, + 0x50, 0x61, 0x6B, 0x50, 0x61, 0x4D, 0x50, 0x61, + 0x47, 0x50, 0x61, 0x72, 0x61, 0x64, 0x72, 0x61, + 0x64, 0xE2, 0x88, 0x95, 0x73, 0x72, 0x61, 0x64, + 0xE2, 0x88, 0x95, 0x73, 0x32, 0x70, 0x73, 0x6E, + 0x73, 0xCE, 0xBC, 0x73, 0x6D, 0x73, 0x70, 0x56, + 0x6E, 0x56, 0xCE, 0xBC, 0x56, 0x6D, 0x56, 0x6B, + 0x56, 0x4D, 0x56, 0x70, 0x57, 0x6E, 0x57, 0xCE, + 0xBC, 0x57, 0x6D, 0x57, 0x6B, 0x57, 0x4D, 0x57, + 0x6B, 0xCE, 0xA9, 0x4D, 0xCE, 0xA9, 0x61, 0x2E, + 0x6D, 0x2E, 0x42, 0x71, 0x63, 0x63, 0x63, 0x64, + 0x43, 0xE2, 0x88, 0x95, 0x6B, 0x67, 0x43, 0x6F, + 0x2E, 0x64, 0x42, 0x47, 0x79, 0x68, 0x61, 0x48, + 0x50, 0x69, 0x6E, 0x4B, 0x4B, 0x4B, 0x4D, 0x6B, + 0x74, 0x6C, 0x6D, 0x6C, 0x6E, 0x6C, 0x6F, 0x67, + 0x6C, 0x78, 0x6D, 0x62, 0x6D, 0x69, 0x6C, 0x6D, + 0x6F, 0x6C, 0x50, 0x48, 0x70, 0x2E, 0x6D, 0x2E, + 0x50, 0x50, 0x4D, 0x50, 0x52, 0x73, 0x72, 0x53, + 0x76, 0x57, 0x62, 0x56, 0xE2, 0x88, 0x95, 0x6D, + 0x41, 0xE2, 0x88, 0x95, 0x6D, 0x31, 0xE6, 0x97, + 0xA5, 0x32, 0xE6, 0x97, 0xA5, 0x33, 0xE6, 0x97, + 0xA5, 0x34, 0xE6, 0x97, 0xA5, 0x35, 0xE6, 0x97, + 0xA5, 0x36, 0xE6, 0x97, 0xA5, 0x37, 0xE6, 0x97, + 0xA5, 0x38, 0xE6, 0x97, 0xA5, 0x39, 0xE6, 0x97, + 0xA5, 0x31, 0x30, 0xE6, 0x97, 0xA5, 0x31, 0x31, + 0xE6, 0x97, 0xA5, 0x31, 0x32, 0xE6, 0x97, 0xA5, + 0x31, 0x33, 0xE6, 0x97, 0xA5, 0x31, 0x34, 0xE6, + 0x97, 0xA5, 0x31, 0x35, 0xE6, 0x97, 0xA5, 0x31, + 0x36, 0xE6, 0x97, 0xA5, 0x31, 0x37, 0xE6, 0x97, + 0xA5, 0x31, 0x38, 0xE6, 0x97, 0xA5, 0x31, 0x39, + 0xE6, 0x97, 0xA5, 0x32, 0x30, 0xE6, 0x97, 0xA5, + 0x32, 0x31, 0xE6, 0x97, 0xA5, 0x32, 0x32, 0xE6, + 0x97, 0xA5, 0x32, 0x33, 0xE6, 0x97, 0xA5, 0x32, + 0x34, 0xE6, 0x97, 0xA5, 0x32, 0x35, 0xE6, 0x97, + 0xA5, 0x32, 0x36, 0xE6, 0x97, 0xA5, 0x32, 0x37, + 0xE6, 0x97, 0xA5, 0x32, 0x38, 0xE6, 0x97, 0xA5, + 0x32, 0x39, 0xE6, 0x97, 0xA5, 0x33, 0x30, 0xE6, + 0x97, 0xA5, 0x33, 0x31, 0xE6, 0x97, 0xA5, 0x67, + 0x61, 0x6C, 0xF6, 0xE8, 0xB1, 0x88, 0xF6, 0xE6, + 0x9B, 0xB4, 0xF6, 0xE8, 0xBB, 0x8A, 0xF6, 0xE8, + 0xB3, 0x88, 0xF6, 0xE6, 0xBB, 0x91, 0xF6, 0xE4, + 0xB8, 0xB2, 0xF6, 0xE5, 0x8F, 0xA5, 0xF6, 0xE9, + 0xBE, 0x9C, 0xF6, 0xE9, 0xBE, 0x9C, 0xF6, 0xE5, + 0xA5, 0x91, 0xF6, 0xE9, 0x87, 0x91, 0xF6, 0xE5, + 0x96, 0x87, 0xF6, 0xE5, 0xA5, 0x88, 0xF6, 0xE6, + 0x87, 0xB6, 0xF6, 0xE7, 0x99, 0xA9, 0xF6, 0xE7, + 0xBE, 0x85, 0xF6, 0xE8, 0x98, 0xBF, 0xF6, 0xE8, + 0x9E, 0xBA, 0xF6, 0xE8, 0xA3, 0xB8, 0xF6, 0xE9, + 0x82, 0x8F, 0xF6, 0xE6, 0xA8, 0x82, 0xF6, 0xE6, + 0xB4, 0x9B, 0xF6, 0xE7, 0x83, 0x99, 0xF6, 0xE7, + 0x8F, 0x9E, 0xF6, 0xE8, 0x90, 0xBD, 0xF6, 0xE9, + 0x85, 0xAA, 0xF6, 0xE9, 0xA7, 0xB1, 0xF6, 0xE4, + 0xBA, 0x82, 0xF6, 0xE5, 0x8D, 0xB5, 0xF6, 0xE6, + 0xAC, 0x84, 0xF6, 0xE7, 0x88, 0x9B, 0xF6, 0xE8, + 0x98, 0xAD, 0xF6, 0xE9, 0xB8, 0x9E, 0xF6, 0xE5, + 0xB5, 0x90, 0xF6, 0xE6, 0xBF, 0xAB, 0xF6, 0xE8, + 0x97, 0x8D, 0xF6, 0xE8, 0xA5, 0xA4, 0xF6, 0xE6, + 0x8B, 0x89, 0xF6, 0xE8, 0x87, 0x98, 0xF6, 0xE8, + 0xA0, 0x9F, 0xF6, 0xE5, 0xBB, 0x8A, 0xF6, 0xE6, + 0x9C, 0x97, 0xF6, 0xE6, 0xB5, 0xAA, 0xF6, 0xE7, + 0x8B, 0xBC, 0xF6, 0xE9, 0x83, 0x8E, 0xF6, 0xE4, + 0xBE, 0x86, 0xF6, 0xE5, 0x86, 0xB7, 0xF6, 0xE5, + 0x8B, 0x9E, 0xF6, 0xE6, 0x93, 0x84, 0xF6, 0xE6, + 0xAB, 0x93, 0xF6, 0xE7, 0x88, 0x90, 0xF6, 0xE7, + 0x9B, 0xA7, 0xF6, 0xE8, 0x80, 0x81, 0xF6, 0xE8, + 0x98, 0x86, 0xF6, 0xE8, 0x99, 0x9C, 0xF6, 0xE8, + 0xB7, 0xAF, 0xF6, 0xE9, 0x9C, 0xB2, 0xF6, 0xE9, + 0xAD, 0xAF, 0xF6, 0xE9, 0xB7, 0xBA, 0xF6, 0xE7, + 0xA2, 0x8C, 0xF6, 0xE7, 0xA5, 0xBF, 0xF6, 0xE7, + 0xB6, 0xA0, 0xF6, 0xE8, 0x8F, 0x89, 0xF6, 0xE9, + 0x8C, 0x84, 0xF6, 0xE9, 0xB9, 0xBF, 0xF6, 0xE8, + 0xAB, 0x96, 0xF6, 0xE5, 0xA3, 0x9F, 0xF6, 0xE5, + 0xBC, 0x84, 0xF6, 0xE7, 0xB1, 0xA0, 0xF6, 0xE8, + 0x81, 0xBE, 0xF6, 0xE7, 0x89, 0xA2, 0xF6, 0xE7, + 0xA3, 0x8A, 0xF6, 0xE8, 0xB3, 0x82, 0xF6, 0xE9, + 0x9B, 0xB7, 0xF6, 0xE5, 0xA3, 0x98, 0xF6, 0xE5, + 0xB1, 0xA2, 0xF6, 0xE6, 0xA8, 0x93, 0xF6, 0xE6, + 0xB7, 0x9A, 0xF6, 0xE6, 0xBC, 0x8F, 0xF6, 0xE7, + 0xB4, 0xAF, 0xF6, 0xE7, 0xB8, 0xB7, 0xF6, 0xE9, + 0x99, 0x8B, 0xF6, 0xE5, 0x8B, 0x92, 0xF6, 0xE8, + 0x82, 0x8B, 0xF6, 0xE5, 0x87, 0x9C, 0xF6, 0xE5, + 0x87, 0x8C, 0xF6, 0xE7, 0xA8, 0x9C, 0xF6, 0xE7, + 0xB6, 0xBE, 0xF6, 0xE8, 0x8F, 0xB1, 0xF6, 0xE9, + 0x99, 0xB5, 0xF6, 0xE8, 0xAE, 0x80, 0xF6, 0xE6, + 0x8B, 0x8F, 0xF6, 0xE6, 0xA8, 0x82, 0xF6, 0xE8, + 0xAB, 0xBE, 0xF6, 0xE4, 0xB8, 0xB9, 0xF6, 0xE5, + 0xAF, 0xA7, 0xF6, 0xE6, 0x80, 0x92, 0xF6, 0xE7, + 0x8E, 0x87, 0xF6, 0xE7, 0x95, 0xB0, 0xF6, 0xE5, + 0x8C, 0x97, 0xF6, 0xE7, 0xA3, 0xBB, 0xF6, 0xE4, + 0xBE, 0xBF, 0xF6, 0xE5, 0xBE, 0xA9, 0xF6, 0xE4, + 0xB8, 0x8D, 0xF6, 0xE6, 0xB3, 0x8C, 0xF6, 0xE6, + 0x95, 0xB8, 0xF6, 0xE7, 0xB4, 0xA2, 0xF6, 0xE5, + 0x8F, 0x83, 0xF6, 0xE5, 0xA1, 0x9E, 0xF6, 0xE7, + 0x9C, 0x81, 0xF6, 0xE8, 0x91, 0x89, 0xF6, 0xE8, + 0xAA, 0xAA, 0xF6, 0xE6, 0xAE, 0xBA, 0xF6, 0xE8, + 0xBE, 0xB0, 0xF6, 0xE6, 0xB2, 0x88, 0xF6, 0xE6, + 0x8B, 0xBE, 0xF6, 0xE8, 0x8B, 0xA5, 0xF6, 0xE6, + 0x8E, 0xA0, 0xF6, 0xE7, 0x95, 0xA5, 0xF6, 0xE4, + 0xBA, 0xAE, 0xF6, 0xE5, 0x85, 0xA9, 0xF6, 0xE5, + 0x87, 0x89, 0xF6, 0xE6, 0xA2, 0x81, 0xF6, 0xE7, + 0xB3, 0xA7, 0xF6, 0xE8, 0x89, 0xAF, 0xF6, 0xE8, + 0xAB, 0x92, 0xF6, 0xE9, 0x87, 0x8F, 0xF6, 0xE5, + 0x8B, 0xB5, 0xF6, 0xE5, 0x91, 0x82, 0xF6, 0xE5, + 0xA5, 0xB3, 0xF6, 0xE5, 0xBB, 0xAC, 0xF6, 0xE6, + 0x97, 0x85, 0xF6, 0xE6, 0xBF, 0xBE, 0xF6, 0xE7, + 0xA4, 0xAA, 0xF6, 0xE9, 0x96, 0xAD, 0xF6, 0xE9, + 0xA9, 0xAA, 0xF6, 0xE9, 0xBA, 0x97, 0xF6, 0xE9, + 0xBB, 0x8E, 0xF6, 0xE5, 0x8A, 0x9B, 0xF6, 0xE6, + 0x9B, 0x86, 0xF6, 0xE6, 0xAD, 0xB7, 0xF6, 0xE8, + 0xBD, 0xA2, 0xF6, 0xE5, 0xB9, 0xB4, 0xF6, 0xE6, + 0x86, 0x90, 0xF6, 0xE6, 0x88, 0x80, 0xF6, 0xE6, + 0x92, 0x9A, 0xF6, 0xE6, 0xBC, 0xA3, 0xF6, 0xE7, + 0x85, 0x89, 0xF6, 0xE7, 0x92, 0x89, 0xF6, 0xE7, + 0xA7, 0x8A, 0xF6, 0xE7, 0xB7, 0xB4, 0xF6, 0xE8, + 0x81, 0xAF, 0xF6, 0xE8, 0xBC, 0xA6, 0xF6, 0xE8, + 0x93, 0xAE, 0xF6, 0xE9, 0x80, 0xA3, 0xF6, 0xE9, + 0x8D, 0x8A, 0xF6, 0xE5, 0x88, 0x97, 0xF6, 0xE5, + 0x8A, 0xA3, 0xF6, 0xE5, 0x92, 0xBD, 0xF6, 0xE7, + 0x83, 0x88, 0xF6, 0xE8, 0xA3, 0x82, 0xF6, 0xE8, + 0xAA, 0xAA, 0xF6, 0xE5, 0xBB, 0x89, 0xF6, 0xE5, + 0xBF, 0xB5, 0xF6, 0xE6, 0x8D, 0xBB, 0xF6, 0xE6, + 0xAE, 0xAE, 0xF6, 0xE7, 0xB0, 0xBE, 0xF6, 0xE7, + 0x8D, 0xB5, 0xF6, 0xE4, 0xBB, 0xA4, 0xF6, 0xE5, + 0x9B, 0xB9, 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, + 0xB6, 0xBA, 0xF6, 0xE6, 0x80, 0x9C, 0xF6, 0xE7, + 0x8E, 0xB2, 0xF6, 0xE7, 0x91, 0xA9, 0xF6, 0xE7, + 0xBE, 0x9A, 0xF6, 0xE8, 0x81, 0x86, 0xF6, 0xE9, + 0x88, 0xB4, 0xF6, 0xE9, 0x9B, 0xB6, 0xF6, 0xE9, + 0x9D, 0x88, 0xF6, 0xE9, 0xA0, 0x98, 0xF6, 0xE4, + 0xBE, 0x8B, 0xF6, 0xE7, 0xA6, 0xAE, 0xF6, 0xE9, + 0x86, 0xB4, 0xF6, 0xE9, 0x9A, 0xB8, 0xF6, 0xE6, + 0x83, 0xA1, 0xF6, 0xE4, 0xBA, 0x86, 0xF6, 0xE5, + 0x83, 0x9A, 0xF6, 0xE5, 0xAF, 0xAE, 0xF6, 0xE5, + 0xB0, 0xBF, 0xF6, 0xE6, 0x96, 0x99, 0xF6, 0xE6, + 0xA8, 0x82, 0xF6, 0xE7, 0x87, 0x8E, 0xF6, 0xE7, + 0x99, 0x82, 0xF6, 0xE8, 0x93, 0xBC, 0xF6, 0xE9, + 0x81, 0xBC, 0xF6, 0xE9, 0xBE, 0x8D, 0xF6, 0xE6, + 0x9A, 0x88, 0xF6, 0xE9, 0x98, 0xAE, 0xF6, 0xE5, + 0x8A, 0x89, 0xF6, 0xE6, 0x9D, 0xBB, 0xF6, 0xE6, + 0x9F, 0xB3, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, + 0xBA, 0x9C, 0xF6, 0xE7, 0x90, 0x89, 0xF6, 0xE7, + 0x95, 0x99, 0xF6, 0xE7, 0xA1, 0xAB, 0xF6, 0xE7, + 0xB4, 0x90, 0xF6, 0xE9, 0xA1, 0x9E, 0xF6, 0xE5, + 0x85, 0xAD, 0xF6, 0xE6, 0x88, 0xAE, 0xF6, 0xE9, + 0x99, 0xB8, 0xF6, 0xE5, 0x80, 0xAB, 0xF6, 0xE5, + 0xB4, 0x99, 0xF6, 0xE6, 0xB7, 0xAA, 0xF6, 0xE8, + 0xBC, 0xAA, 0xF6, 0xE5, 0xBE, 0x8B, 0xF6, 0xE6, + 0x85, 0x84, 0xF6, 0xE6, 0xA0, 0x97, 0xF6, 0xE7, + 0x8E, 0x87, 0xF6, 0xE9, 0x9A, 0x86, 0xF6, 0xE5, + 0x88, 0xA9, 0xF6, 0xE5, 0x90, 0x8F, 0xF6, 0xE5, + 0xB1, 0xA5, 0xF6, 0xE6, 0x98, 0x93, 0xF6, 0xE6, + 0x9D, 0x8E, 0xF6, 0xE6, 0xA2, 0xA8, 0xF6, 0xE6, + 0xB3, 0xA5, 0xF6, 0xE7, 0x90, 0x86, 0xF6, 0xE7, + 0x97, 0xA2, 0xF6, 0xE7, 0xBD, 0xB9, 0xF6, 0xE8, + 0xA3, 0x8F, 0xF6, 0xE8, 0xA3, 0xA1, 0xF6, 0xE9, + 0x87, 0x8C, 0xF6, 0xE9, 0x9B, 0xA2, 0xF6, 0xE5, + 0x8C, 0xBF, 0xF6, 0xE6, 0xBA, 0xBA, 0xF6, 0xE5, + 0x90, 0x9D, 0xF6, 0xE7, 0x87, 0x90, 0xF6, 0xE7, + 0x92, 0x98, 0xF6, 0xE8, 0x97, 0xBA, 0xF6, 0xE9, + 0x9A, 0xA3, 0xF6, 0xE9, 0xB1, 0x97, 0xF6, 0xE9, + 0xBA, 0x9F, 0xF6, 0xE6, 0x9E, 0x97, 0xF6, 0xE6, + 0xB7, 0x8B, 0xF6, 0xE8, 0x87, 0xA8, 0xF6, 0xE7, + 0xAB, 0x8B, 0xF6, 0xE7, 0xAC, 0xA0, 0xF6, 0xE7, + 0xB2, 0x92, 0xF6, 0xE7, 0x8B, 0x80, 0xF6, 0xE7, + 0x82, 0x99, 0xF6, 0xE8, 0xAD, 0x98, 0xF6, 0xE4, + 0xBB, 0x80, 0xF6, 0xE8, 0x8C, 0xB6, 0xF6, 0xE5, + 0x88, 0xBA, 0xF6, 0xE5, 0x88, 0x87, 0xF6, 0xE5, + 0xBA, 0xA6, 0xF6, 0xE6, 0x8B, 0x93, 0xF6, 0xE7, + 0xB3, 0x96, 0xF6, 0xE5, 0xAE, 0x85, 0xF6, 0xE6, + 0xB4, 0x9E, 0xF6, 0xE6, 0x9A, 0xB4, 0xF6, 0xE8, + 0xBC, 0xBB, 0xF6, 0xE8, 0xA1, 0x8C, 0xF6, 0xE9, + 0x99, 0x8D, 0xF6, 0xE8, 0xA6, 0x8B, 0xF6, 0xE5, + 0xBB, 0x93, 0xF6, 0xE5, 0x85, 0x80, 0xF6, 0xE5, + 0x97, 0x80, 0xF6, 0xE5, 0xA1, 0x9A, 0xF6, 0xE6, + 0x99, 0xB4, 0xF6, 0xE5, 0x87, 0x9E, 0xF6, 0xE7, + 0x8C, 0xAA, 0xF6, 0xE7, 0x9B, 0x8A, 0xF6, 0xE7, + 0xA4, 0xBC, 0xF6, 0xE7, 0xA5, 0x9E, 0xF6, 0xE7, + 0xA5, 0xA5, 0xF6, 0xE7, 0xA6, 0x8F, 0xF6, 0xE9, + 0x9D, 0x96, 0xF6, 0xE7, 0xB2, 0xBE, 0xF6, 0xE7, + 0xBE, 0xBD, 0xF6, 0xE8, 0x98, 0x92, 0xF6, 0xE8, + 0xAB, 0xB8, 0xF6, 0xE9, 0x80, 0xB8, 0xF6, 0xE9, + 0x83, 0xBD, 0xF6, 0xE9, 0xA3, 0xAF, 0xF6, 0xE9, + 0xA3, 0xBC, 0xF6, 0xE9, 0xA4, 0xA8, 0xF6, 0xE9, + 0xB6, 0xB4, 0xF6, 0xE4, 0xBE, 0xAE, 0xF6, 0xE5, + 0x83, 0xA7, 0xF6, 0xE5, 0x85, 0x8D, 0xF6, 0xE5, + 0x8B, 0x89, 0xF6, 0xE5, 0x8B, 0xA4, 0xF6, 0xE5, + 0x8D, 0x91, 0xF6, 0xE5, 0x96, 0x9D, 0xF6, 0xE5, + 0x98, 0x86, 0xF6, 0xE5, 0x99, 0xA8, 0xF6, 0xE5, + 0xA1, 0x80, 0xF6, 0xE5, 0xA2, 0xA8, 0xF6, 0xE5, + 0xB1, 0xA4, 0xF6, 0xE5, 0xB1, 0xAE, 0xF6, 0xE6, + 0x82, 0x94, 0xF6, 0xE6, 0x85, 0xA8, 0xF6, 0xE6, + 0x86, 0x8E, 0xF6, 0xE6, 0x87, 0xB2, 0xF6, 0xE6, + 0x95, 0x8F, 0xF6, 0xE6, 0x97, 0xA2, 0xF6, 0xE6, + 0x9A, 0x91, 0xF6, 0xE6, 0xA2, 0x85, 0xF6, 0xE6, + 0xB5, 0xB7, 0xF6, 0xE6, 0xB8, 0x9A, 0xF6, 0xE6, + 0xBC, 0xA2, 0xF6, 0xE7, 0x85, 0xAE, 0xF6, 0xE7, + 0x88, 0xAB, 0xF6, 0xE7, 0x90, 0xA2, 0xF6, 0xE7, + 0xA2, 0x91, 0xF6, 0xE7, 0xA4, 0xBE, 0xF6, 0xE7, + 0xA5, 0x89, 0xF6, 0xE7, 0xA5, 0x88, 0xF6, 0xE7, + 0xA5, 0x90, 0xF6, 0xE7, 0xA5, 0x96, 0xF6, 0xE7, + 0xA5, 0x9D, 0xF6, 0xE7, 0xA6, 0x8D, 0xF6, 0xE7, + 0xA6, 0x8E, 0xF6, 0xE7, 0xA9, 0x80, 0xF6, 0xE7, + 0xAA, 0x81, 0xF6, 0xE7, 0xAF, 0x80, 0xF6, 0xE7, + 0xB7, 0xB4, 0xF6, 0xE7, 0xB8, 0x89, 0xF6, 0xE7, + 0xB9, 0x81, 0xF6, 0xE7, 0xBD, 0xB2, 0xF6, 0xE8, + 0x80, 0x85, 0xF6, 0xE8, 0x87, 0xAD, 0xF6, 0xE8, + 0x89, 0xB9, 0xF6, 0xE8, 0x89, 0xB9, 0xF6, 0xE8, + 0x91, 0x97, 0xF6, 0xE8, 0xA4, 0x90, 0xF6, 0xE8, + 0xA6, 0x96, 0xF6, 0xE8, 0xAC, 0x81, 0xF6, 0xE8, + 0xAC, 0xB9, 0xF6, 0xE8, 0xB3, 0x93, 0xF6, 0xE8, + 0xB4, 0x88, 0xF6, 0xE8, 0xBE, 0xB6, 0xF6, 0xE9, + 0x80, 0xB8, 0xF6, 0xE9, 0x9B, 0xA3, 0xF6, 0xE9, + 0x9F, 0xBF, 0xF6, 0xE9, 0xA0, 0xBB, 0xF6, 0xE4, + 0xB8, 0xA6, 0xF6, 0xE5, 0x86, 0xB5, 0xF6, 0xE5, + 0x85, 0xA8, 0xF6, 0xE4, 0xBE, 0x80, 0xF6, 0xE5, + 0x85, 0x85, 0xF6, 0xE5, 0x86, 0x80, 0xF6, 0xE5, + 0x8B, 0x87, 0xF6, 0xE5, 0x8B, 0xBA, 0xF6, 0xE5, + 0x96, 0x9D, 0xF6, 0xE5, 0x95, 0x95, 0xF6, 0xE5, + 0x96, 0x99, 0xF6, 0xE5, 0x97, 0xA2, 0xF6, 0xE5, + 0xA1, 0x9A, 0xF6, 0xE5, 0xA2, 0xB3, 0xF6, 0xE5, + 0xA5, 0x84, 0xF6, 0xE5, 0xA5, 0x94, 0xF6, 0xE5, + 0xA9, 0xA2, 0xF6, 0xE5, 0xAC, 0xA8, 0xF6, 0xE5, + 0xBB, 0x92, 0xF6, 0xE5, 0xBB, 0x99, 0xF6, 0xE5, + 0xBD, 0xA9, 0xF6, 0xE5, 0xBE, 0xAD, 0xF6, 0xE6, + 0x83, 0x98, 0xF6, 0xE6, 0x85, 0x8E, 0xF6, 0xE6, + 0x84, 0x88, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, 0xE6, + 0x85, 0xA0, 0xF6, 0xE6, 0x87, 0xB2, 0xF6, 0xE6, + 0x88, 0xB4, 0xF6, 0xE6, 0x8F, 0x84, 0xF6, 0xE6, + 0x90, 0x9C, 0xF6, 0xE6, 0x91, 0x92, 0xF6, 0xE6, + 0x95, 0x96, 0xF6, 0xE6, 0x99, 0xB4, 0xF6, 0xE6, + 0x9C, 0x97, 0xF6, 0xE6, 0x9C, 0x9B, 0xF6, 0xE6, + 0x9D, 0x96, 0xF6, 0xE6, 0xAD, 0xB9, 0xF6, 0xE6, + 0xAE, 0xBA, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, + 0xBB, 0x9B, 0xF6, 0xE6, 0xBB, 0x8B, 0xF6, 0xE6, + 0xBC, 0xA2, 0xF6, 0xE7, 0x80, 0x9E, 0xF6, 0xE7, + 0x85, 0xAE, 0xF6, 0xE7, 0x9E, 0xA7, 0xF6, 0xE7, + 0x88, 0xB5, 0xF6, 0xE7, 0x8A, 0xAF, 0xF6, 0xE7, + 0x8C, 0xAA, 0xF6, 0xE7, 0x91, 0xB1, 0xF6, 0xE7, + 0x94, 0x86, 0xF6, 0xE7, 0x94, 0xBB, 0xF6, 0xE7, + 0x98, 0x9D, 0xF6, 0xE7, 0x98, 0x9F, 0xF6, 0xE7, + 0x9B, 0x8A, 0xF6, 0xE7, 0x9B, 0x9B, 0xF6, 0xE7, + 0x9B, 0xB4, 0xF6, 0xE7, 0x9D, 0x8A, 0xF6, 0xE7, + 0x9D, 0x80, 0xF6, 0xE7, 0xA3, 0x8C, 0xF6, 0xE7, + 0xAA, 0xB1, 0xF6, 0xE7, 0xAF, 0x80, 0xF6, 0xE7, + 0xB1, 0xBB, 0xF6, 0xE7, 0xB5, 0x9B, 0xF6, 0xE7, + 0xB7, 0xB4, 0xF6, 0xE7, 0xBC, 0xBE, 0xF6, 0xE8, + 0x80, 0x85, 0xF6, 0xE8, 0x8D, 0x92, 0xF6, 0xE8, + 0x8F, 0xAF, 0xF6, 0xE8, 0x9D, 0xB9, 0xF6, 0xE8, + 0xA5, 0x81, 0xF6, 0xE8, 0xA6, 0x86, 0xF6, 0xE8, + 0xA6, 0x96, 0xF6, 0xE8, 0xAA, 0xBF, 0xF6, 0xE8, + 0xAB, 0xB8, 0xF6, 0xE8, 0xAB, 0x8B, 0xF6, 0xE8, + 0xAC, 0x81, 0xF6, 0xE8, 0xAB, 0xBE, 0xF6, 0xE8, + 0xAB, 0xAD, 0xF6, 0xE8, 0xAC, 0xB9, 0xF6, 0xE8, + 0xAE, 0x8A, 0xF6, 0xE8, 0xB4, 0x88, 0xF6, 0xE8, + 0xBC, 0xB8, 0xF6, 0xE9, 0x81, 0xB2, 0xF6, 0xE9, + 0x86, 0x99, 0xF6, 0xE9, 0x89, 0xB6, 0xF6, 0xE9, + 0x99, 0xBC, 0xF6, 0xE9, 0x9B, 0xA3, 0xF6, 0xE9, + 0x9D, 0x96, 0xF6, 0xE9, 0x9F, 0x9B, 0xF6, 0xE9, + 0x9F, 0xBF, 0xF6, 0xE9, 0xA0, 0x8B, 0xF6, 0xE9, + 0xA0, 0xBB, 0xF6, 0xE9, 0xAC, 0x92, 0xF6, 0xE9, + 0xBE, 0x9C, 0xF6, 0xF0, 0xA2, 0xA1, 0x8A, 0xF6, + 0xF0, 0xA2, 0xA1, 0x84, 0xF6, 0xF0, 0xA3, 0x8F, + 0x95, 0xF6, 0xE3, 0xAE, 0x9D, 0xF6, 0xE4, 0x80, + 0x98, 0xF6, 0xE4, 0x80, 0xB9, 0xF6, 0xF0, 0xA5, + 0x89, 0x89, 0xF6, 0xF0, 0xA5, 0xB3, 0x90, 0xF6, + 0xF0, 0xA7, 0xBB, 0x93, 0xF6, 0xE9, 0xBD, 0x83, + 0xF6, 0xE9, 0xBE, 0x8E, 0x66, 0x66, 0x66, 0x69, + 0x66, 0x6C, 0x66, 0x66, 0x69, 0x66, 0x66, 0x6C, + 0x73, 0x74, 0x73, 0x74, 0xD5, 0xB4, 0xD5, 0xB6, + 0xD5, 0xB4, 0xD5, 0xA5, 0xD5, 0xB4, 0xD5, 0xAB, + 0xD5, 0xBE, 0xD5, 0xB6, 0xD5, 0xB4, 0xD5, 0xAD, + 0xF6, 0xD7, 0x99, 0xD6, 0xB4, 0xF6, 0xD7, 0xB2, + 0xD6, 0xB7, 0xD7, 0xA2, 0xD7, 0x90, 0xD7, 0x93, + 0xD7, 0x94, 0xD7, 0x9B, 0xD7, 0x9C, 0xD7, 0x9D, + 0xD7, 0xA8, 0xD7, 0xAA, 0x2B, 0xF6, 0xD7, 0xA9, + 0xD7, 0x81, 0xF6, 0xD7, 0xA9, 0xD7, 0x82, 0xF6, + 0xD7, 0xA9, 0xD6, 0xBC, 0xD7, 0x81, 0xF6, 0xD7, + 0xA9, 0xD6, 0xBC, 0xD7, 0x82, 0xF6, 0xD7, 0x90, + 0xD6, 0xB7, 0xF6, 0xD7, 0x90, 0xD6, 0xB8, 0xF6, + 0xD7, 0x90, 0xD6, 0xBC, 0xF6, 0xD7, 0x91, 0xD6, + 0xBC, 0xF6, 0xD7, 0x92, 0xD6, 0xBC, 0xF6, 0xD7, + 0x93, 0xD6, 0xBC, 0xF6, 0xD7, 0x94, 0xD6, 0xBC, + 0xF6, 0xD7, 0x95, 0xD6, 0xBC, 0xF6, 0xD7, 0x96, + 0xD6, 0xBC, 0xF6, 0xD7, 0x98, 0xD6, 0xBC, 0xF6, + 0xD7, 0x99, 0xD6, 0xBC, 0xF6, 0xD7, 0x9A, 0xD6, + 0xBC, 0xF6, 0xD7, 0x9B, 0xD6, 0xBC, 0xF6, 0xD7, + 0x9C, 0xD6, 0xBC, 0xF6, 0xD7, 0x9E, 0xD6, 0xBC, + 0xF6, 0xD7, 0xA0, 0xD6, 0xBC, 0xF6, 0xD7, 0xA1, + 0xD6, 0xBC, 0xF6, 0xD7, 0xA3, 0xD6, 0xBC, 0xF6, + 0xD7, 0xA4, 0xD6, 0xBC, 0xF6, 0xD7, 0xA6, 0xD6, + 0xBC, 0xF6, 0xD7, 0xA7, 0xD6, 0xBC, 0xF6, 0xD7, + 0xA8, 0xD6, 0xBC, 0xF6, 0xD7, 0xA9, 0xD6, 0xBC, + 0xF6, 0xD7, 0xAA, 0xD6, 0xBC, 0xF6, 0xD7, 0x95, + 0xD6, 0xB9, 0xF6, 0xD7, 0x91, 0xD6, 0xBF, 0xF6, + 0xD7, 0x9B, 0xD6, 0xBF, 0xF6, 0xD7, 0xA4, 0xD6, + 0xBF, 0xD7, 0x90, 0xD7, 0x9C, 0xD9, 0xB1, 0xD9, + 0xB1, 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, + 0xBB, 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, + 0xBE, 0xDA, 0x80, 0xDA, 0x80, 0xDA, 0x80, 0xDA, + 0x80, 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, + 0xBA, 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, + 0xBF, 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, + 0xB9, 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, + 0xA4, 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, + 0xA6, 0xDA, 0x84, 0xDA, 0x84, 0xDA, 0x84, 0xDA, + 0x84, 0xDA, 0x83, 0xDA, 0x83, 0xDA, 0x83, 0xDA, + 0x83, 0xDA, 0x86, 0xDA, 0x86, 0xDA, 0x86, 0xDA, + 0x86, 0xDA, 0x87, 0xDA, 0x87, 0xDA, 0x87, 0xDA, + 0x87, 0xDA, 0x8D, 0xDA, 0x8D, 0xDA, 0x8C, 0xDA, + 0x8C, 0xDA, 0x8E, 0xDA, 0x8E, 0xDA, 0x88, 0xDA, + 0x88, 0xDA, 0x98, 0xDA, 0x98, 0xDA, 0x91, 0xDA, + 0x91, 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, + 0xA9, 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, + 0xAF, 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, + 0xB3, 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, + 0xB1, 0xDA, 0xBA, 0xDA, 0xBA, 0xDA, 0xBB, 0xDA, + 0xBB, 0xDA, 0xBB, 0xDA, 0xBB, 0xDB, 0x95, 0xD9, + 0x94, 0xDB, 0x95, 0xD9, 0x94, 0xDB, 0x81, 0xDB, + 0x81, 0xDB, 0x81, 0xDB, 0x81, 0xDA, 0xBE, 0xDA, + 0xBE, 0xDA, 0xBE, 0xDA, 0xBE, 0xDB, 0x92, 0xDB, + 0x92, 0xDB, 0x92, 0xD9, 0x94, 0xDB, 0x92, 0xD9, + 0x94, 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, + 0xAD, 0xDB, 0x87, 0xDB, 0x87, 0xDB, 0x86, 0xDB, + 0x86, 0xDB, 0x88, 0xDB, 0x88, 0xDB, 0x87, 0xD9, + 0xB4, 0xDB, 0x8B, 0xDB, 0x8B, 0xDB, 0x85, 0xDB, + 0x85, 0xDB, 0x89, 0xDB, 0x89, 0xDB, 0x90, 0xDB, + 0x90, 0xDB, 0x90, 0xDB, 0x90, 0xD9, 0x89, 0xD9, + 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, + 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x8A, 0xD9, + 0x94, 0xDB, 0x95, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, + 0x95, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x88, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x8A, 0xD9, + 0x94, 0xDB, 0x87, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, + 0x87, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x86, 0xD9, + 0x8A, 0xD9, 0x94, 0xDB, 0x86, 0xD9, 0x8A, 0xD9, + 0x94, 0xDB, 0x88, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, + 0x88, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x90, 0xD9, + 0x8A, 0xD9, 0x94, 0xDB, 0x90, 0xD9, 0x8A, 0xD9, + 0x94, 0xDB, 0x90, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, + 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xDB, 0x8C, 0xDB, + 0x8C, 0xDB, 0x8C, 0xDB, 0x8C, 0xD9, 0x8A, 0xD9, + 0x94, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, + 0xAD, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xD9, 0x8A, 0xD9, + 0x94, 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, + 0xA8, 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, + 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x89, 0xD8, + 0xA8, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD8, + 0xAA, 0xD8, 0xAD, 0xD8, 0xAA, 0xD8, 0xAE, 0xD8, + 0xAA, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x89, 0xD8, + 0xAA, 0xD9, 0x8A, 0xD8, 0xAB, 0xD8, 0xAC, 0xD8, + 0xAB, 0xD9, 0x85, 0xD8, 0xAB, 0xD9, 0x89, 0xD8, + 0xAB, 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, + 0xAC, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, + 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD8, + 0xAE, 0xD8, 0xAD, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, + 0xB3, 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, + 0xB3, 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, + 0xB5, 0xD8, 0xAD, 0xD8, 0xB5, 0xD9, 0x85, 0xD8, + 0xB6, 0xD8, 0xAC, 0xD8, 0xB6, 0xD8, 0xAD, 0xD8, + 0xB6, 0xD8, 0xAE, 0xD8, 0xB6, 0xD9, 0x85, 0xD8, + 0xB7, 0xD8, 0xAD, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, + 0xB8, 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, + 0xB9, 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, + 0xBA, 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, + 0x81, 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, + 0x81, 0xD9, 0x85, 0xD9, 0x81, 0xD9, 0x89, 0xD9, + 0x81, 0xD9, 0x8A, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, + 0x82, 0xD9, 0x85, 0xD9, 0x82, 0xD9, 0x89, 0xD9, + 0x82, 0xD9, 0x8A, 0xD9, 0x83, 0xD8, 0xA7, 0xD9, + 0x83, 0xD8, 0xAC, 0xD9, 0x83, 0xD8, 0xAD, 0xD9, + 0x83, 0xD8, 0xAE, 0xD9, 0x83, 0xD9, 0x84, 0xD9, + 0x83, 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x89, 0xD9, + 0x83, 0xD9, 0x8A, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, + 0x84, 0xD8, 0xAD, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, + 0x84, 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x89, 0xD9, + 0x84, 0xD9, 0x8A, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, + 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, + 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x89, 0xD9, + 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, + 0x86, 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, + 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x89, 0xD9, + 0x86, 0xD9, 0x8A, 0xD9, 0x87, 0xD8, 0xAC, 0xD9, + 0x87, 0xD9, 0x85, 0xD9, 0x87, 0xD9, 0x89, 0xD9, + 0x87, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, + 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x89, 0xD9, + 0x8A, 0xD9, 0x8A, 0xD8, 0xB0, 0xD9, 0xB0, 0xD8, + 0xB1, 0xD9, 0xB0, 0xD9, 0x89, 0xD9, 0xB0, 0x20, + 0xD9, 0x8C, 0xD9, 0x91, 0x20, 0xD9, 0x8D, 0xD9, + 0x91, 0x20, 0xD9, 0x8E, 0xD9, 0x91, 0x20, 0xD9, + 0x8F, 0xD9, 0x91, 0x20, 0xD9, 0x90, 0xD9, 0x91, + 0x20, 0xD9, 0x91, 0xD9, 0xB0, 0xD9, 0x8A, 0xD9, + 0x94, 0xD8, 0xB1, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, + 0xB2, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, + 0x94, 0xD9, 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, + 0x8A, 0xD8, 0xA8, 0xD8, 0xB1, 0xD8, 0xA8, 0xD8, + 0xB2, 0xD8, 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, + 0x86, 0xD8, 0xA8, 0xD9, 0x89, 0xD8, 0xA8, 0xD9, + 0x8A, 0xD8, 0xAA, 0xD8, 0xB1, 0xD8, 0xAA, 0xD8, + 0xB2, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, + 0x86, 0xD8, 0xAA, 0xD9, 0x89, 0xD8, 0xAA, 0xD9, + 0x8A, 0xD8, 0xAB, 0xD8, 0xB1, 0xD8, 0xAB, 0xD8, + 0xB2, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAB, 0xD9, + 0x86, 0xD8, 0xAB, 0xD9, 0x89, 0xD8, 0xAB, 0xD9, + 0x8A, 0xD9, 0x81, 0xD9, 0x89, 0xD9, 0x81, 0xD9, + 0x8A, 0xD9, 0x82, 0xD9, 0x89, 0xD9, 0x82, 0xD9, + 0x8A, 0xD9, 0x83, 0xD8, 0xA7, 0xD9, 0x83, 0xD9, + 0x84, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x83, 0xD9, + 0x89, 0xD9, 0x83, 0xD9, 0x8A, 0xD9, 0x84, 0xD9, + 0x85, 0xD9, 0x84, 0xD9, 0x89, 0xD9, 0x84, 0xD9, + 0x8A, 0xD9, 0x85, 0xD8, 0xA7, 0xD9, 0x85, 0xD9, + 0x85, 0xD9, 0x86, 0xD8, 0xB1, 0xD9, 0x86, 0xD8, + 0xB2, 0xD9, 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, + 0x86, 0xD9, 0x86, 0xD9, 0x89, 0xD9, 0x86, 0xD9, + 0x8A, 0xD9, 0x89, 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, + 0xB1, 0xD9, 0x8A, 0xD8, 0xB2, 0xD9, 0x8A, 0xD9, + 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, + 0x89, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, + 0x94, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, + 0xAD, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAE, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, + 0x94, 0xD9, 0x87, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, + 0xA8, 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, + 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x87, 0xD8, + 0xAA, 0xD8, 0xAC, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, + 0xAA, 0xD8, 0xAE, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, + 0xAA, 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, + 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, + 0xAD, 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, + 0xAE, 0xD8, 0xAC, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, + 0xB3, 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, + 0xB3, 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, + 0xB5, 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAE, 0xD8, + 0xB5, 0xD9, 0x85, 0xD8, 0xB6, 0xD8, 0xAC, 0xD8, + 0xB6, 0xD8, 0xAD, 0xD8, 0xB6, 0xD8, 0xAE, 0xD8, + 0xB6, 0xD9, 0x85, 0xD8, 0xB7, 0xD8, 0xAD, 0xD8, + 0xB8, 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, + 0xB9, 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, + 0xBA, 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, + 0x81, 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, + 0x81, 0xD9, 0x85, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, + 0x82, 0xD9, 0x85, 0xD9, 0x83, 0xD8, 0xAC, 0xD9, + 0x83, 0xD8, 0xAD, 0xD9, 0x83, 0xD8, 0xAE, 0xD9, + 0x83, 0xD9, 0x84, 0xD9, 0x83, 0xD9, 0x85, 0xD9, + 0x84, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, + 0x84, 0xD8, 0xAE, 0xD9, 0x84, 0xD9, 0x85, 0xD9, + 0x84, 0xD9, 0x87, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, + 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, + 0x85, 0xD9, 0x85, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, + 0x86, 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, + 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, + 0x87, 0xD8, 0xAC, 0xD9, 0x87, 0xD9, 0x85, 0xD9, + 0x87, 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, + 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, + 0x94, 0xD9, 0x87, 0xD8, 0xA8, 0xD9, 0x85, 0xD8, + 0xA8, 0xD9, 0x87, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, + 0xAA, 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, + 0xAB, 0xD9, 0x87, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, + 0xB3, 0xD9, 0x87, 0xD8, 0xB4, 0xD9, 0x85, 0xD8, + 0xB4, 0xD9, 0x87, 0xD9, 0x83, 0xD9, 0x84, 0xD9, + 0x83, 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x85, 0xD9, + 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, + 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, + 0x80, 0xD9, 0x8E, 0xD9, 0x91, 0xD9, 0x80, 0xD9, + 0x8F, 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x90, 0xD9, + 0x91, 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, + 0x8A, 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, + 0x8A, 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, + 0x8A, 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, + 0x8A, 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, + 0x8A, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, + 0x8A, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, + 0x8A, 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, + 0x8A, 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, + 0x8A, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, + 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, + 0x85, 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, + 0xB1, 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, + 0xB1, 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, + 0x8A, 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, + 0x8A, 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, + 0x8A, 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, + 0x8A, 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, + 0x8A, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, + 0x8A, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, + 0x8A, 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, + 0x8A, 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, + 0x8A, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, + 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, + 0x85, 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, + 0xB1, 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, + 0xB1, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, + 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, + 0x85, 0xD8, 0xB3, 0xD9, 0x87, 0xD8, 0xB4, 0xD9, + 0x87, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, + 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, 0xD8, + 0xAE, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, + 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB7, 0xD9, + 0x85, 0xD8, 0xB8, 0xD9, 0x85, 0xD8, 0xA7, 0xD9, + 0x8B, 0xD8, 0xA7, 0xD9, 0x8B, 0xD8, 0xAA, 0xD8, + 0xAC, 0xD9, 0x85, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, + 0xAC, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, + 0xAA, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAA, 0xD8, + 0xAE, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, + 0xAC, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, + 0xAA, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, + 0x85, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, + 0xAD, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, + 0xAD, 0xD9, 0x85, 0xD9, 0x89, 0xD8, 0xB3, 0xD8, + 0xAD, 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAC, 0xD8, + 0xAD, 0xD8, 0xB3, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, + 0xB3, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, + 0x85, 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, + 0xAC, 0xD8, 0xB3, 0xD9, 0x85, 0xD9, 0x85, 0xD8, + 0xB3, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB5, 0xD8, + 0xAD, 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAD, 0xD8, + 0xAD, 0xD8, 0xB5, 0xD9, 0x85, 0xD9, 0x85, 0xD8, + 0xB4, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB4, 0xD8, + 0xAD, 0xD9, 0x85, 0xD8, 0xB4, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xB4, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, + 0xB4, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, + 0x85, 0xD9, 0x85, 0xD8, 0xB4, 0xD9, 0x85, 0xD9, + 0x85, 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, + 0xB6, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB6, 0xD8, + 0xAE, 0xD9, 0x85, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, + 0xAD, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, + 0xB7, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB7, 0xD9, + 0x85, 0xD9, 0x8A, 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, + 0x85, 0xD8, 0xB9, 0xD9, 0x85, 0xD9, 0x85, 0xD8, + 0xB9, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, + 0x85, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x85, 0xD9, + 0x85, 0xD8, 0xBA, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, + 0xBA, 0xD9, 0x85, 0xD9, 0x89, 0xD9, 0x81, 0xD8, + 0xAE, 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, + 0x85, 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, + 0x82, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x84, 0xD8, + 0xAD, 0xD9, 0x85, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, + 0x8A, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x89, 0xD9, + 0x84, 0xD8, 0xAC, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, + 0xAC, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, + 0x85, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x85, 0xD9, + 0x84, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x84, 0xD9, + 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, + 0xAC, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, + 0x85, 0xD8, 0xAD, 0xD9, 0x8A, 0xD9, 0x85, 0xD8, + 0xAC, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, + 0x85, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, + 0x85, 0xD8, 0xAE, 0xD9, 0x85, 0xD9, 0x85, 0xD8, + 0xAC, 0xD8, 0xAE, 0xD9, 0x87, 0xD9, 0x85, 0xD8, + 0xAC, 0xD9, 0x87, 0xD9, 0x85, 0xD9, 0x85, 0xD9, + 0x86, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x86, 0xD8, + 0xAD, 0xD9, 0x89, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, + 0x85, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, + 0x86, 0xD8, 0xAC, 0xD9, 0x89, 0xD9, 0x86, 0xD9, + 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x85, 0xD9, + 0x89, 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x85, 0xD9, + 0x8A, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xA8, 0xD8, + 0xAE, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, + 0xAA, 0xD8, 0xAE, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, + 0xAE, 0xD9, 0x89, 0xD8, 0xAA, 0xD9, 0x85, 0xD9, + 0x8A, 0xD8, 0xAA, 0xD9, 0x85, 0xD9, 0x89, 0xD8, + 0xAC, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, + 0xAD, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, + 0x89, 0xD8, 0xB3, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, + 0xB5, 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xB4, 0xD8, + 0xAD, 0xD9, 0x8A, 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, + 0x8A, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, + 0x84, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, + 0xAD, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, + 0x85, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x82, 0xD9, + 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAD, 0xD9, + 0x8A, 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, + 0x84, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, + 0x85, 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, + 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, + 0x85, 0xD8, 0xAE, 0xD9, 0x8A, 0xD9, 0x84, 0xD8, + 0xAC, 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x85, 0xD9, + 0x85, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, + 0x86, 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, + 0xAD, 0xD9, 0x8A, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, + 0x81, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, + 0xAD, 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, + 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, + 0xB5, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, + 0xAE, 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xB5, 0xD9, 0x84, 0xDB, 0x92, 0xD9, + 0x82, 0xD9, 0x84, 0xDB, 0x92, 0xD8, 0xA7, 0xD9, + 0x84, 0xD9, 0x84, 0xD9, 0x87, 0xD8, 0xA7, 0xD9, + 0x83, 0xD8, 0xA8, 0xD8, 0xB1, 0xD9, 0x85, 0xD8, + 0xAD, 0xD9, 0x85, 0xD8, 0xAF, 0xD8, 0xB5, 0xD9, + 0x84, 0xD8, 0xB9, 0xD9, 0x85, 0xD8, 0xB1, 0xD8, + 0xB3, 0xD9, 0x88, 0xD9, 0x84, 0xD8, 0xB9, 0xD9, + 0x84, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x88, 0xD8, + 0xB3, 0xD9, 0x84, 0xD9, 0x85, 0xD8, 0xB5, 0xD9, + 0x84, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x84, 0xD9, + 0x89, 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9, 0x84, + 0xD9, 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9, + 0x8A, 0xD9, 0x87, 0x20, 0xD9, 0x88, 0xD8, 0xB3, + 0xD9, 0x84, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x84, + 0x20, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, + 0x84, 0xD9, 0x87, 0xD8, 0xB1, 0xDB, 0x8C, 0xD8, + 0xA7, 0xD9, 0x84, 0x2C, 0xE3, 0x80, 0x81, 0xE3, + 0x80, 0x82, 0x3A, 0x3B, 0x21, 0x3F, 0xE3, 0x80, + 0x96, 0xE3, 0x80, 0x97, 0x2E, 0x2E, 0x2E, 0x2E, + 0x2E, 0xE2, 0x80, 0x94, 0xE2, 0x80, 0x93, 0x5F, + 0x5F, 0x28, 0x29, 0x7B, 0x7D, 0xE3, 0x80, 0x94, + 0xE3, 0x80, 0x95, 0xE3, 0x80, 0x90, 0xE3, 0x80, + 0x91, 0xE3, 0x80, 0x8A, 0xE3, 0x80, 0x8B, 0xE3, + 0x80, 0x88, 0xE3, 0x80, 0x89, 0xE3, 0x80, 0x8C, + 0xE3, 0x80, 0x8D, 0xE3, 0x80, 0x8E, 0xE3, 0x80, + 0x8F, 0x5B, 0x5D, 0x20, 0xCC, 0x85, 0x20, 0xCC, + 0x85, 0x20, 0xCC, 0x85, 0x20, 0xCC, 0x85, 0x5F, + 0x5F, 0x5F, 0x2C, 0xE3, 0x80, 0x81, 0x2E, 0x3B, + 0x3A, 0x3F, 0x21, 0xE2, 0x80, 0x94, 0x28, 0x29, + 0x7B, 0x7D, 0xE3, 0x80, 0x94, 0xE3, 0x80, 0x95, + 0x23, 0x26, 0x2A, 0x2B, 0x2D, 0x3C, 0x3E, 0x3D, + 0x5C, 0x24, 0x25, 0x40, 0x20, 0xD9, 0x8B, 0xD9, + 0x80, 0xD9, 0x8B, 0x20, 0xD9, 0x8C, 0x20, 0xD9, + 0x8D, 0x20, 0xD9, 0x8E, 0xD9, 0x80, 0xD9, 0x8E, + 0x20, 0xD9, 0x8F, 0xD9, 0x80, 0xD9, 0x8F, 0x20, + 0xD9, 0x90, 0xD9, 0x80, 0xD9, 0x90, 0x20, 0xD9, + 0x91, 0xD9, 0x80, 0xD9, 0x91, 0x20, 0xD9, 0x92, + 0xD9, 0x80, 0xD9, 0x92, 0xD8, 0xA1, 0xD8, 0xA7, + 0xD9, 0x93, 0xD8, 0xA7, 0xD9, 0x93, 0xD8, 0xA7, + 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, 0x88, + 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x94, 0xD8, 0xA7, + 0xD9, 0x95, 0xD8, 0xA7, 0xD9, 0x95, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xA7, + 0xD8, 0xA7, 0xD8, 0xA8, 0xD8, 0xA8, 0xD8, 0xA8, + 0xD8, 0xA8, 0xD8, 0xA9, 0xD8, 0xA9, 0xD8, 0xAA, + 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, 0xAB, + 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, 0xAC, + 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, 0xAD, + 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, 0xAE, + 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, 0xAF, + 0xD8, 0xAF, 0xD8, 0xB0, 0xD8, 0xB0, 0xD8, 0xB1, + 0xD8, 0xB1, 0xD8, 0xB2, 0xD8, 0xB2, 0xD8, 0xB3, + 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, 0xB4, + 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, 0xB5, + 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, 0xB6, + 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, 0xB7, + 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, 0xB8, + 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, 0xB9, + 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, 0xBA, + 0xD8, 0xBA, 0xD8, 0xBA, 0xD8, 0xBA, 0xD9, 0x81, + 0xD9, 0x81, 0xD9, 0x81, 0xD9, 0x81, 0xD9, 0x82, + 0xD9, 0x82, 0xD9, 0x82, 0xD9, 0x82, 0xD9, 0x83, + 0xD9, 0x83, 0xD9, 0x83, 0xD9, 0x83, 0xD9, 0x84, + 0xD9, 0x84, 0xD9, 0x84, 0xD9, 0x84, 0xD9, 0x85, + 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x86, + 0xD9, 0x86, 0xD9, 0x86, 0xD9, 0x86, 0xD9, 0x87, + 0xD9, 0x87, 0xD9, 0x87, 0xD9, 0x87, 0xD9, 0x88, + 0xD9, 0x88, 0xD9, 0x89, 0xD9, 0x89, 0xD9, 0x8A, + 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x84, + 0xD8, 0xA7, 0xD9, 0x93, 0xD9, 0x84, 0xD8, 0xA7, + 0xD9, 0x93, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x94, + 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, 0x84, + 0xD8, 0xA7, 0xD9, 0x95, 0xD9, 0x84, 0xD8, 0xA7, + 0xD9, 0x95, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x84, + 0xD8, 0xA7, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, + 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, + 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, + 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, + 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, + 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, + 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, + 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, + 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, + 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, + 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, + 0xE2, 0xA6, 0x85, 0xE2, 0xA6, 0x86, 0xE3, 0x80, + 0x82, 0xE3, 0x80, 0x8C, 0xE3, 0x80, 0x8D, 0xE3, + 0x80, 0x81, 0xE3, 0x83, 0xBB, 0xE3, 0x83, 0xB2, + 0xE3, 0x82, 0xA1, 0xE3, 0x82, 0xA3, 0xE3, 0x82, + 0xA5, 0xE3, 0x82, 0xA7, 0xE3, 0x82, 0xA9, 0xE3, + 0x83, 0xA3, 0xE3, 0x83, 0xA5, 0xE3, 0x83, 0xA7, + 0xE3, 0x83, 0x83, 0xE3, 0x83, 0xBC, 0xE3, 0x82, + 0xA2, 0xE3, 0x82, 0xA4, 0xE3, 0x82, 0xA6, 0xE3, + 0x82, 0xA8, 0xE3, 0x82, 0xAA, 0xE3, 0x82, 0xAB, + 0xE3, 0x82, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x82, + 0xB1, 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0xB5, 0xE3, + 0x82, 0xB7, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xBB, + 0xE3, 0x82, 0xBD, 0xE3, 0x82, 0xBF, 0xE3, 0x83, + 0x81, 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x86, 0xE3, + 0x83, 0x88, 0xE3, 0x83, 0x8A, 0xE3, 0x83, 0x8B, + 0xE3, 0x83, 0x8C, 0xE3, 0x83, 0x8D, 0xE3, 0x83, + 0x8E, 0xE3, 0x83, 0x8F, 0xE3, 0x83, 0x92, 0xE3, + 0x83, 0x95, 0xE3, 0x83, 0x98, 0xE3, 0x83, 0x9B, + 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0x9F, 0xE3, 0x83, + 0xA0, 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xA2, 0xE3, + 0x83, 0xA4, 0xE3, 0x83, 0xA6, 0xE3, 0x83, 0xA8, + 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xAA, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xAD, 0xE3, + 0x83, 0xAF, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0x99, + 0xE3, 0x82, 0x9A, 0xE1, 0x85, 0xA0, 0xE1, 0x84, + 0x80, 0xE1, 0x84, 0x81, 0xE1, 0x86, 0xAA, 0xE1, + 0x84, 0x82, 0xE1, 0x86, 0xAC, 0xE1, 0x86, 0xAD, + 0xE1, 0x84, 0x83, 0xE1, 0x84, 0x84, 0xE1, 0x84, + 0x85, 0xE1, 0x86, 0xB0, 0xE1, 0x86, 0xB1, 0xE1, + 0x86, 0xB2, 0xE1, 0x86, 0xB3, 0xE1, 0x86, 0xB4, + 0xE1, 0x86, 0xB5, 0xE1, 0x84, 0x9A, 0xE1, 0x84, + 0x86, 0xE1, 0x84, 0x87, 0xE1, 0x84, 0x88, 0xE1, + 0x84, 0xA1, 0xE1, 0x84, 0x89, 0xE1, 0x84, 0x8A, + 0xE1, 0x84, 0x8B, 0xE1, 0x84, 0x8C, 0xE1, 0x84, + 0x8D, 0xE1, 0x84, 0x8E, 0xE1, 0x84, 0x8F, 0xE1, + 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, 0x84, 0x92, + 0xE1, 0x85, 0xA1, 0xE1, 0x85, 0xA2, 0xE1, 0x85, + 0xA3, 0xE1, 0x85, 0xA4, 0xE1, 0x85, 0xA5, 0xE1, + 0x85, 0xA6, 0xE1, 0x85, 0xA7, 0xE1, 0x85, 0xA8, + 0xE1, 0x85, 0xA9, 0xE1, 0x85, 0xAA, 0xE1, 0x85, + 0xAB, 0xE1, 0x85, 0xAC, 0xE1, 0x85, 0xAD, 0xE1, + 0x85, 0xAE, 0xE1, 0x85, 0xAF, 0xE1, 0x85, 0xB0, + 0xE1, 0x85, 0xB1, 0xE1, 0x85, 0xB2, 0xE1, 0x85, + 0xB3, 0xE1, 0x85, 0xB4, 0xE1, 0x85, 0xB5, 0xC2, + 0xA2, 0xC2, 0xA3, 0xC2, 0xAC, 0x20, 0xCC, 0x84, + 0xC2, 0xA6, 0xC2, 0xA5, 0xE2, 0x82, 0xA9, 0xE2, + 0x94, 0x82, 0xE2, 0x86, 0x90, 0xE2, 0x86, 0x91, + 0xE2, 0x86, 0x92, 0xE2, 0x86, 0x93, 0xE2, 0x96, + 0xA0, 0xE2, 0x97, 0x8B, 0xF6, 0xF0, 0x9D, 0x85, + 0x97, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, 0xF0, 0x9D, + 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, 0xF0, + 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, + 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x85, 0x98, + 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAF, + 0xF6, 0xF0, 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, + 0xA5, 0xF0, 0x9D, 0x85, 0xB0, 0xF6, 0xF0, 0x9D, + 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, + 0x85, 0xB1, 0xF6, 0xF0, 0x9D, 0x85, 0x98, 0xF0, + 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xB2, 0xF6, + 0xF0, 0x9D, 0x86, 0xB9, 0xF0, 0x9D, 0x85, 0xA5, + 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, + 0xA5, 0xF6, 0xF0, 0x9D, 0x86, 0xB9, 0xF0, 0x9D, + 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAE, 0xF6, 0xF0, + 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, + 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x86, 0xB9, + 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAF, + 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, + 0xA5, 0xF0, 0x9D, 0x85, 0xAF, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, + 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, + 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, 0x44, + 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, + 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, + 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, 0x62, + 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, + 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, + 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, + 0x41, 0x43, 0x44, 0x47, 0x4A, 0x4B, 0x4E, 0x4F, + 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x66, 0x68, + 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, + 0x45, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, + 0x4F, 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, + 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, + 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, 0x45, + 0x46, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4F, + 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0xC4, 0xB1, 0xC8, 0xB7, 0xCE, 0x91, 0xCE, + 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, + 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, + 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, + 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, + 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, + 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, + 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, + 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, + 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, + 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, + 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, + 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, + 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, + 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, + 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, + 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, + 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, + 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, + 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, + 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, + 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, + 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, + 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, + 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, + 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, + 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, + 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, + 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, + 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, + 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, + 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, + 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, + 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, + 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, + 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, + 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, + 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, + 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, + 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, + 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, + 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, + 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, + 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, + 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, + 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, + 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, + 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, + 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, + 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, + 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, + 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, + 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, + 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, + 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, + 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, + 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, + 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, + 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, + 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, + 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, + 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, + 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, + 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, + 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, + 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, + 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, + 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, + 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, + 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, + 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, + 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, + 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, + 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, + 0x81, 0xCF, 0x80, 0xCF, 0x9C, 0xCF, 0x9D, 0x30, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, + 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, + 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, + 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x39, 0xF6, 0xE4, 0xB8, 0xBD, 0xF6, 0xE4, 0xB8, + 0xB8, 0xF6, 0xE4, 0xB9, 0x81, 0xF6, 0xF0, 0xA0, + 0x84, 0xA2, 0xF6, 0xE4, 0xBD, 0xA0, 0xF6, 0xE4, + 0xBE, 0xAE, 0xF6, 0xE4, 0xBE, 0xBB, 0xF6, 0xE5, + 0x80, 0x82, 0xF6, 0xE5, 0x81, 0xBA, 0xF6, 0xE5, + 0x82, 0x99, 0xF6, 0xE5, 0x83, 0xA7, 0xF6, 0xE5, + 0x83, 0x8F, 0xF6, 0xE3, 0x92, 0x9E, 0xF6, 0xF0, + 0xA0, 0x98, 0xBA, 0xF6, 0xE5, 0x85, 0x8D, 0xF6, + 0xE5, 0x85, 0x94, 0xF6, 0xE5, 0x85, 0xA4, 0xF6, + 0xE5, 0x85, 0xB7, 0xF6, 0xF0, 0xA0, 0x94, 0x9C, + 0xF6, 0xE3, 0x92, 0xB9, 0xF6, 0xE5, 0x85, 0xA7, + 0xF6, 0xE5, 0x86, 0x8D, 0xF6, 0xF0, 0xA0, 0x95, + 0x8B, 0xF6, 0xE5, 0x86, 0x97, 0xF6, 0xE5, 0x86, + 0xA4, 0xF6, 0xE4, 0xBB, 0x8C, 0xF6, 0xE5, 0x86, + 0xAC, 0xF6, 0xE5, 0x86, 0xB5, 0xF6, 0xF0, 0xA9, + 0x87, 0x9F, 0xF6, 0xE5, 0x87, 0xB5, 0xF6, 0xE5, + 0x88, 0x83, 0xF6, 0xE3, 0x93, 0x9F, 0xF6, 0xE5, + 0x88, 0xBB, 0xF6, 0xE5, 0x89, 0x86, 0xF6, 0xE5, + 0x89, 0xB2, 0xF6, 0xE5, 0x89, 0xB7, 0xF6, 0xE3, + 0x94, 0x95, 0xF6, 0xE5, 0x8B, 0x87, 0xF6, 0xE5, + 0x8B, 0x89, 0xF6, 0xE5, 0x8B, 0xA4, 0xF6, 0xE5, + 0x8B, 0xBA, 0xF6, 0xE5, 0x8C, 0x85, 0xF6, 0xE5, + 0x8C, 0x86, 0xF6, 0xE5, 0x8C, 0x97, 0xF6, 0xE5, + 0x8D, 0x89, 0xF6, 0xE5, 0x8D, 0x91, 0xF6, 0xE5, + 0x8D, 0x9A, 0xF6, 0xE5, 0x8D, 0xB3, 0xF6, 0xE5, + 0x8D, 0xBD, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xE5, + 0x8D, 0xBF, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xF0, + 0xA0, 0xA8, 0xAC, 0xF6, 0xE7, 0x81, 0xB0, 0xF6, + 0xE5, 0x8F, 0x8A, 0xF6, 0xE5, 0x8F, 0x9F, 0xF6, + 0xF0, 0xA0, 0xAD, 0xA3, 0xF6, 0xE5, 0x8F, 0xAB, + 0xF6, 0xE5, 0x8F, 0xB1, 0xF6, 0xE5, 0x90, 0x86, + 0xF6, 0xE5, 0x92, 0x9E, 0xF6, 0xE5, 0x90, 0xB8, + 0xF6, 0xE5, 0x91, 0x88, 0xF6, 0xE5, 0x91, 0xA8, + 0xF6, 0xE5, 0x92, 0xA2, 0xF6, 0xE5, 0x93, 0xB6, + 0xF6, 0xE5, 0x94, 0x90, 0xF6, 0xE5, 0x95, 0x93, + 0xF6, 0xE5, 0x95, 0xA3, 0xF6, 0xE5, 0x96, 0x84, + 0xF6, 0xE5, 0x96, 0x84, 0xF6, 0xE5, 0x96, 0x99, + 0xF6, 0xE5, 0x96, 0xAB, 0xF6, 0xE5, 0x96, 0xB3, + 0xF6, 0xE5, 0x97, 0x82, 0xF6, 0xE5, 0x9C, 0x96, + 0xF6, 0xE5, 0x98, 0x86, 0xF6, 0xE5, 0x9C, 0x97, + 0xF6, 0xE5, 0x99, 0x91, 0xF6, 0xE5, 0x99, 0xB4, + 0xF6, 0xE5, 0x88, 0x87, 0xF6, 0xE5, 0xA3, 0xAE, + 0xF6, 0xE5, 0x9F, 0x8E, 0xF6, 0xE5, 0x9F, 0xB4, + 0xF6, 0xE5, 0xA0, 0x8D, 0xF6, 0xE5, 0x9E, 0x8B, + 0xF6, 0xE5, 0xA0, 0xB2, 0xF6, 0xE5, 0xA0, 0xB1, + 0xF6, 0xE5, 0xA2, 0xAC, 0xF6, 0xF0, 0xA1, 0x93, + 0xA4, 0xF6, 0xE5, 0xA3, 0xB2, 0xF6, 0xE5, 0xA3, + 0xB7, 0xF6, 0xE5, 0xA4, 0x86, 0xF6, 0xE5, 0xA4, + 0x9A, 0xF6, 0xE5, 0xA4, 0xA2, 0xF6, 0xE5, 0xA5, + 0xA2, 0xF6, 0xF0, 0xA1, 0x9A, 0xA8, 0xF6, 0xF0, + 0xA1, 0x9B, 0xAA, 0xF6, 0xE5, 0xA7, 0xAC, 0xF6, + 0xE5, 0xA8, 0x9B, 0xF6, 0xE5, 0xA8, 0xA7, 0xF6, + 0xE5, 0xA7, 0x98, 0xF6, 0xE5, 0xA9, 0xA6, 0xF6, + 0xE3, 0x9B, 0xAE, 0xF6, 0xE3, 0x9B, 0xBC, 0xF6, + 0xE5, 0xAC, 0x88, 0xF6, 0xE5, 0xAC, 0xBE, 0xF6, + 0xE5, 0xAC, 0xBE, 0xF6, 0xF0, 0xA1, 0xA7, 0x88, + 0xF6, 0xE5, 0xAF, 0x83, 0xF6, 0xE5, 0xAF, 0x98, + 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, 0xAF, 0xB3, + 0xF6, 0xF0, 0xA1, 0xAC, 0x98, 0xF6, 0xE5, 0xAF, + 0xBF, 0xF6, 0xE5, 0xB0, 0x86, 0xF6, 0xE5, 0xBD, + 0x93, 0xF6, 0xE5, 0xB0, 0xA2, 0xF6, 0xE3, 0x9E, + 0x81, 0xF6, 0xE5, 0xB1, 0xA0, 0xF6, 0xE5, 0xB1, + 0xAE, 0xF6, 0xE5, 0xB3, 0x80, 0xF6, 0xE5, 0xB2, + 0x8D, 0xF6, 0xF0, 0xA1, 0xB7, 0xA4, 0xF6, 0xE5, + 0xB5, 0x83, 0xF6, 0xF0, 0xA1, 0xB7, 0xA6, 0xF6, + 0xE5, 0xB5, 0xAE, 0xF6, 0xE5, 0xB5, 0xAB, 0xF6, + 0xE5, 0xB5, 0xBC, 0xF6, 0xE5, 0xB7, 0xA1, 0xF6, + 0xE5, 0xB7, 0xA2, 0xF6, 0xE3, 0xA0, 0xAF, 0xF6, + 0xE5, 0xB7, 0xBD, 0xF6, 0xE5, 0xB8, 0xA8, 0xF6, + 0xE5, 0xB8, 0xBD, 0xF6, 0xE5, 0xB9, 0xA9, 0xF6, + 0xE3, 0xA1, 0xA2, 0xF6, 0xF0, 0xA2, 0x86, 0x83, + 0xF6, 0xE3, 0xA1, 0xBC, 0xF6, 0xE5, 0xBA, 0xB0, + 0xF6, 0xE5, 0xBA, 0xB3, 0xF6, 0xE5, 0xBA, 0xB6, + 0xF6, 0xE5, 0xBB, 0x8A, 0xF6, 0xF0, 0xAA, 0x8E, + 0x92, 0xF6, 0xE5, 0xBB, 0xBE, 0xF6, 0xF0, 0xA2, + 0x8C, 0xB1, 0xF6, 0xF0, 0xA2, 0x8C, 0xB1, 0xF6, + 0xE8, 0x88, 0x81, 0xF6, 0xE5, 0xBC, 0xA2, 0xF6, + 0xE5, 0xBC, 0xA2, 0xF6, 0xE3, 0xA3, 0x87, 0xF6, + 0xF0, 0xA3, 0x8A, 0xB8, 0xF6, 0xF0, 0xA6, 0x87, + 0x9A, 0xF6, 0xE5, 0xBD, 0xA2, 0xF6, 0xE5, 0xBD, + 0xAB, 0xF6, 0xE3, 0xA3, 0xA3, 0xF6, 0xE5, 0xBE, + 0x9A, 0xF6, 0xE5, 0xBF, 0x8D, 0xF6, 0xE5, 0xBF, + 0x97, 0xF6, 0xE5, 0xBF, 0xB9, 0xF6, 0xE6, 0x82, + 0x81, 0xF6, 0xE3, 0xA4, 0xBA, 0xF6, 0xE3, 0xA4, + 0x9C, 0xF6, 0xE6, 0x82, 0x94, 0xF6, 0xF0, 0xA2, + 0x9B, 0x94, 0xF6, 0xE6, 0x83, 0x87, 0xF6, 0xE6, + 0x85, 0x88, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6, + 0x85, 0x8E, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6, + 0x85, 0xBA, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, 0xE6, + 0x86, 0xB2, 0xF6, 0xE6, 0x86, 0xA4, 0xF6, 0xE6, + 0x86, 0xAF, 0xF6, 0xE6, 0x87, 0x9E, 0xF6, 0xE6, + 0x87, 0xB2, 0xF6, 0xE6, 0x87, 0xB6, 0xF6, 0xE6, + 0x88, 0x90, 0xF6, 0xE6, 0x88, 0x9B, 0xF6, 0xE6, + 0x89, 0x9D, 0xF6, 0xE6, 0x8A, 0xB1, 0xF6, 0xE6, + 0x8B, 0x94, 0xF6, 0xE6, 0x8D, 0x90, 0xF6, 0xF0, + 0xA2, 0xAC, 0x8C, 0xF6, 0xE6, 0x8C, 0xBD, 0xF6, + 0xE6, 0x8B, 0xBC, 0xF6, 0xE6, 0x8D, 0xA8, 0xF6, + 0xE6, 0x8E, 0x83, 0xF6, 0xE6, 0x8F, 0xA4, 0xF6, + 0xF0, 0xA2, 0xAF, 0xB1, 0xF6, 0xE6, 0x90, 0xA2, + 0xF6, 0xE6, 0x8F, 0x85, 0xF6, 0xE6, 0x8E, 0xA9, + 0xF6, 0xE3, 0xA8, 0xAE, 0xF6, 0xE6, 0x91, 0xA9, + 0xF6, 0xE6, 0x91, 0xBE, 0xF6, 0xE6, 0x92, 0x9D, + 0xF6, 0xE6, 0x91, 0xB7, 0xF6, 0xE3, 0xA9, 0xAC, + 0xF6, 0xE6, 0x95, 0x8F, 0xF6, 0xE6, 0x95, 0xAC, + 0xF6, 0xF0, 0xA3, 0x80, 0x8A, 0xF6, 0xE6, 0x97, + 0xA3, 0xF6, 0xE6, 0x9B, 0xB8, 0xF6, 0xE6, 0x99, + 0x89, 0xF6, 0xE3, 0xAC, 0x99, 0xF6, 0xE6, 0x9A, + 0x91, 0xF6, 0xE3, 0xAC, 0x88, 0xF6, 0xE3, 0xAB, + 0xA4, 0xF6, 0xE5, 0x86, 0x92, 0xF6, 0xE5, 0x86, + 0x95, 0xF6, 0xE6, 0x9C, 0x80, 0xF6, 0xE6, 0x9A, + 0x9C, 0xF6, 0xE8, 0x82, 0xAD, 0xF6, 0xE4, 0x8F, + 0x99, 0xF6, 0xE6, 0x9C, 0x97, 0xF6, 0xE6, 0x9C, + 0x9B, 0xF6, 0xE6, 0x9C, 0xA1, 0xF6, 0xE6, 0x9D, + 0x9E, 0xF6, 0xE6, 0x9D, 0x93, 0xF6, 0xF0, 0xA3, + 0x8F, 0x83, 0xF6, 0xE3, 0xAD, 0x89, 0xF6, 0xE6, + 0x9F, 0xBA, 0xF6, 0xE6, 0x9E, 0x85, 0xF6, 0xE6, + 0xA1, 0x92, 0xF6, 0xE6, 0xA2, 0x85, 0xF6, 0xF0, + 0xA3, 0x91, 0xAD, 0xF6, 0xE6, 0xA2, 0x8E, 0xF6, + 0xE6, 0xA0, 0x9F, 0xF6, 0xE6, 0xA4, 0x94, 0xF6, + 0xE3, 0xAE, 0x9D, 0xF6, 0xE6, 0xA5, 0x82, 0xF6, + 0xE6, 0xA6, 0xA3, 0xF6, 0xE6, 0xA7, 0xAA, 0xF6, + 0xE6, 0xAA, 0xA8, 0xF6, 0xF0, 0xA3, 0x9A, 0xA3, + 0xF6, 0xE6, 0xAB, 0x9B, 0xF6, 0xE3, 0xB0, 0x98, + 0xF6, 0xE6, 0xAC, 0xA1, 0xF6, 0xF0, 0xA3, 0xA2, + 0xA7, 0xF6, 0xE6, 0xAD, 0x94, 0xF6, 0xE3, 0xB1, + 0x8E, 0xF6, 0xE6, 0xAD, 0xB2, 0xF6, 0xE6, 0xAE, + 0x9F, 0xF6, 0xE6, 0xAE, 0xBA, 0xF6, 0xE6, 0xAE, + 0xBB, 0xF6, 0xF0, 0xA3, 0xAA, 0x8D, 0xF6, 0xF0, + 0xA1, 0xB4, 0x8B, 0xF6, 0xF0, 0xA3, 0xAB, 0xBA, + 0xF6, 0xE6, 0xB1, 0x8E, 0xF6, 0xF0, 0xA3, 0xB2, + 0xBC, 0xF6, 0xE6, 0xB2, 0xBF, 0xF6, 0xE6, 0xB3, + 0x8D, 0xF6, 0xE6, 0xB1, 0xA7, 0xF6, 0xE6, 0xB4, + 0x96, 0xF6, 0xE6, 0xB4, 0xBE, 0xF6, 0xE6, 0xB5, + 0xB7, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, 0xB5, + 0xA9, 0xF6, 0xE6, 0xB5, 0xB8, 0xF6, 0xE6, 0xB6, + 0x85, 0xF6, 0xF0, 0xA3, 0xB4, 0x9E, 0xF6, 0xE6, + 0xB4, 0xB4, 0xF6, 0xE6, 0xB8, 0xAF, 0xF6, 0xE6, + 0xB9, 0xAE, 0xF6, 0xE3, 0xB4, 0xB3, 0xF6, 0xE6, + 0xBB, 0x8B, 0xF6, 0xE6, 0xBB, 0x87, 0xF6, 0xF0, + 0xA3, 0xBB, 0x91, 0xF6, 0xE6, 0xB7, 0xB9, 0xF6, + 0xE6, 0xBD, 0xAE, 0xF6, 0xF0, 0xA3, 0xBD, 0x9E, + 0xF6, 0xF0, 0xA3, 0xBE, 0x8E, 0xF6, 0xE6, 0xBF, + 0x86, 0xF6, 0xE7, 0x80, 0xB9, 0xF6, 0xE7, 0x80, + 0x9E, 0xF6, 0xE7, 0x80, 0x9B, 0xF6, 0xE3, 0xB6, + 0x96, 0xF6, 0xE7, 0x81, 0x8A, 0xF6, 0xE7, 0x81, + 0xBD, 0xF6, 0xE7, 0x81, 0xB7, 0xF6, 0xE7, 0x82, + 0xAD, 0xF6, 0xF0, 0xA0, 0x94, 0xA5, 0xF6, 0xE7, + 0x85, 0x85, 0xF6, 0xF0, 0xA4, 0x89, 0xA3, 0xF6, + 0xE7, 0x86, 0x9C, 0xF6, 0xF0, 0xA4, 0x8E, 0xAB, + 0xF6, 0xE7, 0x88, 0xA8, 0xF6, 0xE7, 0x88, 0xB5, + 0xF6, 0xE7, 0x89, 0x90, 0xF6, 0xF0, 0xA4, 0x98, + 0x88, 0xF6, 0xE7, 0x8A, 0x80, 0xF6, 0xE7, 0x8A, + 0x95, 0xF6, 0xF0, 0xA4, 0x9C, 0xB5, 0xF6, 0xF0, + 0xA4, 0xA0, 0x94, 0xF6, 0xE7, 0x8D, 0xBA, 0xF6, + 0xE7, 0x8E, 0x8B, 0xF6, 0xE3, 0xBA, 0xAC, 0xF6, + 0xE7, 0x8E, 0xA5, 0xF6, 0xE3, 0xBA, 0xB8, 0xF6, + 0xE3, 0xBA, 0xB8, 0xF6, 0xE7, 0x91, 0x87, 0xF6, + 0xE7, 0x91, 0x9C, 0xF6, 0xE7, 0x91, 0xB1, 0xF6, + 0xE7, 0x92, 0x85, 0xF6, 0xE7, 0x93, 0x8A, 0xF6, + 0xE3, 0xBC, 0x9B, 0xF6, 0xE7, 0x94, 0xA4, 0xF6, + 0xF0, 0xA4, 0xB0, 0xB6, 0xF6, 0xE7, 0x94, 0xBE, + 0xF6, 0xF0, 0xA4, 0xB2, 0x92, 0xF6, 0xE7, 0x95, + 0xB0, 0xF6, 0xF0, 0xA2, 0x86, 0x9F, 0xF6, 0xE7, + 0x98, 0x90, 0xF6, 0xF0, 0xA4, 0xBE, 0xA1, 0xF6, + 0xF0, 0xA4, 0xBE, 0xB8, 0xF6, 0xF0, 0xA5, 0x81, + 0x84, 0xF6, 0xE3, 0xBF, 0xBC, 0xF6, 0xE4, 0x80, + 0x88, 0xF6, 0xE7, 0x9B, 0xB4, 0xF6, 0xF0, 0xA5, + 0x83, 0xB3, 0xF6, 0xF0, 0xA5, 0x83, 0xB2, 0xF6, + 0xF0, 0xA5, 0x84, 0x99, 0xF6, 0xF0, 0xA5, 0x84, + 0xB3, 0xF6, 0xE7, 0x9C, 0x9E, 0xF6, 0xE7, 0x9C, + 0x9F, 0xF6, 0xE7, 0x9C, 0x9F, 0xF6, 0xE7, 0x9D, + 0x8A, 0xF6, 0xE4, 0x80, 0xB9, 0xF6, 0xE7, 0x9E, + 0x8B, 0xF6, 0xE4, 0x81, 0x86, 0xF6, 0xE4, 0x82, + 0x96, 0xF6, 0xF0, 0xA5, 0x90, 0x9D, 0xF6, 0xE7, + 0xA1, 0x8E, 0xF6, 0xE7, 0xA2, 0x8C, 0xF6, 0xE7, + 0xA3, 0x8C, 0xF6, 0xE4, 0x83, 0xA3, 0xF6, 0xF0, + 0xA5, 0x98, 0xA6, 0xF6, 0xE7, 0xA5, 0x96, 0xF6, + 0xF0, 0xA5, 0x9A, 0x9A, 0xF6, 0xF0, 0xA5, 0x9B, + 0x85, 0xF6, 0xE7, 0xA6, 0x8F, 0xF6, 0xE7, 0xA7, + 0xAB, 0xF6, 0xE4, 0x84, 0xAF, 0xF6, 0xE7, 0xA9, + 0x80, 0xF6, 0xE7, 0xA9, 0x8A, 0xF6, 0xE7, 0xA9, + 0x8F, 0xF6, 0xF0, 0xA5, 0xA5, 0xBC, 0xF6, 0xF0, + 0xA5, 0xAA, 0xA7, 0xF6, 0xF0, 0xA5, 0xAA, 0xA7, + 0xF6, 0xE7, 0xAB, 0xAE, 0xF6, 0xE4, 0x88, 0x82, + 0xF6, 0xF0, 0xA5, 0xAE, 0xAB, 0xF6, 0xE7, 0xAF, + 0x86, 0xF6, 0xE7, 0xAF, 0x89, 0xF6, 0xE4, 0x88, + 0xA7, 0xF6, 0xF0, 0xA5, 0xB2, 0x80, 0xF6, 0xE7, + 0xB3, 0x92, 0xF6, 0xE4, 0x8A, 0xA0, 0xF6, 0xE7, + 0xB3, 0xA8, 0xF6, 0xE7, 0xB3, 0xA3, 0xF6, 0xE7, + 0xB4, 0x80, 0xF6, 0xF0, 0xA5, 0xBE, 0x86, 0xF6, + 0xE7, 0xB5, 0xA3, 0xF6, 0xE4, 0x8C, 0x81, 0xF6, + 0xE7, 0xB7, 0x87, 0xF6, 0xE7, 0xB8, 0x82, 0xF6, + 0xE7, 0xB9, 0x85, 0xF6, 0xE4, 0x8C, 0xB4, 0xF6, + 0xF0, 0xA6, 0x88, 0xA8, 0xF6, 0xF0, 0xA6, 0x89, + 0x87, 0xF6, 0xE4, 0x8D, 0x99, 0xF6, 0xF0, 0xA6, + 0x8B, 0x99, 0xF6, 0xE7, 0xBD, 0xBA, 0xF6, 0xF0, + 0xA6, 0x8C, 0xBE, 0xF6, 0xE7, 0xBE, 0x95, 0xF6, + 0xE7, 0xBF, 0xBA, 0xF6, 0xE8, 0x80, 0x85, 0xF6, + 0xF0, 0xA6, 0x93, 0x9A, 0xF6, 0xF0, 0xA6, 0x94, + 0xA3, 0xF6, 0xE8, 0x81, 0xA0, 0xF6, 0xF0, 0xA6, + 0x96, 0xA8, 0xF6, 0xE8, 0x81, 0xB0, 0xF6, 0xF0, + 0xA3, 0x8D, 0x9F, 0xF6, 0xE4, 0x8F, 0x95, 0xF6, + 0xE8, 0x82, 0xB2, 0xF6, 0xE8, 0x84, 0x83, 0xF6, + 0xE4, 0x90, 0x8B, 0xF6, 0xE8, 0x84, 0xBE, 0xF6, + 0xE5, 0xAA, 0xB5, 0xF6, 0xF0, 0xA6, 0x9E, 0xA7, + 0xF6, 0xF0, 0xA6, 0x9E, 0xB5, 0xF6, 0xF0, 0xA3, + 0x8E, 0x93, 0xF6, 0xF0, 0xA3, 0x8E, 0x9C, 0xF6, + 0xE8, 0x88, 0x81, 0xF6, 0xE8, 0x88, 0x84, 0xF6, + 0xE8, 0xBE, 0x9E, 0xF6, 0xE4, 0x91, 0xAB, 0xF6, + 0xE8, 0x8A, 0x91, 0xF6, 0xE8, 0x8A, 0x8B, 0xF6, + 0xE8, 0x8A, 0x9D, 0xF6, 0xE5, 0x8A, 0xB3, 0xF6, + 0xE8, 0x8A, 0xB1, 0xF6, 0xE8, 0x8A, 0xB3, 0xF6, + 0xE8, 0x8A, 0xBD, 0xF6, 0xE8, 0x8B, 0xA6, 0xF6, + 0xF0, 0xA6, 0xAC, 0xBC, 0xF6, 0xE8, 0x8B, 0xA5, + 0xF6, 0xE8, 0x8C, 0x9D, 0xF6, 0xE8, 0x8D, 0xA3, + 0xF6, 0xE8, 0x8E, 0xAD, 0xF6, 0xE8, 0x8C, 0xA3, + 0xF6, 0xE8, 0x8E, 0xBD, 0xF6, 0xE8, 0x8F, 0xA7, + 0xF6, 0xE8, 0x91, 0x97, 0xF6, 0xE8, 0x8D, 0x93, + 0xF6, 0xE8, 0x8F, 0x8A, 0xF6, 0xE8, 0x8F, 0x8C, + 0xF6, 0xE8, 0x8F, 0x9C, 0xF6, 0xF0, 0xA6, 0xB0, + 0xB6, 0xF6, 0xF0, 0xA6, 0xB5, 0xAB, 0xF6, 0xF0, + 0xA6, 0xB3, 0x95, 0xF6, 0xE4, 0x94, 0xAB, 0xF6, + 0xE8, 0x93, 0xB1, 0xF6, 0xE8, 0x93, 0xB3, 0xF6, + 0xE8, 0x94, 0x96, 0xF6, 0xF0, 0xA7, 0x8F, 0x8A, + 0xF6, 0xE8, 0x95, 0xA4, 0xF6, 0xF0, 0xA6, 0xBC, + 0xAC, 0xF6, 0xE4, 0x95, 0x9D, 0xF6, 0xE4, 0x95, + 0xA1, 0xF6, 0xF0, 0xA6, 0xBE, 0xB1, 0xF6, 0xF0, + 0xA7, 0x83, 0x92, 0xF6, 0xE4, 0x95, 0xAB, 0xF6, + 0xE8, 0x99, 0x90, 0xF6, 0xE8, 0x99, 0x9C, 0xF6, + 0xE8, 0x99, 0xA7, 0xF6, 0xE8, 0x99, 0xA9, 0xF6, + 0xE8, 0x9A, 0xA9, 0xF6, 0xE8, 0x9A, 0x88, 0xF6, + 0xE8, 0x9C, 0x8E, 0xF6, 0xE8, 0x9B, 0xA2, 0xF6, + 0xE8, 0x9D, 0xB9, 0xF6, 0xE8, 0x9C, 0xA8, 0xF6, + 0xE8, 0x9D, 0xAB, 0xF6, 0xE8, 0x9E, 0x86, 0xF6, + 0xE4, 0x97, 0x97, 0xF6, 0xE8, 0x9F, 0xA1, 0xF6, + 0xE8, 0xA0, 0x81, 0xF6, 0xE4, 0x97, 0xB9, 0xF6, + 0xE8, 0xA1, 0xA0, 0xF6, 0xE8, 0xA1, 0xA3, 0xF6, + 0xF0, 0xA7, 0x99, 0xA7, 0xF6, 0xE8, 0xA3, 0x97, + 0xF6, 0xE8, 0xA3, 0x9E, 0xF6, 0xE4, 0x98, 0xB5, + 0xF6, 0xE8, 0xA3, 0xBA, 0xF6, 0xE3, 0x92, 0xBB, + 0xF6, 0xF0, 0xA7, 0xA2, 0xAE, 0xF6, 0xF0, 0xA7, + 0xA5, 0xA6, 0xF6, 0xE4, 0x9A, 0xBE, 0xF6, 0xE4, + 0x9B, 0x87, 0xF6, 0xE8, 0xAA, 0xA0, 0xF6, 0xE8, + 0xAB, 0xAD, 0xF6, 0xE8, 0xAE, 0x8A, 0xF6, 0xE8, + 0xB1, 0x95, 0xF6, 0xF0, 0xA7, 0xB2, 0xA8, 0xF6, + 0xE8, 0xB2, 0xAB, 0xF6, 0xE8, 0xB3, 0x81, 0xF6, + 0xE8, 0xB4, 0x9B, 0xF6, 0xE8, 0xB5, 0xB7, 0xF6, + 0xF0, 0xA7, 0xBC, 0xAF, 0xF6, 0xF0, 0xA0, 0xA0, + 0x84, 0xF6, 0xE8, 0xB7, 0x8B, 0xF6, 0xE8, 0xB6, + 0xBC, 0xF6, 0xE8, 0xB7, 0xB0, 0xF6, 0xF0, 0xA0, + 0xA3, 0x9E, 0xF6, 0xE8, 0xBB, 0x94, 0xF6, 0xE8, + 0xBC, 0xB8, 0xF6, 0xF0, 0xA8, 0x97, 0x92, 0xF6, + 0xF0, 0xA8, 0x97, 0xAD, 0xF6, 0xE9, 0x82, 0x94, + 0xF6, 0xE9, 0x83, 0xB1, 0xF6, 0xE9, 0x84, 0x91, + 0xF6, 0xF0, 0xA8, 0x9C, 0xAE, 0xF6, 0xE9, 0x84, + 0x9B, 0xF6, 0xE9, 0x88, 0xB8, 0xF6, 0xE9, 0x8B, + 0x97, 0xF6, 0xE9, 0x8B, 0x98, 0xF6, 0xE9, 0x89, + 0xBC, 0xF6, 0xE9, 0x8F, 0xB9, 0xF6, 0xE9, 0x90, + 0x95, 0xF6, 0xF0, 0xA8, 0xAF, 0xBA, 0xF6, 0xE9, + 0x96, 0x8B, 0xF6, 0xE4, 0xA6, 0x95, 0xF6, 0xE9, + 0x96, 0xB7, 0xF6, 0xF0, 0xA8, 0xB5, 0xB7, 0xF6, + 0xE4, 0xA7, 0xA6, 0xF6, 0xE9, 0x9B, 0x83, 0xF6, + 0xE5, 0xB6, 0xB2, 0xF6, 0xE9, 0x9C, 0xA3, 0xF6, + 0xF0, 0xA9, 0x85, 0x85, 0xF6, 0xF0, 0xA9, 0x88, + 0x9A, 0xF6, 0xE4, 0xA9, 0xAE, 0xF6, 0xE4, 0xA9, + 0xB6, 0xF6, 0xE9, 0x9F, 0xA0, 0xF6, 0xF0, 0xA9, + 0x90, 0x8A, 0xF6, 0xE4, 0xAA, 0xB2, 0xF6, 0xF0, + 0xA9, 0x92, 0x96, 0xF6, 0xE9, 0xA0, 0x8B, 0xF6, + 0xE9, 0xA0, 0x8B, 0xF6, 0xE9, 0xA0, 0xA9, 0xF6, + 0xF0, 0xA9, 0x96, 0xB6, 0xF6, 0xE9, 0xA3, 0xA2, + 0xF6, 0xE4, 0xAC, 0xB3, 0xF6, 0xE9, 0xA4, 0xA9, + 0xF6, 0xE9, 0xA6, 0xA7, 0xF6, 0xE9, 0xA7, 0x82, + 0xF6, 0xE9, 0xA7, 0xBE, 0xF6, 0xE4, 0xAF, 0x8E, + 0xF6, 0xF0, 0xA9, 0xAC, 0xB0, 0xF6, 0xE9, 0xAC, + 0x92, 0xF6, 0xE9, 0xB1, 0x80, 0xF6, 0xE9, 0xB3, + 0xBD, 0xF6, 0xE4, 0xB3, 0x8E, 0xF6, 0xE4, 0xB3, + 0xAD, 0xF6, 0xE9, 0xB5, 0xA7, 0xF6, 0xF0, 0xAA, + 0x83, 0x8E, 0xF6, 0xE4, 0xB3, 0xB8, 0xF6, 0xF0, + 0xAA, 0x84, 0x85, 0xF6, 0xF0, 0xAA, 0x88, 0x8E, + 0xF6, 0xF0, 0xAA, 0x8A, 0x91, 0xF6, 0xE9, 0xBA, + 0xBB, 0xF6, 0xE4, 0xB5, 0x96, 0xF6, 0xE9, 0xBB, + 0xB9, 0xF6, 0xE9, 0xBB, 0xBE, 0xF6, 0xE9, 0xBC, + 0x85, 0xF6, 0xE9, 0xBC, 0x8F, 0xF6, 0xE9, 0xBC, + 0x96, 0xF6, 0xE9, 0xBC, 0xBB, 0xF6, 0xF0, 0xAA, + 0x98, 0x80, + }, +}; + +static const uchar_t u8_case_common_b2_tbl[2][2][256] = { + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, 1, 2, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 3, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 4, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, 1, 2, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 3, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 4, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + +}; + +static const u8_displacement_t u8_tolower_b3_tbl[2][5][256] = { + { + { /* Third byte table 0. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { 0, 0 }, + { 1, 60 }, { 2, 123 }, { 3, 185 }, { 4, 257 }, + { 5, 321 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 6, 373 }, { 7, 439 }, + { 8, 465 }, { 9, 561 }, { 10, 593 }, { 11, 649 }, + { 12, 703 }, { 13, 749 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 14, 795 }, { 15, 891 }, { 16, 987 }, { 17, 1068 }, + { 18, 1155 }, { 19, 1245 }, { 20, 1299 }, { 21, 1386 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 22, 1443 }, { 23, 1448 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 24, 1496 }, { 25, 1526 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 26, 1574 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 27, 1652 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + }, + { + { /* Third byte table 0. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { 0, 0 }, + { 1, 60 }, { 2, 123 }, { 3, 185 }, { 4, 257 }, + { 5, 321 }, { 6, 383 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 7, 401 }, { 8, 467 }, + { 9, 505 }, { 10, 601 }, { 11, 633 }, { 12, 689 }, + { 13, 753 }, { 14, 803 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 15, 849 }, { 16, 945 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 17, 963 }, { 18, 1059 }, { 19, 1155 }, { 20, 1236 }, + { 21, 1323 }, { 22, 1413 }, { 23, 1467 }, { 24, 1554 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 25, 1611 }, { 26, 1619 }, { 27, 1667 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 28, 1670 }, { 29, 1700 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 30, 1748 }, { 31, 1889 }, { 32, 1911 }, { 33, 2007 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 34, 2061 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 35, 2139 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + }, +}; + +static const uchar_t u8_tolower_b4_tbl[2][36][257] = { + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 46, 48, 50, 52, 54, 56, 58, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 49, 49, 51, 51, 53, 53, 55, + 55, 55, 57, 57, 59, 59, 61, 61, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 8, 10, 10, 12, 12, 14, + 14, 16, 16, 18, 18, 20, 20, 22, + 22, 24, 24, 26, 26, 28, 28, 30, + 30, 32, 32, 34, 34, 36, 36, 38, + 38, 40, 40, 42, 42, 44, 44, 46, + 46, 48, 48, 50, 50, 52, 52, 54, + 54, 56, 58, 58, 60, 60, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 4, 4, 6, 6, 8, + 10, 10, 12, 14, 16, 16, 16, 18, + 20, 22, 24, 24, 26, 28, 28, 30, + 32, 34, 34, 34, 34, 36, 38, 38, + 40, 42, 42, 44, 44, 46, 46, 48, + 50, 50, 52, 52, 52, 54, 54, 56, + 58, 58, 60, 62, 64, 64, 66, 66, + 68, 70, 70, 70, 70, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 4, 4, + 6, 8, 8, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 28, 30, + 30, 32, 32, 34, 34, 36, 36, 38, + 38, 40, 40, 42, 42, 44, 44, 46, + 46, 46, 48, 50, 50, 52, 52, 54, + 56, 58, 58, 60, 60, 62, 62, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 50, 50, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, + 2, 4, 6, 8, 8, 10, 10, 12, + 14, 14, 16, 18, 20, 22, 24, 26, + 28, 30, 32, 34, 36, 38, 40, 42, + 44, 46, 48, 48, 50, 52, 54, 56, + 58, 60, 62, 64, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 24, 24, 24, 24, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 50, 50, 52, 52, 54, 54, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 14, 16, 16, 18, 18, 20, 20, 22, + 22, 24, 24, 26, 26, 28, 28, 30, + 30, 32, 32, 34, 34, 36, 36, 38, + 38, 40, 40, 42, 42, 44, 44, 46, + 46, 48, 48, 50, 50, 52, 52, 52, + 52, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 18, 20, 22, 24, 26, 28, + 30, 32, 34, 36, 38, 40, 42, 44, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 90, 90, 93, 93, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 90, 90, 93, 93, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, + 33, 36, 36, 39, 39, 42, 42, 45, + 45, 48, 48, 51, 51, 54, 54, 57, + 57, 60, 60, 63, 63, 66, 66, 69, + 69, 72, 72, 75, 75, 78, 78, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 45, 48, 51, 54, 57, 60, 63, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 69, 72, 75, 78, 81, 84, 87, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 21, 21, 24, 24, 27, 27, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 33, 36, 39, 42, 45, 48, 51, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 75, 78, 81, 84, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 18, 21, 24, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 30, 33, 36, 39, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 45, 48, 51, 54, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, + 2, 2, 2, 3, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 3, + 6, 9, 12, 15, 18, 21, 24, 27, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 6, 9, 12, 15, 18, + 21, 24, 27, 30, 33, 36, 39, 42, + 45, 48, 51, 54, 57, 60, 63, 66, + 69, 72, 75, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + }, + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 46, 48, 50, 52, 54, 56, 58, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 49, 49, 51, 51, 53, 53, 55, + 55, 55, 57, 57, 59, 59, 61, 61, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 8, 10, 10, 12, 12, 14, + 14, 16, 16, 18, 18, 20, 20, 22, + 22, 24, 24, 26, 26, 28, 28, 30, + 30, 32, 32, 34, 34, 36, 36, 38, + 38, 40, 40, 42, 42, 44, 44, 46, + 46, 48, 48, 50, 50, 52, 52, 54, + 54, 56, 58, 58, 60, 60, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 4, 4, 6, 6, 8, + 10, 10, 12, 14, 16, 16, 16, 18, + 20, 22, 24, 24, 26, 28, 28, 30, + 32, 34, 34, 34, 34, 36, 38, 38, + 40, 42, 42, 44, 44, 46, 46, 48, + 50, 50, 52, 52, 52, 54, 54, 56, + 58, 58, 60, 62, 64, 64, 66, 66, + 68, 70, 70, 70, 70, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 4, 4, + 6, 8, 8, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 28, 30, + 30, 32, 32, 34, 34, 36, 36, 38, + 38, 40, 40, 42, 42, 44, 44, 46, + 46, 46, 48, 50, 50, 52, 52, 54, + 56, 58, 58, 60, 60, 62, 62, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 50, 50, 52, 52, 52, 52, 52, + 52, 52, 52, 55, 57, 57, 59, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 6, 8, 10, + 10, 12, 12, 14, 14, 16, 16, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, + 2, 4, 6, 8, 8, 10, 10, 12, + 14, 14, 16, 18, 20, 22, 24, 26, + 28, 30, 32, 34, 36, 38, 40, 42, + 44, 46, 48, 48, 50, 52, 54, 56, + 58, 60, 62, 64, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 24, 24, 24, 24, 26, 26, 26, + 28, 28, 30, 32, 32, 32, 34, 36, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 50, 50, 52, 52, 54, 54, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 4, 6, 6, 8, 8, + 10, 10, 12, 12, 14, 14, 16, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 50, 50, 52, 52, 54, 54, 56, + 56, 58, 58, 60, 60, 62, 62, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 90, 90, 93, 93, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 90, 90, 93, 93, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, + 33, 36, 36, 39, 39, 42, 42, 45, + 45, 48, 48, 51, 51, 54, 54, 57, + 57, 60, 60, 63, 63, 66, 66, 69, + 69, 72, 72, 75, 75, 78, 78, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 45, 48, 51, 54, 57, 60, 63, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 69, 72, 75, 78, 81, 84, 87, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 21, 21, 24, 24, 27, 27, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 33, 36, 39, 42, 45, 48, 51, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 75, 78, 81, 84, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 18, 21, 24, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 30, 33, 36, 39, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 45, 48, 51, 54, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, + 2, 2, 2, 3, 5, 5, 5, 5, + 5, 5, 5, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 3, + 6, 9, 12, 15, 18, 21, 24, 27, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 5, 8, 10, 10, 10, + 13, 13, 16, 16, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 90, 90, 93, 93, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 6, 9, 12, 15, 18, + 21, 24, 27, 30, 33, 36, 39, 42, + 45, 48, 51, 54, 57, 60, 63, 66, + 69, 72, 75, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, + }, + }, +}; + +static const uchar_t u8_tolower_final_tbl[2][2299] = { + { + 0xC3, 0xA0, 0xC3, 0xA1, 0xC3, 0xA2, 0xC3, 0xA3, + 0xC3, 0xA4, 0xC3, 0xA5, 0xC3, 0xA6, 0xC3, 0xA7, + 0xC3, 0xA8, 0xC3, 0xA9, 0xC3, 0xAA, 0xC3, 0xAB, + 0xC3, 0xAC, 0xC3, 0xAD, 0xC3, 0xAE, 0xC3, 0xAF, + 0xC3, 0xB0, 0xC3, 0xB1, 0xC3, 0xB2, 0xC3, 0xB3, + 0xC3, 0xB4, 0xC3, 0xB5, 0xC3, 0xB6, 0xC3, 0xB8, + 0xC3, 0xB9, 0xC3, 0xBA, 0xC3, 0xBB, 0xC3, 0xBC, + 0xC3, 0xBD, 0xC3, 0xBE, 0xC4, 0x81, 0xC4, 0x83, + 0xC4, 0x85, 0xC4, 0x87, 0xC4, 0x89, 0xC4, 0x8B, + 0xC4, 0x8D, 0xC4, 0x8F, 0xC4, 0x91, 0xC4, 0x93, + 0xC4, 0x95, 0xC4, 0x97, 0xC4, 0x99, 0xC4, 0x9B, + 0xC4, 0x9D, 0xC4, 0x9F, 0xC4, 0xA1, 0xC4, 0xA3, + 0xC4, 0xA5, 0xC4, 0xA7, 0xC4, 0xA9, 0xC4, 0xAB, + 0xC4, 0xAD, 0xC4, 0xAF, 0x69, 0xC4, 0xB3, 0xC4, + 0xB5, 0xC4, 0xB7, 0xC4, 0xBA, 0xC4, 0xBC, 0xC4, + 0xBE, 0xC5, 0x80, 0xC5, 0x82, 0xC5, 0x84, 0xC5, + 0x86, 0xC5, 0x88, 0xC5, 0x8B, 0xC5, 0x8D, 0xC5, + 0x8F, 0xC5, 0x91, 0xC5, 0x93, 0xC5, 0x95, 0xC5, + 0x97, 0xC5, 0x99, 0xC5, 0x9B, 0xC5, 0x9D, 0xC5, + 0x9F, 0xC5, 0xA1, 0xC5, 0xA3, 0xC5, 0xA5, 0xC5, + 0xA7, 0xC5, 0xA9, 0xC5, 0xAB, 0xC5, 0xAD, 0xC5, + 0xAF, 0xC5, 0xB1, 0xC5, 0xB3, 0xC5, 0xB5, 0xC5, + 0xB7, 0xC3, 0xBF, 0xC5, 0xBA, 0xC5, 0xBC, 0xC5, + 0xBE, 0xC9, 0x93, 0xC6, 0x83, 0xC6, 0x85, 0xC9, + 0x94, 0xC6, 0x88, 0xC9, 0x96, 0xC9, 0x97, 0xC6, + 0x8C, 0xC7, 0x9D, 0xC9, 0x99, 0xC9, 0x9B, 0xC6, + 0x92, 0xC9, 0xA0, 0xC9, 0xA3, 0xC9, 0xA9, 0xC9, + 0xA8, 0xC6, 0x99, 0xC9, 0xAF, 0xC9, 0xB2, 0xC9, + 0xB5, 0xC6, 0xA1, 0xC6, 0xA3, 0xC6, 0xA5, 0xCA, + 0x80, 0xC6, 0xA8, 0xCA, 0x83, 0xC6, 0xAD, 0xCA, + 0x88, 0xC6, 0xB0, 0xCA, 0x8A, 0xCA, 0x8B, 0xC6, + 0xB4, 0xC6, 0xB6, 0xCA, 0x92, 0xC6, 0xB9, 0xC6, + 0xBD, 0xC7, 0x86, 0xC7, 0x86, 0xC7, 0x89, 0xC7, + 0x89, 0xC7, 0x8C, 0xC7, 0x8C, 0xC7, 0x8E, 0xC7, + 0x90, 0xC7, 0x92, 0xC7, 0x94, 0xC7, 0x96, 0xC7, + 0x98, 0xC7, 0x9A, 0xC7, 0x9C, 0xC7, 0x9F, 0xC7, + 0xA1, 0xC7, 0xA3, 0xC7, 0xA5, 0xC7, 0xA7, 0xC7, + 0xA9, 0xC7, 0xAB, 0xC7, 0xAD, 0xC7, 0xAF, 0xC7, + 0xB3, 0xC7, 0xB3, 0xC7, 0xB5, 0xC6, 0x95, 0xC6, + 0xBF, 0xC7, 0xB9, 0xC7, 0xBB, 0xC7, 0xBD, 0xC7, + 0xBF, 0xC8, 0x81, 0xC8, 0x83, 0xC8, 0x85, 0xC8, + 0x87, 0xC8, 0x89, 0xC8, 0x8B, 0xC8, 0x8D, 0xC8, + 0x8F, 0xC8, 0x91, 0xC8, 0x93, 0xC8, 0x95, 0xC8, + 0x97, 0xC8, 0x99, 0xC8, 0x9B, 0xC8, 0x9D, 0xC8, + 0x9F, 0xC6, 0x9E, 0xC8, 0xA3, 0xC8, 0xA5, 0xC8, + 0xA7, 0xC8, 0xA9, 0xC8, 0xAB, 0xC8, 0xAD, 0xC8, + 0xAF, 0xC8, 0xB1, 0xC8, 0xB3, 0xCE, 0xAC, 0xCE, + 0xAD, 0xCE, 0xAE, 0xCE, 0xAF, 0xCF, 0x8C, 0xCF, + 0x8D, 0xCF, 0x8E, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, + 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, + 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, + 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, + 0xBF, 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x83, 0xCF, + 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xCF, + 0x88, 0xCF, 0x89, 0xCF, 0x8A, 0xCF, 0x8B, 0xCF, + 0x99, 0xCF, 0x9B, 0xCF, 0x9D, 0xCF, 0x9F, 0xCF, + 0xA1, 0xCF, 0xA3, 0xCF, 0xA5, 0xCF, 0xA7, 0xCF, + 0xA9, 0xCF, 0xAB, 0xCF, 0xAD, 0xCF, 0xAF, 0xCE, + 0xB8, 0xD1, 0x90, 0xD1, 0x91, 0xD1, 0x92, 0xD1, + 0x93, 0xD1, 0x94, 0xD1, 0x95, 0xD1, 0x96, 0xD1, + 0x97, 0xD1, 0x98, 0xD1, 0x99, 0xD1, 0x9A, 0xD1, + 0x9B, 0xD1, 0x9C, 0xD1, 0x9D, 0xD1, 0x9E, 0xD1, + 0x9F, 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0, + 0xB3, 0xD0, 0xB4, 0xD0, 0xB5, 0xD0, 0xB6, 0xD0, + 0xB7, 0xD0, 0xB8, 0xD0, 0xB9, 0xD0, 0xBA, 0xD0, + 0xBB, 0xD0, 0xBC, 0xD0, 0xBD, 0xD0, 0xBE, 0xD0, + 0xBF, 0xD1, 0x80, 0xD1, 0x81, 0xD1, 0x82, 0xD1, + 0x83, 0xD1, 0x84, 0xD1, 0x85, 0xD1, 0x86, 0xD1, + 0x87, 0xD1, 0x88, 0xD1, 0x89, 0xD1, 0x8A, 0xD1, + 0x8B, 0xD1, 0x8C, 0xD1, 0x8D, 0xD1, 0x8E, 0xD1, + 0x8F, 0xD1, 0xA1, 0xD1, 0xA3, 0xD1, 0xA5, 0xD1, + 0xA7, 0xD1, 0xA9, 0xD1, 0xAB, 0xD1, 0xAD, 0xD1, + 0xAF, 0xD1, 0xB1, 0xD1, 0xB3, 0xD1, 0xB5, 0xD1, + 0xB7, 0xD1, 0xB9, 0xD1, 0xBB, 0xD1, 0xBD, 0xD1, + 0xBF, 0xD2, 0x81, 0xD2, 0x8B, 0xD2, 0x8D, 0xD2, + 0x8F, 0xD2, 0x91, 0xD2, 0x93, 0xD2, 0x95, 0xD2, + 0x97, 0xD2, 0x99, 0xD2, 0x9B, 0xD2, 0x9D, 0xD2, + 0x9F, 0xD2, 0xA1, 0xD2, 0xA3, 0xD2, 0xA5, 0xD2, + 0xA7, 0xD2, 0xA9, 0xD2, 0xAB, 0xD2, 0xAD, 0xD2, + 0xAF, 0xD2, 0xB1, 0xD2, 0xB3, 0xD2, 0xB5, 0xD2, + 0xB7, 0xD2, 0xB9, 0xD2, 0xBB, 0xD2, 0xBD, 0xD2, + 0xBF, 0xD3, 0x82, 0xD3, 0x84, 0xD3, 0x86, 0xD3, + 0x88, 0xD3, 0x8A, 0xD3, 0x8C, 0xD3, 0x8E, 0xD3, + 0x91, 0xD3, 0x93, 0xD3, 0x95, 0xD3, 0x97, 0xD3, + 0x99, 0xD3, 0x9B, 0xD3, 0x9D, 0xD3, 0x9F, 0xD3, + 0xA1, 0xD3, 0xA3, 0xD3, 0xA5, 0xD3, 0xA7, 0xD3, + 0xA9, 0xD3, 0xAB, 0xD3, 0xAD, 0xD3, 0xAF, 0xD3, + 0xB1, 0xD3, 0xB3, 0xD3, 0xB5, 0xD3, 0xB9, 0xD4, + 0x81, 0xD4, 0x83, 0xD4, 0x85, 0xD4, 0x87, 0xD4, + 0x89, 0xD4, 0x8B, 0xD4, 0x8D, 0xD4, 0x8F, 0xD5, + 0xA1, 0xD5, 0xA2, 0xD5, 0xA3, 0xD5, 0xA4, 0xD5, + 0xA5, 0xD5, 0xA6, 0xD5, 0xA7, 0xD5, 0xA8, 0xD5, + 0xA9, 0xD5, 0xAA, 0xD5, 0xAB, 0xD5, 0xAC, 0xD5, + 0xAD, 0xD5, 0xAE, 0xD5, 0xAF, 0xD5, 0xB0, 0xD5, + 0xB1, 0xD5, 0xB2, 0xD5, 0xB3, 0xD5, 0xB4, 0xD5, + 0xB5, 0xD5, 0xB6, 0xD5, 0xB7, 0xD5, 0xB8, 0xD5, + 0xB9, 0xD5, 0xBA, 0xD5, 0xBB, 0xD5, 0xBC, 0xD5, + 0xBD, 0xD5, 0xBE, 0xD5, 0xBF, 0xD6, 0x80, 0xD6, + 0x81, 0xD6, 0x82, 0xD6, 0x83, 0xD6, 0x84, 0xD6, + 0x85, 0xD6, 0x86, 0xE1, 0xB8, 0x81, 0xE1, 0xB8, + 0x83, 0xE1, 0xB8, 0x85, 0xE1, 0xB8, 0x87, 0xE1, + 0xB8, 0x89, 0xE1, 0xB8, 0x8B, 0xE1, 0xB8, 0x8D, + 0xE1, 0xB8, 0x8F, 0xE1, 0xB8, 0x91, 0xE1, 0xB8, + 0x93, 0xE1, 0xB8, 0x95, 0xE1, 0xB8, 0x97, 0xE1, + 0xB8, 0x99, 0xE1, 0xB8, 0x9B, 0xE1, 0xB8, 0x9D, + 0xE1, 0xB8, 0x9F, 0xE1, 0xB8, 0xA1, 0xE1, 0xB8, + 0xA3, 0xE1, 0xB8, 0xA5, 0xE1, 0xB8, 0xA7, 0xE1, + 0xB8, 0xA9, 0xE1, 0xB8, 0xAB, 0xE1, 0xB8, 0xAD, + 0xE1, 0xB8, 0xAF, 0xE1, 0xB8, 0xB1, 0xE1, 0xB8, + 0xB3, 0xE1, 0xB8, 0xB5, 0xE1, 0xB8, 0xB7, 0xE1, + 0xB8, 0xB9, 0xE1, 0xB8, 0xBB, 0xE1, 0xB8, 0xBD, + 0xE1, 0xB8, 0xBF, 0xE1, 0xB9, 0x81, 0xE1, 0xB9, + 0x83, 0xE1, 0xB9, 0x85, 0xE1, 0xB9, 0x87, 0xE1, + 0xB9, 0x89, 0xE1, 0xB9, 0x8B, 0xE1, 0xB9, 0x8D, + 0xE1, 0xB9, 0x8F, 0xE1, 0xB9, 0x91, 0xE1, 0xB9, + 0x93, 0xE1, 0xB9, 0x95, 0xE1, 0xB9, 0x97, 0xE1, + 0xB9, 0x99, 0xE1, 0xB9, 0x9B, 0xE1, 0xB9, 0x9D, + 0xE1, 0xB9, 0x9F, 0xE1, 0xB9, 0xA1, 0xE1, 0xB9, + 0xA3, 0xE1, 0xB9, 0xA5, 0xE1, 0xB9, 0xA7, 0xE1, + 0xB9, 0xA9, 0xE1, 0xB9, 0xAB, 0xE1, 0xB9, 0xAD, + 0xE1, 0xB9, 0xAF, 0xE1, 0xB9, 0xB1, 0xE1, 0xB9, + 0xB3, 0xE1, 0xB9, 0xB5, 0xE1, 0xB9, 0xB7, 0xE1, + 0xB9, 0xB9, 0xE1, 0xB9, 0xBB, 0xE1, 0xB9, 0xBD, + 0xE1, 0xB9, 0xBF, 0xE1, 0xBA, 0x81, 0xE1, 0xBA, + 0x83, 0xE1, 0xBA, 0x85, 0xE1, 0xBA, 0x87, 0xE1, + 0xBA, 0x89, 0xE1, 0xBA, 0x8B, 0xE1, 0xBA, 0x8D, + 0xE1, 0xBA, 0x8F, 0xE1, 0xBA, 0x91, 0xE1, 0xBA, + 0x93, 0xE1, 0xBA, 0x95, 0xE1, 0xBA, 0xA1, 0xE1, + 0xBA, 0xA3, 0xE1, 0xBA, 0xA5, 0xE1, 0xBA, 0xA7, + 0xE1, 0xBA, 0xA9, 0xE1, 0xBA, 0xAB, 0xE1, 0xBA, + 0xAD, 0xE1, 0xBA, 0xAF, 0xE1, 0xBA, 0xB1, 0xE1, + 0xBA, 0xB3, 0xE1, 0xBA, 0xB5, 0xE1, 0xBA, 0xB7, + 0xE1, 0xBA, 0xB9, 0xE1, 0xBA, 0xBB, 0xE1, 0xBA, + 0xBD, 0xE1, 0xBA, 0xBF, 0xE1, 0xBB, 0x81, 0xE1, + 0xBB, 0x83, 0xE1, 0xBB, 0x85, 0xE1, 0xBB, 0x87, + 0xE1, 0xBB, 0x89, 0xE1, 0xBB, 0x8B, 0xE1, 0xBB, + 0x8D, 0xE1, 0xBB, 0x8F, 0xE1, 0xBB, 0x91, 0xE1, + 0xBB, 0x93, 0xE1, 0xBB, 0x95, 0xE1, 0xBB, 0x97, + 0xE1, 0xBB, 0x99, 0xE1, 0xBB, 0x9B, 0xE1, 0xBB, + 0x9D, 0xE1, 0xBB, 0x9F, 0xE1, 0xBB, 0xA1, 0xE1, + 0xBB, 0xA3, 0xE1, 0xBB, 0xA5, 0xE1, 0xBB, 0xA7, + 0xE1, 0xBB, 0xA9, 0xE1, 0xBB, 0xAB, 0xE1, 0xBB, + 0xAD, 0xE1, 0xBB, 0xAF, 0xE1, 0xBB, 0xB1, 0xE1, + 0xBB, 0xB3, 0xE1, 0xBB, 0xB5, 0xE1, 0xBB, 0xB7, + 0xE1, 0xBB, 0xB9, 0xE1, 0xBC, 0x80, 0xE1, 0xBC, + 0x81, 0xE1, 0xBC, 0x82, 0xE1, 0xBC, 0x83, 0xE1, + 0xBC, 0x84, 0xE1, 0xBC, 0x85, 0xE1, 0xBC, 0x86, + 0xE1, 0xBC, 0x87, 0xE1, 0xBC, 0x90, 0xE1, 0xBC, + 0x91, 0xE1, 0xBC, 0x92, 0xE1, 0xBC, 0x93, 0xE1, + 0xBC, 0x94, 0xE1, 0xBC, 0x95, 0xE1, 0xBC, 0xA0, + 0xE1, 0xBC, 0xA1, 0xE1, 0xBC, 0xA2, 0xE1, 0xBC, + 0xA3, 0xE1, 0xBC, 0xA4, 0xE1, 0xBC, 0xA5, 0xE1, + 0xBC, 0xA6, 0xE1, 0xBC, 0xA7, 0xE1, 0xBC, 0xB0, + 0xE1, 0xBC, 0xB1, 0xE1, 0xBC, 0xB2, 0xE1, 0xBC, + 0xB3, 0xE1, 0xBC, 0xB4, 0xE1, 0xBC, 0xB5, 0xE1, + 0xBC, 0xB6, 0xE1, 0xBC, 0xB7, 0xE1, 0xBD, 0x80, + 0xE1, 0xBD, 0x81, 0xE1, 0xBD, 0x82, 0xE1, 0xBD, + 0x83, 0xE1, 0xBD, 0x84, 0xE1, 0xBD, 0x85, 0xE1, + 0xBD, 0x91, 0xE1, 0xBD, 0x93, 0xE1, 0xBD, 0x95, + 0xE1, 0xBD, 0x97, 0xE1, 0xBD, 0xA0, 0xE1, 0xBD, + 0xA1, 0xE1, 0xBD, 0xA2, 0xE1, 0xBD, 0xA3, 0xE1, + 0xBD, 0xA4, 0xE1, 0xBD, 0xA5, 0xE1, 0xBD, 0xA6, + 0xE1, 0xBD, 0xA7, 0xE1, 0xBE, 0x80, 0xE1, 0xBE, + 0x81, 0xE1, 0xBE, 0x82, 0xE1, 0xBE, 0x83, 0xE1, + 0xBE, 0x84, 0xE1, 0xBE, 0x85, 0xE1, 0xBE, 0x86, + 0xE1, 0xBE, 0x87, 0xE1, 0xBE, 0x90, 0xE1, 0xBE, + 0x91, 0xE1, 0xBE, 0x92, 0xE1, 0xBE, 0x93, 0xE1, + 0xBE, 0x94, 0xE1, 0xBE, 0x95, 0xE1, 0xBE, 0x96, + 0xE1, 0xBE, 0x97, 0xE1, 0xBE, 0xA0, 0xE1, 0xBE, + 0xA1, 0xE1, 0xBE, 0xA2, 0xE1, 0xBE, 0xA3, 0xE1, + 0xBE, 0xA4, 0xE1, 0xBE, 0xA5, 0xE1, 0xBE, 0xA6, + 0xE1, 0xBE, 0xA7, 0xE1, 0xBE, 0xB0, 0xE1, 0xBE, + 0xB1, 0xE1, 0xBD, 0xB0, 0xE1, 0xBD, 0xB1, 0xE1, + 0xBE, 0xB3, 0xE1, 0xBD, 0xB2, 0xE1, 0xBD, 0xB3, + 0xE1, 0xBD, 0xB4, 0xE1, 0xBD, 0xB5, 0xE1, 0xBF, + 0x83, 0xE1, 0xBF, 0x90, 0xE1, 0xBF, 0x91, 0xE1, + 0xBD, 0xB6, 0xE1, 0xBD, 0xB7, 0xE1, 0xBF, 0xA0, + 0xE1, 0xBF, 0xA1, 0xE1, 0xBD, 0xBA, 0xE1, 0xBD, + 0xBB, 0xE1, 0xBF, 0xA5, 0xE1, 0xBD, 0xB8, 0xE1, + 0xBD, 0xB9, 0xE1, 0xBD, 0xBC, 0xE1, 0xBD, 0xBD, + 0xE1, 0xBF, 0xB3, 0xCF, 0x89, 0x6B, 0xC3, 0xA5, + 0xE2, 0x85, 0xB0, 0xE2, 0x85, 0xB1, 0xE2, 0x85, + 0xB2, 0xE2, 0x85, 0xB3, 0xE2, 0x85, 0xB4, 0xE2, + 0x85, 0xB5, 0xE2, 0x85, 0xB6, 0xE2, 0x85, 0xB7, + 0xE2, 0x85, 0xB8, 0xE2, 0x85, 0xB9, 0xE2, 0x85, + 0xBA, 0xE2, 0x85, 0xBB, 0xE2, 0x85, 0xBC, 0xE2, + 0x85, 0xBD, 0xE2, 0x85, 0xBE, 0xE2, 0x85, 0xBF, + 0xE2, 0x93, 0x90, 0xE2, 0x93, 0x91, 0xE2, 0x93, + 0x92, 0xE2, 0x93, 0x93, 0xE2, 0x93, 0x94, 0xE2, + 0x93, 0x95, 0xE2, 0x93, 0x96, 0xE2, 0x93, 0x97, + 0xE2, 0x93, 0x98, 0xE2, 0x93, 0x99, 0xE2, 0x93, + 0x9A, 0xE2, 0x93, 0x9B, 0xE2, 0x93, 0x9C, 0xE2, + 0x93, 0x9D, 0xE2, 0x93, 0x9E, 0xE2, 0x93, 0x9F, + 0xE2, 0x93, 0xA0, 0xE2, 0x93, 0xA1, 0xE2, 0x93, + 0xA2, 0xE2, 0x93, 0xA3, 0xE2, 0x93, 0xA4, 0xE2, + 0x93, 0xA5, 0xE2, 0x93, 0xA6, 0xE2, 0x93, 0xA7, + 0xE2, 0x93, 0xA8, 0xE2, 0x93, 0xA9, 0xEF, 0xBD, + 0x81, 0xEF, 0xBD, 0x82, 0xEF, 0xBD, 0x83, 0xEF, + 0xBD, 0x84, 0xEF, 0xBD, 0x85, 0xEF, 0xBD, 0x86, + 0xEF, 0xBD, 0x87, 0xEF, 0xBD, 0x88, 0xEF, 0xBD, + 0x89, 0xEF, 0xBD, 0x8A, 0xEF, 0xBD, 0x8B, 0xEF, + 0xBD, 0x8C, 0xEF, 0xBD, 0x8D, 0xEF, 0xBD, 0x8E, + 0xEF, 0xBD, 0x8F, 0xEF, 0xBD, 0x90, 0xEF, 0xBD, + 0x91, 0xEF, 0xBD, 0x92, 0xEF, 0xBD, 0x93, 0xEF, + 0xBD, 0x94, 0xEF, 0xBD, 0x95, 0xEF, 0xBD, 0x96, + 0xEF, 0xBD, 0x97, 0xEF, 0xBD, 0x98, 0xEF, 0xBD, + 0x99, 0xEF, 0xBD, 0x9A, 0xF0, 0x90, 0x90, 0xA8, + 0xF0, 0x90, 0x90, 0xA9, 0xF0, 0x90, 0x90, 0xAA, + 0xF0, 0x90, 0x90, 0xAB, 0xF0, 0x90, 0x90, 0xAC, + 0xF0, 0x90, 0x90, 0xAD, 0xF0, 0x90, 0x90, 0xAE, + 0xF0, 0x90, 0x90, 0xAF, 0xF0, 0x90, 0x90, 0xB0, + 0xF0, 0x90, 0x90, 0xB1, 0xF0, 0x90, 0x90, 0xB2, + 0xF0, 0x90, 0x90, 0xB3, 0xF0, 0x90, 0x90, 0xB4, + 0xF0, 0x90, 0x90, 0xB5, 0xF0, 0x90, 0x90, 0xB6, + 0xF0, 0x90, 0x90, 0xB7, 0xF0, 0x90, 0x90, 0xB8, + 0xF0, 0x90, 0x90, 0xB9, 0xF0, 0x90, 0x90, 0xBA, + 0xF0, 0x90, 0x90, 0xBB, 0xF0, 0x90, 0x90, 0xBC, + 0xF0, 0x90, 0x90, 0xBD, 0xF0, 0x90, 0x90, 0xBE, + 0xF0, 0x90, 0x90, 0xBF, 0xF0, 0x90, 0x91, 0x80, + 0xF0, 0x90, 0x91, 0x81, 0xF0, 0x90, 0x91, 0x82, + 0xF0, 0x90, 0x91, 0x83, 0xF0, 0x90, 0x91, 0x84, + 0xF0, 0x90, 0x91, 0x85, 0xF0, 0x90, 0x91, 0x86, + 0xF0, 0x90, 0x91, 0x87, 0xF0, 0x90, 0x91, 0x88, + 0xF0, 0x90, 0x91, 0x89, 0xF0, 0x90, 0x91, 0x8A, + 0xF0, 0x90, 0x91, 0x8B, 0xF0, 0x90, 0x91, 0x8C, + 0xF0, 0x90, 0x91, 0x8D, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, + }, + { + 0xC3, 0xA0, 0xC3, 0xA1, 0xC3, 0xA2, 0xC3, 0xA3, + 0xC3, 0xA4, 0xC3, 0xA5, 0xC3, 0xA6, 0xC3, 0xA7, + 0xC3, 0xA8, 0xC3, 0xA9, 0xC3, 0xAA, 0xC3, 0xAB, + 0xC3, 0xAC, 0xC3, 0xAD, 0xC3, 0xAE, 0xC3, 0xAF, + 0xC3, 0xB0, 0xC3, 0xB1, 0xC3, 0xB2, 0xC3, 0xB3, + 0xC3, 0xB4, 0xC3, 0xB5, 0xC3, 0xB6, 0xC3, 0xB8, + 0xC3, 0xB9, 0xC3, 0xBA, 0xC3, 0xBB, 0xC3, 0xBC, + 0xC3, 0xBD, 0xC3, 0xBE, 0xC4, 0x81, 0xC4, 0x83, + 0xC4, 0x85, 0xC4, 0x87, 0xC4, 0x89, 0xC4, 0x8B, + 0xC4, 0x8D, 0xC4, 0x8F, 0xC4, 0x91, 0xC4, 0x93, + 0xC4, 0x95, 0xC4, 0x97, 0xC4, 0x99, 0xC4, 0x9B, + 0xC4, 0x9D, 0xC4, 0x9F, 0xC4, 0xA1, 0xC4, 0xA3, + 0xC4, 0xA5, 0xC4, 0xA7, 0xC4, 0xA9, 0xC4, 0xAB, + 0xC4, 0xAD, 0xC4, 0xAF, 0x69, 0xC4, 0xB3, 0xC4, + 0xB5, 0xC4, 0xB7, 0xC4, 0xBA, 0xC4, 0xBC, 0xC4, + 0xBE, 0xC5, 0x80, 0xC5, 0x82, 0xC5, 0x84, 0xC5, + 0x86, 0xC5, 0x88, 0xC5, 0x8B, 0xC5, 0x8D, 0xC5, + 0x8F, 0xC5, 0x91, 0xC5, 0x93, 0xC5, 0x95, 0xC5, + 0x97, 0xC5, 0x99, 0xC5, 0x9B, 0xC5, 0x9D, 0xC5, + 0x9F, 0xC5, 0xA1, 0xC5, 0xA3, 0xC5, 0xA5, 0xC5, + 0xA7, 0xC5, 0xA9, 0xC5, 0xAB, 0xC5, 0xAD, 0xC5, + 0xAF, 0xC5, 0xB1, 0xC5, 0xB3, 0xC5, 0xB5, 0xC5, + 0xB7, 0xC3, 0xBF, 0xC5, 0xBA, 0xC5, 0xBC, 0xC5, + 0xBE, 0xC9, 0x93, 0xC6, 0x83, 0xC6, 0x85, 0xC9, + 0x94, 0xC6, 0x88, 0xC9, 0x96, 0xC9, 0x97, 0xC6, + 0x8C, 0xC7, 0x9D, 0xC9, 0x99, 0xC9, 0x9B, 0xC6, + 0x92, 0xC9, 0xA0, 0xC9, 0xA3, 0xC9, 0xA9, 0xC9, + 0xA8, 0xC6, 0x99, 0xC9, 0xAF, 0xC9, 0xB2, 0xC9, + 0xB5, 0xC6, 0xA1, 0xC6, 0xA3, 0xC6, 0xA5, 0xCA, + 0x80, 0xC6, 0xA8, 0xCA, 0x83, 0xC6, 0xAD, 0xCA, + 0x88, 0xC6, 0xB0, 0xCA, 0x8A, 0xCA, 0x8B, 0xC6, + 0xB4, 0xC6, 0xB6, 0xCA, 0x92, 0xC6, 0xB9, 0xC6, + 0xBD, 0xC7, 0x86, 0xC7, 0x86, 0xC7, 0x89, 0xC7, + 0x89, 0xC7, 0x8C, 0xC7, 0x8C, 0xC7, 0x8E, 0xC7, + 0x90, 0xC7, 0x92, 0xC7, 0x94, 0xC7, 0x96, 0xC7, + 0x98, 0xC7, 0x9A, 0xC7, 0x9C, 0xC7, 0x9F, 0xC7, + 0xA1, 0xC7, 0xA3, 0xC7, 0xA5, 0xC7, 0xA7, 0xC7, + 0xA9, 0xC7, 0xAB, 0xC7, 0xAD, 0xC7, 0xAF, 0xC7, + 0xB3, 0xC7, 0xB3, 0xC7, 0xB5, 0xC6, 0x95, 0xC6, + 0xBF, 0xC7, 0xB9, 0xC7, 0xBB, 0xC7, 0xBD, 0xC7, + 0xBF, 0xC8, 0x81, 0xC8, 0x83, 0xC8, 0x85, 0xC8, + 0x87, 0xC8, 0x89, 0xC8, 0x8B, 0xC8, 0x8D, 0xC8, + 0x8F, 0xC8, 0x91, 0xC8, 0x93, 0xC8, 0x95, 0xC8, + 0x97, 0xC8, 0x99, 0xC8, 0x9B, 0xC8, 0x9D, 0xC8, + 0x9F, 0xC6, 0x9E, 0xC8, 0xA3, 0xC8, 0xA5, 0xC8, + 0xA7, 0xC8, 0xA9, 0xC8, 0xAB, 0xC8, 0xAD, 0xC8, + 0xAF, 0xC8, 0xB1, 0xC8, 0xB3, 0xE2, 0xB1, 0xA5, + 0xC8, 0xBC, 0xC6, 0x9A, 0xE2, 0xB1, 0xA6, 0xC9, + 0x82, 0xC6, 0x80, 0xCA, 0x89, 0xCA, 0x8C, 0xC9, + 0x87, 0xC9, 0x89, 0xC9, 0x8B, 0xC9, 0x8D, 0xC9, + 0x8F, 0xCE, 0xAC, 0xCE, 0xAD, 0xCE, 0xAE, 0xCE, + 0xAF, 0xCF, 0x8C, 0xCF, 0x8D, 0xCF, 0x8E, 0xCE, + 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, + 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, + 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, + 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, + 0x81, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, 0xCF, + 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, 0xCF, + 0x8A, 0xCF, 0x8B, 0xCF, 0x99, 0xCF, 0x9B, 0xCF, + 0x9D, 0xCF, 0x9F, 0xCF, 0xA1, 0xCF, 0xA3, 0xCF, + 0xA5, 0xCF, 0xA7, 0xCF, 0xA9, 0xCF, 0xAB, 0xCF, + 0xAD, 0xCF, 0xAF, 0xCE, 0xB8, 0xCF, 0xB8, 0xCF, + 0xB2, 0xCF, 0xBB, 0xCD, 0xBB, 0xCD, 0xBC, 0xCD, + 0xBD, 0xD1, 0x90, 0xD1, 0x91, 0xD1, 0x92, 0xD1, + 0x93, 0xD1, 0x94, 0xD1, 0x95, 0xD1, 0x96, 0xD1, + 0x97, 0xD1, 0x98, 0xD1, 0x99, 0xD1, 0x9A, 0xD1, + 0x9B, 0xD1, 0x9C, 0xD1, 0x9D, 0xD1, 0x9E, 0xD1, + 0x9F, 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0, + 0xB3, 0xD0, 0xB4, 0xD0, 0xB5, 0xD0, 0xB6, 0xD0, + 0xB7, 0xD0, 0xB8, 0xD0, 0xB9, 0xD0, 0xBA, 0xD0, + 0xBB, 0xD0, 0xBC, 0xD0, 0xBD, 0xD0, 0xBE, 0xD0, + 0xBF, 0xD1, 0x80, 0xD1, 0x81, 0xD1, 0x82, 0xD1, + 0x83, 0xD1, 0x84, 0xD1, 0x85, 0xD1, 0x86, 0xD1, + 0x87, 0xD1, 0x88, 0xD1, 0x89, 0xD1, 0x8A, 0xD1, + 0x8B, 0xD1, 0x8C, 0xD1, 0x8D, 0xD1, 0x8E, 0xD1, + 0x8F, 0xD1, 0xA1, 0xD1, 0xA3, 0xD1, 0xA5, 0xD1, + 0xA7, 0xD1, 0xA9, 0xD1, 0xAB, 0xD1, 0xAD, 0xD1, + 0xAF, 0xD1, 0xB1, 0xD1, 0xB3, 0xD1, 0xB5, 0xD1, + 0xB7, 0xD1, 0xB9, 0xD1, 0xBB, 0xD1, 0xBD, 0xD1, + 0xBF, 0xD2, 0x81, 0xD2, 0x8B, 0xD2, 0x8D, 0xD2, + 0x8F, 0xD2, 0x91, 0xD2, 0x93, 0xD2, 0x95, 0xD2, + 0x97, 0xD2, 0x99, 0xD2, 0x9B, 0xD2, 0x9D, 0xD2, + 0x9F, 0xD2, 0xA1, 0xD2, 0xA3, 0xD2, 0xA5, 0xD2, + 0xA7, 0xD2, 0xA9, 0xD2, 0xAB, 0xD2, 0xAD, 0xD2, + 0xAF, 0xD2, 0xB1, 0xD2, 0xB3, 0xD2, 0xB5, 0xD2, + 0xB7, 0xD2, 0xB9, 0xD2, 0xBB, 0xD2, 0xBD, 0xD2, + 0xBF, 0xD3, 0x8F, 0xD3, 0x82, 0xD3, 0x84, 0xD3, + 0x86, 0xD3, 0x88, 0xD3, 0x8A, 0xD3, 0x8C, 0xD3, + 0x8E, 0xD3, 0x91, 0xD3, 0x93, 0xD3, 0x95, 0xD3, + 0x97, 0xD3, 0x99, 0xD3, 0x9B, 0xD3, 0x9D, 0xD3, + 0x9F, 0xD3, 0xA1, 0xD3, 0xA3, 0xD3, 0xA5, 0xD3, + 0xA7, 0xD3, 0xA9, 0xD3, 0xAB, 0xD3, 0xAD, 0xD3, + 0xAF, 0xD3, 0xB1, 0xD3, 0xB3, 0xD3, 0xB5, 0xD3, + 0xB7, 0xD3, 0xB9, 0xD3, 0xBB, 0xD3, 0xBD, 0xD3, + 0xBF, 0xD4, 0x81, 0xD4, 0x83, 0xD4, 0x85, 0xD4, + 0x87, 0xD4, 0x89, 0xD4, 0x8B, 0xD4, 0x8D, 0xD4, + 0x8F, 0xD4, 0x91, 0xD4, 0x93, 0xD5, 0xA1, 0xD5, + 0xA2, 0xD5, 0xA3, 0xD5, 0xA4, 0xD5, 0xA5, 0xD5, + 0xA6, 0xD5, 0xA7, 0xD5, 0xA8, 0xD5, 0xA9, 0xD5, + 0xAA, 0xD5, 0xAB, 0xD5, 0xAC, 0xD5, 0xAD, 0xD5, + 0xAE, 0xD5, 0xAF, 0xD5, 0xB0, 0xD5, 0xB1, 0xD5, + 0xB2, 0xD5, 0xB3, 0xD5, 0xB4, 0xD5, 0xB5, 0xD5, + 0xB6, 0xD5, 0xB7, 0xD5, 0xB8, 0xD5, 0xB9, 0xD5, + 0xBA, 0xD5, 0xBB, 0xD5, 0xBC, 0xD5, 0xBD, 0xD5, + 0xBE, 0xD5, 0xBF, 0xD6, 0x80, 0xD6, 0x81, 0xD6, + 0x82, 0xD6, 0x83, 0xD6, 0x84, 0xD6, 0x85, 0xD6, + 0x86, 0xE2, 0xB4, 0x80, 0xE2, 0xB4, 0x81, 0xE2, + 0xB4, 0x82, 0xE2, 0xB4, 0x83, 0xE2, 0xB4, 0x84, + 0xE2, 0xB4, 0x85, 0xE2, 0xB4, 0x86, 0xE2, 0xB4, + 0x87, 0xE2, 0xB4, 0x88, 0xE2, 0xB4, 0x89, 0xE2, + 0xB4, 0x8A, 0xE2, 0xB4, 0x8B, 0xE2, 0xB4, 0x8C, + 0xE2, 0xB4, 0x8D, 0xE2, 0xB4, 0x8E, 0xE2, 0xB4, + 0x8F, 0xE2, 0xB4, 0x90, 0xE2, 0xB4, 0x91, 0xE2, + 0xB4, 0x92, 0xE2, 0xB4, 0x93, 0xE2, 0xB4, 0x94, + 0xE2, 0xB4, 0x95, 0xE2, 0xB4, 0x96, 0xE2, 0xB4, + 0x97, 0xE2, 0xB4, 0x98, 0xE2, 0xB4, 0x99, 0xE2, + 0xB4, 0x9A, 0xE2, 0xB4, 0x9B, 0xE2, 0xB4, 0x9C, + 0xE2, 0xB4, 0x9D, 0xE2, 0xB4, 0x9E, 0xE2, 0xB4, + 0x9F, 0xE2, 0xB4, 0xA0, 0xE2, 0xB4, 0xA1, 0xE2, + 0xB4, 0xA2, 0xE2, 0xB4, 0xA3, 0xE2, 0xB4, 0xA4, + 0xE2, 0xB4, 0xA5, 0xE1, 0xB8, 0x81, 0xE1, 0xB8, + 0x83, 0xE1, 0xB8, 0x85, 0xE1, 0xB8, 0x87, 0xE1, + 0xB8, 0x89, 0xE1, 0xB8, 0x8B, 0xE1, 0xB8, 0x8D, + 0xE1, 0xB8, 0x8F, 0xE1, 0xB8, 0x91, 0xE1, 0xB8, + 0x93, 0xE1, 0xB8, 0x95, 0xE1, 0xB8, 0x97, 0xE1, + 0xB8, 0x99, 0xE1, 0xB8, 0x9B, 0xE1, 0xB8, 0x9D, + 0xE1, 0xB8, 0x9F, 0xE1, 0xB8, 0xA1, 0xE1, 0xB8, + 0xA3, 0xE1, 0xB8, 0xA5, 0xE1, 0xB8, 0xA7, 0xE1, + 0xB8, 0xA9, 0xE1, 0xB8, 0xAB, 0xE1, 0xB8, 0xAD, + 0xE1, 0xB8, 0xAF, 0xE1, 0xB8, 0xB1, 0xE1, 0xB8, + 0xB3, 0xE1, 0xB8, 0xB5, 0xE1, 0xB8, 0xB7, 0xE1, + 0xB8, 0xB9, 0xE1, 0xB8, 0xBB, 0xE1, 0xB8, 0xBD, + 0xE1, 0xB8, 0xBF, 0xE1, 0xB9, 0x81, 0xE1, 0xB9, + 0x83, 0xE1, 0xB9, 0x85, 0xE1, 0xB9, 0x87, 0xE1, + 0xB9, 0x89, 0xE1, 0xB9, 0x8B, 0xE1, 0xB9, 0x8D, + 0xE1, 0xB9, 0x8F, 0xE1, 0xB9, 0x91, 0xE1, 0xB9, + 0x93, 0xE1, 0xB9, 0x95, 0xE1, 0xB9, 0x97, 0xE1, + 0xB9, 0x99, 0xE1, 0xB9, 0x9B, 0xE1, 0xB9, 0x9D, + 0xE1, 0xB9, 0x9F, 0xE1, 0xB9, 0xA1, 0xE1, 0xB9, + 0xA3, 0xE1, 0xB9, 0xA5, 0xE1, 0xB9, 0xA7, 0xE1, + 0xB9, 0xA9, 0xE1, 0xB9, 0xAB, 0xE1, 0xB9, 0xAD, + 0xE1, 0xB9, 0xAF, 0xE1, 0xB9, 0xB1, 0xE1, 0xB9, + 0xB3, 0xE1, 0xB9, 0xB5, 0xE1, 0xB9, 0xB7, 0xE1, + 0xB9, 0xB9, 0xE1, 0xB9, 0xBB, 0xE1, 0xB9, 0xBD, + 0xE1, 0xB9, 0xBF, 0xE1, 0xBA, 0x81, 0xE1, 0xBA, + 0x83, 0xE1, 0xBA, 0x85, 0xE1, 0xBA, 0x87, 0xE1, + 0xBA, 0x89, 0xE1, 0xBA, 0x8B, 0xE1, 0xBA, 0x8D, + 0xE1, 0xBA, 0x8F, 0xE1, 0xBA, 0x91, 0xE1, 0xBA, + 0x93, 0xE1, 0xBA, 0x95, 0xE1, 0xBA, 0xA1, 0xE1, + 0xBA, 0xA3, 0xE1, 0xBA, 0xA5, 0xE1, 0xBA, 0xA7, + 0xE1, 0xBA, 0xA9, 0xE1, 0xBA, 0xAB, 0xE1, 0xBA, + 0xAD, 0xE1, 0xBA, 0xAF, 0xE1, 0xBA, 0xB1, 0xE1, + 0xBA, 0xB3, 0xE1, 0xBA, 0xB5, 0xE1, 0xBA, 0xB7, + 0xE1, 0xBA, 0xB9, 0xE1, 0xBA, 0xBB, 0xE1, 0xBA, + 0xBD, 0xE1, 0xBA, 0xBF, 0xE1, 0xBB, 0x81, 0xE1, + 0xBB, 0x83, 0xE1, 0xBB, 0x85, 0xE1, 0xBB, 0x87, + 0xE1, 0xBB, 0x89, 0xE1, 0xBB, 0x8B, 0xE1, 0xBB, + 0x8D, 0xE1, 0xBB, 0x8F, 0xE1, 0xBB, 0x91, 0xE1, + 0xBB, 0x93, 0xE1, 0xBB, 0x95, 0xE1, 0xBB, 0x97, + 0xE1, 0xBB, 0x99, 0xE1, 0xBB, 0x9B, 0xE1, 0xBB, + 0x9D, 0xE1, 0xBB, 0x9F, 0xE1, 0xBB, 0xA1, 0xE1, + 0xBB, 0xA3, 0xE1, 0xBB, 0xA5, 0xE1, 0xBB, 0xA7, + 0xE1, 0xBB, 0xA9, 0xE1, 0xBB, 0xAB, 0xE1, 0xBB, + 0xAD, 0xE1, 0xBB, 0xAF, 0xE1, 0xBB, 0xB1, 0xE1, + 0xBB, 0xB3, 0xE1, 0xBB, 0xB5, 0xE1, 0xBB, 0xB7, + 0xE1, 0xBB, 0xB9, 0xE1, 0xBC, 0x80, 0xE1, 0xBC, + 0x81, 0xE1, 0xBC, 0x82, 0xE1, 0xBC, 0x83, 0xE1, + 0xBC, 0x84, 0xE1, 0xBC, 0x85, 0xE1, 0xBC, 0x86, + 0xE1, 0xBC, 0x87, 0xE1, 0xBC, 0x90, 0xE1, 0xBC, + 0x91, 0xE1, 0xBC, 0x92, 0xE1, 0xBC, 0x93, 0xE1, + 0xBC, 0x94, 0xE1, 0xBC, 0x95, 0xE1, 0xBC, 0xA0, + 0xE1, 0xBC, 0xA1, 0xE1, 0xBC, 0xA2, 0xE1, 0xBC, + 0xA3, 0xE1, 0xBC, 0xA4, 0xE1, 0xBC, 0xA5, 0xE1, + 0xBC, 0xA6, 0xE1, 0xBC, 0xA7, 0xE1, 0xBC, 0xB0, + 0xE1, 0xBC, 0xB1, 0xE1, 0xBC, 0xB2, 0xE1, 0xBC, + 0xB3, 0xE1, 0xBC, 0xB4, 0xE1, 0xBC, 0xB5, 0xE1, + 0xBC, 0xB6, 0xE1, 0xBC, 0xB7, 0xE1, 0xBD, 0x80, + 0xE1, 0xBD, 0x81, 0xE1, 0xBD, 0x82, 0xE1, 0xBD, + 0x83, 0xE1, 0xBD, 0x84, 0xE1, 0xBD, 0x85, 0xE1, + 0xBD, 0x91, 0xE1, 0xBD, 0x93, 0xE1, 0xBD, 0x95, + 0xE1, 0xBD, 0x97, 0xE1, 0xBD, 0xA0, 0xE1, 0xBD, + 0xA1, 0xE1, 0xBD, 0xA2, 0xE1, 0xBD, 0xA3, 0xE1, + 0xBD, 0xA4, 0xE1, 0xBD, 0xA5, 0xE1, 0xBD, 0xA6, + 0xE1, 0xBD, 0xA7, 0xE1, 0xBE, 0x80, 0xE1, 0xBE, + 0x81, 0xE1, 0xBE, 0x82, 0xE1, 0xBE, 0x83, 0xE1, + 0xBE, 0x84, 0xE1, 0xBE, 0x85, 0xE1, 0xBE, 0x86, + 0xE1, 0xBE, 0x87, 0xE1, 0xBE, 0x90, 0xE1, 0xBE, + 0x91, 0xE1, 0xBE, 0x92, 0xE1, 0xBE, 0x93, 0xE1, + 0xBE, 0x94, 0xE1, 0xBE, 0x95, 0xE1, 0xBE, 0x96, + 0xE1, 0xBE, 0x97, 0xE1, 0xBE, 0xA0, 0xE1, 0xBE, + 0xA1, 0xE1, 0xBE, 0xA2, 0xE1, 0xBE, 0xA3, 0xE1, + 0xBE, 0xA4, 0xE1, 0xBE, 0xA5, 0xE1, 0xBE, 0xA6, + 0xE1, 0xBE, 0xA7, 0xE1, 0xBE, 0xB0, 0xE1, 0xBE, + 0xB1, 0xE1, 0xBD, 0xB0, 0xE1, 0xBD, 0xB1, 0xE1, + 0xBE, 0xB3, 0xE1, 0xBD, 0xB2, 0xE1, 0xBD, 0xB3, + 0xE1, 0xBD, 0xB4, 0xE1, 0xBD, 0xB5, 0xE1, 0xBF, + 0x83, 0xE1, 0xBF, 0x90, 0xE1, 0xBF, 0x91, 0xE1, + 0xBD, 0xB6, 0xE1, 0xBD, 0xB7, 0xE1, 0xBF, 0xA0, + 0xE1, 0xBF, 0xA1, 0xE1, 0xBD, 0xBA, 0xE1, 0xBD, + 0xBB, 0xE1, 0xBF, 0xA5, 0xE1, 0xBD, 0xB8, 0xE1, + 0xBD, 0xB9, 0xE1, 0xBD, 0xBC, 0xE1, 0xBD, 0xBD, + 0xE1, 0xBF, 0xB3, 0xCF, 0x89, 0x6B, 0xC3, 0xA5, + 0xE2, 0x85, 0x8E, 0xE2, 0x85, 0xB0, 0xE2, 0x85, + 0xB1, 0xE2, 0x85, 0xB2, 0xE2, 0x85, 0xB3, 0xE2, + 0x85, 0xB4, 0xE2, 0x85, 0xB5, 0xE2, 0x85, 0xB6, + 0xE2, 0x85, 0xB7, 0xE2, 0x85, 0xB8, 0xE2, 0x85, + 0xB9, 0xE2, 0x85, 0xBA, 0xE2, 0x85, 0xBB, 0xE2, + 0x85, 0xBC, 0xE2, 0x85, 0xBD, 0xE2, 0x85, 0xBE, + 0xE2, 0x85, 0xBF, 0xE2, 0x86, 0x84, 0xE2, 0x93, + 0x90, 0xE2, 0x93, 0x91, 0xE2, 0x93, 0x92, 0xE2, + 0x93, 0x93, 0xE2, 0x93, 0x94, 0xE2, 0x93, 0x95, + 0xE2, 0x93, 0x96, 0xE2, 0x93, 0x97, 0xE2, 0x93, + 0x98, 0xE2, 0x93, 0x99, 0xE2, 0x93, 0x9A, 0xE2, + 0x93, 0x9B, 0xE2, 0x93, 0x9C, 0xE2, 0x93, 0x9D, + 0xE2, 0x93, 0x9E, 0xE2, 0x93, 0x9F, 0xE2, 0x93, + 0xA0, 0xE2, 0x93, 0xA1, 0xE2, 0x93, 0xA2, 0xE2, + 0x93, 0xA3, 0xE2, 0x93, 0xA4, 0xE2, 0x93, 0xA5, + 0xE2, 0x93, 0xA6, 0xE2, 0x93, 0xA7, 0xE2, 0x93, + 0xA8, 0xE2, 0x93, 0xA9, 0xE2, 0xB0, 0xB0, 0xE2, + 0xB0, 0xB1, 0xE2, 0xB0, 0xB2, 0xE2, 0xB0, 0xB3, + 0xE2, 0xB0, 0xB4, 0xE2, 0xB0, 0xB5, 0xE2, 0xB0, + 0xB6, 0xE2, 0xB0, 0xB7, 0xE2, 0xB0, 0xB8, 0xE2, + 0xB0, 0xB9, 0xE2, 0xB0, 0xBA, 0xE2, 0xB0, 0xBB, + 0xE2, 0xB0, 0xBC, 0xE2, 0xB0, 0xBD, 0xE2, 0xB0, + 0xBE, 0xE2, 0xB0, 0xBF, 0xE2, 0xB1, 0x80, 0xE2, + 0xB1, 0x81, 0xE2, 0xB1, 0x82, 0xE2, 0xB1, 0x83, + 0xE2, 0xB1, 0x84, 0xE2, 0xB1, 0x85, 0xE2, 0xB1, + 0x86, 0xE2, 0xB1, 0x87, 0xE2, 0xB1, 0x88, 0xE2, + 0xB1, 0x89, 0xE2, 0xB1, 0x8A, 0xE2, 0xB1, 0x8B, + 0xE2, 0xB1, 0x8C, 0xE2, 0xB1, 0x8D, 0xE2, 0xB1, + 0x8E, 0xE2, 0xB1, 0x8F, 0xE2, 0xB1, 0x90, 0xE2, + 0xB1, 0x91, 0xE2, 0xB1, 0x92, 0xE2, 0xB1, 0x93, + 0xE2, 0xB1, 0x94, 0xE2, 0xB1, 0x95, 0xE2, 0xB1, + 0x96, 0xE2, 0xB1, 0x97, 0xE2, 0xB1, 0x98, 0xE2, + 0xB1, 0x99, 0xE2, 0xB1, 0x9A, 0xE2, 0xB1, 0x9B, + 0xE2, 0xB1, 0x9C, 0xE2, 0xB1, 0x9D, 0xE2, 0xB1, + 0x9E, 0xE2, 0xB1, 0xA1, 0xC9, 0xAB, 0xE1, 0xB5, + 0xBD, 0xC9, 0xBD, 0xE2, 0xB1, 0xA8, 0xE2, 0xB1, + 0xAA, 0xE2, 0xB1, 0xAC, 0xE2, 0xB1, 0xB6, 0xE2, + 0xB2, 0x81, 0xE2, 0xB2, 0x83, 0xE2, 0xB2, 0x85, + 0xE2, 0xB2, 0x87, 0xE2, 0xB2, 0x89, 0xE2, 0xB2, + 0x8B, 0xE2, 0xB2, 0x8D, 0xE2, 0xB2, 0x8F, 0xE2, + 0xB2, 0x91, 0xE2, 0xB2, 0x93, 0xE2, 0xB2, 0x95, + 0xE2, 0xB2, 0x97, 0xE2, 0xB2, 0x99, 0xE2, 0xB2, + 0x9B, 0xE2, 0xB2, 0x9D, 0xE2, 0xB2, 0x9F, 0xE2, + 0xB2, 0xA1, 0xE2, 0xB2, 0xA3, 0xE2, 0xB2, 0xA5, + 0xE2, 0xB2, 0xA7, 0xE2, 0xB2, 0xA9, 0xE2, 0xB2, + 0xAB, 0xE2, 0xB2, 0xAD, 0xE2, 0xB2, 0xAF, 0xE2, + 0xB2, 0xB1, 0xE2, 0xB2, 0xB3, 0xE2, 0xB2, 0xB5, + 0xE2, 0xB2, 0xB7, 0xE2, 0xB2, 0xB9, 0xE2, 0xB2, + 0xBB, 0xE2, 0xB2, 0xBD, 0xE2, 0xB2, 0xBF, 0xE2, + 0xB3, 0x81, 0xE2, 0xB3, 0x83, 0xE2, 0xB3, 0x85, + 0xE2, 0xB3, 0x87, 0xE2, 0xB3, 0x89, 0xE2, 0xB3, + 0x8B, 0xE2, 0xB3, 0x8D, 0xE2, 0xB3, 0x8F, 0xE2, + 0xB3, 0x91, 0xE2, 0xB3, 0x93, 0xE2, 0xB3, 0x95, + 0xE2, 0xB3, 0x97, 0xE2, 0xB3, 0x99, 0xE2, 0xB3, + 0x9B, 0xE2, 0xB3, 0x9D, 0xE2, 0xB3, 0x9F, 0xE2, + 0xB3, 0xA1, 0xE2, 0xB3, 0xA3, 0xEF, 0xBD, 0x81, + 0xEF, 0xBD, 0x82, 0xEF, 0xBD, 0x83, 0xEF, 0xBD, + 0x84, 0xEF, 0xBD, 0x85, 0xEF, 0xBD, 0x86, 0xEF, + 0xBD, 0x87, 0xEF, 0xBD, 0x88, 0xEF, 0xBD, 0x89, + 0xEF, 0xBD, 0x8A, 0xEF, 0xBD, 0x8B, 0xEF, 0xBD, + 0x8C, 0xEF, 0xBD, 0x8D, 0xEF, 0xBD, 0x8E, 0xEF, + 0xBD, 0x8F, 0xEF, 0xBD, 0x90, 0xEF, 0xBD, 0x91, + 0xEF, 0xBD, 0x92, 0xEF, 0xBD, 0x93, 0xEF, 0xBD, + 0x94, 0xEF, 0xBD, 0x95, 0xEF, 0xBD, 0x96, 0xEF, + 0xBD, 0x97, 0xEF, 0xBD, 0x98, 0xEF, 0xBD, 0x99, + 0xEF, 0xBD, 0x9A, 0xF0, 0x90, 0x90, 0xA8, 0xF0, + 0x90, 0x90, 0xA9, 0xF0, 0x90, 0x90, 0xAA, 0xF0, + 0x90, 0x90, 0xAB, 0xF0, 0x90, 0x90, 0xAC, 0xF0, + 0x90, 0x90, 0xAD, 0xF0, 0x90, 0x90, 0xAE, 0xF0, + 0x90, 0x90, 0xAF, 0xF0, 0x90, 0x90, 0xB0, 0xF0, + 0x90, 0x90, 0xB1, 0xF0, 0x90, 0x90, 0xB2, 0xF0, + 0x90, 0x90, 0xB3, 0xF0, 0x90, 0x90, 0xB4, 0xF0, + 0x90, 0x90, 0xB5, 0xF0, 0x90, 0x90, 0xB6, 0xF0, + 0x90, 0x90, 0xB7, 0xF0, 0x90, 0x90, 0xB8, 0xF0, + 0x90, 0x90, 0xB9, 0xF0, 0x90, 0x90, 0xBA, 0xF0, + 0x90, 0x90, 0xBB, 0xF0, 0x90, 0x90, 0xBC, 0xF0, + 0x90, 0x90, 0xBD, 0xF0, 0x90, 0x90, 0xBE, 0xF0, + 0x90, 0x90, 0xBF, 0xF0, 0x90, 0x91, 0x80, 0xF0, + 0x90, 0x91, 0x81, 0xF0, 0x90, 0x91, 0x82, 0xF0, + 0x90, 0x91, 0x83, 0xF0, 0x90, 0x91, 0x84, 0xF0, + 0x90, 0x91, 0x85, 0xF0, 0x90, 0x91, 0x86, 0xF0, + 0x90, 0x91, 0x87, 0xF0, 0x90, 0x91, 0x88, 0xF0, + 0x90, 0x91, 0x89, 0xF0, 0x90, 0x91, 0x8A, 0xF0, + 0x90, 0x91, 0x8B, 0xF0, 0x90, 0x91, 0x8C, 0xF0, + 0x90, 0x91, 0x8D, 0xF0, 0x90, 0x91, 0x8E, 0xF0, + 0x90, 0x91, 0x8F, + }, +}; + +static const u8_displacement_t u8_toupper_b3_tbl[2][5][256] = { + { + { /* Third byte table 0. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0, 0 }, { 1, 2 }, + { 2, 64 }, { 3, 125 }, { 4, 188 }, { 5, 226 }, + { 6, 288 }, { 7, 338 }, { 8, 364 }, { N_, 0 }, + { N_, 0 }, { 9, 376 }, { 10, 378 }, { 11, 416 }, + { 12, 486 }, { 13, 518 }, { 14, 614 }, { 15, 670 }, + { 16, 724 }, { 17, 740 }, { 18, 802 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 19, 816 }, { 20, 912 }, { 21, 1008 }, { 22, 1092 }, + { 23, 1179 }, { 24, 1269 }, { 25, 1365 }, { 26, 1448 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 27, 1469 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { 28, 1517 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 29, 1595 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 30, 1673 }, { 31, 1769 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + }, + { + { /* Third byte table 0. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0, 0 }, { 1, 2 }, + { 2, 64 }, { 3, 125 }, { 4, 188 }, { 5, 230 }, + { 6, 292 }, { 7, 344 }, { 8, 388 }, { N_, 0 }, + { N_, 0 }, { 9, 404 }, { 10, 412 }, { 11, 450 }, + { 12, 524 }, { 13, 556 }, { 14, 652 }, { 15, 708 }, + { 16, 772 }, { 17, 792 }, { 18, 854 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 19, 868 }, { N_, 0 }, { N_, 0 }, + { 20, 871 }, { 21, 967 }, { 22, 1063 }, { 23, 1147 }, + { 24, 1234 }, { 25, 1324 }, { 26, 1420 }, { 27, 1503 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 28, 1524 }, { 29, 1575 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { 30, 1578 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 31, 1656 }, { 32, 1704 }, { 33, 1816 }, { 34, 1912 }, + { 35, 1966 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 36, 2080 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 37, 2158 }, { 38, 2254 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + }, +}; + +static const uchar_t u8_toupper_b4_tbl[2][39][257] = { + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 46, 48, 50, 52, 54, 56, 58, 60, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 49, 49, 51, 51, 53, 53, + 55, 55, 55, 57, 57, 59, 59, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 50, 50, 52, 52, 54, 54, + 56, 56, 56, 58, 58, 60, 60, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 2, 4, 4, + 4, 6, 6, 6, 6, 8, 8, 8, + 8, 8, 8, 10, 10, 10, 12, 12, + 12, 12, 14, 14, 14, 14, 14, 16, + 16, 16, 18, 18, 20, 20, 22, 22, + 22, 24, 24, 24, 24, 24, 26, 26, + 26, 28, 28, 28, 28, 30, 30, 32, + 32, 32, 34, 34, 34, 34, 36, 36, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 4, + 4, 6, 8, 8, 10, 12, 12, 14, + 14, 16, 16, 18, 18, 20, 20, 22, + 22, 24, 24, 26, 26, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 48, 50, 52, 52, 54, 54, + 54, 54, 56, 56, 58, 58, 60, 60, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 32, 32, 34, 34, 36, 36, + 38, 38, 40, 40, 42, 42, 44, 44, + 46, 46, 48, 48, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 4, 4, 6, + 8, 8, 10, 10, 12, 12, 12, 12, + 12, 14, 14, 14, 16, 16, 16, 16, + 16, 18, 20, 20, 20, 20, 20, 20, + 22, 22, 22, 24, 24, 24, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 2, 4, 4, 4, 4, + 4, 6, 6, 8, 10, 10, 10, 10, + 10, 10, 10, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 4, 6, + 8, 8, 10, 12, 14, 16, 18, 20, + 22, 24, 26, 28, 30, 32, 34, 36, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 30, 32, 34, 34, 34, 34, 36, 38, + 38, 38, 40, 40, 42, 42, 44, 44, + 46, 46, 48, 48, 50, 50, 52, 52, + 54, 54, 56, 56, 58, 58, 60, 60, + 62, 64, 66, 68, 68, 68, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 64, 66, 66, 68, 68, 70, 70, + 72, 72, 74, 74, 76, 76, 78, 78, + 80, 80, 82, 82, 84, 84, 86, 86, + 88, 88, 90, 90, 92, 92, 94, 94, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 50, 50, 52, 52, 54, 54, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 2, 4, 4, 6, + 6, 8, 8, 10, 10, 12, 12, 14, + 14, 14, 16, 16, 18, 18, 20, 20, + 22, 22, 24, 24, 26, 26, 28, 28, + 30, 30, 32, 32, 34, 34, 36, 36, + 38, 38, 40, 40, 42, 42, 44, 44, + 46, 46, 48, 48, 50, 50, 52, 52, + 52, 52, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 4, 6, 8, 10, 12, + 14, 16, 18, 20, 22, 24, 26, 28, + 30, 32, 34, 36, 38, 40, 42, 44, + 46, 48, 50, 52, 54, 56, 58, 60, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 90, 90, 93, 93, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 90, 90, 93, 93, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 33, 33, 33, 33, 36, 36, 36, 36, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 45, 48, 51, 54, 57, 60, 63, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 69, 72, 75, 78, 81, 84, 87, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 21, 21, 24, 24, 27, 27, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 33, 36, 39, 42, 45, 48, 51, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 57, 60, 63, 66, 69, 72, 75, + 78, 81, 84, 87, 90, 93, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 75, 78, 78, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 6, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 12, 15, 15, 15, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 6, 9, 12, 15, 18, + 21, 24, 27, 30, 33, 36, 39, 42, + 45, 48, 51, 54, 57, 60, 63, 66, + 69, 72, 75, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + }, + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 46, 48, 50, 52, 54, 56, 58, 60, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 49, 49, 51, 51, 53, 53, + 55, 55, 55, 57, 57, 59, 59, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 50, 50, 52, 52, 54, 54, + 56, 56, 56, 58, 58, 60, 60, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 2, 4, 4, 6, 6, + 6, 8, 8, 8, 8, 10, 10, 10, + 10, 10, 10, 12, 12, 12, 14, 14, + 14, 14, 16, 18, 18, 18, 18, 20, + 20, 20, 22, 22, 24, 24, 26, 26, + 26, 28, 28, 28, 28, 28, 30, 30, + 30, 32, 32, 32, 32, 34, 34, 36, + 36, 36, 38, 38, 38, 38, 40, 40, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 4, + 4, 6, 8, 8, 10, 12, 12, 14, + 14, 16, 16, 18, 18, 20, 20, 22, + 22, 24, 24, 26, 26, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 48, 50, 52, 52, 54, 54, + 54, 54, 56, 56, 58, 58, 60, 60, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 32, 32, 34, 34, 36, 36, + 38, 38, 40, 40, 42, 42, 44, 44, + 46, 46, 48, 48, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 2, 2, 2, 2, + 4, 4, 6, 6, 8, 8, 10, 10, + 12, 12, 12, 12, 14, 16, 16, 18, + 20, 20, 22, 22, 24, 24, 24, 24, + 24, 26, 26, 26, 28, 28, 28, 28, + 28, 30, 32, 32, 35, 35, 35, 35, + 37, 37, 37, 39, 39, 39, 41, 41, + 41, 41, 41, 41, 41, 41, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 2, 4, 4, 4, 4, + 4, 6, 8, 10, 12, 14, 14, 14, + 14, 14, 14, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 4, 6, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 4, 6, + 8, 8, 10, 12, 14, 16, 18, 20, + 22, 24, 26, 28, 30, 32, 34, 36, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 30, 32, 34, 34, 34, 34, 36, 38, + 38, 38, 40, 40, 42, 42, 44, 44, + 46, 46, 48, 48, 50, 50, 52, 52, + 54, 54, 56, 56, 58, 58, 60, 60, + 62, 64, 66, 68, 68, 68, 70, 70, + 70, 72, 72, 72, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 64, 66, 66, 68, 68, 70, 70, + 72, 72, 74, 74, 76, 76, 78, 78, + 80, 80, 82, 82, 84, 84, 86, 86, + 88, 88, 90, 90, 92, 92, 94, 94, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 50, 50, 52, 52, 54, 54, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 2, 4, 4, 6, + 6, 8, 8, 10, 10, 12, 12, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 50, 50, 52, 52, 54, 54, + 56, 56, 58, 58, 60, 60, 62, 62, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 4, 6, 8, 10, 12, + 14, 16, 18, 20, 22, 24, 26, 28, + 30, 32, 34, 36, 38, 40, 42, 44, + 46, 48, 50, 52, 54, 56, 58, 60, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 90, 90, 93, 93, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 90, 90, 93, 93, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 33, 33, 33, 33, 36, 36, 36, 36, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 45, 48, 51, 54, 57, 60, 63, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 69, 72, 75, 78, 81, 84, 87, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 21, 21, 24, 24, 27, 27, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 33, 36, 39, 42, 45, 48, 51, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 57, 60, 63, 66, 69, 72, 75, + 78, 81, 84, 87, 90, 93, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 75, 78, 78, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 6, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 12, 15, 15, 15, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 6, 9, 12, 15, 18, 21, 24, + 27, 30, 33, 36, 39, 42, 45, 48, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 93, 93, 96, 96, 96, 96, 98, 100, + 100, 103, 103, 106, 106, 109, 109, 109, + 109, 109, 109, 109, 109, 109, 109, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 90, 90, 93, 93, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 6, 9, 12, 15, 18, + 21, 24, 27, 30, 33, 36, 39, 42, + 45, 48, 51, 54, 57, 60, 63, 66, + 69, 72, 75, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + }, +}; + +static const uchar_t u8_toupper_final_tbl[2][2318] = { + { + 0xCE, 0x9C, 0xC3, 0x80, 0xC3, 0x81, 0xC3, 0x82, + 0xC3, 0x83, 0xC3, 0x84, 0xC3, 0x85, 0xC3, 0x86, + 0xC3, 0x87, 0xC3, 0x88, 0xC3, 0x89, 0xC3, 0x8A, + 0xC3, 0x8B, 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, + 0xC3, 0x8F, 0xC3, 0x90, 0xC3, 0x91, 0xC3, 0x92, + 0xC3, 0x93, 0xC3, 0x94, 0xC3, 0x95, 0xC3, 0x96, + 0xC3, 0x98, 0xC3, 0x99, 0xC3, 0x9A, 0xC3, 0x9B, + 0xC3, 0x9C, 0xC3, 0x9D, 0xC3, 0x9E, 0xC5, 0xB8, + 0xC4, 0x80, 0xC4, 0x82, 0xC4, 0x84, 0xC4, 0x86, + 0xC4, 0x88, 0xC4, 0x8A, 0xC4, 0x8C, 0xC4, 0x8E, + 0xC4, 0x90, 0xC4, 0x92, 0xC4, 0x94, 0xC4, 0x96, + 0xC4, 0x98, 0xC4, 0x9A, 0xC4, 0x9C, 0xC4, 0x9E, + 0xC4, 0xA0, 0xC4, 0xA2, 0xC4, 0xA4, 0xC4, 0xA6, + 0xC4, 0xA8, 0xC4, 0xAA, 0xC4, 0xAC, 0xC4, 0xAE, + 0x49, 0xC4, 0xB2, 0xC4, 0xB4, 0xC4, 0xB6, 0xC4, + 0xB9, 0xC4, 0xBB, 0xC4, 0xBD, 0xC4, 0xBF, 0xC5, + 0x81, 0xC5, 0x83, 0xC5, 0x85, 0xC5, 0x87, 0xC5, + 0x8A, 0xC5, 0x8C, 0xC5, 0x8E, 0xC5, 0x90, 0xC5, + 0x92, 0xC5, 0x94, 0xC5, 0x96, 0xC5, 0x98, 0xC5, + 0x9A, 0xC5, 0x9C, 0xC5, 0x9E, 0xC5, 0xA0, 0xC5, + 0xA2, 0xC5, 0xA4, 0xC5, 0xA6, 0xC5, 0xA8, 0xC5, + 0xAA, 0xC5, 0xAC, 0xC5, 0xAE, 0xC5, 0xB0, 0xC5, + 0xB2, 0xC5, 0xB4, 0xC5, 0xB6, 0xC5, 0xB9, 0xC5, + 0xBB, 0xC5, 0xBD, 0x53, 0xC6, 0x82, 0xC6, 0x84, + 0xC6, 0x87, 0xC6, 0x8B, 0xC6, 0x91, 0xC7, 0xB6, + 0xC6, 0x98, 0xC8, 0xA0, 0xC6, 0xA0, 0xC6, 0xA2, + 0xC6, 0xA4, 0xC6, 0xA7, 0xC6, 0xAC, 0xC6, 0xAF, + 0xC6, 0xB3, 0xC6, 0xB5, 0xC6, 0xB8, 0xC6, 0xBC, + 0xC7, 0xB7, 0xC7, 0x84, 0xC7, 0x84, 0xC7, 0x87, + 0xC7, 0x87, 0xC7, 0x8A, 0xC7, 0x8A, 0xC7, 0x8D, + 0xC7, 0x8F, 0xC7, 0x91, 0xC7, 0x93, 0xC7, 0x95, + 0xC7, 0x97, 0xC7, 0x99, 0xC7, 0x9B, 0xC6, 0x8E, + 0xC7, 0x9E, 0xC7, 0xA0, 0xC7, 0xA2, 0xC7, 0xA4, + 0xC7, 0xA6, 0xC7, 0xA8, 0xC7, 0xAA, 0xC7, 0xAC, + 0xC7, 0xAE, 0xC7, 0xB1, 0xC7, 0xB1, 0xC7, 0xB4, + 0xC7, 0xB8, 0xC7, 0xBA, 0xC7, 0xBC, 0xC7, 0xBE, + 0xC8, 0x80, 0xC8, 0x82, 0xC8, 0x84, 0xC8, 0x86, + 0xC8, 0x88, 0xC8, 0x8A, 0xC8, 0x8C, 0xC8, 0x8E, + 0xC8, 0x90, 0xC8, 0x92, 0xC8, 0x94, 0xC8, 0x96, + 0xC8, 0x98, 0xC8, 0x9A, 0xC8, 0x9C, 0xC8, 0x9E, + 0xC8, 0xA2, 0xC8, 0xA4, 0xC8, 0xA6, 0xC8, 0xA8, + 0xC8, 0xAA, 0xC8, 0xAC, 0xC8, 0xAE, 0xC8, 0xB0, + 0xC8, 0xB2, 0xC6, 0x81, 0xC6, 0x86, 0xC6, 0x89, + 0xC6, 0x8A, 0xC6, 0x8F, 0xC6, 0x90, 0xC6, 0x93, + 0xC6, 0x94, 0xC6, 0x97, 0xC6, 0x96, 0xC6, 0x9C, + 0xC6, 0x9D, 0xC6, 0x9F, 0xC6, 0xA6, 0xC6, 0xA9, + 0xC6, 0xAE, 0xC6, 0xB1, 0xC6, 0xB2, 0xC6, 0xB7, + 0xCE, 0x99, 0xCE, 0x86, 0xCE, 0x88, 0xCE, 0x89, + 0xCE, 0x8A, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, + 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, + 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, + 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, + 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0xA3, 0xCE, 0xA3, + 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, + 0xCE, 0xA8, 0xCE, 0xA9, 0xCE, 0xAA, 0xCE, 0xAB, + 0xCE, 0x8C, 0xCE, 0x8E, 0xCE, 0x8F, 0xCE, 0x92, + 0xCE, 0x98, 0xCE, 0xA6, 0xCE, 0xA0, 0xCF, 0x98, + 0xCF, 0x9A, 0xCF, 0x9C, 0xCF, 0x9E, 0xCF, 0xA0, + 0xCF, 0xA2, 0xCF, 0xA4, 0xCF, 0xA6, 0xCF, 0xA8, + 0xCF, 0xAA, 0xCF, 0xAC, 0xCF, 0xAE, 0xCE, 0x9A, + 0xCE, 0xA1, 0xCE, 0xA3, 0xCE, 0x95, 0xD0, 0x90, + 0xD0, 0x91, 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, + 0xD0, 0x95, 0xD0, 0x96, 0xD0, 0x97, 0xD0, 0x98, + 0xD0, 0x99, 0xD0, 0x9A, 0xD0, 0x9B, 0xD0, 0x9C, + 0xD0, 0x9D, 0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, + 0xD0, 0xA1, 0xD0, 0xA2, 0xD0, 0xA3, 0xD0, 0xA4, + 0xD0, 0xA5, 0xD0, 0xA6, 0xD0, 0xA7, 0xD0, 0xA8, + 0xD0, 0xA9, 0xD0, 0xAA, 0xD0, 0xAB, 0xD0, 0xAC, + 0xD0, 0xAD, 0xD0, 0xAE, 0xD0, 0xAF, 0xD0, 0x80, + 0xD0, 0x81, 0xD0, 0x82, 0xD0, 0x83, 0xD0, 0x84, + 0xD0, 0x85, 0xD0, 0x86, 0xD0, 0x87, 0xD0, 0x88, + 0xD0, 0x89, 0xD0, 0x8A, 0xD0, 0x8B, 0xD0, 0x8C, + 0xD0, 0x8D, 0xD0, 0x8E, 0xD0, 0x8F, 0xD1, 0xA0, + 0xD1, 0xA2, 0xD1, 0xA4, 0xD1, 0xA6, 0xD1, 0xA8, + 0xD1, 0xAA, 0xD1, 0xAC, 0xD1, 0xAE, 0xD1, 0xB0, + 0xD1, 0xB2, 0xD1, 0xB4, 0xD1, 0xB6, 0xD1, 0xB8, + 0xD1, 0xBA, 0xD1, 0xBC, 0xD1, 0xBE, 0xD2, 0x80, + 0xD2, 0x8A, 0xD2, 0x8C, 0xD2, 0x8E, 0xD2, 0x90, + 0xD2, 0x92, 0xD2, 0x94, 0xD2, 0x96, 0xD2, 0x98, + 0xD2, 0x9A, 0xD2, 0x9C, 0xD2, 0x9E, 0xD2, 0xA0, + 0xD2, 0xA2, 0xD2, 0xA4, 0xD2, 0xA6, 0xD2, 0xA8, + 0xD2, 0xAA, 0xD2, 0xAC, 0xD2, 0xAE, 0xD2, 0xB0, + 0xD2, 0xB2, 0xD2, 0xB4, 0xD2, 0xB6, 0xD2, 0xB8, + 0xD2, 0xBA, 0xD2, 0xBC, 0xD2, 0xBE, 0xD3, 0x81, + 0xD3, 0x83, 0xD3, 0x85, 0xD3, 0x87, 0xD3, 0x89, + 0xD3, 0x8B, 0xD3, 0x8D, 0xD3, 0x90, 0xD3, 0x92, + 0xD3, 0x94, 0xD3, 0x96, 0xD3, 0x98, 0xD3, 0x9A, + 0xD3, 0x9C, 0xD3, 0x9E, 0xD3, 0xA0, 0xD3, 0xA2, + 0xD3, 0xA4, 0xD3, 0xA6, 0xD3, 0xA8, 0xD3, 0xAA, + 0xD3, 0xAC, 0xD3, 0xAE, 0xD3, 0xB0, 0xD3, 0xB2, + 0xD3, 0xB4, 0xD3, 0xB8, 0xD4, 0x80, 0xD4, 0x82, + 0xD4, 0x84, 0xD4, 0x86, 0xD4, 0x88, 0xD4, 0x8A, + 0xD4, 0x8C, 0xD4, 0x8E, 0xD4, 0xB1, 0xD4, 0xB2, + 0xD4, 0xB3, 0xD4, 0xB4, 0xD4, 0xB5, 0xD4, 0xB6, + 0xD4, 0xB7, 0xD4, 0xB8, 0xD4, 0xB9, 0xD4, 0xBA, + 0xD4, 0xBB, 0xD4, 0xBC, 0xD4, 0xBD, 0xD4, 0xBE, + 0xD4, 0xBF, 0xD5, 0x80, 0xD5, 0x81, 0xD5, 0x82, + 0xD5, 0x83, 0xD5, 0x84, 0xD5, 0x85, 0xD5, 0x86, + 0xD5, 0x87, 0xD5, 0x88, 0xD5, 0x89, 0xD5, 0x8A, + 0xD5, 0x8B, 0xD5, 0x8C, 0xD5, 0x8D, 0xD5, 0x8E, + 0xD5, 0x8F, 0xD5, 0x90, 0xD5, 0x91, 0xD5, 0x92, + 0xD5, 0x93, 0xD5, 0x94, 0xD5, 0x95, 0xD5, 0x96, + 0xE1, 0xB8, 0x80, 0xE1, 0xB8, 0x82, 0xE1, 0xB8, + 0x84, 0xE1, 0xB8, 0x86, 0xE1, 0xB8, 0x88, 0xE1, + 0xB8, 0x8A, 0xE1, 0xB8, 0x8C, 0xE1, 0xB8, 0x8E, + 0xE1, 0xB8, 0x90, 0xE1, 0xB8, 0x92, 0xE1, 0xB8, + 0x94, 0xE1, 0xB8, 0x96, 0xE1, 0xB8, 0x98, 0xE1, + 0xB8, 0x9A, 0xE1, 0xB8, 0x9C, 0xE1, 0xB8, 0x9E, + 0xE1, 0xB8, 0xA0, 0xE1, 0xB8, 0xA2, 0xE1, 0xB8, + 0xA4, 0xE1, 0xB8, 0xA6, 0xE1, 0xB8, 0xA8, 0xE1, + 0xB8, 0xAA, 0xE1, 0xB8, 0xAC, 0xE1, 0xB8, 0xAE, + 0xE1, 0xB8, 0xB0, 0xE1, 0xB8, 0xB2, 0xE1, 0xB8, + 0xB4, 0xE1, 0xB8, 0xB6, 0xE1, 0xB8, 0xB8, 0xE1, + 0xB8, 0xBA, 0xE1, 0xB8, 0xBC, 0xE1, 0xB8, 0xBE, + 0xE1, 0xB9, 0x80, 0xE1, 0xB9, 0x82, 0xE1, 0xB9, + 0x84, 0xE1, 0xB9, 0x86, 0xE1, 0xB9, 0x88, 0xE1, + 0xB9, 0x8A, 0xE1, 0xB9, 0x8C, 0xE1, 0xB9, 0x8E, + 0xE1, 0xB9, 0x90, 0xE1, 0xB9, 0x92, 0xE1, 0xB9, + 0x94, 0xE1, 0xB9, 0x96, 0xE1, 0xB9, 0x98, 0xE1, + 0xB9, 0x9A, 0xE1, 0xB9, 0x9C, 0xE1, 0xB9, 0x9E, + 0xE1, 0xB9, 0xA0, 0xE1, 0xB9, 0xA2, 0xE1, 0xB9, + 0xA4, 0xE1, 0xB9, 0xA6, 0xE1, 0xB9, 0xA8, 0xE1, + 0xB9, 0xAA, 0xE1, 0xB9, 0xAC, 0xE1, 0xB9, 0xAE, + 0xE1, 0xB9, 0xB0, 0xE1, 0xB9, 0xB2, 0xE1, 0xB9, + 0xB4, 0xE1, 0xB9, 0xB6, 0xE1, 0xB9, 0xB8, 0xE1, + 0xB9, 0xBA, 0xE1, 0xB9, 0xBC, 0xE1, 0xB9, 0xBE, + 0xE1, 0xBA, 0x80, 0xE1, 0xBA, 0x82, 0xE1, 0xBA, + 0x84, 0xE1, 0xBA, 0x86, 0xE1, 0xBA, 0x88, 0xE1, + 0xBA, 0x8A, 0xE1, 0xBA, 0x8C, 0xE1, 0xBA, 0x8E, + 0xE1, 0xBA, 0x90, 0xE1, 0xBA, 0x92, 0xE1, 0xBA, + 0x94, 0xE1, 0xB9, 0xA0, 0xE1, 0xBA, 0xA0, 0xE1, + 0xBA, 0xA2, 0xE1, 0xBA, 0xA4, 0xE1, 0xBA, 0xA6, + 0xE1, 0xBA, 0xA8, 0xE1, 0xBA, 0xAA, 0xE1, 0xBA, + 0xAC, 0xE1, 0xBA, 0xAE, 0xE1, 0xBA, 0xB0, 0xE1, + 0xBA, 0xB2, 0xE1, 0xBA, 0xB4, 0xE1, 0xBA, 0xB6, + 0xE1, 0xBA, 0xB8, 0xE1, 0xBA, 0xBA, 0xE1, 0xBA, + 0xBC, 0xE1, 0xBA, 0xBE, 0xE1, 0xBB, 0x80, 0xE1, + 0xBB, 0x82, 0xE1, 0xBB, 0x84, 0xE1, 0xBB, 0x86, + 0xE1, 0xBB, 0x88, 0xE1, 0xBB, 0x8A, 0xE1, 0xBB, + 0x8C, 0xE1, 0xBB, 0x8E, 0xE1, 0xBB, 0x90, 0xE1, + 0xBB, 0x92, 0xE1, 0xBB, 0x94, 0xE1, 0xBB, 0x96, + 0xE1, 0xBB, 0x98, 0xE1, 0xBB, 0x9A, 0xE1, 0xBB, + 0x9C, 0xE1, 0xBB, 0x9E, 0xE1, 0xBB, 0xA0, 0xE1, + 0xBB, 0xA2, 0xE1, 0xBB, 0xA4, 0xE1, 0xBB, 0xA6, + 0xE1, 0xBB, 0xA8, 0xE1, 0xBB, 0xAA, 0xE1, 0xBB, + 0xAC, 0xE1, 0xBB, 0xAE, 0xE1, 0xBB, 0xB0, 0xE1, + 0xBB, 0xB2, 0xE1, 0xBB, 0xB4, 0xE1, 0xBB, 0xB6, + 0xE1, 0xBB, 0xB8, 0xE1, 0xBC, 0x88, 0xE1, 0xBC, + 0x89, 0xE1, 0xBC, 0x8A, 0xE1, 0xBC, 0x8B, 0xE1, + 0xBC, 0x8C, 0xE1, 0xBC, 0x8D, 0xE1, 0xBC, 0x8E, + 0xE1, 0xBC, 0x8F, 0xE1, 0xBC, 0x98, 0xE1, 0xBC, + 0x99, 0xE1, 0xBC, 0x9A, 0xE1, 0xBC, 0x9B, 0xE1, + 0xBC, 0x9C, 0xE1, 0xBC, 0x9D, 0xE1, 0xBC, 0xA8, + 0xE1, 0xBC, 0xA9, 0xE1, 0xBC, 0xAA, 0xE1, 0xBC, + 0xAB, 0xE1, 0xBC, 0xAC, 0xE1, 0xBC, 0xAD, 0xE1, + 0xBC, 0xAE, 0xE1, 0xBC, 0xAF, 0xE1, 0xBC, 0xB8, + 0xE1, 0xBC, 0xB9, 0xE1, 0xBC, 0xBA, 0xE1, 0xBC, + 0xBB, 0xE1, 0xBC, 0xBC, 0xE1, 0xBC, 0xBD, 0xE1, + 0xBC, 0xBE, 0xE1, 0xBC, 0xBF, 0xE1, 0xBD, 0x88, + 0xE1, 0xBD, 0x89, 0xE1, 0xBD, 0x8A, 0xE1, 0xBD, + 0x8B, 0xE1, 0xBD, 0x8C, 0xE1, 0xBD, 0x8D, 0xE1, + 0xBD, 0x99, 0xE1, 0xBD, 0x9B, 0xE1, 0xBD, 0x9D, + 0xE1, 0xBD, 0x9F, 0xE1, 0xBD, 0xA8, 0xE1, 0xBD, + 0xA9, 0xE1, 0xBD, 0xAA, 0xE1, 0xBD, 0xAB, 0xE1, + 0xBD, 0xAC, 0xE1, 0xBD, 0xAD, 0xE1, 0xBD, 0xAE, + 0xE1, 0xBD, 0xAF, 0xE1, 0xBE, 0xBA, 0xE1, 0xBE, + 0xBB, 0xE1, 0xBF, 0x88, 0xE1, 0xBF, 0x89, 0xE1, + 0xBF, 0x8A, 0xE1, 0xBF, 0x8B, 0xE1, 0xBF, 0x9A, + 0xE1, 0xBF, 0x9B, 0xE1, 0xBF, 0xB8, 0xE1, 0xBF, + 0xB9, 0xE1, 0xBF, 0xAA, 0xE1, 0xBF, 0xAB, 0xE1, + 0xBF, 0xBA, 0xE1, 0xBF, 0xBB, 0xE1, 0xBE, 0x88, + 0xE1, 0xBE, 0x89, 0xE1, 0xBE, 0x8A, 0xE1, 0xBE, + 0x8B, 0xE1, 0xBE, 0x8C, 0xE1, 0xBE, 0x8D, 0xE1, + 0xBE, 0x8E, 0xE1, 0xBE, 0x8F, 0xE1, 0xBE, 0x98, + 0xE1, 0xBE, 0x99, 0xE1, 0xBE, 0x9A, 0xE1, 0xBE, + 0x9B, 0xE1, 0xBE, 0x9C, 0xE1, 0xBE, 0x9D, 0xE1, + 0xBE, 0x9E, 0xE1, 0xBE, 0x9F, 0xE1, 0xBE, 0xA8, + 0xE1, 0xBE, 0xA9, 0xE1, 0xBE, 0xAA, 0xE1, 0xBE, + 0xAB, 0xE1, 0xBE, 0xAC, 0xE1, 0xBE, 0xAD, 0xE1, + 0xBE, 0xAE, 0xE1, 0xBE, 0xAF, 0xE1, 0xBE, 0xB8, + 0xE1, 0xBE, 0xB9, 0xE1, 0xBE, 0xBC, 0xCE, 0x99, + 0xE1, 0xBF, 0x8C, 0xE1, 0xBF, 0x98, 0xE1, 0xBF, + 0x99, 0xE1, 0xBF, 0xA8, 0xE1, 0xBF, 0xA9, 0xE1, + 0xBF, 0xAC, 0xE1, 0xBF, 0xBC, 0xE2, 0x85, 0xA0, + 0xE2, 0x85, 0xA1, 0xE2, 0x85, 0xA2, 0xE2, 0x85, + 0xA3, 0xE2, 0x85, 0xA4, 0xE2, 0x85, 0xA5, 0xE2, + 0x85, 0xA6, 0xE2, 0x85, 0xA7, 0xE2, 0x85, 0xA8, + 0xE2, 0x85, 0xA9, 0xE2, 0x85, 0xAA, 0xE2, 0x85, + 0xAB, 0xE2, 0x85, 0xAC, 0xE2, 0x85, 0xAD, 0xE2, + 0x85, 0xAE, 0xE2, 0x85, 0xAF, 0xE2, 0x92, 0xB6, + 0xE2, 0x92, 0xB7, 0xE2, 0x92, 0xB8, 0xE2, 0x92, + 0xB9, 0xE2, 0x92, 0xBA, 0xE2, 0x92, 0xBB, 0xE2, + 0x92, 0xBC, 0xE2, 0x92, 0xBD, 0xE2, 0x92, 0xBE, + 0xE2, 0x92, 0xBF, 0xE2, 0x93, 0x80, 0xE2, 0x93, + 0x81, 0xE2, 0x93, 0x82, 0xE2, 0x93, 0x83, 0xE2, + 0x93, 0x84, 0xE2, 0x93, 0x85, 0xE2, 0x93, 0x86, + 0xE2, 0x93, 0x87, 0xE2, 0x93, 0x88, 0xE2, 0x93, + 0x89, 0xE2, 0x93, 0x8A, 0xE2, 0x93, 0x8B, 0xE2, + 0x93, 0x8C, 0xE2, 0x93, 0x8D, 0xE2, 0x93, 0x8E, + 0xE2, 0x93, 0x8F, 0xEF, 0xBC, 0xA1, 0xEF, 0xBC, + 0xA2, 0xEF, 0xBC, 0xA3, 0xEF, 0xBC, 0xA4, 0xEF, + 0xBC, 0xA5, 0xEF, 0xBC, 0xA6, 0xEF, 0xBC, 0xA7, + 0xEF, 0xBC, 0xA8, 0xEF, 0xBC, 0xA9, 0xEF, 0xBC, + 0xAA, 0xEF, 0xBC, 0xAB, 0xEF, 0xBC, 0xAC, 0xEF, + 0xBC, 0xAD, 0xEF, 0xBC, 0xAE, 0xEF, 0xBC, 0xAF, + 0xEF, 0xBC, 0xB0, 0xEF, 0xBC, 0xB1, 0xEF, 0xBC, + 0xB2, 0xEF, 0xBC, 0xB3, 0xEF, 0xBC, 0xB4, 0xEF, + 0xBC, 0xB5, 0xEF, 0xBC, 0xB6, 0xEF, 0xBC, 0xB7, + 0xEF, 0xBC, 0xB8, 0xEF, 0xBC, 0xB9, 0xEF, 0xBC, + 0xBA, 0xF0, 0x90, 0x90, 0x80, 0xF0, 0x90, 0x90, + 0x81, 0xF0, 0x90, 0x90, 0x82, 0xF0, 0x90, 0x90, + 0x83, 0xF0, 0x90, 0x90, 0x84, 0xF0, 0x90, 0x90, + 0x85, 0xF0, 0x90, 0x90, 0x86, 0xF0, 0x90, 0x90, + 0x87, 0xF0, 0x90, 0x90, 0x88, 0xF0, 0x90, 0x90, + 0x89, 0xF0, 0x90, 0x90, 0x8A, 0xF0, 0x90, 0x90, + 0x8B, 0xF0, 0x90, 0x90, 0x8C, 0xF0, 0x90, 0x90, + 0x8D, 0xF0, 0x90, 0x90, 0x8E, 0xF0, 0x90, 0x90, + 0x8F, 0xF0, 0x90, 0x90, 0x90, 0xF0, 0x90, 0x90, + 0x91, 0xF0, 0x90, 0x90, 0x92, 0xF0, 0x90, 0x90, + 0x93, 0xF0, 0x90, 0x90, 0x94, 0xF0, 0x90, 0x90, + 0x95, 0xF0, 0x90, 0x90, 0x96, 0xF0, 0x90, 0x90, + 0x97, 0xF0, 0x90, 0x90, 0x98, 0xF0, 0x90, 0x90, + 0x99, 0xF0, 0x90, 0x90, 0x9A, 0xF0, 0x90, 0x90, + 0x9B, 0xF0, 0x90, 0x90, 0x9C, 0xF0, 0x90, 0x90, + 0x9D, 0xF0, 0x90, 0x90, 0x9E, 0xF0, 0x90, 0x90, + 0x9F, 0xF0, 0x90, 0x90, 0xA0, 0xF0, 0x90, 0x90, + 0xA1, 0xF0, 0x90, 0x90, 0xA2, 0xF0, 0x90, 0x90, + 0xA3, 0xF0, 0x90, 0x90, 0xA4, 0xF0, 0x90, 0x90, + 0xA5, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + }, + { + 0xCE, 0x9C, 0xC3, 0x80, 0xC3, 0x81, 0xC3, 0x82, + 0xC3, 0x83, 0xC3, 0x84, 0xC3, 0x85, 0xC3, 0x86, + 0xC3, 0x87, 0xC3, 0x88, 0xC3, 0x89, 0xC3, 0x8A, + 0xC3, 0x8B, 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, + 0xC3, 0x8F, 0xC3, 0x90, 0xC3, 0x91, 0xC3, 0x92, + 0xC3, 0x93, 0xC3, 0x94, 0xC3, 0x95, 0xC3, 0x96, + 0xC3, 0x98, 0xC3, 0x99, 0xC3, 0x9A, 0xC3, 0x9B, + 0xC3, 0x9C, 0xC3, 0x9D, 0xC3, 0x9E, 0xC5, 0xB8, + 0xC4, 0x80, 0xC4, 0x82, 0xC4, 0x84, 0xC4, 0x86, + 0xC4, 0x88, 0xC4, 0x8A, 0xC4, 0x8C, 0xC4, 0x8E, + 0xC4, 0x90, 0xC4, 0x92, 0xC4, 0x94, 0xC4, 0x96, + 0xC4, 0x98, 0xC4, 0x9A, 0xC4, 0x9C, 0xC4, 0x9E, + 0xC4, 0xA0, 0xC4, 0xA2, 0xC4, 0xA4, 0xC4, 0xA6, + 0xC4, 0xA8, 0xC4, 0xAA, 0xC4, 0xAC, 0xC4, 0xAE, + 0x49, 0xC4, 0xB2, 0xC4, 0xB4, 0xC4, 0xB6, 0xC4, + 0xB9, 0xC4, 0xBB, 0xC4, 0xBD, 0xC4, 0xBF, 0xC5, + 0x81, 0xC5, 0x83, 0xC5, 0x85, 0xC5, 0x87, 0xC5, + 0x8A, 0xC5, 0x8C, 0xC5, 0x8E, 0xC5, 0x90, 0xC5, + 0x92, 0xC5, 0x94, 0xC5, 0x96, 0xC5, 0x98, 0xC5, + 0x9A, 0xC5, 0x9C, 0xC5, 0x9E, 0xC5, 0xA0, 0xC5, + 0xA2, 0xC5, 0xA4, 0xC5, 0xA6, 0xC5, 0xA8, 0xC5, + 0xAA, 0xC5, 0xAC, 0xC5, 0xAE, 0xC5, 0xB0, 0xC5, + 0xB2, 0xC5, 0xB4, 0xC5, 0xB6, 0xC5, 0xB9, 0xC5, + 0xBB, 0xC5, 0xBD, 0x53, 0xC9, 0x83, 0xC6, 0x82, + 0xC6, 0x84, 0xC6, 0x87, 0xC6, 0x8B, 0xC6, 0x91, + 0xC7, 0xB6, 0xC6, 0x98, 0xC8, 0xBD, 0xC8, 0xA0, + 0xC6, 0xA0, 0xC6, 0xA2, 0xC6, 0xA4, 0xC6, 0xA7, + 0xC6, 0xAC, 0xC6, 0xAF, 0xC6, 0xB3, 0xC6, 0xB5, + 0xC6, 0xB8, 0xC6, 0xBC, 0xC7, 0xB7, 0xC7, 0x84, + 0xC7, 0x84, 0xC7, 0x87, 0xC7, 0x87, 0xC7, 0x8A, + 0xC7, 0x8A, 0xC7, 0x8D, 0xC7, 0x8F, 0xC7, 0x91, + 0xC7, 0x93, 0xC7, 0x95, 0xC7, 0x97, 0xC7, 0x99, + 0xC7, 0x9B, 0xC6, 0x8E, 0xC7, 0x9E, 0xC7, 0xA0, + 0xC7, 0xA2, 0xC7, 0xA4, 0xC7, 0xA6, 0xC7, 0xA8, + 0xC7, 0xAA, 0xC7, 0xAC, 0xC7, 0xAE, 0xC7, 0xB1, + 0xC7, 0xB1, 0xC7, 0xB4, 0xC7, 0xB8, 0xC7, 0xBA, + 0xC7, 0xBC, 0xC7, 0xBE, 0xC8, 0x80, 0xC8, 0x82, + 0xC8, 0x84, 0xC8, 0x86, 0xC8, 0x88, 0xC8, 0x8A, + 0xC8, 0x8C, 0xC8, 0x8E, 0xC8, 0x90, 0xC8, 0x92, + 0xC8, 0x94, 0xC8, 0x96, 0xC8, 0x98, 0xC8, 0x9A, + 0xC8, 0x9C, 0xC8, 0x9E, 0xC8, 0xA2, 0xC8, 0xA4, + 0xC8, 0xA6, 0xC8, 0xA8, 0xC8, 0xAA, 0xC8, 0xAC, + 0xC8, 0xAE, 0xC8, 0xB0, 0xC8, 0xB2, 0xC8, 0xBB, + 0xC9, 0x81, 0xC9, 0x86, 0xC9, 0x88, 0xC9, 0x8A, + 0xC9, 0x8C, 0xC9, 0x8E, 0xC6, 0x81, 0xC6, 0x86, + 0xC6, 0x89, 0xC6, 0x8A, 0xC6, 0x8F, 0xC6, 0x90, + 0xC6, 0x93, 0xC6, 0x94, 0xC6, 0x97, 0xC6, 0x96, + 0xE2, 0xB1, 0xA2, 0xC6, 0x9C, 0xC6, 0x9D, 0xC6, + 0x9F, 0xE2, 0xB1, 0xA4, 0xC6, 0xA6, 0xC6, 0xA9, + 0xC6, 0xAE, 0xC9, 0x84, 0xC6, 0xB1, 0xC6, 0xB2, + 0xC9, 0x85, 0xC6, 0xB7, 0xCE, 0x99, 0xCF, 0xBD, + 0xCF, 0xBE, 0xCF, 0xBF, 0xCE, 0x86, 0xCE, 0x88, + 0xCE, 0x89, 0xCE, 0x8A, 0xCE, 0x91, 0xCE, 0x92, + 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, + 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, + 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, + 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0xA3, + 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, + 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xCE, 0xAA, + 0xCE, 0xAB, 0xCE, 0x8C, 0xCE, 0x8E, 0xCE, 0x8F, + 0xCE, 0x92, 0xCE, 0x98, 0xCE, 0xA6, 0xCE, 0xA0, + 0xCF, 0x98, 0xCF, 0x9A, 0xCF, 0x9C, 0xCF, 0x9E, + 0xCF, 0xA0, 0xCF, 0xA2, 0xCF, 0xA4, 0xCF, 0xA6, + 0xCF, 0xA8, 0xCF, 0xAA, 0xCF, 0xAC, 0xCF, 0xAE, + 0xCE, 0x9A, 0xCE, 0xA1, 0xCF, 0xB9, 0xCE, 0x95, + 0xCF, 0xB7, 0xCF, 0xBA, 0xD0, 0x90, 0xD0, 0x91, + 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, 0xD0, 0x95, + 0xD0, 0x96, 0xD0, 0x97, 0xD0, 0x98, 0xD0, 0x99, + 0xD0, 0x9A, 0xD0, 0x9B, 0xD0, 0x9C, 0xD0, 0x9D, + 0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, 0xD0, 0xA1, + 0xD0, 0xA2, 0xD0, 0xA3, 0xD0, 0xA4, 0xD0, 0xA5, + 0xD0, 0xA6, 0xD0, 0xA7, 0xD0, 0xA8, 0xD0, 0xA9, + 0xD0, 0xAA, 0xD0, 0xAB, 0xD0, 0xAC, 0xD0, 0xAD, + 0xD0, 0xAE, 0xD0, 0xAF, 0xD0, 0x80, 0xD0, 0x81, + 0xD0, 0x82, 0xD0, 0x83, 0xD0, 0x84, 0xD0, 0x85, + 0xD0, 0x86, 0xD0, 0x87, 0xD0, 0x88, 0xD0, 0x89, + 0xD0, 0x8A, 0xD0, 0x8B, 0xD0, 0x8C, 0xD0, 0x8D, + 0xD0, 0x8E, 0xD0, 0x8F, 0xD1, 0xA0, 0xD1, 0xA2, + 0xD1, 0xA4, 0xD1, 0xA6, 0xD1, 0xA8, 0xD1, 0xAA, + 0xD1, 0xAC, 0xD1, 0xAE, 0xD1, 0xB0, 0xD1, 0xB2, + 0xD1, 0xB4, 0xD1, 0xB6, 0xD1, 0xB8, 0xD1, 0xBA, + 0xD1, 0xBC, 0xD1, 0xBE, 0xD2, 0x80, 0xD2, 0x8A, + 0xD2, 0x8C, 0xD2, 0x8E, 0xD2, 0x90, 0xD2, 0x92, + 0xD2, 0x94, 0xD2, 0x96, 0xD2, 0x98, 0xD2, 0x9A, + 0xD2, 0x9C, 0xD2, 0x9E, 0xD2, 0xA0, 0xD2, 0xA2, + 0xD2, 0xA4, 0xD2, 0xA6, 0xD2, 0xA8, 0xD2, 0xAA, + 0xD2, 0xAC, 0xD2, 0xAE, 0xD2, 0xB0, 0xD2, 0xB2, + 0xD2, 0xB4, 0xD2, 0xB6, 0xD2, 0xB8, 0xD2, 0xBA, + 0xD2, 0xBC, 0xD2, 0xBE, 0xD3, 0x81, 0xD3, 0x83, + 0xD3, 0x85, 0xD3, 0x87, 0xD3, 0x89, 0xD3, 0x8B, + 0xD3, 0x8D, 0xD3, 0x80, 0xD3, 0x90, 0xD3, 0x92, + 0xD3, 0x94, 0xD3, 0x96, 0xD3, 0x98, 0xD3, 0x9A, + 0xD3, 0x9C, 0xD3, 0x9E, 0xD3, 0xA0, 0xD3, 0xA2, + 0xD3, 0xA4, 0xD3, 0xA6, 0xD3, 0xA8, 0xD3, 0xAA, + 0xD3, 0xAC, 0xD3, 0xAE, 0xD3, 0xB0, 0xD3, 0xB2, + 0xD3, 0xB4, 0xD3, 0xB6, 0xD3, 0xB8, 0xD3, 0xBA, + 0xD3, 0xBC, 0xD3, 0xBE, 0xD4, 0x80, 0xD4, 0x82, + 0xD4, 0x84, 0xD4, 0x86, 0xD4, 0x88, 0xD4, 0x8A, + 0xD4, 0x8C, 0xD4, 0x8E, 0xD4, 0x90, 0xD4, 0x92, + 0xD4, 0xB1, 0xD4, 0xB2, 0xD4, 0xB3, 0xD4, 0xB4, + 0xD4, 0xB5, 0xD4, 0xB6, 0xD4, 0xB7, 0xD4, 0xB8, + 0xD4, 0xB9, 0xD4, 0xBA, 0xD4, 0xBB, 0xD4, 0xBC, + 0xD4, 0xBD, 0xD4, 0xBE, 0xD4, 0xBF, 0xD5, 0x80, + 0xD5, 0x81, 0xD5, 0x82, 0xD5, 0x83, 0xD5, 0x84, + 0xD5, 0x85, 0xD5, 0x86, 0xD5, 0x87, 0xD5, 0x88, + 0xD5, 0x89, 0xD5, 0x8A, 0xD5, 0x8B, 0xD5, 0x8C, + 0xD5, 0x8D, 0xD5, 0x8E, 0xD5, 0x8F, 0xD5, 0x90, + 0xD5, 0x91, 0xD5, 0x92, 0xD5, 0x93, 0xD5, 0x94, + 0xD5, 0x95, 0xD5, 0x96, 0xE2, 0xB1, 0xA3, 0xE1, + 0xB8, 0x80, 0xE1, 0xB8, 0x82, 0xE1, 0xB8, 0x84, + 0xE1, 0xB8, 0x86, 0xE1, 0xB8, 0x88, 0xE1, 0xB8, + 0x8A, 0xE1, 0xB8, 0x8C, 0xE1, 0xB8, 0x8E, 0xE1, + 0xB8, 0x90, 0xE1, 0xB8, 0x92, 0xE1, 0xB8, 0x94, + 0xE1, 0xB8, 0x96, 0xE1, 0xB8, 0x98, 0xE1, 0xB8, + 0x9A, 0xE1, 0xB8, 0x9C, 0xE1, 0xB8, 0x9E, 0xE1, + 0xB8, 0xA0, 0xE1, 0xB8, 0xA2, 0xE1, 0xB8, 0xA4, + 0xE1, 0xB8, 0xA6, 0xE1, 0xB8, 0xA8, 0xE1, 0xB8, + 0xAA, 0xE1, 0xB8, 0xAC, 0xE1, 0xB8, 0xAE, 0xE1, + 0xB8, 0xB0, 0xE1, 0xB8, 0xB2, 0xE1, 0xB8, 0xB4, + 0xE1, 0xB8, 0xB6, 0xE1, 0xB8, 0xB8, 0xE1, 0xB8, + 0xBA, 0xE1, 0xB8, 0xBC, 0xE1, 0xB8, 0xBE, 0xE1, + 0xB9, 0x80, 0xE1, 0xB9, 0x82, 0xE1, 0xB9, 0x84, + 0xE1, 0xB9, 0x86, 0xE1, 0xB9, 0x88, 0xE1, 0xB9, + 0x8A, 0xE1, 0xB9, 0x8C, 0xE1, 0xB9, 0x8E, 0xE1, + 0xB9, 0x90, 0xE1, 0xB9, 0x92, 0xE1, 0xB9, 0x94, + 0xE1, 0xB9, 0x96, 0xE1, 0xB9, 0x98, 0xE1, 0xB9, + 0x9A, 0xE1, 0xB9, 0x9C, 0xE1, 0xB9, 0x9E, 0xE1, + 0xB9, 0xA0, 0xE1, 0xB9, 0xA2, 0xE1, 0xB9, 0xA4, + 0xE1, 0xB9, 0xA6, 0xE1, 0xB9, 0xA8, 0xE1, 0xB9, + 0xAA, 0xE1, 0xB9, 0xAC, 0xE1, 0xB9, 0xAE, 0xE1, + 0xB9, 0xB0, 0xE1, 0xB9, 0xB2, 0xE1, 0xB9, 0xB4, + 0xE1, 0xB9, 0xB6, 0xE1, 0xB9, 0xB8, 0xE1, 0xB9, + 0xBA, 0xE1, 0xB9, 0xBC, 0xE1, 0xB9, 0xBE, 0xE1, + 0xBA, 0x80, 0xE1, 0xBA, 0x82, 0xE1, 0xBA, 0x84, + 0xE1, 0xBA, 0x86, 0xE1, 0xBA, 0x88, 0xE1, 0xBA, + 0x8A, 0xE1, 0xBA, 0x8C, 0xE1, 0xBA, 0x8E, 0xE1, + 0xBA, 0x90, 0xE1, 0xBA, 0x92, 0xE1, 0xBA, 0x94, + 0xE1, 0xB9, 0xA0, 0xE1, 0xBA, 0xA0, 0xE1, 0xBA, + 0xA2, 0xE1, 0xBA, 0xA4, 0xE1, 0xBA, 0xA6, 0xE1, + 0xBA, 0xA8, 0xE1, 0xBA, 0xAA, 0xE1, 0xBA, 0xAC, + 0xE1, 0xBA, 0xAE, 0xE1, 0xBA, 0xB0, 0xE1, 0xBA, + 0xB2, 0xE1, 0xBA, 0xB4, 0xE1, 0xBA, 0xB6, 0xE1, + 0xBA, 0xB8, 0xE1, 0xBA, 0xBA, 0xE1, 0xBA, 0xBC, + 0xE1, 0xBA, 0xBE, 0xE1, 0xBB, 0x80, 0xE1, 0xBB, + 0x82, 0xE1, 0xBB, 0x84, 0xE1, 0xBB, 0x86, 0xE1, + 0xBB, 0x88, 0xE1, 0xBB, 0x8A, 0xE1, 0xBB, 0x8C, + 0xE1, 0xBB, 0x8E, 0xE1, 0xBB, 0x90, 0xE1, 0xBB, + 0x92, 0xE1, 0xBB, 0x94, 0xE1, 0xBB, 0x96, 0xE1, + 0xBB, 0x98, 0xE1, 0xBB, 0x9A, 0xE1, 0xBB, 0x9C, + 0xE1, 0xBB, 0x9E, 0xE1, 0xBB, 0xA0, 0xE1, 0xBB, + 0xA2, 0xE1, 0xBB, 0xA4, 0xE1, 0xBB, 0xA6, 0xE1, + 0xBB, 0xA8, 0xE1, 0xBB, 0xAA, 0xE1, 0xBB, 0xAC, + 0xE1, 0xBB, 0xAE, 0xE1, 0xBB, 0xB0, 0xE1, 0xBB, + 0xB2, 0xE1, 0xBB, 0xB4, 0xE1, 0xBB, 0xB6, 0xE1, + 0xBB, 0xB8, 0xE1, 0xBC, 0x88, 0xE1, 0xBC, 0x89, + 0xE1, 0xBC, 0x8A, 0xE1, 0xBC, 0x8B, 0xE1, 0xBC, + 0x8C, 0xE1, 0xBC, 0x8D, 0xE1, 0xBC, 0x8E, 0xE1, + 0xBC, 0x8F, 0xE1, 0xBC, 0x98, 0xE1, 0xBC, 0x99, + 0xE1, 0xBC, 0x9A, 0xE1, 0xBC, 0x9B, 0xE1, 0xBC, + 0x9C, 0xE1, 0xBC, 0x9D, 0xE1, 0xBC, 0xA8, 0xE1, + 0xBC, 0xA9, 0xE1, 0xBC, 0xAA, 0xE1, 0xBC, 0xAB, + 0xE1, 0xBC, 0xAC, 0xE1, 0xBC, 0xAD, 0xE1, 0xBC, + 0xAE, 0xE1, 0xBC, 0xAF, 0xE1, 0xBC, 0xB8, 0xE1, + 0xBC, 0xB9, 0xE1, 0xBC, 0xBA, 0xE1, 0xBC, 0xBB, + 0xE1, 0xBC, 0xBC, 0xE1, 0xBC, 0xBD, 0xE1, 0xBC, + 0xBE, 0xE1, 0xBC, 0xBF, 0xE1, 0xBD, 0x88, 0xE1, + 0xBD, 0x89, 0xE1, 0xBD, 0x8A, 0xE1, 0xBD, 0x8B, + 0xE1, 0xBD, 0x8C, 0xE1, 0xBD, 0x8D, 0xE1, 0xBD, + 0x99, 0xE1, 0xBD, 0x9B, 0xE1, 0xBD, 0x9D, 0xE1, + 0xBD, 0x9F, 0xE1, 0xBD, 0xA8, 0xE1, 0xBD, 0xA9, + 0xE1, 0xBD, 0xAA, 0xE1, 0xBD, 0xAB, 0xE1, 0xBD, + 0xAC, 0xE1, 0xBD, 0xAD, 0xE1, 0xBD, 0xAE, 0xE1, + 0xBD, 0xAF, 0xE1, 0xBE, 0xBA, 0xE1, 0xBE, 0xBB, + 0xE1, 0xBF, 0x88, 0xE1, 0xBF, 0x89, 0xE1, 0xBF, + 0x8A, 0xE1, 0xBF, 0x8B, 0xE1, 0xBF, 0x9A, 0xE1, + 0xBF, 0x9B, 0xE1, 0xBF, 0xB8, 0xE1, 0xBF, 0xB9, + 0xE1, 0xBF, 0xAA, 0xE1, 0xBF, 0xAB, 0xE1, 0xBF, + 0xBA, 0xE1, 0xBF, 0xBB, 0xE1, 0xBE, 0x88, 0xE1, + 0xBE, 0x89, 0xE1, 0xBE, 0x8A, 0xE1, 0xBE, 0x8B, + 0xE1, 0xBE, 0x8C, 0xE1, 0xBE, 0x8D, 0xE1, 0xBE, + 0x8E, 0xE1, 0xBE, 0x8F, 0xE1, 0xBE, 0x98, 0xE1, + 0xBE, 0x99, 0xE1, 0xBE, 0x9A, 0xE1, 0xBE, 0x9B, + 0xE1, 0xBE, 0x9C, 0xE1, 0xBE, 0x9D, 0xE1, 0xBE, + 0x9E, 0xE1, 0xBE, 0x9F, 0xE1, 0xBE, 0xA8, 0xE1, + 0xBE, 0xA9, 0xE1, 0xBE, 0xAA, 0xE1, 0xBE, 0xAB, + 0xE1, 0xBE, 0xAC, 0xE1, 0xBE, 0xAD, 0xE1, 0xBE, + 0xAE, 0xE1, 0xBE, 0xAF, 0xE1, 0xBE, 0xB8, 0xE1, + 0xBE, 0xB9, 0xE1, 0xBE, 0xBC, 0xCE, 0x99, 0xE1, + 0xBF, 0x8C, 0xE1, 0xBF, 0x98, 0xE1, 0xBF, 0x99, + 0xE1, 0xBF, 0xA8, 0xE1, 0xBF, 0xA9, 0xE1, 0xBF, + 0xAC, 0xE1, 0xBF, 0xBC, 0xE2, 0x84, 0xB2, 0xE2, + 0x85, 0xA0, 0xE2, 0x85, 0xA1, 0xE2, 0x85, 0xA2, + 0xE2, 0x85, 0xA3, 0xE2, 0x85, 0xA4, 0xE2, 0x85, + 0xA5, 0xE2, 0x85, 0xA6, 0xE2, 0x85, 0xA7, 0xE2, + 0x85, 0xA8, 0xE2, 0x85, 0xA9, 0xE2, 0x85, 0xAA, + 0xE2, 0x85, 0xAB, 0xE2, 0x85, 0xAC, 0xE2, 0x85, + 0xAD, 0xE2, 0x85, 0xAE, 0xE2, 0x85, 0xAF, 0xE2, + 0x86, 0x83, 0xE2, 0x92, 0xB6, 0xE2, 0x92, 0xB7, + 0xE2, 0x92, 0xB8, 0xE2, 0x92, 0xB9, 0xE2, 0x92, + 0xBA, 0xE2, 0x92, 0xBB, 0xE2, 0x92, 0xBC, 0xE2, + 0x92, 0xBD, 0xE2, 0x92, 0xBE, 0xE2, 0x92, 0xBF, + 0xE2, 0x93, 0x80, 0xE2, 0x93, 0x81, 0xE2, 0x93, + 0x82, 0xE2, 0x93, 0x83, 0xE2, 0x93, 0x84, 0xE2, + 0x93, 0x85, 0xE2, 0x93, 0x86, 0xE2, 0x93, 0x87, + 0xE2, 0x93, 0x88, 0xE2, 0x93, 0x89, 0xE2, 0x93, + 0x8A, 0xE2, 0x93, 0x8B, 0xE2, 0x93, 0x8C, 0xE2, + 0x93, 0x8D, 0xE2, 0x93, 0x8E, 0xE2, 0x93, 0x8F, + 0xE2, 0xB0, 0x80, 0xE2, 0xB0, 0x81, 0xE2, 0xB0, + 0x82, 0xE2, 0xB0, 0x83, 0xE2, 0xB0, 0x84, 0xE2, + 0xB0, 0x85, 0xE2, 0xB0, 0x86, 0xE2, 0xB0, 0x87, + 0xE2, 0xB0, 0x88, 0xE2, 0xB0, 0x89, 0xE2, 0xB0, + 0x8A, 0xE2, 0xB0, 0x8B, 0xE2, 0xB0, 0x8C, 0xE2, + 0xB0, 0x8D, 0xE2, 0xB0, 0x8E, 0xE2, 0xB0, 0x8F, + 0xE2, 0xB0, 0x90, 0xE2, 0xB0, 0x91, 0xE2, 0xB0, + 0x92, 0xE2, 0xB0, 0x93, 0xE2, 0xB0, 0x94, 0xE2, + 0xB0, 0x95, 0xE2, 0xB0, 0x96, 0xE2, 0xB0, 0x97, + 0xE2, 0xB0, 0x98, 0xE2, 0xB0, 0x99, 0xE2, 0xB0, + 0x9A, 0xE2, 0xB0, 0x9B, 0xE2, 0xB0, 0x9C, 0xE2, + 0xB0, 0x9D, 0xE2, 0xB0, 0x9E, 0xE2, 0xB0, 0x9F, + 0xE2, 0xB0, 0xA0, 0xE2, 0xB0, 0xA1, 0xE2, 0xB0, + 0xA2, 0xE2, 0xB0, 0xA3, 0xE2, 0xB0, 0xA4, 0xE2, + 0xB0, 0xA5, 0xE2, 0xB0, 0xA6, 0xE2, 0xB0, 0xA7, + 0xE2, 0xB0, 0xA8, 0xE2, 0xB0, 0xA9, 0xE2, 0xB0, + 0xAA, 0xE2, 0xB0, 0xAB, 0xE2, 0xB0, 0xAC, 0xE2, + 0xB0, 0xAD, 0xE2, 0xB0, 0xAE, 0xE2, 0xB1, 0xA0, + 0xC8, 0xBA, 0xC8, 0xBE, 0xE2, 0xB1, 0xA7, 0xE2, + 0xB1, 0xA9, 0xE2, 0xB1, 0xAB, 0xE2, 0xB1, 0xB5, + 0xE2, 0xB2, 0x80, 0xE2, 0xB2, 0x82, 0xE2, 0xB2, + 0x84, 0xE2, 0xB2, 0x86, 0xE2, 0xB2, 0x88, 0xE2, + 0xB2, 0x8A, 0xE2, 0xB2, 0x8C, 0xE2, 0xB2, 0x8E, + 0xE2, 0xB2, 0x90, 0xE2, 0xB2, 0x92, 0xE2, 0xB2, + 0x94, 0xE2, 0xB2, 0x96, 0xE2, 0xB2, 0x98, 0xE2, + 0xB2, 0x9A, 0xE2, 0xB2, 0x9C, 0xE2, 0xB2, 0x9E, + 0xE2, 0xB2, 0xA0, 0xE2, 0xB2, 0xA2, 0xE2, 0xB2, + 0xA4, 0xE2, 0xB2, 0xA6, 0xE2, 0xB2, 0xA8, 0xE2, + 0xB2, 0xAA, 0xE2, 0xB2, 0xAC, 0xE2, 0xB2, 0xAE, + 0xE2, 0xB2, 0xB0, 0xE2, 0xB2, 0xB2, 0xE2, 0xB2, + 0xB4, 0xE2, 0xB2, 0xB6, 0xE2, 0xB2, 0xB8, 0xE2, + 0xB2, 0xBA, 0xE2, 0xB2, 0xBC, 0xE2, 0xB2, 0xBE, + 0xE2, 0xB3, 0x80, 0xE2, 0xB3, 0x82, 0xE2, 0xB3, + 0x84, 0xE2, 0xB3, 0x86, 0xE2, 0xB3, 0x88, 0xE2, + 0xB3, 0x8A, 0xE2, 0xB3, 0x8C, 0xE2, 0xB3, 0x8E, + 0xE2, 0xB3, 0x90, 0xE2, 0xB3, 0x92, 0xE2, 0xB3, + 0x94, 0xE2, 0xB3, 0x96, 0xE2, 0xB3, 0x98, 0xE2, + 0xB3, 0x9A, 0xE2, 0xB3, 0x9C, 0xE2, 0xB3, 0x9E, + 0xE2, 0xB3, 0xA0, 0xE2, 0xB3, 0xA2, 0xE1, 0x82, + 0xA0, 0xE1, 0x82, 0xA1, 0xE1, 0x82, 0xA2, 0xE1, + 0x82, 0xA3, 0xE1, 0x82, 0xA4, 0xE1, 0x82, 0xA5, + 0xE1, 0x82, 0xA6, 0xE1, 0x82, 0xA7, 0xE1, 0x82, + 0xA8, 0xE1, 0x82, 0xA9, 0xE1, 0x82, 0xAA, 0xE1, + 0x82, 0xAB, 0xE1, 0x82, 0xAC, 0xE1, 0x82, 0xAD, + 0xE1, 0x82, 0xAE, 0xE1, 0x82, 0xAF, 0xE1, 0x82, + 0xB0, 0xE1, 0x82, 0xB1, 0xE1, 0x82, 0xB2, 0xE1, + 0x82, 0xB3, 0xE1, 0x82, 0xB4, 0xE1, 0x82, 0xB5, + 0xE1, 0x82, 0xB6, 0xE1, 0x82, 0xB7, 0xE1, 0x82, + 0xB8, 0xE1, 0x82, 0xB9, 0xE1, 0x82, 0xBA, 0xE1, + 0x82, 0xBB, 0xE1, 0x82, 0xBC, 0xE1, 0x82, 0xBD, + 0xE1, 0x82, 0xBE, 0xE1, 0x82, 0xBF, 0xE1, 0x83, + 0x80, 0xE1, 0x83, 0x81, 0xE1, 0x83, 0x82, 0xE1, + 0x83, 0x83, 0xE1, 0x83, 0x84, 0xE1, 0x83, 0x85, + 0xEF, 0xBC, 0xA1, 0xEF, 0xBC, 0xA2, 0xEF, 0xBC, + 0xA3, 0xEF, 0xBC, 0xA4, 0xEF, 0xBC, 0xA5, 0xEF, + 0xBC, 0xA6, 0xEF, 0xBC, 0xA7, 0xEF, 0xBC, 0xA8, + 0xEF, 0xBC, 0xA9, 0xEF, 0xBC, 0xAA, 0xEF, 0xBC, + 0xAB, 0xEF, 0xBC, 0xAC, 0xEF, 0xBC, 0xAD, 0xEF, + 0xBC, 0xAE, 0xEF, 0xBC, 0xAF, 0xEF, 0xBC, 0xB0, + 0xEF, 0xBC, 0xB1, 0xEF, 0xBC, 0xB2, 0xEF, 0xBC, + 0xB3, 0xEF, 0xBC, 0xB4, 0xEF, 0xBC, 0xB5, 0xEF, + 0xBC, 0xB6, 0xEF, 0xBC, 0xB7, 0xEF, 0xBC, 0xB8, + 0xEF, 0xBC, 0xB9, 0xEF, 0xBC, 0xBA, 0xF0, 0x90, + 0x90, 0x80, 0xF0, 0x90, 0x90, 0x81, 0xF0, 0x90, + 0x90, 0x82, 0xF0, 0x90, 0x90, 0x83, 0xF0, 0x90, + 0x90, 0x84, 0xF0, 0x90, 0x90, 0x85, 0xF0, 0x90, + 0x90, 0x86, 0xF0, 0x90, 0x90, 0x87, 0xF0, 0x90, + 0x90, 0x88, 0xF0, 0x90, 0x90, 0x89, 0xF0, 0x90, + 0x90, 0x8A, 0xF0, 0x90, 0x90, 0x8B, 0xF0, 0x90, + 0x90, 0x8C, 0xF0, 0x90, 0x90, 0x8D, 0xF0, 0x90, + 0x90, 0x8E, 0xF0, 0x90, 0x90, 0x8F, 0xF0, 0x90, + 0x90, 0x90, 0xF0, 0x90, 0x90, 0x91, 0xF0, 0x90, + 0x90, 0x92, 0xF0, 0x90, 0x90, 0x93, 0xF0, 0x90, + 0x90, 0x94, 0xF0, 0x90, 0x90, 0x95, 0xF0, 0x90, + 0x90, 0x96, 0xF0, 0x90, 0x90, 0x97, 0xF0, 0x90, + 0x90, 0x98, 0xF0, 0x90, 0x90, 0x99, 0xF0, 0x90, + 0x90, 0x9A, 0xF0, 0x90, 0x90, 0x9B, 0xF0, 0x90, + 0x90, 0x9C, 0xF0, 0x90, 0x90, 0x9D, 0xF0, 0x90, + 0x90, 0x9E, 0xF0, 0x90, 0x90, 0x9F, 0xF0, 0x90, + 0x90, 0xA0, 0xF0, 0x90, 0x90, 0xA1, 0xF0, 0x90, + 0x90, 0xA2, 0xF0, 0x90, 0x90, 0xA3, 0xF0, 0x90, + 0x90, 0xA4, 0xF0, 0x90, 0x90, 0xA5, 0xF0, 0x90, + 0x90, 0xA6, 0xF0, 0x90, 0x90, 0xA7, + }, +}; + +#undef N_ +#undef FIL_ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_U8_TEXTPREP_DATA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h b/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h new file mode 100644 index 000000000000..465d8998d4e2 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h @@ -0,0 +1,427 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 RackTop Systems. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_VNODE_H +#define _SYS_VNODE_H + +#include_next <sys/vnode.h> + +#define IS_DEVVP(vp) \ + ((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO) + +#define V_XATTRDIR 0x0000 /* attribute unnamed directory */ + +#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ + +/* + * Structure of all optional attributes. + */ +typedef struct xoptattr { + timestruc_t xoa_createtime; /* Create time of file */ + uint8_t xoa_archive; + uint8_t xoa_system; + uint8_t xoa_readonly; + uint8_t xoa_hidden; + uint8_t xoa_nounlink; + uint8_t xoa_immutable; + uint8_t xoa_appendonly; + uint8_t xoa_nodump; + uint8_t xoa_opaque; + uint8_t xoa_av_quarantined; + uint8_t xoa_av_modified; + uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ]; + uint8_t xoa_reparse; + uint64_t xoa_generation; + uint8_t xoa_offline; + uint8_t xoa_sparse; +} xoptattr_t; + +/* + * The xvattr structure is really a variable length structure that + * is made up of: + * - The classic vattr_t (xva_vattr) + * - a 32 bit quantity (xva_mapsize) that specifies the size of the + * attribute bitmaps in 32 bit words. + * - A pointer to the returned attribute bitmap (needed because the + * previous element, the requested attribute bitmap) is variable lenth. + * - The requested attribute bitmap, which is an array of 32 bit words. + * Callers use the XVA_SET_REQ() macro to set the bits corresponding to + * the attributes that are being requested. + * - The returned attribute bitmap, which is an array of 32 bit words. + * File systems that support optional attributes use the XVA_SET_RTN() + * macro to set the bits corresponding to the attributes that are being + * returned. + * - The xoptattr_t structure which contains the attribute values + * + * xva_mapsize determines how many words in the attribute bitmaps. + * Immediately following the attribute bitmaps is the xoptattr_t. + * xva_getxoptattr() is used to get the pointer to the xoptattr_t + * section. + */ + +#define XVA_MAPSIZE 3 /* Size of attr bitmaps */ +#define XVA_MAGIC 0x78766174 /* Magic # for verification */ + +/* + * The xvattr structure is an extensible structure which permits optional + * attributes to be requested/returned. File systems may or may not support + * optional attributes. They do so at their own discretion but if they do + * support optional attributes, they must register the VFSFT_XVATTR feature + * so that the optional attributes can be set/retrived. + * + * The fields of the xvattr structure are: + * + * xva_vattr - The first element of an xvattr is a legacy vattr structure + * which includes the common attributes. If AT_XVATTR is set in the va_mask + * then the entire structure is treated as an xvattr. If AT_XVATTR is not + * set, then only the xva_vattr structure can be used. + * + * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification. + * + * xva_mapsize - Size of requested and returned attribute bitmaps. + * + * xva_rtnattrmapp - Pointer to xva_rtnattrmap[]. We need this since the + * size of the array before it, xva_reqattrmap[], could change which means + * the location of xva_rtnattrmap[] could change. This will allow unbundled + * file systems to find the location of xva_rtnattrmap[] when the sizes change. + * + * xva_reqattrmap[] - Array of requested attributes. Attributes are + * represented by a specific bit in a specific element of the attribute + * map array. Callers set the bits corresponding to the attributes + * that the caller wants to get/set. + * + * xva_rtnattrmap[] - Array of attributes that the file system was able to + * process. Not all file systems support all optional attributes. This map + * informs the caller which attributes the underlying file system was able + * to set/get. (Same structure as the requested attributes array in terms + * of each attribute corresponding to specific bits and array elements.) + * + * xva_xoptattrs - Structure containing values of optional attributes. + * These values are only valid if the corresponding bits in xva_reqattrmap + * are set and the underlying file system supports those attributes. + */ +typedef struct xvattr { + vattr_t xva_vattr; /* Embedded vattr structure */ + uint32_t xva_magic; /* Magic Number */ + uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */ + uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */ + uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */ + uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */ + xoptattr_t xva_xoptattrs; /* Optional attributes */ +} xvattr_t; + +/* + * Attributes of interest to the caller of setattr or getattr. + */ +#define AT_TYPE 0x00001 +#define AT_MODE 0x00002 +#define AT_UID 0x00004 +#define AT_GID 0x00008 +#define AT_FSID 0x00010 +#define AT_NODEID 0x00020 +#define AT_NLINK 0x00040 +#define AT_SIZE 0x00080 +#define AT_ATIME 0x00100 +#define AT_MTIME 0x00200 +#define AT_CTIME 0x00400 +#define AT_RDEV 0x00800 +#define AT_BLKSIZE 0x01000 +#define AT_NBLOCKS 0x02000 +/* 0x04000 */ /* unused */ +#define AT_SEQ 0x08000 +/* + * If AT_XVATTR is set then there are additional bits to process in + * the xvattr_t's attribute bitmap. If this is not set then the bitmap + * MUST be ignored. Note that this bit must be set/cleared explicitly. + * That is, setting AT_ALL will NOT set AT_XVATTR. + */ +#define AT_XVATTR 0x10000 + +#define AT_ALL (AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\ + AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|\ + AT_RDEV|AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) + +#define AT_STAT (AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\ + AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|AT_TYPE) + +#define AT_TIMES (AT_ATIME|AT_MTIME|AT_CTIME) + +#define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\ + AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) + +/* + * Attribute bits used in the extensible attribute's (xva's) attribute + * bitmaps. Note that the bitmaps are made up of a variable length number + * of 32-bit words. The convention is to use XAT{n}_{attrname} where "n" + * is the element in the bitmap (starting at 1). This convention is for + * the convenience of the maintainer to keep track of which element each + * attribute belongs to. + * + * NOTE THAT CONSUMERS MUST *NOT* USE THE XATn_* DEFINES DIRECTLY. CONSUMERS + * MUST USE THE XAT_* DEFINES. + */ +#define XAT0_INDEX 0LL /* Index into bitmap for XAT0 attrs */ +#define XAT0_CREATETIME 0x00000001 /* Create time of file */ +#define XAT0_ARCHIVE 0x00000002 /* Archive */ +#define XAT0_SYSTEM 0x00000004 /* System */ +#define XAT0_READONLY 0x00000008 /* Readonly */ +#define XAT0_HIDDEN 0x00000010 /* Hidden */ +#define XAT0_NOUNLINK 0x00000020 /* Nounlink */ +#define XAT0_IMMUTABLE 0x00000040 /* immutable */ +#define XAT0_APPENDONLY 0x00000080 /* appendonly */ +#define XAT0_NODUMP 0x00000100 /* nodump */ +#define XAT0_OPAQUE 0x00000200 /* opaque */ +#define XAT0_AV_QUARANTINED 0x00000400 /* anti-virus quarantine */ +#define XAT0_AV_MODIFIED 0x00000800 /* anti-virus modified */ +#define XAT0_AV_SCANSTAMP 0x00001000 /* anti-virus scanstamp */ +#define XAT0_REPARSE 0x00002000 /* FS reparse point */ +#define XAT0_GEN 0x00004000 /* object generation number */ +#define XAT0_OFFLINE 0x00008000 /* offline */ +#define XAT0_SPARSE 0x00010000 /* sparse */ + +#define XAT0_ALL_ATTRS (XAT0_CREATETIME|XAT0_ARCHIVE|XAT0_SYSTEM| \ + XAT0_READONLY|XAT0_HIDDEN|XAT0_NOUNLINK|XAT0_IMMUTABLE|XAT0_APPENDONLY| \ + XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED| XAT0_AV_MODIFIED| \ + XAT0_AV_SCANSTAMP|XAT0_REPARSE|XATO_GEN|XAT0_OFFLINE|XAT0_SPARSE) + +/* Support for XAT_* optional attributes */ +#define XVA_MASK 0xffffffff /* Used to mask off 32 bits */ +#define XVA_SHFT 32 /* Used to shift index */ + +/* + * Used to pry out the index and attribute bits from the XAT_* attributes + * defined below. Note that we're masking things down to 32 bits then + * casting to uint32_t. + */ +#define XVA_INDEX(attr) ((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK)) +#define XVA_ATTRBIT(attr) ((uint32_t)((attr) & XVA_MASK)) + +/* + * The following defines present a "flat namespace" so that consumers don't + * need to keep track of which element belongs to which bitmap entry. + * + * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER + */ +#define XAT_CREATETIME ((XAT0_INDEX << XVA_SHFT) | XAT0_CREATETIME) +#define XAT_ARCHIVE ((XAT0_INDEX << XVA_SHFT) | XAT0_ARCHIVE) +#define XAT_SYSTEM ((XAT0_INDEX << XVA_SHFT) | XAT0_SYSTEM) +#define XAT_READONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_READONLY) +#define XAT_HIDDEN ((XAT0_INDEX << XVA_SHFT) | XAT0_HIDDEN) +#define XAT_NOUNLINK ((XAT0_INDEX << XVA_SHFT) | XAT0_NOUNLINK) +#define XAT_IMMUTABLE ((XAT0_INDEX << XVA_SHFT) | XAT0_IMMUTABLE) +#define XAT_APPENDONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_APPENDONLY) +#define XAT_NODUMP ((XAT0_INDEX << XVA_SHFT) | XAT0_NODUMP) +#define XAT_OPAQUE ((XAT0_INDEX << XVA_SHFT) | XAT0_OPAQUE) +#define XAT_AV_QUARANTINED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED) +#define XAT_AV_MODIFIED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED) +#define XAT_AV_SCANSTAMP ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP) +#define XAT_REPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE) +#define XAT_GEN ((XAT0_INDEX << XVA_SHFT) | XAT0_GEN) +#define XAT_OFFLINE ((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE) +#define XAT_SPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE) + +/* + * The returned attribute map array (xva_rtnattrmap[]) is located past the + * requested attribute map array (xva_reqattrmap[]). Its location changes + * when the array sizes change. We use a separate pointer in a known location + * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[]. This is + * set in xva_init() + */ +#define XVA_RTNATTRMAP(xvap) ((xvap)->xva_rtnattrmapp) + +/* + * XVA_SET_REQ() sets an attribute bit in the proper element in the bitmap + * of requested attributes (xva_reqattrmap[]). + */ +#define XVA_SET_REQ(xvap, attr) { \ + ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ + ASSERT((xvap)->xva_magic == XVA_MAGIC); \ + (xvap)->xva_reqattrmap[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr); \ +} +/* + * XVA_CLR_REQ() clears an attribute bit in the proper element in the bitmap + * of requested attributes (xva_reqattrmap[]). + */ +#define XVA_CLR_REQ(xvap, attr) { \ + ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ + ASSERT((xvap)->xva_magic == XVA_MAGIC); \ + (xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr); \ +} + +/* + * XVA_SET_RTN() sets an attribute bit in the proper element in the bitmap + * of returned attributes (xva_rtnattrmap[]). + */ +#define XVA_SET_RTN(xvap, attr) { \ + ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ + ASSERT((xvap)->xva_magic == XVA_MAGIC); \ + (XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr); \ +} + +/* + * XVA_ISSET_REQ() checks the requested attribute bitmap (xva_reqattrmap[]) + * to see of the corresponding attribute bit is set. If so, returns non-zero. + */ +#define XVA_ISSET_REQ(xvap, attr) \ + ((((xvap)->xva_vattr.va_mask | AT_XVATTR) && \ + ((xvap)->xva_magic == XVA_MAGIC) && \ + ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \ + ((xvap)->xva_reqattrmap[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0) + +/* + * XVA_ISSET_RTN() checks the returned attribute bitmap (xva_rtnattrmap[]) + * to see of the corresponding attribute bit is set. If so, returns non-zero. + */ +#define XVA_ISSET_RTN(xvap, attr) \ + ((((xvap)->xva_vattr.va_mask | AT_XVATTR) && \ + ((xvap)->xva_magic == XVA_MAGIC) && \ + ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \ + ((XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0) + +#define MODEMASK 07777 /* mode bits plus permission bits */ +#define PERMMASK 00777 /* permission bits */ + +/* + * VOP_ACCESS flags + */ +#define V_ACE_MASK 0x1 /* mask represents NFSv4 ACE permissions */ + +/* + * Flags for vnode operations. + */ +enum rm { RMFILE, RMDIRECTORY }; /* rm or rmdir (remove) */ +enum create { CRCREAT, CRMKNOD, CRMKDIR }; /* reason for create */ + +/* + * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations + */ + +typedef struct vsecattr { + uint_t vsa_mask; /* See below */ + int vsa_aclcnt; /* ACL entry count */ + void *vsa_aclentp; /* pointer to ACL entries */ + int vsa_dfaclcnt; /* default ACL entry count */ + void *vsa_dfaclentp; /* pointer to default ACL entries */ + size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ + uint_t vsa_aclflags; /* ACE ACL flags */ +} vsecattr_t; + +/* vsa_mask values */ +#define VSA_ACL 0x0001 +#define VSA_ACLCNT 0x0002 +#define VSA_DFACL 0x0004 +#define VSA_DFACLCNT 0x0008 +#define VSA_ACE 0x0010 +#define VSA_ACECNT 0x0020 +#define VSA_ACE_ALLTYPES 0x0040 +#define VSA_ACE_ACLFLAGS 0x0080 /* get/set ACE ACL flags */ + +/* + * Structure used by various vnode operations to determine + * the context (pid, host, identity) of a caller. + * + * The cc_caller_id is used to identify one or more callers who invoke + * operations, possibly on behalf of others. For example, the NFS + * server could have it's own cc_caller_id which can be detected by + * vnode/vfs operations or (FEM) monitors on those operations. New + * caller IDs are generated by fs_new_caller_id(). + */ +typedef struct caller_context { + pid_t cc_pid; /* Process ID of the caller */ + int cc_sysid; /* System ID, used for remote calls */ + u_longlong_t cc_caller_id; /* Identifier for (set of) caller(s) */ + ulong_t cc_flags; +} caller_context_t; + +struct taskq; + +/* + * Flags for VOP_LOOKUP + * + * Defined in file.h, but also possible, FIGNORECASE and FSEARCH + * + */ +#define LOOKUP_DIR 0x01 /* want parent dir vp */ +#define LOOKUP_XATTR 0x02 /* lookup up extended attr dir */ +#define CREATE_XATTR_DIR 0x04 /* Create extended attr dir */ +#define LOOKUP_HAVE_SYSATTR_DIR 0x08 /* Already created virtual GFS dir */ + +/* + * Flags for VOP_READDIR + */ +#define V_RDDIR_ENTFLAGS 0x01 /* request dirent flags */ +#define V_RDDIR_ACCFILTER 0x02 /* filter out inaccessible dirents */ + +/* + * Public vnode manipulation functions. + */ +#ifdef _KERNEL + +void vn_rele_async(struct vnode *vp, struct taskq *taskq); + +/* + * Extensible vnode attribute (xva) routines: + * xva_init() initializes an xvattr_t (zero struct, init mapsize, set AT_XATTR) + * xva_getxoptattr() returns a ponter to the xoptattr_t section of xvattr_t + */ +void xva_init(xvattr_t *); +xoptattr_t *xva_getxoptattr(xvattr_t *); /* Get ptr to xoptattr_t */ + +#define VN_RELE_ASYNC(vp, taskq) { \ + vn_rele_async(vp, taskq); \ +} + +#endif /* _KERNEL */ + +/* + * Flags to VOP_SETATTR/VOP_GETATTR. + */ +#define ATTR_UTIME 0x01 /* non-default utime(2) request */ +#define ATTR_EXEC 0x02 /* invocation from exec(2) */ +#define ATTR_COMM 0x04 /* yield common vp attributes */ +#define ATTR_HINT 0x08 /* information returned will be `hint' */ +#define ATTR_REAL 0x10 /* yield attributes of the real vp */ +#define ATTR_NOACLCHECK 0x20 /* Don't check ACL when checking permissions */ +#define ATTR_TRIGGER 0x40 /* Mount first if vnode is a trigger mount */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VNODE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h b/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h new file mode 100644 index 000000000000..ba0267203ce3 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h @@ -0,0 +1,68 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZMOD_H +#define _ZMOD_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * zmod - RFC-1950-compatible decompression routines + * + * This file provides the public interfaces to zmod, an in-kernel RFC 1950 + * decompression library. More information about the implementation of these + * interfaces can be found in the usr/src/uts/common/zmod/ directory. + */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) + +extern int z_uncompress(void *, size_t *, const void *, size_t); +extern int z_compress(void *, size_t *, const void *, size_t); +extern int z_compress_level(void *, size_t *, const void *, size_t, int); +extern const char *z_strerror(int); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZMOD_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/adler32.c b/sys/cddl/contrib/opensolaris/uts/common/zmod/adler32.c new file mode 100644 index 000000000000..59d84632ed5d --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/adler32.c @@ -0,0 +1,149 @@ +/* adler32.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2004 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#define ZLIB_INTERNAL +#include "zlib.h" + +#define BASE 65521UL /* largest prime smaller than 65536 */ +#define NMAX 5552 +/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ + +#define DO1(buf,i) {adler += (buf)[i]; sum2 += adler;} +#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1); +#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2); +#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); +#define DO16(buf) DO8(buf,0); DO8(buf,8); + +/* use NO_DIVIDE if your processor does not do division in hardware */ +#ifdef NO_DIVIDE +# define MOD(a) \ + do { \ + if (a >= (BASE << 16)) a -= (BASE << 16); \ + if (a >= (BASE << 15)) a -= (BASE << 15); \ + if (a >= (BASE << 14)) a -= (BASE << 14); \ + if (a >= (BASE << 13)) a -= (BASE << 13); \ + if (a >= (BASE << 12)) a -= (BASE << 12); \ + if (a >= (BASE << 11)) a -= (BASE << 11); \ + if (a >= (BASE << 10)) a -= (BASE << 10); \ + if (a >= (BASE << 9)) a -= (BASE << 9); \ + if (a >= (BASE << 8)) a -= (BASE << 8); \ + if (a >= (BASE << 7)) a -= (BASE << 7); \ + if (a >= (BASE << 6)) a -= (BASE << 6); \ + if (a >= (BASE << 5)) a -= (BASE << 5); \ + if (a >= (BASE << 4)) a -= (BASE << 4); \ + if (a >= (BASE << 3)) a -= (BASE << 3); \ + if (a >= (BASE << 2)) a -= (BASE << 2); \ + if (a >= (BASE << 1)) a -= (BASE << 1); \ + if (a >= BASE) a -= BASE; \ + } while (0) +# define MOD4(a) \ + do { \ + if (a >= (BASE << 4)) a -= (BASE << 4); \ + if (a >= (BASE << 3)) a -= (BASE << 3); \ + if (a >= (BASE << 2)) a -= (BASE << 2); \ + if (a >= (BASE << 1)) a -= (BASE << 1); \ + if (a >= BASE) a -= BASE; \ + } while (0) +#else +# define MOD(a) a %= BASE +# define MOD4(a) a %= BASE +#endif + +/* ========================================================================= */ +uLong ZEXPORT adler32(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned long sum2; + unsigned n; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (len == 1) { + adler += buf[0]; + if (adler >= BASE) + adler -= BASE; + sum2 += adler; + if (sum2 >= BASE) + sum2 -= BASE; + return adler | (sum2 << 16); + } + + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (buf == Z_NULL) + return 1L; + + /* in case short lengths are provided, keep it somewhat fast */ + if (len < 16) { + while (len--) { + adler += *buf++; + sum2 += adler; + } + if (adler >= BASE) + adler -= BASE; + MOD4(sum2); /* only added so many BASE's */ + return adler | (sum2 << 16); + } + + /* do length NMAX blocks -- requires just one modulo operation */ + while (len >= NMAX) { + len -= NMAX; + n = NMAX / 16; /* NMAX is divisible by 16 */ + do { + DO16(buf); /* 16 sums unrolled */ + buf += 16; + } while (--n); + MOD(adler); + MOD(sum2); + } + + /* do remaining bytes (less than NMAX, still just one modulo) */ + if (len) { /* avoid modulos if none remaining */ + while (len >= 16) { + len -= 16; + DO16(buf); + buf += 16; + } + while (len--) { + adler += *buf++; + sum2 += adler; + } + MOD(adler); + MOD(sum2); + } + + /* return recombined sums */ + return adler | (sum2 << 16); +} + +/* ========================================================================= */ +uLong ZEXPORT adler32_combine(adler1, adler2, len2) + uLong adler1; + uLong adler2; + z_off_t len2; +{ + unsigned long sum1; + unsigned long sum2; + unsigned rem; + + /* the derivation of this formula is left as an exercise for the reader */ + rem = (unsigned)(len2 % BASE); + sum1 = adler1 & 0xffff; + sum2 = rem * sum1; + MOD(sum2); + sum1 += (adler2 & 0xffff) + BASE - 1; + sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem; + if (sum1 > BASE) sum1 -= BASE; + if (sum1 > BASE) sum1 -= BASE; + if (sum2 > (BASE << 1)) sum2 -= (BASE << 1); + if (sum2 > BASE) sum2 -= BASE; + return sum1 | (sum2 << 16); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/crc32.h b/sys/cddl/contrib/opensolaris/uts/common/zmod/crc32.h new file mode 100644 index 000000000000..495c83e03179 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/crc32.h @@ -0,0 +1,443 @@ +/* crc32.h -- tables for rapid CRC calculation + * Generated automatically by crc32.c + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +local const unsigned long FAR crc_table[TBLS][256] = +{ + { + 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL, + 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL, + 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL, + 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL, + 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL, + 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL, + 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL, + 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL, + 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL, + 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL, + 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL, + 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL, + 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL, + 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL, + 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL, + 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL, + 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL, + 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL, + 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL, + 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL, + 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL, + 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL, + 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL, + 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL, + 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL, + 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL, + 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL, + 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL, + 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL, + 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL, + 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL, + 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL, + 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL, + 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL, + 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL, + 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL, + 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL, + 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL, + 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL, + 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL, + 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL, + 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL, + 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL, + 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL, + 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL, + 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL, + 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL, + 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL, + 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL, + 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL, + 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL, + 0x2d02ef8dUL +#ifdef BYFOUR + }, + { + 0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL, + 0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL, + 0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL, + 0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL, + 0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL, + 0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL, + 0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL, + 0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL, + 0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL, + 0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL, + 0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL, + 0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL, + 0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL, + 0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL, + 0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL, + 0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL, + 0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL, + 0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL, + 0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL, + 0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL, + 0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL, + 0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL, + 0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL, + 0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL, + 0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL, + 0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL, + 0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL, + 0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL, + 0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL, + 0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL, + 0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL, + 0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL, + 0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL, + 0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL, + 0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL, + 0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL, + 0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL, + 0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL, + 0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL, + 0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL, + 0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL, + 0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL, + 0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL, + 0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL, + 0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL, + 0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL, + 0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL, + 0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL, + 0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL, + 0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL, + 0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL, + 0x9324fd72UL + }, + { + 0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL, + 0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL, + 0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL, + 0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL, + 0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL, + 0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL, + 0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL, + 0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL, + 0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL, + 0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL, + 0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL, + 0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL, + 0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL, + 0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL, + 0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL, + 0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL, + 0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL, + 0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL, + 0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL, + 0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL, + 0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL, + 0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL, + 0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL, + 0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL, + 0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL, + 0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL, + 0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL, + 0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL, + 0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL, + 0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL, + 0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL, + 0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL, + 0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL, + 0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL, + 0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL, + 0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL, + 0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL, + 0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL, + 0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL, + 0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL, + 0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL, + 0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL, + 0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL, + 0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL, + 0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL, + 0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL, + 0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL, + 0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL, + 0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL, + 0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL, + 0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL, + 0xbe9834edUL + }, + { + 0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL, + 0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL, + 0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL, + 0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL, + 0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL, + 0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL, + 0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL, + 0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL, + 0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL, + 0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL, + 0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL, + 0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL, + 0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL, + 0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL, + 0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL, + 0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL, + 0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL, + 0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL, + 0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL, + 0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL, + 0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL, + 0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL, + 0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL, + 0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL, + 0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL, + 0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL, + 0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL, + 0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL, + 0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL, + 0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL, + 0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL, + 0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL, + 0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL, + 0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL, + 0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL, + 0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL, + 0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL, + 0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL, + 0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL, + 0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL, + 0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL, + 0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL, + 0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL, + 0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL, + 0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL, + 0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL, + 0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL, + 0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL, + 0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL, + 0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL, + 0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL, + 0xde0506f1UL + }, + { + 0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL, + 0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL, + 0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL, + 0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL, + 0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL, + 0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL, + 0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL, + 0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL, + 0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL, + 0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL, + 0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL, + 0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL, + 0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL, + 0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL, + 0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL, + 0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL, + 0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL, + 0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL, + 0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL, + 0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL, + 0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL, + 0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL, + 0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL, + 0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL, + 0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL, + 0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL, + 0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL, + 0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL, + 0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL, + 0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL, + 0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL, + 0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL, + 0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL, + 0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL, + 0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL, + 0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL, + 0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL, + 0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL, + 0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL, + 0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL, + 0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL, + 0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL, + 0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL, + 0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL, + 0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL, + 0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL, + 0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL, + 0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL, + 0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL, + 0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL, + 0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL, + 0x8def022dUL + }, + { + 0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL, + 0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL, + 0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL, + 0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL, + 0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL, + 0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL, + 0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL, + 0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL, + 0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL, + 0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL, + 0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL, + 0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL, + 0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL, + 0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL, + 0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL, + 0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL, + 0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL, + 0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL, + 0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL, + 0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL, + 0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL, + 0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL, + 0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL, + 0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL, + 0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL, + 0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL, + 0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL, + 0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL, + 0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL, + 0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL, + 0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL, + 0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL, + 0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL, + 0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL, + 0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL, + 0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL, + 0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL, + 0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL, + 0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL, + 0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL, + 0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL, + 0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL, + 0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL, + 0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL, + 0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL, + 0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL, + 0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL, + 0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL, + 0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL, + 0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL, + 0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL, + 0x72fd2493UL + }, + { + 0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL, + 0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL, + 0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL, + 0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL, + 0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL, + 0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL, + 0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL, + 0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL, + 0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL, + 0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL, + 0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL, + 0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL, + 0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL, + 0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL, + 0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL, + 0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL, + 0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL, + 0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL, + 0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL, + 0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL, + 0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL, + 0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL, + 0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL, + 0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL, + 0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL, + 0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL, + 0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL, + 0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL, + 0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL, + 0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL, + 0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL, + 0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL, + 0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL, + 0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL, + 0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL, + 0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL, + 0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL, + 0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL, + 0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL, + 0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL, + 0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL, + 0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL, + 0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL, + 0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL, + 0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL, + 0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL, + 0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL, + 0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL, + 0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL, + 0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL, + 0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL, + 0xed3498beUL + }, + { + 0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL, + 0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL, + 0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL, + 0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL, + 0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL, + 0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL, + 0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL, + 0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL, + 0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL, + 0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL, + 0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL, + 0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL, + 0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL, + 0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL, + 0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL, + 0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL, + 0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL, + 0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL, + 0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL, + 0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL, + 0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL, + 0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL, + 0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL, + 0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL, + 0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL, + 0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL, + 0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL, + 0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL, + 0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL, + 0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL, + 0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL, + 0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL, + 0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL, + 0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL, + 0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL, + 0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL, + 0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL, + 0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL, + 0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL, + 0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL, + 0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL, + 0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL, + 0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL, + 0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL, + 0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL, + 0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL, + 0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL, + 0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL, + 0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL, + 0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL, + 0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL, + 0xf10605deUL +#endif + } +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/deflate.c b/sys/cddl/contrib/opensolaris/uts/common/zmod/deflate.c new file mode 100644 index 000000000000..7847e40ba327 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/deflate.c @@ -0,0 +1,1742 @@ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* deflate.c -- compress data using the deflation algorithm + * Copyright (C) 1995-2005 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ALGORITHM + * + * The "deflation" process depends on being able to identify portions + * of the input text which are identical to earlier input (within a + * sliding window trailing behind the input currently being processed). + * + * The most straightforward technique turns out to be the fastest for + * most input files: try all possible matches and select the longest. + * The key feature of this algorithm is that insertions into the string + * dictionary are very simple and thus fast, and deletions are avoided + * completely. Insertions are performed at each input character, whereas + * string matches are performed only when the previous match ends. So it + * is preferable to spend more time in matches to allow very fast string + * insertions and avoid deletions. The matching algorithm for small + * strings is inspired from that of Rabin & Karp. A brute force approach + * is used to find longer strings when a small match has been found. + * A similar algorithm is used in comic (by Jan-Mark Wams) and freeze + * (by Leonid Broukhis). + * A previous version of this file used a more sophisticated algorithm + * (by Fiala and Greene) which is guaranteed to run in linear amortized + * time, but has a larger average cost, uses more memory and is patented. + * However the F&G algorithm may be faster for some highly redundant + * files if the parameter max_chain_length (described below) is too large. + * + * ACKNOWLEDGEMENTS + * + * The idea of lazy evaluation of matches is due to Jan-Mark Wams, and + * I found it in 'freeze' written by Leonid Broukhis. + * Thanks to many people for bug reports and testing. + * + * REFERENCES + * + * Deutsch, L.P.,"DEFLATE Compressed Data Format Specification". + * Available in http://www.ietf.org/rfc/rfc1951.txt + * + * A description of the Rabin and Karp algorithm is given in the book + * "Algorithms" by R. Sedgewick, Addison-Wesley, p252. + * + * Fiala,E.R., and Greene,D.H. + * Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595 + * + */ + +#include "deflate.h" + +static const char deflate_copyright[] = + " deflate 1.2.3 Copyright 1995-2005 Jean-loup Gailly "; +/* + If you use the zlib library in a product, an acknowledgment is welcome + in the documentation of your product. If for some reason you cannot + include such an acknowledgment, I would appreciate that you keep this + copyright string in the executable of your product. + */ + +/* =========================================================================== + * Function prototypes. + */ +typedef enum { + need_more, /* block not completed, need more input or more output */ + block_done, /* block flush performed */ + finish_started, /* finish started, need only more output at next deflate */ + finish_done /* finish done, accept no more input or output */ +} block_state; + +typedef block_state (*compress_func) OF((deflate_state *s, int flush)); +/* Compression function. Returns the block state after the call. */ + +local void fill_window OF((deflate_state *s)); +local block_state deflate_stored OF((deflate_state *s, int flush)); +local block_state deflate_fast OF((deflate_state *s, int flush)); +#ifndef FASTEST +local block_state deflate_slow OF((deflate_state *s, int flush)); +#endif +local void lm_init OF((deflate_state *s)); +local void putShortMSB OF((deflate_state *s, uInt b)); +local void flush_pending OF((z_streamp strm)); +local int read_buf OF((z_streamp strm, Bytef *buf, unsigned size)); +#ifndef FASTEST +#ifdef ASMV + void match_init OF((void)); /* asm code initialization */ + uInt longest_match OF((deflate_state *s, IPos cur_match)); +#else +local uInt longest_match OF((deflate_state *s, IPos cur_match)); +#endif +#endif +local uInt longest_match_fast OF((deflate_state *s, IPos cur_match)); + +#ifdef DEBUG +local void check_match OF((deflate_state *s, IPos start, IPos match, + int length)); +#endif + +/* =========================================================================== + * Local data + */ + +#define NIL 0 +/* Tail of hash chains */ + +#ifndef TOO_FAR +# define TOO_FAR 4096 +#endif +/* Matches of length 3 are discarded if their distance exceeds TOO_FAR */ + +#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) +/* Minimum amount of lookahead, except at the end of the input file. + * See deflate.c for comments about the MIN_MATCH+1. + */ + +/* Values for max_lazy_match, good_match and max_chain_length, depending on + * the desired pack level (0..9). The values given below have been tuned to + * exclude worst case performance for pathological files. Better values may be + * found for specific files. + */ +typedef struct config_s { + ush good_length; /* reduce lazy search above this match length */ + ush max_lazy; /* do not perform lazy search above this match length */ + ush nice_length; /* quit search above this match length */ + ush max_chain; + compress_func func; +} config; + +#ifdef FASTEST +local const config configuration_table[2] = { +/* good lazy nice chain */ +/* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */ +/* 1 */ {4, 4, 8, 4, deflate_fast}}; /* max speed, no lazy matches */ +#else +local const config configuration_table[10] = { +/* good lazy nice chain */ +/* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */ +/* 1 */ {4, 4, 8, 4, deflate_fast}, /* max speed, no lazy matches */ +/* 2 */ {4, 5, 16, 8, deflate_fast}, +/* 3 */ {4, 6, 32, 32, deflate_fast}, + +/* 4 */ {4, 4, 16, 16, deflate_slow}, /* lazy matches */ +/* 5 */ {8, 16, 32, 32, deflate_slow}, +/* 6 */ {8, 16, 128, 128, deflate_slow}, +/* 7 */ {8, 32, 128, 256, deflate_slow}, +/* 8 */ {32, 128, 258, 1024, deflate_slow}, +/* 9 */ {32, 258, 258, 4096, deflate_slow}}; /* max compression */ +#endif + +/* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4 + * For deflate_fast() (levels <= 3) good is ignored and lazy has a different + * meaning. + */ + +#define EQUAL 0 +/* result of memcmp for equal strings */ + +#ifndef NO_DUMMY_DECL +struct static_tree_desc_s {int dummy;}; /* for buggy compilers */ +#endif + +/* =========================================================================== + * Update a hash value with the given input byte + * IN assertion: all calls to to UPDATE_HASH are made with consecutive + * input characters, so that a running hash key can be computed from the + * previous key instead of complete recalculation each time. + */ +#define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask) + + +/* =========================================================================== + * Insert string str in the dictionary and set match_head to the previous head + * of the hash chain (the most recent string with same hash key). Return + * the previous length of the hash chain. + * If this file is compiled with -DFASTEST, the compression level is forced + * to 1, and no hash chains are maintained. + * IN assertion: all calls to to INSERT_STRING are made with consecutive + * input characters and the first MIN_MATCH bytes of str are valid + * (except for the last MIN_MATCH-1 bytes of the input file). + */ +#ifdef FASTEST +#define INSERT_STRING(s, str, match_head) \ + (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \ + match_head = s->head[s->ins_h], \ + s->head[s->ins_h] = (Pos)(str)) +#else +#define INSERT_STRING(s, str, match_head) \ + (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \ + match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \ + s->head[s->ins_h] = (Pos)(str)) +#endif + +/* =========================================================================== + * Initialize the hash table (avoiding 64K overflow for 16 bit systems). + * prev[] will be initialized on the fly. + */ +#define CLEAR_HASH(s) \ + s->head[s->hash_size-1] = NIL; \ + (void) zmemzero((Bytef *)s->head, \ + (unsigned)(s->hash_size-1)*sizeof(*s->head)); + +/* ========================================================================= */ +int ZEXPORT deflateInit_(strm, level, version, stream_size) + z_streamp strm; + int level; + const char *version; + int stream_size; +{ + return deflateInit2_(strm, level, Z_DEFLATED, MAX_WBITS, DEF_MEM_LEVEL, + Z_DEFAULT_STRATEGY, version, stream_size); + /* To do: ignore strm->next_in if we use it as window */ +} + +/* ========================================================================= */ +int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, + version, stream_size) + z_streamp strm; + int level; + int method; + int windowBits; + int memLevel; + int strategy; + const char *version; + int stream_size; +{ + deflate_state *s; + int wrap = 1; + static const char my_version[] = ZLIB_VERSION; + + ushf *overlay; + /* We overlay pending_buf and d_buf+l_buf. This works since the average + * output size for (length,distance) codes is <= 24 bits. + */ + + if (version == Z_NULL || version[0] != my_version[0] || + stream_size != sizeof(z_stream)) { + return Z_VERSION_ERROR; + } + if (strm == Z_NULL) return Z_STREAM_ERROR; + + strm->msg = Z_NULL; + if (strm->zalloc == (alloc_func)0) { + strm->zalloc = zcalloc; + strm->opaque = (voidpf)0; + } + if (strm->zfree == (free_func)0) strm->zfree = zcfree; + +#ifdef FASTEST + if (level != 0) level = 1; +#else + if (level == Z_DEFAULT_COMPRESSION) level = 6; +#endif + + if (windowBits < 0) { /* suppress zlib wrapper */ + wrap = 0; + windowBits = -windowBits; + } +#ifdef GZIP + else if (windowBits > 15) { + wrap = 2; /* write gzip wrapper instead */ + windowBits -= 16; + } +#endif + if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || method != Z_DEFLATED || + windowBits < 8 || windowBits > 15 || level < 0 || level > 9 || + strategy < 0 || strategy > Z_FIXED) { + return Z_STREAM_ERROR; + } + if (windowBits == 8) windowBits = 9; /* until 256-byte window bug fixed */ + s = (deflate_state *) ZALLOC(strm, 1, sizeof(deflate_state)); + if (s == Z_NULL) return Z_MEM_ERROR; + strm->state = (struct internal_state FAR *)s; + s->strm = strm; + + s->wrap = wrap; + s->gzhead = Z_NULL; + s->w_bits = windowBits; + s->w_size = 1 << s->w_bits; + s->w_mask = s->w_size - 1; + + s->hash_bits = memLevel + 7; + s->hash_size = 1 << s->hash_bits; + s->hash_mask = s->hash_size - 1; + s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH); + + s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte)); + s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos)); + s->head = (Posf *) ZALLOC(strm, s->hash_size, sizeof(Pos)); + + s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */ + + overlay = (ushf *) ZALLOC(strm, s->lit_bufsize, sizeof(ush)+2); + s->pending_buf = (uchf *) overlay; + s->pending_buf_size = (ulg)s->lit_bufsize * (sizeof(ush)+2L); + + if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL || + s->pending_buf == Z_NULL) { + s->status = FINISH_STATE; + strm->msg = (char*)ERR_MSG(Z_MEM_ERROR); + (void) deflateEnd (strm); + return Z_MEM_ERROR; + } + s->d_buf = overlay + s->lit_bufsize/sizeof(ush); + s->l_buf = s->pending_buf + (1+sizeof(ush))*s->lit_bufsize; + + s->level = level; + s->strategy = strategy; + s->method = (Byte)method; + + return deflateReset(strm); +} + +/* ========================================================================= */ +int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength) + z_streamp strm; + const Bytef *dictionary; + uInt dictLength; +{ + deflate_state *s; + uInt length = dictLength; + uInt n; + IPos hash_head = 0; + + if (strm == Z_NULL || strm->state == Z_NULL || dictionary == Z_NULL || + strm->state->wrap == 2 || + (strm->state->wrap == 1 && strm->state->status != INIT_STATE)) + return Z_STREAM_ERROR; + + s = strm->state; + if (s->wrap) + strm->adler = adler32(strm->adler, dictionary, dictLength); + + if (length < MIN_MATCH) return Z_OK; + if (length > MAX_DIST(s)) { + length = MAX_DIST(s); + dictionary += dictLength - length; /* use the tail of the dictionary */ + } + (void) zmemcpy(s->window, dictionary, length); + s->strstart = length; + s->block_start = (long)length; + + /* Insert all strings in the hash table (except for the last two bytes). + * s->lookahead stays null, so s->ins_h will be recomputed at the next + * call of fill_window. + */ + s->ins_h = s->window[0]; + UPDATE_HASH(s, s->ins_h, s->window[1]); + for (n = 0; n <= length - MIN_MATCH; n++) { + INSERT_STRING(s, n, hash_head); + } + if (hash_head) hash_head = 0; /* to make compiler happy */ + return Z_OK; +} + +/* ========================================================================= */ +int ZEXPORT deflateReset (strm) + z_streamp strm; +{ + deflate_state *s; + + if (strm == Z_NULL || strm->state == Z_NULL || + strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0) { + return Z_STREAM_ERROR; + } + + strm->total_in = strm->total_out = 0; + strm->msg = Z_NULL; /* use zfree if we ever allocate msg dynamically */ + strm->data_type = Z_UNKNOWN; + + s = (deflate_state *)strm->state; + s->pending = 0; + s->pending_out = s->pending_buf; + + if (s->wrap < 0) { + s->wrap = -s->wrap; /* was made negative by deflate(..., Z_FINISH); */ + } + s->status = s->wrap ? INIT_STATE : BUSY_STATE; + strm->adler = +#ifdef GZIP + s->wrap == 2 ? crc32(0L, Z_NULL, 0) : +#endif + adler32(0L, Z_NULL, 0); + s->last_flush = Z_NO_FLUSH; + + _tr_init(s); + lm_init(s); + + return Z_OK; +} + +/* ========================================================================= */ +int ZEXPORT deflateSetHeader (strm, head) + z_streamp strm; + gz_headerp head; +{ + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + if (strm->state->wrap != 2) return Z_STREAM_ERROR; + strm->state->gzhead = head; + return Z_OK; +} + +/* ========================================================================= */ +int ZEXPORT deflatePrime (strm, bits, value) + z_streamp strm; + int bits; + int value; +{ + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + strm->state->bi_valid = bits; + strm->state->bi_buf = (ush)(value & ((1 << bits) - 1)); + return Z_OK; +} + +/* ========================================================================= */ +int ZEXPORT deflateParams(strm, level, strategy) + z_streamp strm; + int level; + int strategy; +{ + deflate_state *s; + compress_func func; + int err = Z_OK; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + s = strm->state; + +#ifdef FASTEST + if (level != 0) level = 1; +#else + if (level == Z_DEFAULT_COMPRESSION) level = 6; +#endif + if (level < 0 || level > 9 || strategy < 0 || strategy > Z_FIXED) { + return Z_STREAM_ERROR; + } + func = configuration_table[s->level].func; + + if (func != configuration_table[level].func && strm->total_in != 0) { + /* Flush the last buffer: */ + err = deflate(strm, Z_PARTIAL_FLUSH); + } + if (s->level != level) { + s->level = level; + s->max_lazy_match = configuration_table[level].max_lazy; + s->good_match = configuration_table[level].good_length; + s->nice_match = configuration_table[level].nice_length; + s->max_chain_length = configuration_table[level].max_chain; + } + s->strategy = strategy; + return err; +} + +/* ========================================================================= */ +int ZEXPORT deflateTune(strm, good_length, max_lazy, nice_length, max_chain) + z_streamp strm; + int good_length; + int max_lazy; + int nice_length; + int max_chain; +{ + deflate_state *s; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + s = strm->state; + s->good_match = good_length; + s->max_lazy_match = max_lazy; + s->nice_match = nice_length; + s->max_chain_length = max_chain; + return Z_OK; +} + +/* ========================================================================= + * For the default windowBits of 15 and memLevel of 8, this function returns + * a close to exact, as well as small, upper bound on the compressed size. + * They are coded as constants here for a reason--if the #define's are + * changed, then this function needs to be changed as well. The return + * value for 15 and 8 only works for those exact settings. + * + * For any setting other than those defaults for windowBits and memLevel, + * the value returned is a conservative worst case for the maximum expansion + * resulting from using fixed blocks instead of stored blocks, which deflate + * can emit on compressed data for some combinations of the parameters. + * + * This function could be more sophisticated to provide closer upper bounds + * for every combination of windowBits and memLevel, as well as wrap. + * But even the conservative upper bound of about 14% expansion does not + * seem onerous for output buffer allocation. + */ +uLong ZEXPORT deflateBound(strm, sourceLen) + z_streamp strm; + uLong sourceLen; +{ + deflate_state *s; + uLong destLen; + + /* conservative upper bound */ + destLen = sourceLen + + ((sourceLen + 7) >> 3) + ((sourceLen + 63) >> 6) + 11; + + /* if can't get parameters, return conservative bound */ + if (strm == Z_NULL || strm->state == Z_NULL) + return destLen; + + /* if not default parameters, return conservative bound */ + s = strm->state; + if (s->w_bits != 15 || s->hash_bits != 8 + 7) + return destLen; + + /* default settings: return tight bound for that case */ + return compressBound(sourceLen); +} + +/* ========================================================================= + * Put a short in the pending buffer. The 16-bit value is put in MSB order. + * IN assertion: the stream state is correct and there is enough room in + * pending_buf. + */ +local void putShortMSB (s, b) + deflate_state *s; + uInt b; +{ + put_byte(s, (Byte)(b >> 8)); + put_byte(s, (Byte)(b & 0xff)); +} + +/* ========================================================================= + * Flush as much pending output as possible. All deflate() output goes + * through this function so some applications may wish to modify it + * to avoid allocating a large strm->next_out buffer and copying into it. + * (See also read_buf()). + */ +local void flush_pending(strm) + z_streamp strm; +{ + unsigned len = strm->state->pending; + + if (len > strm->avail_out) len = strm->avail_out; + if (len == 0) return; + + zmemcpy(strm->next_out, strm->state->pending_out, len); + strm->next_out += len; + strm->state->pending_out += len; + strm->total_out += len; + strm->avail_out -= len; + strm->state->pending -= len; + if (strm->state->pending == 0) { + strm->state->pending_out = strm->state->pending_buf; + } +} + +/* ========================================================================= */ +int ZEXPORT deflate (strm, flush) + z_streamp strm; + int flush; +{ + int old_flush; /* value of flush param for previous deflate call */ + deflate_state *s; + + if (strm == Z_NULL || strm->state == Z_NULL || + flush > Z_FINISH || flush < 0) { + return Z_STREAM_ERROR; + } + s = strm->state; + + if (strm->next_out == Z_NULL || + (strm->next_in == Z_NULL && strm->avail_in != 0) || + (s->status == FINISH_STATE && flush != Z_FINISH)) { + ERR_RETURN(strm, Z_STREAM_ERROR); + } + if (strm->avail_out == 0) ERR_RETURN(strm, Z_BUF_ERROR); + + s->strm = strm; /* just in case */ + old_flush = s->last_flush; + s->last_flush = flush; + + /* Write the header */ + if (s->status == INIT_STATE) { +#ifdef GZIP + if (s->wrap == 2) { + strm->adler = crc32(0L, Z_NULL, 0); + put_byte(s, 31); + put_byte(s, 139); + put_byte(s, 8); + if (s->gzhead == NULL) { + put_byte(s, 0); + put_byte(s, 0); + put_byte(s, 0); + put_byte(s, 0); + put_byte(s, 0); + put_byte(s, s->level == 9 ? 2 : + (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2 ? + 4 : 0)); + put_byte(s, OS_CODE); + s->status = BUSY_STATE; + } + else { + put_byte(s, (s->gzhead->text ? 1 : 0) + + (s->gzhead->hcrc ? 2 : 0) + + (s->gzhead->extra == Z_NULL ? 0 : 4) + + (s->gzhead->name == Z_NULL ? 0 : 8) + + (s->gzhead->comment == Z_NULL ? 0 : 16) + ); + put_byte(s, (Byte)(s->gzhead->time & 0xff)); + put_byte(s, (Byte)((s->gzhead->time >> 8) & 0xff)); + put_byte(s, (Byte)((s->gzhead->time >> 16) & 0xff)); + put_byte(s, (Byte)((s->gzhead->time >> 24) & 0xff)); + put_byte(s, s->level == 9 ? 2 : + (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2 ? + 4 : 0)); + put_byte(s, s->gzhead->os & 0xff); + if (s->gzhead->extra != NULL) { + put_byte(s, s->gzhead->extra_len & 0xff); + put_byte(s, (s->gzhead->extra_len >> 8) & 0xff); + } + if (s->gzhead->hcrc) + strm->adler = crc32(strm->adler, s->pending_buf, + s->pending); + s->gzindex = 0; + s->status = EXTRA_STATE; + } + } + else +#endif + { + uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8; + uInt level_flags; + + if (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2) + level_flags = 0; + else if (s->level < 6) + level_flags = 1; + else if (s->level == 6) + level_flags = 2; + else + level_flags = 3; + header |= (level_flags << 6); + if (s->strstart != 0) header |= PRESET_DICT; + header += 31 - (header % 31); + + s->status = BUSY_STATE; + putShortMSB(s, header); + + /* Save the adler32 of the preset dictionary: */ + if (s->strstart != 0) { + putShortMSB(s, (uInt)(strm->adler >> 16)); + putShortMSB(s, (uInt)(strm->adler & 0xffff)); + } + strm->adler = adler32(0L, Z_NULL, 0); + } + } +#ifdef GZIP + if (s->status == EXTRA_STATE) { + if (s->gzhead->extra != NULL) { + uInt beg = s->pending; /* start of bytes to update crc */ + + while (s->gzindex < (s->gzhead->extra_len & 0xffff)) { + if (s->pending == s->pending_buf_size) { + if (s->gzhead->hcrc && s->pending > beg) + strm->adler = crc32(strm->adler, s->pending_buf + beg, + s->pending - beg); + flush_pending(strm); + beg = s->pending; + if (s->pending == s->pending_buf_size) + break; + } + put_byte(s, s->gzhead->extra[s->gzindex]); + s->gzindex++; + } + if (s->gzhead->hcrc && s->pending > beg) + strm->adler = crc32(strm->adler, s->pending_buf + beg, + s->pending - beg); + if (s->gzindex == s->gzhead->extra_len) { + s->gzindex = 0; + s->status = NAME_STATE; + } + } + else + s->status = NAME_STATE; + } + if (s->status == NAME_STATE) { + if (s->gzhead->name != NULL) { + uInt beg = s->pending; /* start of bytes to update crc */ + int val; + + do { + if (s->pending == s->pending_buf_size) { + if (s->gzhead->hcrc && s->pending > beg) + strm->adler = crc32(strm->adler, s->pending_buf + beg, + s->pending - beg); + flush_pending(strm); + beg = s->pending; + if (s->pending == s->pending_buf_size) { + val = 1; + break; + } + } + val = s->gzhead->name[s->gzindex++]; + put_byte(s, val); + } while (val != 0); + if (s->gzhead->hcrc && s->pending > beg) + strm->adler = crc32(strm->adler, s->pending_buf + beg, + s->pending - beg); + if (val == 0) { + s->gzindex = 0; + s->status = COMMENT_STATE; + } + } + else + s->status = COMMENT_STATE; + } + if (s->status == COMMENT_STATE) { + if (s->gzhead->comment != NULL) { + uInt beg = s->pending; /* start of bytes to update crc */ + int val; + + do { + if (s->pending == s->pending_buf_size) { + if (s->gzhead->hcrc && s->pending > beg) + strm->adler = crc32(strm->adler, s->pending_buf + beg, + s->pending - beg); + flush_pending(strm); + beg = s->pending; + if (s->pending == s->pending_buf_size) { + val = 1; + break; + } + } + val = s->gzhead->comment[s->gzindex++]; + put_byte(s, val); + } while (val != 0); + if (s->gzhead->hcrc && s->pending > beg) + strm->adler = crc32(strm->adler, s->pending_buf + beg, + s->pending - beg); + if (val == 0) + s->status = HCRC_STATE; + } + else + s->status = HCRC_STATE; + } + if (s->status == HCRC_STATE) { + if (s->gzhead->hcrc) { + if (s->pending + 2 > s->pending_buf_size) + flush_pending(strm); + if (s->pending + 2 <= s->pending_buf_size) { + put_byte(s, (Byte)(strm->adler & 0xff)); + put_byte(s, (Byte)((strm->adler >> 8) & 0xff)); + strm->adler = crc32(0L, Z_NULL, 0); + s->status = BUSY_STATE; + } + } + else + s->status = BUSY_STATE; + } +#endif + + /* Flush as much pending output as possible */ + if (s->pending != 0) { + flush_pending(strm); + if (strm->avail_out == 0) { + /* Since avail_out is 0, deflate will be called again with + * more output space, but possibly with both pending and + * avail_in equal to zero. There won't be anything to do, + * but this is not an error situation so make sure we + * return OK instead of BUF_ERROR at next call of deflate: + */ + s->last_flush = -1; + return Z_OK; + } + + /* Make sure there is something to do and avoid duplicate consecutive + * flushes. For repeated and useless calls with Z_FINISH, we keep + * returning Z_STREAM_END instead of Z_BUF_ERROR. + */ + } else if (strm->avail_in == 0 && flush <= old_flush && + flush != Z_FINISH) { + ERR_RETURN(strm, Z_BUF_ERROR); + } + + /* User must not provide more input after the first FINISH: */ + if (s->status == FINISH_STATE && strm->avail_in != 0) { + ERR_RETURN(strm, Z_BUF_ERROR); + } + + /* Start a new block or continue the current one. + */ + if (strm->avail_in != 0 || s->lookahead != 0 || + (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) { + block_state bstate; + + bstate = (*(configuration_table[s->level].func))(s, flush); + + if (bstate == finish_started || bstate == finish_done) { + s->status = FINISH_STATE; + } + if (bstate == need_more || bstate == finish_started) { + if (strm->avail_out == 0) { + s->last_flush = -1; /* avoid BUF_ERROR next call, see above */ + } + return Z_OK; + /* If flush != Z_NO_FLUSH && avail_out == 0, the next call + * of deflate should use the same flush parameter to make sure + * that the flush is complete. So we don't have to output an + * empty block here, this will be done at next call. This also + * ensures that for a very small output buffer, we emit at most + * one empty block. + */ + } + if (bstate == block_done) { + if (flush == Z_PARTIAL_FLUSH) { + _tr_align(s); + } else { /* FULL_FLUSH or SYNC_FLUSH */ + _tr_stored_block(s, (char*)0, 0L, 0); + /* For a full flush, this empty block will be recognized + * as a special marker by inflate_sync(). + */ + if (flush == Z_FULL_FLUSH) { + CLEAR_HASH(s); /* forget history */ + } + } + flush_pending(strm); + if (strm->avail_out == 0) { + s->last_flush = -1; /* avoid BUF_ERROR at next call, see above */ + return Z_OK; + } + } + } + Assert(strm->avail_out > 0, "bug2"); + + if (flush != Z_FINISH) return Z_OK; + if (s->wrap <= 0) return Z_STREAM_END; + + /* Write the trailer */ +#ifdef GZIP + if (s->wrap == 2) { + put_byte(s, (Byte)(strm->adler & 0xff)); + put_byte(s, (Byte)((strm->adler >> 8) & 0xff)); + put_byte(s, (Byte)((strm->adler >> 16) & 0xff)); + put_byte(s, (Byte)((strm->adler >> 24) & 0xff)); + put_byte(s, (Byte)(strm->total_in & 0xff)); + put_byte(s, (Byte)((strm->total_in >> 8) & 0xff)); + put_byte(s, (Byte)((strm->total_in >> 16) & 0xff)); + put_byte(s, (Byte)((strm->total_in >> 24) & 0xff)); + } + else +#endif + { + putShortMSB(s, (uInt)(strm->adler >> 16)); + putShortMSB(s, (uInt)(strm->adler & 0xffff)); + } + flush_pending(strm); + /* If avail_out is zero, the application will call deflate again + * to flush the rest. + */ + if (s->wrap > 0) s->wrap = -s->wrap; /* write the trailer only once! */ + return s->pending != 0 ? Z_OK : Z_STREAM_END; +} + +/* ========================================================================= */ +int ZEXPORT deflateEnd (strm) + z_streamp strm; +{ + int status; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + + status = strm->state->status; + if (status != INIT_STATE && + status != EXTRA_STATE && + status != NAME_STATE && + status != COMMENT_STATE && + status != HCRC_STATE && + status != BUSY_STATE && + status != FINISH_STATE) { + return Z_STREAM_ERROR; + } + + /* Deallocate in reverse order of allocations: */ + TRY_FREE(strm, strm->state->pending_buf); + TRY_FREE(strm, strm->state->head); + TRY_FREE(strm, strm->state->prev); + TRY_FREE(strm, strm->state->window); + + ZFREE(strm, strm->state); + strm->state = Z_NULL; + + return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK; +} + +/* ========================================================================= + * Copy the source state to the destination state. + * To simplify the source, this is not supported for 16-bit MSDOS (which + * doesn't have enough memory anyway to duplicate compression states). + */ +int ZEXPORT deflateCopy (dest, source) + z_streamp dest; + z_streamp source; +{ +#ifdef MAXSEG_64K + return Z_STREAM_ERROR; +#else + deflate_state *ds; + deflate_state *ss; + ushf *overlay; + + + if (source == Z_NULL || dest == Z_NULL || source->state == Z_NULL) { + return Z_STREAM_ERROR; + } + + ss = source->state; + + zmemcpy(dest, source, sizeof(z_stream)); + + ds = (deflate_state *) ZALLOC(dest, 1, sizeof(deflate_state)); + if (ds == Z_NULL) return Z_MEM_ERROR; + dest->state = (struct internal_state FAR *) ds; + zmemcpy(ds, ss, sizeof(deflate_state)); + ds->strm = dest; + + ds->window = (Bytef *) ZALLOC(dest, ds->w_size, 2*sizeof(Byte)); + ds->prev = (Posf *) ZALLOC(dest, ds->w_size, sizeof(Pos)); + ds->head = (Posf *) ZALLOC(dest, ds->hash_size, sizeof(Pos)); + overlay = (ushf *) ZALLOC(dest, ds->lit_bufsize, sizeof(ush)+2); + ds->pending_buf = (uchf *) overlay; + + if (ds->window == Z_NULL || ds->prev == Z_NULL || ds->head == Z_NULL || + ds->pending_buf == Z_NULL) { + deflateEnd (dest); + return Z_MEM_ERROR; + } + /* following zmemcpy do not work for 16-bit MSDOS */ + zmemcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte)); + zmemcpy(ds->prev, ss->prev, ds->w_size * sizeof(Pos)); + zmemcpy(ds->head, ss->head, ds->hash_size * sizeof(Pos)); + zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size); + + ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf); + ds->d_buf = overlay + ds->lit_bufsize/sizeof(ush); + ds->l_buf = ds->pending_buf + (1+sizeof(ush))*ds->lit_bufsize; + + ds->l_desc.dyn_tree = ds->dyn_ltree; + ds->d_desc.dyn_tree = ds->dyn_dtree; + ds->bl_desc.dyn_tree = ds->bl_tree; + + return Z_OK; +#endif /* MAXSEG_64K */ +} + +/* =========================================================================== + * Read a new buffer from the current input stream, update the adler32 + * and total number of bytes read. All deflate() input goes through + * this function so some applications may wish to modify it to avoid + * allocating a large strm->next_in buffer and copying from it. + * (See also flush_pending()). + */ +local int read_buf(strm, buf, size) + z_streamp strm; + Bytef *buf; + unsigned size; +{ + unsigned len = strm->avail_in; + + if (len > size) len = size; + if (len == 0) return 0; + + strm->avail_in -= len; + + if (strm->state->wrap == 1) { + strm->adler = adler32(strm->adler, strm->next_in, len); + } +#ifdef GZIP + else if (strm->state->wrap == 2) { + strm->adler = crc32(strm->adler, strm->next_in, len); + } +#endif + zmemcpy(buf, strm->next_in, len); + strm->next_in += len; + strm->total_in += len; + + return (int)len; +} + +/* =========================================================================== + * Initialize the "longest match" routines for a new zlib stream + */ +local void lm_init (s) + deflate_state *s; +{ + s->window_size = (ulg)2L*s->w_size; + + CLEAR_HASH(s); + + /* Set the default configuration parameters: + */ + s->max_lazy_match = configuration_table[s->level].max_lazy; + s->good_match = configuration_table[s->level].good_length; + s->nice_match = configuration_table[s->level].nice_length; + s->max_chain_length = configuration_table[s->level].max_chain; + + s->strstart = 0; + s->block_start = 0L; + s->lookahead = 0; + s->match_length = s->prev_length = MIN_MATCH-1; + s->match_available = 0; + s->ins_h = 0; +#ifndef FASTEST +#ifdef ASMV + match_init(); /* initialize the asm code */ +#endif +#endif +} + +#ifndef FASTEST +/* =========================================================================== + * Set match_start to the longest match starting at the given string and + * return its length. Matches shorter or equal to prev_length are discarded, + * in which case the result is equal to prev_length and match_start is + * garbage. + * IN assertions: cur_match is the head of the hash chain for the current + * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1 + * OUT assertion: the match length is not greater than s->lookahead. + */ +#ifndef ASMV +/* For 80x86 and 680x0, an optimized version will be provided in match.asm or + * match.S. The code will be functionally equivalent. + */ +local uInt longest_match(s, cur_match) + deflate_state *s; + IPos cur_match; /* current match */ +{ + unsigned chain_length = s->max_chain_length;/* max hash chain length */ + register Bytef *scan = s->window + s->strstart; /* current string */ + register Bytef *match; /* matched string */ + register int len; /* length of current match */ + int best_len = s->prev_length; /* best match length so far */ + int nice_match = s->nice_match; /* stop if match long enough */ + IPos limit = s->strstart > (IPos)MAX_DIST(s) ? + s->strstart - (IPos)MAX_DIST(s) : NIL; + /* Stop when cur_match becomes <= limit. To simplify the code, + * we prevent matches with the string of window index 0. + */ + Posf *prev = s->prev; + uInt wmask = s->w_mask; + +#ifdef UNALIGNED_OK + /* Compare two bytes at a time. Note: this is not always beneficial. + * Try with and without -DUNALIGNED_OK to check. + */ + register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1; + register ush scan_start = *(ushf*)scan; + register ush scan_end = *(ushf*)(scan+best_len-1); +#else + register Bytef *strend = s->window + s->strstart + MAX_MATCH; + register Byte scan_end1 = scan[best_len-1]; + register Byte scan_end = scan[best_len]; +#endif + + /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. + * It is easy to get rid of this optimization if necessary. + */ + Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); + + /* Do not waste too much time if we already have a good match: */ + if (s->prev_length >= s->good_match) { + chain_length >>= 2; + } + /* Do not look for matches beyond the end of the input. This is necessary + * to make deflate deterministic. + */ + if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; + + Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); + + do { + Assert(cur_match < s->strstart, "no future"); + match = s->window + cur_match; + + /* Skip to next match if the match length cannot increase + * or if the match length is less than 2. Note that the checks below + * for insufficient lookahead only occur occasionally for performance + * reasons. Therefore uninitialized memory will be accessed, and + * conditional jumps will be made that depend on those values. + * However the length of the match is limited to the lookahead, so + * the output of deflate is not affected by the uninitialized values. + */ +#if (defined(UNALIGNED_OK) && MAX_MATCH == 258) + /* This code assumes sizeof(unsigned short) == 2. Do not use + * UNALIGNED_OK if your compiler uses a different size. + */ + if (*(ushf*)(match+best_len-1) != scan_end || + *(ushf*)match != scan_start) continue; + + /* It is not necessary to compare scan[2] and match[2] since they are + * always equal when the other bytes match, given that the hash keys + * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at + * strstart+3, +5, ... up to strstart+257. We check for insufficient + * lookahead only every 4th comparison; the 128th check will be made + * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is + * necessary to put more guard bytes at the end of the window, or + * to check more often for insufficient lookahead. + */ + Assert(scan[2] == match[2], "scan[2]?"); + scan++, match++; + do { + } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) && + *(ushf*)(scan+=2) == *(ushf*)(match+=2) && + *(ushf*)(scan+=2) == *(ushf*)(match+=2) && + *(ushf*)(scan+=2) == *(ushf*)(match+=2) && + scan < strend); + /* The funny "do {}" generates better code on most compilers */ + + /* Here, scan <= window+strstart+257 */ + Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); + if (*scan == *match) scan++; + + len = (MAX_MATCH - 1) - (int)(strend-scan); + scan = strend - (MAX_MATCH-1); + +#else /* UNALIGNED_OK */ + + if (match[best_len] != scan_end || + match[best_len-1] != scan_end1 || + *match != *scan || + *++match != scan[1]) continue; + + /* The check at best_len-1 can be removed because it will be made + * again later. (This heuristic is not always a win.) + * It is not necessary to compare scan[2] and match[2] since they + * are always equal when the other bytes match, given that + * the hash keys are equal and that HASH_BITS >= 8. + */ + scan += 2, match++; + Assert(*scan == *match, "match[2]?"); + + /* We check for insufficient lookahead only every 8th comparison; + * the 256th check will be made at strstart+258. + */ + do { + } while (*++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + scan < strend); + + Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); + + len = MAX_MATCH - (int)(strend - scan); + scan = strend - MAX_MATCH; + +#endif /* UNALIGNED_OK */ + + if (len > best_len) { + s->match_start = cur_match; + best_len = len; + if (len >= nice_match) break; +#ifdef UNALIGNED_OK + scan_end = *(ushf*)(scan+best_len-1); +#else + scan_end1 = scan[best_len-1]; + scan_end = scan[best_len]; +#endif + } + } while ((cur_match = prev[cur_match & wmask]) > limit + && --chain_length != 0); + + if ((uInt)best_len <= s->lookahead) return (uInt)best_len; + return s->lookahead; +} +#endif /* ASMV */ +#endif /* FASTEST */ + +/* --------------------------------------------------------------------------- + * Optimized version for level == 1 or strategy == Z_RLE only + */ +local uInt longest_match_fast(s, cur_match) + deflate_state *s; + IPos cur_match; /* current match */ +{ + register Bytef *scan = s->window + s->strstart; /* current string */ + register Bytef *match; /* matched string */ + register int len; /* length of current match */ + register Bytef *strend = s->window + s->strstart + MAX_MATCH; + + /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. + * It is easy to get rid of this optimization if necessary. + */ + Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); + + Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); + + Assert(cur_match < s->strstart, "no future"); + + match = s->window + cur_match; + + /* Return failure if the match length is less than 2: + */ + if (match[0] != scan[0] || match[1] != scan[1]) return MIN_MATCH-1; + + /* The check at best_len-1 can be removed because it will be made + * again later. (This heuristic is not always a win.) + * It is not necessary to compare scan[2] and match[2] since they + * are always equal when the other bytes match, given that + * the hash keys are equal and that HASH_BITS >= 8. + */ + scan += 2, match += 2; + Assert(*scan == *match, "match[2]?"); + + /* We check for insufficient lookahead only every 8th comparison; + * the 256th check will be made at strstart+258. + */ + do { + } while (*++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + scan < strend); + + Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); + + len = MAX_MATCH - (int)(strend - scan); + + if (len < MIN_MATCH) return MIN_MATCH - 1; + + s->match_start = cur_match; + return (uInt)len <= s->lookahead ? (uInt)len : s->lookahead; +} + +#ifdef DEBUG +/* =========================================================================== + * Check that the match at match_start is indeed a match. + */ +local void check_match(s, start, match, length) + deflate_state *s; + IPos start, match; + int length; +{ + /* check that the match is indeed a match */ + if (zmemcmp(s->window + match, + s->window + start, length) != EQUAL) { + fprintf(stderr, " start %u, match %u, length %d\n", + start, match, length); + do { + fprintf(stderr, "%c%c", s->window[match++], s->window[start++]); + } while (--length != 0); + z_error("invalid match"); + } + if (z_verbose > 1) { + fprintf(stderr,"\\[%d,%d]", start-match, length); + do { putc(s->window[start++], stderr); } while (--length != 0); + } +} +#else +# define check_match(s, start, match, length) +#endif /* DEBUG */ + +/* =========================================================================== + * Fill the window when the lookahead becomes insufficient. + * Updates strstart and lookahead. + * + * IN assertion: lookahead < MIN_LOOKAHEAD + * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD + * At least one byte has been read, or avail_in == 0; reads are + * performed for at least two bytes (required for the zip translate_eol + * option -- not supported here). + */ +local void fill_window(s) + deflate_state *s; +{ + register unsigned n, m; + register Posf *p; + unsigned more; /* Amount of free space at the end of the window. */ + uInt wsize = s->w_size; + + do { + more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart); + + /* Deal with !@#$% 64K limit: */ + if (sizeof(int) <= 2) { + if (more == 0 && s->strstart == 0 && s->lookahead == 0) { + more = wsize; + + } else if (more == (unsigned)(-1)) { + /* Very unlikely, but possible on 16 bit machine if + * strstart == 0 && lookahead == 1 (input done a byte at time) + */ + more--; + } + } + + /* If the window is almost full and there is insufficient lookahead, + * move the upper half to the lower one to make room in the upper half. + */ + if (s->strstart >= wsize+MAX_DIST(s)) { + + zmemcpy(s->window, s->window+wsize, (unsigned)wsize); + s->match_start -= wsize; + s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ + s->block_start -= (long) wsize; + + /* Slide the hash table (could be avoided with 32 bit values + at the expense of memory usage). We slide even when level == 0 + to keep the hash table consistent if we switch back to level > 0 + later. (Using level 0 permanently is not an optimal usage of + zlib, so we don't care about this pathological case.) + */ + /* %%% avoid this when Z_RLE */ + n = s->hash_size; + p = &s->head[n]; + do { + m = *--p; + *p = (Pos)(m >= wsize ? m-wsize : NIL); + } while (--n); + + n = wsize; +#ifndef FASTEST + p = &s->prev[n]; + do { + m = *--p; + *p = (Pos)(m >= wsize ? m-wsize : NIL); + /* If n is not on any hash chain, prev[n] is garbage but + * its value will never be used. + */ + } while (--n); +#endif + more += wsize; + } + if (s->strm->avail_in == 0) return; + + /* If there was no sliding: + * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && + * more == window_size - lookahead - strstart + * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) + * => more >= window_size - 2*WSIZE + 2 + * In the BIG_MEM or MMAP case (not yet supported), + * window_size == input_size + MIN_LOOKAHEAD && + * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD. + * Otherwise, window_size == 2*WSIZE so more >= 2. + * If there was sliding, more >= WSIZE. So in all cases, more >= 2. + */ + Assert(more >= 2, "more < 2"); + + n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more); + s->lookahead += n; + + /* Initialize the hash value now that we have some input: */ + if (s->lookahead >= MIN_MATCH) { + s->ins_h = s->window[s->strstart]; + UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); +#if MIN_MATCH != 3 + Call UPDATE_HASH() MIN_MATCH-3 more times +#endif + } + /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage, + * but this is not important since only literal bytes will be emitted. + */ + + } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0); +} + +/* =========================================================================== + * Flush the current block, with given end-of-file flag. + * IN assertion: strstart is set to the end of the current match. + */ +#define FLUSH_BLOCK_ONLY(s, eof) { \ + _tr_flush_block(s, (s->block_start >= 0L ? \ + (charf *)&s->window[(unsigned)s->block_start] : \ + (charf *)Z_NULL), \ + (ulg)((long)s->strstart - s->block_start), \ + (eof)); \ + s->block_start = s->strstart; \ + flush_pending(s->strm); \ + Tracev((stderr,"[FLUSH]")); \ +} + +/* Same but force premature exit if necessary. */ +#define FLUSH_BLOCK(s, eof) { \ + FLUSH_BLOCK_ONLY(s, eof); \ + if (s->strm->avail_out == 0) return (eof) ? finish_started : need_more; \ +} + +/* =========================================================================== + * Copy without compression as much as possible from the input stream, return + * the current block state. + * This function does not insert new strings in the dictionary since + * uncompressible data is probably not useful. This function is used + * only for the level=0 compression option. + * NOTE: this function should be optimized to avoid extra copying from + * window to pending_buf. + */ +local block_state deflate_stored(s, flush) + deflate_state *s; + int flush; +{ + /* Stored blocks are limited to 0xffff bytes, pending_buf is limited + * to pending_buf_size, and each stored block has a 5 byte header: + */ + ulg max_block_size = 0xffff; + ulg max_start; + + if (max_block_size > s->pending_buf_size - 5) { + max_block_size = s->pending_buf_size - 5; + } + + /* Copy as much as possible from input to output: */ + for (;;) { + /* Fill the window as much as possible: */ + if (s->lookahead <= 1) { + + Assert(s->strstart < s->w_size+MAX_DIST(s) || + s->block_start >= (long)s->w_size, "slide too late"); + + fill_window(s); + if (s->lookahead == 0 && flush == Z_NO_FLUSH) return need_more; + + if (s->lookahead == 0) break; /* flush the current block */ + } + Assert(s->block_start >= 0L, "block gone"); + + s->strstart += s->lookahead; + s->lookahead = 0; + + /* Emit a stored block if pending_buf will be full: */ + max_start = s->block_start + max_block_size; + if (s->strstart == 0 || (ulg)s->strstart >= max_start) { + /* strstart == 0 is possible when wraparound on 16-bit machine */ + s->lookahead = (uInt)(s->strstart - max_start); + s->strstart = (uInt)max_start; + FLUSH_BLOCK(s, 0); + } + /* Flush if we may have to slide, otherwise block_start may become + * negative and the data will be gone: + */ + if (s->strstart - (uInt)s->block_start >= MAX_DIST(s)) { + FLUSH_BLOCK(s, 0); + } + } + FLUSH_BLOCK(s, flush == Z_FINISH); + return flush == Z_FINISH ? finish_done : block_done; +} + +/* =========================================================================== + * Compress as much as possible from the input stream, return the current + * block state. + * This function does not perform lazy evaluation of matches and inserts + * new strings in the dictionary only for unmatched strings or for short + * matches. It is used only for the fast compression options. + */ +local block_state deflate_fast(s, flush) + deflate_state *s; + int flush; +{ + IPos hash_head = NIL; /* head of the hash chain */ + int bflush; /* set if current block must be flushed */ + + for (;;) { + /* Make sure that we always have enough lookahead, except + * at the end of the input file. We need MAX_MATCH bytes + * for the next match, plus MIN_MATCH bytes to insert the + * string following the next match. + */ + if (s->lookahead < MIN_LOOKAHEAD) { + fill_window(s); + if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { + return need_more; + } + if (s->lookahead == 0) break; /* flush the current block */ + } + + /* Insert the string window[strstart .. strstart+2] in the + * dictionary, and set hash_head to the head of the hash chain: + */ + if (s->lookahead >= MIN_MATCH) { + INSERT_STRING(s, s->strstart, hash_head); + } + + /* Find the longest match, discarding those <= prev_length. + * At this point we have always match_length < MIN_MATCH + */ + if (hash_head != NIL && s->strstart - hash_head <= MAX_DIST(s)) { + /* To simplify the code, we prevent matches with the string + * of window index 0 (in particular we have to avoid a match + * of the string with itself at the start of the input file). + */ +#ifdef FASTEST + if ((s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) || + (s->strategy == Z_RLE && s->strstart - hash_head == 1)) { + s->match_length = longest_match_fast (s, hash_head); + } +#else + if (s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) { + s->match_length = longest_match (s, hash_head); + } else if (s->strategy == Z_RLE && s->strstart - hash_head == 1) { + s->match_length = longest_match_fast (s, hash_head); + } +#endif + /* longest_match() or longest_match_fast() sets match_start */ + } + if (s->match_length >= MIN_MATCH) { + check_match(s, s->strstart, s->match_start, s->match_length); + + _tr_tally_dist(s, s->strstart - s->match_start, + s->match_length - MIN_MATCH, bflush); + + s->lookahead -= s->match_length; + + /* Insert new strings in the hash table only if the match length + * is not too large. This saves time but degrades compression. + */ +#ifndef FASTEST + if (s->match_length <= s->max_insert_length && + s->lookahead >= MIN_MATCH) { + s->match_length--; /* string at strstart already in table */ + do { + s->strstart++; + INSERT_STRING(s, s->strstart, hash_head); + /* strstart never exceeds WSIZE-MAX_MATCH, so there are + * always MIN_MATCH bytes ahead. + */ + } while (--s->match_length != 0); + s->strstart++; + } else +#endif + { + s->strstart += s->match_length; + s->match_length = 0; + s->ins_h = s->window[s->strstart]; + UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); +#if MIN_MATCH != 3 + Call UPDATE_HASH() MIN_MATCH-3 more times +#endif + /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not + * matter since it will be recomputed at next deflate call. + */ + } + } else { + /* No match, output a literal byte */ + Tracevv((stderr,"%c", s->window[s->strstart])); + _tr_tally_lit (s, s->window[s->strstart], bflush); + s->lookahead--; + s->strstart++; + } + if (bflush) FLUSH_BLOCK(s, 0); + } + FLUSH_BLOCK(s, flush == Z_FINISH); + return flush == Z_FINISH ? finish_done : block_done; +} + +#ifndef FASTEST +/* =========================================================================== + * Same as above, but achieves better compression. We use a lazy + * evaluation for matches: a match is finally adopted only if there is + * no better match at the next window position. + */ +local block_state deflate_slow(s, flush) + deflate_state *s; + int flush; +{ + IPos hash_head = NIL; /* head of hash chain */ + int bflush; /* set if current block must be flushed */ + + /* Process the input block. */ + for (;;) { + /* Make sure that we always have enough lookahead, except + * at the end of the input file. We need MAX_MATCH bytes + * for the next match, plus MIN_MATCH bytes to insert the + * string following the next match. + */ + if (s->lookahead < MIN_LOOKAHEAD) { + fill_window(s); + if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { + return need_more; + } + if (s->lookahead == 0) break; /* flush the current block */ + } + + /* Insert the string window[strstart .. strstart+2] in the + * dictionary, and set hash_head to the head of the hash chain: + */ + if (s->lookahead >= MIN_MATCH) { + INSERT_STRING(s, s->strstart, hash_head); + } + + /* Find the longest match, discarding those <= prev_length. + */ + s->prev_length = s->match_length, s->prev_match = s->match_start; + s->match_length = MIN_MATCH-1; + + if (hash_head != NIL && s->prev_length < s->max_lazy_match && + s->strstart - hash_head <= MAX_DIST(s)) { + /* To simplify the code, we prevent matches with the string + * of window index 0 (in particular we have to avoid a match + * of the string with itself at the start of the input file). + */ + if (s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) { + s->match_length = longest_match (s, hash_head); + } else if (s->strategy == Z_RLE && s->strstart - hash_head == 1) { + s->match_length = longest_match_fast (s, hash_head); + } + /* longest_match() or longest_match_fast() sets match_start */ + + if (s->match_length <= 5 && (s->strategy == Z_FILTERED +#if TOO_FAR <= 32767 + || (s->match_length == MIN_MATCH && + s->strstart - s->match_start > TOO_FAR) +#endif + )) { + + /* If prev_match is also MIN_MATCH, match_start is garbage + * but we will ignore the current match anyway. + */ + s->match_length = MIN_MATCH-1; + } + } + /* If there was a match at the previous step and the current + * match is not better, output the previous match: + */ + if (s->prev_length >= MIN_MATCH && s->match_length <= s->prev_length) { + uInt max_insert = s->strstart + s->lookahead - MIN_MATCH; + /* Do not insert strings in hash table beyond this. */ + + check_match(s, s->strstart-1, s->prev_match, s->prev_length); + + _tr_tally_dist(s, s->strstart -1 - s->prev_match, + s->prev_length - MIN_MATCH, bflush); + + /* Insert in hash table all strings up to the end of the match. + * strstart-1 and strstart are already inserted. If there is not + * enough lookahead, the last two strings are not inserted in + * the hash table. + */ + s->lookahead -= s->prev_length-1; + s->prev_length -= 2; + do { + if (++s->strstart <= max_insert) { + INSERT_STRING(s, s->strstart, hash_head); + } + } while (--s->prev_length != 0); + s->match_available = 0; + s->match_length = MIN_MATCH-1; + s->strstart++; + + if (bflush) FLUSH_BLOCK(s, 0); + + } else if (s->match_available) { + /* If there was no match at the previous position, output a + * single literal. If there was a match but the current match + * is longer, truncate the previous match to a single literal. + */ + Tracevv((stderr,"%c", s->window[s->strstart-1])); + _tr_tally_lit(s, s->window[s->strstart-1], bflush); + if (bflush) { + FLUSH_BLOCK_ONLY(s, 0); + } + s->strstart++; + s->lookahead--; + if (s->strm->avail_out == 0) return need_more; + } else { + /* There is no previous match to compare with, wait for + * the next step to decide. + */ + s->match_available = 1; + s->strstart++; + s->lookahead--; + } + } + Assert (flush != Z_NO_FLUSH, "no flush?"); + if (s->match_available) { + Tracevv((stderr,"%c", s->window[s->strstart-1])); + _tr_tally_lit(s, s->window[s->strstart-1], bflush); + s->match_available = 0; + } + FLUSH_BLOCK(s, flush == Z_FINISH); + return flush == Z_FINISH ? finish_done : block_done; +} +#endif /* FASTEST */ + +#if 0 +/* =========================================================================== + * For Z_RLE, simply look for runs of bytes, generate matches only of distance + * one. Do not maintain a hash table. (It will be regenerated if this run of + * deflate switches away from Z_RLE.) + */ +local block_state deflate_rle(s, flush) + deflate_state *s; + int flush; +{ + int bflush; /* set if current block must be flushed */ + uInt run; /* length of run */ + uInt max; /* maximum length of run */ + uInt prev; /* byte at distance one to match */ + Bytef *scan; /* scan for end of run */ + + for (;;) { + /* Make sure that we always have enough lookahead, except + * at the end of the input file. We need MAX_MATCH bytes + * for the longest encodable run. + */ + if (s->lookahead < MAX_MATCH) { + fill_window(s); + if (s->lookahead < MAX_MATCH && flush == Z_NO_FLUSH) { + return need_more; + } + if (s->lookahead == 0) break; /* flush the current block */ + } + + /* See how many times the previous byte repeats */ + run = 0; + if (s->strstart > 0) { /* if there is a previous byte, that is */ + max = s->lookahead < MAX_MATCH ? s->lookahead : MAX_MATCH; + scan = s->window + s->strstart - 1; + prev = *scan++; + do { + if (*scan++ != prev) + break; + } while (++run < max); + } + + /* Emit match if have run of MIN_MATCH or longer, else emit literal */ + if (run >= MIN_MATCH) { + check_match(s, s->strstart, s->strstart - 1, run); + _tr_tally_dist(s, 1, run - MIN_MATCH, bflush); + s->lookahead -= run; + s->strstart += run; + } else { + /* No match, output a literal byte */ + Tracevv((stderr,"%c", s->window[s->strstart])); + _tr_tally_lit (s, s->window[s->strstart], bflush); + s->lookahead--; + s->strstart++; + } + if (bflush) FLUSH_BLOCK(s, 0); + } + FLUSH_BLOCK(s, flush == Z_FINISH); + return flush == Z_FINISH ? finish_done : block_done; +} +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/deflate.h b/sys/cddl/contrib/opensolaris/uts/common/zmod/deflate.h new file mode 100644 index 000000000000..d01a3c10e449 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/deflate.h @@ -0,0 +1,331 @@ +/* deflate.h -- internal compression state + * Copyright (C) 1995-2004 Jean-loup Gailly + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +#ifndef _DEFLATE_H +#define _DEFLATE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "zutil.h" + +/* define NO_GZIP when compiling if you want to disable gzip header and + trailer creation by deflate(). NO_GZIP would be used to avoid linking in + the crc code when it is not needed. For shared libraries, gzip encoding + should be left enabled. */ +#ifndef NO_GZIP +# define GZIP +#endif + +/* =========================================================================== + * Internal compression state. + */ + +#define LENGTH_CODES 29 +/* number of length codes, not counting the special END_BLOCK code */ + +#define LITERALS 256 +/* number of literal bytes 0..255 */ + +#define L_CODES (LITERALS+1+LENGTH_CODES) +/* number of Literal or Length codes, including the END_BLOCK code */ + +#define D_CODES 30 +/* number of distance codes */ + +#define BL_CODES 19 +/* number of codes used to transfer the bit lengths */ + +#define HEAP_SIZE (2*L_CODES+1) +/* maximum heap size */ + +#define MAX_BITS 15 +/* All codes must not exceed MAX_BITS bits */ + +#define INIT_STATE 42 +#define EXTRA_STATE 69 +#define NAME_STATE 73 +#define COMMENT_STATE 91 +#define HCRC_STATE 103 +#define BUSY_STATE 113 +#define FINISH_STATE 666 +/* Stream status */ + + +/* Data structure describing a single value and its code string. */ +typedef struct ct_data_s { + union { + ush freq; /* frequency count */ + ush code; /* bit string */ + } fc; + union { + ush dad; /* father node in Huffman tree */ + ush len; /* length of bit string */ + } dl; +} FAR ct_data; + +#define Freq fc.freq +#define Code fc.code +#define Dad dl.dad +#define Len dl.len + +typedef struct static_tree_desc_s static_tree_desc; + +typedef struct tree_desc_s { + ct_data *dyn_tree; /* the dynamic tree */ + int max_code; /* largest code with non zero frequency */ + static_tree_desc *stat_desc; /* the corresponding static tree */ +} FAR tree_desc; + +typedef ush Pos; +typedef Pos FAR Posf; +typedef unsigned IPos; + +/* A Pos is an index in the character window. We use short instead of int to + * save space in the various tables. IPos is used only for parameter passing. + */ + +typedef struct internal_state { + z_streamp strm; /* pointer back to this zlib stream */ + int status; /* as the name implies */ + Bytef *pending_buf; /* output still pending */ + ulg pending_buf_size; /* size of pending_buf */ + Bytef *pending_out; /* next pending byte to output to the stream */ + uInt pending; /* nb of bytes in the pending buffer */ + int wrap; /* bit 0 true for zlib, bit 1 true for gzip */ + gz_headerp gzhead; /* gzip header information to write */ + uInt gzindex; /* where in extra, name, or comment */ + Byte method; /* STORED (for zip only) or DEFLATED */ + int last_flush; /* value of flush param for previous deflate call */ + + /* used by deflate.c: */ + + uInt w_size; /* LZ77 window size (32K by default) */ + uInt w_bits; /* log2(w_size) (8..16) */ + uInt w_mask; /* w_size - 1 */ + + Bytef *window; + /* Sliding window. Input bytes are read into the second half of the window, + * and move to the first half later to keep a dictionary of at least wSize + * bytes. With this organization, matches are limited to a distance of + * wSize-MAX_MATCH bytes, but this ensures that IO is always + * performed with a length multiple of the block size. Also, it limits + * the window size to 64K, which is quite useful on MSDOS. + * To do: use the user input buffer as sliding window. + */ + + ulg window_size; + /* Actual size of window: 2*wSize, except when the user input buffer + * is directly used as sliding window. + */ + + Posf *prev; + /* Link to older string with same hash index. To limit the size of this + * array to 64K, this link is maintained only for the last 32K strings. + * An index in this array is thus a window index modulo 32K. + */ + + Posf *head; /* Heads of the hash chains or NIL. */ + + uInt ins_h; /* hash index of string to be inserted */ + uInt hash_size; /* number of elements in hash table */ + uInt hash_bits; /* log2(hash_size) */ + uInt hash_mask; /* hash_size-1 */ + + uInt hash_shift; + /* Number of bits by which ins_h must be shifted at each input + * step. It must be such that after MIN_MATCH steps, the oldest + * byte no longer takes part in the hash key, that is: + * hash_shift * MIN_MATCH >= hash_bits + */ + + long block_start; + /* Window position at the beginning of the current output block. Gets + * negative when the window is moved backwards. + */ + + uInt match_length; /* length of best match */ + IPos prev_match; /* previous match */ + int match_available; /* set if previous match exists */ + uInt strstart; /* start of string to insert */ + uInt match_start; /* start of matching string */ + uInt lookahead; /* number of valid bytes ahead in window */ + + uInt prev_length; + /* Length of the best match at previous step. Matches not greater than this + * are discarded. This is used in the lazy match evaluation. + */ + + uInt max_chain_length; + /* To speed up deflation, hash chains are never searched beyond this + * length. A higher limit improves compression ratio but degrades the + * speed. + */ + + uInt max_lazy_match; + /* Attempt to find a better match only when the current match is strictly + * smaller than this value. This mechanism is used only for compression + * levels >= 4. + */ +# define max_insert_length max_lazy_match + /* Insert new strings in the hash table only if the match length is not + * greater than this length. This saves time but degrades compression. + * max_insert_length is used only for compression levels <= 3. + */ + + int level; /* compression level (1..9) */ + int strategy; /* favor or force Huffman coding*/ + + uInt good_match; + /* Use a faster search when the previous match is longer than this */ + + int nice_match; /* Stop searching when current match exceeds this */ + + /* used by trees.c: */ + /* Didn't use ct_data typedef below to supress compiler warning */ + struct ct_data_s dyn_ltree[HEAP_SIZE]; /* literal and length tree */ + struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */ + struct ct_data_s bl_tree[2*BL_CODES+1]; /* Huffman tree for bit lengths */ + + struct tree_desc_s l_desc; /* desc. for literal tree */ + struct tree_desc_s d_desc; /* desc. for distance tree */ + struct tree_desc_s bl_desc; /* desc. for bit length tree */ + + ush bl_count[MAX_BITS+1]; + /* number of codes at each bit length for an optimal tree */ + + int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */ + int heap_len; /* number of elements in the heap */ + int heap_max; /* element of largest frequency */ + /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used. + * The same heap array is used to build all trees. + */ + + uch depth[2*L_CODES+1]; + /* Depth of each subtree used as tie breaker for trees of equal frequency + */ + + uchf *l_buf; /* buffer for literals or lengths */ + + uInt lit_bufsize; + /* Size of match buffer for literals/lengths. There are 4 reasons for + * limiting lit_bufsize to 64K: + * - frequencies can be kept in 16 bit counters + * - if compression is not successful for the first block, all input + * data is still in the window so we can still emit a stored block even + * when input comes from standard input. (This can also be done for + * all blocks if lit_bufsize is not greater than 32K.) + * - if compression is not successful for a file smaller than 64K, we can + * even emit a stored file instead of a stored block (saving 5 bytes). + * This is applicable only for zip (not gzip or zlib). + * - creating new Huffman trees less frequently may not provide fast + * adaptation to changes in the input data statistics. (Take for + * example a binary file with poorly compressible code followed by + * a highly compressible string table.) Smaller buffer sizes give + * fast adaptation but have of course the overhead of transmitting + * trees more frequently. + * - I can't count above 4 + */ + + uInt last_lit; /* running index in l_buf */ + + ushf *d_buf; + /* Buffer for distances. To simplify the code, d_buf and l_buf have + * the same number of elements. To use different lengths, an extra flag + * array would be necessary. + */ + + ulg opt_len; /* bit length of current block with optimal trees */ + ulg static_len; /* bit length of current block with static trees */ + uInt matches; /* number of string matches in current block */ + int last_eob_len; /* bit length of EOB code for last block */ + +#ifdef DEBUG + ulg compressed_len; /* total bit length of compressed file mod 2^32 */ + ulg bits_sent; /* bit length of compressed data sent mod 2^32 */ +#endif + + ush bi_buf; + /* Output buffer. bits are inserted starting at the bottom (least + * significant bits). + */ + int bi_valid; + /* Number of valid bits in bi_buf. All bits above the last valid bit + * are always zero. + */ + +} FAR deflate_state; + +/* Output a byte on the stream. + * IN assertion: there is enough room in pending_buf. + */ +#define put_byte(s, c) {s->pending_buf[s->pending++] = (c);} + + +#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) +/* Minimum amount of lookahead, except at the end of the input file. + * See deflate.c for comments about the MIN_MATCH+1. + */ + +#define MAX_DIST(s) ((s)->w_size-MIN_LOOKAHEAD) +/* In order to simplify the code, particularly on 16 bit machines, match + * distances are limited to MAX_DIST instead of WSIZE. + */ + + /* in trees.c */ +void _tr_init OF((deflate_state *s)); +int _tr_tally OF((deflate_state *s, unsigned dist, unsigned lc)); +void _tr_flush_block OF((deflate_state *s, charf *buf, ulg stored_len, + int eof)); +void _tr_align OF((deflate_state *s)); +void _tr_stored_block OF((deflate_state *s, charf *buf, ulg stored_len, + int eof)); + +#define d_code(dist) \ + ((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)]) +/* Mapping from a distance to a distance code. dist is the distance - 1 and + * must not have side effects. _dist_code[256] and _dist_code[257] are never + * used. + */ + +#ifndef DEBUG +/* Inline versions of _tr_tally for speed: */ + +#if defined(GEN_TREES_H) || !defined(STDC) + extern uch _length_code[]; + extern uch _dist_code[]; +#else + extern const uch _length_code[]; + extern const uch _dist_code[]; +#endif + +# define _tr_tally_lit(s, c, flush) \ + { uch cc = (c); \ + s->d_buf[s->last_lit] = 0; \ + s->l_buf[s->last_lit++] = cc; \ + s->dyn_ltree[cc].Freq++; \ + flush = (s->last_lit == s->lit_bufsize-1); \ + } +# define _tr_tally_dist(s, distance, length, flush) \ + { uch len = (length); \ + ush dist = (distance); \ + s->d_buf[s->last_lit] = dist; \ + s->l_buf[s->last_lit++] = len; \ + dist--; \ + s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \ + s->dyn_dtree[d_code(dist)].Freq++; \ + flush = (s->last_lit == s->lit_bufsize-1); \ + } +#else +# define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c) +# define _tr_tally_dist(s, distance, length, flush) \ + flush = _tr_tally(s, distance, length) +#endif + +#endif /* _DEFLATE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/inffast.c b/sys/cddl/contrib/opensolaris/uts/common/zmod/inffast.c new file mode 100644 index 000000000000..a6dcf3f5c8b8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/inffast.c @@ -0,0 +1,320 @@ +/* inffast.c -- fast decoding + * Copyright (C) 1995-2004 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "zutil.h" +#include "inftrees.h" +#include "inflate.h" +#include "inffast.h" + +#ifndef ASMINF + +/* Allow machine dependent optimization for post-increment or pre-increment. + Based on testing to date, + Pre-increment preferred for: + - PowerPC G3 (Adler) + - MIPS R5000 (Randers-Pehrson) + Post-increment preferred for: + - none + No measurable difference: + - Pentium III (Anderson) + - M68060 (Nikl) + */ +#ifdef POSTINC +# define OFF 0 +# define PUP(a) *(a)++ +#else +# define OFF 1 +# define PUP(a) *++(a) +#endif + +/* + Decode literal, length, and distance codes and write out the resulting + literal and match bytes until either not enough input or output is + available, an end-of-block is encountered, or a data error is encountered. + When large enough input and output buffers are supplied to inflate(), for + example, a 16K input buffer and a 64K output buffer, more than 95% of the + inflate execution time is spent in this routine. + + Entry assumptions: + + state->mode == LEN + strm->avail_in >= 6 + strm->avail_out >= 258 + start >= strm->avail_out + state->bits < 8 + + On return, state->mode is one of: + + LEN -- ran out of enough output space or enough available input + TYPE -- reached end of block code, inflate() to interpret next block + BAD -- error in block data + + Notes: + + - The maximum input bits used by a length/distance pair is 15 bits for the + length code, 5 bits for the length extra, 15 bits for the distance code, + and 13 bits for the distance extra. This totals 48 bits, or six bytes. + Therefore if strm->avail_in >= 6, then there is enough input to avoid + checking for available input while decoding. + + - The maximum bytes that a single length/distance pair can output is 258 + bytes, which is the maximum length that can be coded. inflate_fast() + requires strm->avail_out >= 258 for each loop to avoid checking for + output space. + */ +void inflate_fast(strm, start) +z_streamp strm; +unsigned start; /* inflate()'s starting value for strm->avail_out */ +{ + struct inflate_state FAR *state; + unsigned char FAR *in; /* local strm->next_in */ + unsigned char FAR *last; /* while in < last, enough input available */ + unsigned char FAR *out; /* local strm->next_out */ + unsigned char FAR *beg; /* inflate()'s initial strm->next_out */ + unsigned char FAR *end; /* while out < end, enough space available */ +#ifdef INFLATE_STRICT + unsigned dmax; /* maximum distance from zlib header */ +#endif + unsigned wsize; /* window size or zero if not using window */ + unsigned whave; /* valid bytes in the window */ + unsigned write; /* window write index */ + unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */ + unsigned long hold; /* local strm->hold */ + unsigned bits; /* local strm->bits */ + code const FAR *lcode; /* local strm->lencode */ + code const FAR *dcode; /* local strm->distcode */ + unsigned lmask; /* mask for first level of length codes */ + unsigned dmask; /* mask for first level of distance codes */ + code this; /* retrieved table entry */ + unsigned op; /* code bits, operation, extra bits, or */ + /* window position, window bytes to copy */ + unsigned len; /* match length, unused bytes */ + unsigned dist; /* match distance */ + unsigned char FAR *from; /* where to copy match from */ + + /* copy state to local variables */ + state = (struct inflate_state FAR *)strm->state; + in = strm->next_in - OFF; + last = in + (strm->avail_in - 5); + out = strm->next_out - OFF; + beg = out - (start - strm->avail_out); + end = out + (strm->avail_out - 257); +#ifdef INFLATE_STRICT + dmax = state->dmax; +#endif + wsize = state->wsize; + whave = state->whave; + write = state->write; + window = state->window; + hold = state->hold; + bits = state->bits; + lcode = state->lencode; + dcode = state->distcode; + lmask = (1U << state->lenbits) - 1; + dmask = (1U << state->distbits) - 1; + + /* decode literals and length/distances until end-of-block or not enough + input data or output space */ + do { + if (bits < 15) { + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + } + this = lcode[hold & lmask]; + dolen: + op = (unsigned)(this.bits); + hold >>= op; + bits -= op; + op = (unsigned)(this.op); + if (op == 0) { /* literal */ + Tracevv((stderr, this.val >= 0x20 && this.val < 0x7f ? + "inflate: literal '%c'\n" : + "inflate: literal 0x%02x\n", this.val)); + PUP(out) = (unsigned char)(this.val); + } + else if (op & 16) { /* length base */ + len = (unsigned)(this.val); + op &= 15; /* number of extra bits */ + if (op) { + if (bits < op) { + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + } + len += (unsigned)hold & ((1U << op) - 1); + hold >>= op; + bits -= op; + } + Tracevv((stderr, "inflate: length %u\n", len)); + if (bits < 15) { + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + } + this = dcode[hold & dmask]; + dodist: + op = (unsigned)(this.bits); + hold >>= op; + bits -= op; + op = (unsigned)(this.op); + if (op & 16) { /* distance base */ + dist = (unsigned)(this.val); + op &= 15; /* number of extra bits */ + if (bits < op) { + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + if (bits < op) { + hold += (unsigned long)(PUP(in)) << bits; + bits += 8; + } + } + dist += (unsigned)hold & ((1U << op) - 1); +#ifdef INFLATE_STRICT + if (dist > dmax) { + strm->msg = (char *)"invalid distance too far back"; + state->mode = BAD; + break; + } +#endif + hold >>= op; + bits -= op; + Tracevv((stderr, "inflate: distance %u\n", dist)); + op = (unsigned)(out - beg); /* max distance in output */ + if (dist > op) { /* see if copy from window */ + op = dist - op; /* distance back in window */ + if (op > whave) { + strm->msg = (char *)"invalid distance too far back"; + state->mode = BAD; + break; + } + from = window - OFF; + if (write == 0) { /* very common case */ + from += wsize - op; + if (op < len) { /* some from window */ + len -= op; + do { + PUP(out) = PUP(from); + } while (--op); + from = out - dist; /* rest from output */ + } + } + else if (write < op) { /* wrap around window */ + from += wsize + write - op; + op -= write; + if (op < len) { /* some from end of window */ + len -= op; + do { + PUP(out) = PUP(from); + } while (--op); + from = window - OFF; + if (write < len) { /* some from start of window */ + op = write; + len -= op; + do { + PUP(out) = PUP(from); + } while (--op); + from = out - dist; /* rest from output */ + } + } + } + else { /* contiguous in window */ + from += write - op; + if (op < len) { /* some from window */ + len -= op; + do { + PUP(out) = PUP(from); + } while (--op); + from = out - dist; /* rest from output */ + } + } + while (len > 2) { + PUP(out) = PUP(from); + PUP(out) = PUP(from); + PUP(out) = PUP(from); + len -= 3; + } + if (len) { + PUP(out) = PUP(from); + if (len > 1) + PUP(out) = PUP(from); + } + } + else { + from = out - dist; /* copy direct from output */ + do { /* minimum length is three */ + PUP(out) = PUP(from); + PUP(out) = PUP(from); + PUP(out) = PUP(from); + len -= 3; + } while (len > 2); + if (len) { + PUP(out) = PUP(from); + if (len > 1) + PUP(out) = PUP(from); + } + } + } + else if ((op & 64) == 0) { /* 2nd level distance code */ + this = dcode[this.val + (hold & ((1U << op) - 1))]; + goto dodist; + } + else { + strm->msg = (char *)"invalid distance code"; + state->mode = BAD; + break; + } + } + else if ((op & 64) == 0) { /* 2nd level length code */ + this = lcode[this.val + (hold & ((1U << op) - 1))]; + goto dolen; + } + else if (op & 32) { /* end-of-block */ + Tracevv((stderr, "inflate: end of block\n")); + state->mode = TYPE; + break; + } + else { + strm->msg = (char *)"invalid literal/length code"; + state->mode = BAD; + break; + } + } while (in < last && out < end); + + /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ + len = bits >> 3; + in -= len; + bits -= len << 3; + hold &= (1U << bits) - 1; + + /* update state and return */ + strm->next_in = in + OFF; + strm->next_out = out + OFF; + strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last)); + strm->avail_out = (unsigned)(out < end ? + 257 + (end - out) : 257 - (out - end)); + state->hold = hold; + state->bits = bits; + return; +} + +/* + inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe): + - Using bit fields for code structure + - Different op definition to avoid & for extra bits (do & for table bits) + - Three separate decoding do-loops for direct, window, and write == 0 + - Special case for distance > 1 copies to do overlapped load and store copy + - Explicit branch predictions (based on measured branch probabilities) + - Deferring match copy and interspersed it with decoding subsequent codes + - Swapping literal/length else + - Swapping window/direct else + - Larger unrolled copy loops (three is about right) + - Moving len -= 3 statement into middle of loop + */ + +#endif /* !ASMINF */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/inffast.h b/sys/cddl/contrib/opensolaris/uts/common/zmod/inffast.h new file mode 100644 index 000000000000..2d214efa299a --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/inffast.h @@ -0,0 +1,13 @@ +/* inffast.h -- header to use inffast.c + * Copyright (C) 1995-2003 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +void inflate_fast OF((z_streamp strm, unsigned start)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/inffixed.h b/sys/cddl/contrib/opensolaris/uts/common/zmod/inffixed.h new file mode 100644 index 000000000000..ed55df899109 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/inffixed.h @@ -0,0 +1,96 @@ + /* inffixed.h -- table for decoding fixed codes + * Generated automatically by makefixed(). + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + /* WARNING: this file should *not* be used by applications. It + is part of the implementation of the compression library and + is subject to change. Applications should only use zlib.h. + */ + + static const code lenfix[512] = { + {96,7,0},{0,8,80},{0,8,16},{20,8,115},{18,7,31},{0,8,112},{0,8,48}, + {0,9,192},{16,7,10},{0,8,96},{0,8,32},{0,9,160},{0,8,0},{0,8,128}, + {0,8,64},{0,9,224},{16,7,6},{0,8,88},{0,8,24},{0,9,144},{19,7,59}, + {0,8,120},{0,8,56},{0,9,208},{17,7,17},{0,8,104},{0,8,40},{0,9,176}, + {0,8,8},{0,8,136},{0,8,72},{0,9,240},{16,7,4},{0,8,84},{0,8,20}, + {21,8,227},{19,7,43},{0,8,116},{0,8,52},{0,9,200},{17,7,13},{0,8,100}, + {0,8,36},{0,9,168},{0,8,4},{0,8,132},{0,8,68},{0,9,232},{16,7,8}, + {0,8,92},{0,8,28},{0,9,152},{20,7,83},{0,8,124},{0,8,60},{0,9,216}, + {18,7,23},{0,8,108},{0,8,44},{0,9,184},{0,8,12},{0,8,140},{0,8,76}, + {0,9,248},{16,7,3},{0,8,82},{0,8,18},{21,8,163},{19,7,35},{0,8,114}, + {0,8,50},{0,9,196},{17,7,11},{0,8,98},{0,8,34},{0,9,164},{0,8,2}, + {0,8,130},{0,8,66},{0,9,228},{16,7,7},{0,8,90},{0,8,26},{0,9,148}, + {20,7,67},{0,8,122},{0,8,58},{0,9,212},{18,7,19},{0,8,106},{0,8,42}, + {0,9,180},{0,8,10},{0,8,138},{0,8,74},{0,9,244},{16,7,5},{0,8,86}, + {0,8,22},{64,8,0},{19,7,51},{0,8,118},{0,8,54},{0,9,204},{17,7,15}, + {0,8,102},{0,8,38},{0,9,172},{0,8,6},{0,8,134},{0,8,70},{0,9,236}, + {16,7,9},{0,8,94},{0,8,30},{0,9,156},{20,7,99},{0,8,126},{0,8,62}, + {0,9,220},{18,7,27},{0,8,110},{0,8,46},{0,9,188},{0,8,14},{0,8,142}, + {0,8,78},{0,9,252},{96,7,0},{0,8,81},{0,8,17},{21,8,131},{18,7,31}, + {0,8,113},{0,8,49},{0,9,194},{16,7,10},{0,8,97},{0,8,33},{0,9,162}, + {0,8,1},{0,8,129},{0,8,65},{0,9,226},{16,7,6},{0,8,89},{0,8,25}, + {0,9,146},{19,7,59},{0,8,121},{0,8,57},{0,9,210},{17,7,17},{0,8,105}, + {0,8,41},{0,9,178},{0,8,9},{0,8,137},{0,8,73},{0,9,242},{16,7,4}, + {0,8,85},{0,8,21},{16,8,258},{19,7,43},{0,8,117},{0,8,53},{0,9,202}, + {17,7,13},{0,8,101},{0,8,37},{0,9,170},{0,8,5},{0,8,133},{0,8,69}, + {0,9,234},{16,7,8},{0,8,93},{0,8,29},{0,9,154},{20,7,83},{0,8,125}, + {0,8,61},{0,9,218},{18,7,23},{0,8,109},{0,8,45},{0,9,186},{0,8,13}, + {0,8,141},{0,8,77},{0,9,250},{16,7,3},{0,8,83},{0,8,19},{21,8,195}, + {19,7,35},{0,8,115},{0,8,51},{0,9,198},{17,7,11},{0,8,99},{0,8,35}, + {0,9,166},{0,8,3},{0,8,131},{0,8,67},{0,9,230},{16,7,7},{0,8,91}, + {0,8,27},{0,9,150},{20,7,67},{0,8,123},{0,8,59},{0,9,214},{18,7,19}, + {0,8,107},{0,8,43},{0,9,182},{0,8,11},{0,8,139},{0,8,75},{0,9,246}, + {16,7,5},{0,8,87},{0,8,23},{64,8,0},{19,7,51},{0,8,119},{0,8,55}, + {0,9,206},{17,7,15},{0,8,103},{0,8,39},{0,9,174},{0,8,7},{0,8,135}, + {0,8,71},{0,9,238},{16,7,9},{0,8,95},{0,8,31},{0,9,158},{20,7,99}, + {0,8,127},{0,8,63},{0,9,222},{18,7,27},{0,8,111},{0,8,47},{0,9,190}, + {0,8,15},{0,8,143},{0,8,79},{0,9,254},{96,7,0},{0,8,80},{0,8,16}, + {20,8,115},{18,7,31},{0,8,112},{0,8,48},{0,9,193},{16,7,10},{0,8,96}, + {0,8,32},{0,9,161},{0,8,0},{0,8,128},{0,8,64},{0,9,225},{16,7,6}, + {0,8,88},{0,8,24},{0,9,145},{19,7,59},{0,8,120},{0,8,56},{0,9,209}, + {17,7,17},{0,8,104},{0,8,40},{0,9,177},{0,8,8},{0,8,136},{0,8,72}, + {0,9,241},{16,7,4},{0,8,84},{0,8,20},{21,8,227},{19,7,43},{0,8,116}, + {0,8,52},{0,9,201},{17,7,13},{0,8,100},{0,8,36},{0,9,169},{0,8,4}, + {0,8,132},{0,8,68},{0,9,233},{16,7,8},{0,8,92},{0,8,28},{0,9,153}, + {20,7,83},{0,8,124},{0,8,60},{0,9,217},{18,7,23},{0,8,108},{0,8,44}, + {0,9,185},{0,8,12},{0,8,140},{0,8,76},{0,9,249},{16,7,3},{0,8,82}, + {0,8,18},{21,8,163},{19,7,35},{0,8,114},{0,8,50},{0,9,197},{17,7,11}, + {0,8,98},{0,8,34},{0,9,165},{0,8,2},{0,8,130},{0,8,66},{0,9,229}, + {16,7,7},{0,8,90},{0,8,26},{0,9,149},{20,7,67},{0,8,122},{0,8,58}, + {0,9,213},{18,7,19},{0,8,106},{0,8,42},{0,9,181},{0,8,10},{0,8,138}, + {0,8,74},{0,9,245},{16,7,5},{0,8,86},{0,8,22},{64,8,0},{19,7,51}, + {0,8,118},{0,8,54},{0,9,205},{17,7,15},{0,8,102},{0,8,38},{0,9,173}, + {0,8,6},{0,8,134},{0,8,70},{0,9,237},{16,7,9},{0,8,94},{0,8,30}, + {0,9,157},{20,7,99},{0,8,126},{0,8,62},{0,9,221},{18,7,27},{0,8,110}, + {0,8,46},{0,9,189},{0,8,14},{0,8,142},{0,8,78},{0,9,253},{96,7,0}, + {0,8,81},{0,8,17},{21,8,131},{18,7,31},{0,8,113},{0,8,49},{0,9,195}, + {16,7,10},{0,8,97},{0,8,33},{0,9,163},{0,8,1},{0,8,129},{0,8,65}, + {0,9,227},{16,7,6},{0,8,89},{0,8,25},{0,9,147},{19,7,59},{0,8,121}, + {0,8,57},{0,9,211},{17,7,17},{0,8,105},{0,8,41},{0,9,179},{0,8,9}, + {0,8,137},{0,8,73},{0,9,243},{16,7,4},{0,8,85},{0,8,21},{16,8,258}, + {19,7,43},{0,8,117},{0,8,53},{0,9,203},{17,7,13},{0,8,101},{0,8,37}, + {0,9,171},{0,8,5},{0,8,133},{0,8,69},{0,9,235},{16,7,8},{0,8,93}, + {0,8,29},{0,9,155},{20,7,83},{0,8,125},{0,8,61},{0,9,219},{18,7,23}, + {0,8,109},{0,8,45},{0,9,187},{0,8,13},{0,8,141},{0,8,77},{0,9,251}, + {16,7,3},{0,8,83},{0,8,19},{21,8,195},{19,7,35},{0,8,115},{0,8,51}, + {0,9,199},{17,7,11},{0,8,99},{0,8,35},{0,9,167},{0,8,3},{0,8,131}, + {0,8,67},{0,9,231},{16,7,7},{0,8,91},{0,8,27},{0,9,151},{20,7,67}, + {0,8,123},{0,8,59},{0,9,215},{18,7,19},{0,8,107},{0,8,43},{0,9,183}, + {0,8,11},{0,8,139},{0,8,75},{0,9,247},{16,7,5},{0,8,87},{0,8,23}, + {64,8,0},{19,7,51},{0,8,119},{0,8,55},{0,9,207},{17,7,15},{0,8,103}, + {0,8,39},{0,9,175},{0,8,7},{0,8,135},{0,8,71},{0,9,239},{16,7,9}, + {0,8,95},{0,8,31},{0,9,159},{20,7,99},{0,8,127},{0,8,63},{0,9,223}, + {18,7,27},{0,8,111},{0,8,47},{0,9,191},{0,8,15},{0,8,143},{0,8,79}, + {0,9,255} + }; + + static const code distfix[32] = { + {16,5,1},{23,5,257},{19,5,17},{27,5,4097},{17,5,5},{25,5,1025}, + {21,5,65},{29,5,16385},{16,5,3},{24,5,513},{20,5,33},{28,5,8193}, + {18,5,9},{26,5,2049},{22,5,129},{64,5,0},{16,5,2},{23,5,385}, + {19,5,25},{27,5,6145},{17,5,7},{25,5,1537},{21,5,97},{29,5,24577}, + {16,5,4},{24,5,769},{20,5,49},{28,5,12289},{18,5,13},{26,5,3073}, + {22,5,193},{64,5,0} + }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/inflate.c b/sys/cddl/contrib/opensolaris/uts/common/zmod/inflate.c new file mode 100644 index 000000000000..023e7a121ce7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/inflate.c @@ -0,0 +1,1395 @@ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* inflate.c -- zlib decompression + * Copyright (C) 1995-2005 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Change history: + * + * 1.2.beta0 24 Nov 2002 + * - First version -- complete rewrite of inflate to simplify code, avoid + * creation of window when not needed, minimize use of window when it is + * needed, make inffast.c even faster, implement gzip decoding, and to + * improve code readability and style over the previous zlib inflate code + * + * 1.2.beta1 25 Nov 2002 + * - Use pointers for available input and output checking in inffast.c + * - Remove input and output counters in inffast.c + * - Change inffast.c entry and loop from avail_in >= 7 to >= 6 + * - Remove unnecessary second byte pull from length extra in inffast.c + * - Unroll direct copy to three copies per loop in inffast.c + * + * 1.2.beta2 4 Dec 2002 + * - Change external routine names to reduce potential conflicts + * - Correct filename to inffixed.h for fixed tables in inflate.c + * - Make hbuf[] unsigned char to match parameter type in inflate.c + * - Change strm->next_out[-state->offset] to *(strm->next_out - state->offset) + * to avoid negation problem on Alphas (64 bit) in inflate.c + * + * 1.2.beta3 22 Dec 2002 + * - Add comments on state->bits assertion in inffast.c + * - Add comments on op field in inftrees.h + * - Fix bug in reuse of allocated window after inflateReset() + * - Remove bit fields--back to byte structure for speed + * - Remove distance extra == 0 check in inflate_fast()--only helps for lengths + * - Change post-increments to pre-increments in inflate_fast(), PPC biased? + * - Add compile time option, POSTINC, to use post-increments instead (Intel?) + * - Make MATCH copy in inflate() much faster for when inflate_fast() not used + * - Use local copies of stream next and avail values, as well as local bit + * buffer and bit count in inflate()--for speed when inflate_fast() not used + * + * 1.2.beta4 1 Jan 2003 + * - Split ptr - 257 statements in inflate_table() to avoid compiler warnings + * - Move a comment on output buffer sizes from inffast.c to inflate.c + * - Add comments in inffast.c to introduce the inflate_fast() routine + * - Rearrange window copies in inflate_fast() for speed and simplification + * - Unroll last copy for window match in inflate_fast() + * - Use local copies of window variables in inflate_fast() for speed + * - Pull out common write == 0 case for speed in inflate_fast() + * - Make op and len in inflate_fast() unsigned for consistency + * - Add FAR to lcode and dcode declarations in inflate_fast() + * - Simplified bad distance check in inflate_fast() + * - Added inflateBackInit(), inflateBack(), and inflateBackEnd() in new + * source file infback.c to provide a call-back interface to inflate for + * programs like gzip and unzip -- uses window as output buffer to avoid + * window copying + * + * 1.2.beta5 1 Jan 2003 + * - Improved inflateBack() interface to allow the caller to provide initial + * input in strm. + * - Fixed stored blocks bug in inflateBack() + * + * 1.2.beta6 4 Jan 2003 + * - Added comments in inffast.c on effectiveness of POSTINC + * - Typecasting all around to reduce compiler warnings + * - Changed loops from while (1) or do {} while (1) to for (;;), again to + * make compilers happy + * - Changed type of window in inflateBackInit() to unsigned char * + * + * 1.2.beta7 27 Jan 2003 + * - Changed many types to unsigned or unsigned short to avoid warnings + * - Added inflateCopy() function + * + * 1.2.0 9 Mar 2003 + * - Changed inflateBack() interface to provide separate opaque descriptors + * for the in() and out() functions + * - Changed inflateBack() argument and in_func typedef to swap the length + * and buffer address return values for the input function + * - Check next_in and next_out for Z_NULL on entry to inflate() + * + * The history for versions after 1.2.0 are in ChangeLog in zlib distribution. + */ + +#include "zutil.h" +#include "inftrees.h" +#include "inflate.h" +#include "inffast.h" + +#ifdef MAKEFIXED +# ifndef BUILDFIXED +# define BUILDFIXED +# endif +#endif + +/* function prototypes */ +local void fixedtables OF((struct inflate_state FAR *state)); +local int updatewindow OF((z_streamp strm, unsigned out)); +#ifdef BUILDFIXED + void makefixed OF((void)); +#endif +local unsigned syncsearch OF((unsigned FAR *have, unsigned char FAR *buf, + unsigned len)); + +int ZEXPORT inflateReset(strm) +z_streamp strm; +{ + struct inflate_state FAR *state; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + strm->total_in = strm->total_out = state->total = 0; + strm->msg = Z_NULL; + strm->adler = 1; /* to support ill-conceived Java test suite */ + state->mode = HEAD; + state->last = 0; + state->havedict = 0; + state->dmax = 32768U; + state->head = Z_NULL; + state->wsize = 0; + state->whave = 0; + state->write = 0; + state->hold = 0; + state->bits = 0; + state->lencode = state->distcode = state->next = state->codes; + Tracev((stderr, "inflate: reset\n")); + return Z_OK; +} + +int ZEXPORT inflatePrime(strm, bits, value) +z_streamp strm; +int bits; +int value; +{ + struct inflate_state FAR *state; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + if (bits > 16 || state->bits + bits > 32) return Z_STREAM_ERROR; + value &= (1L << bits) - 1; + state->hold += value << state->bits; + state->bits += bits; + return Z_OK; +} + +int ZEXPORT inflateInit2_(strm, windowBits, version, stream_size) +z_streamp strm; +int windowBits; +const char *version; +int stream_size; +{ + struct inflate_state FAR *state; + + if (version == Z_NULL || version[0] != ZLIB_VERSION[0] || + stream_size != (int)(sizeof(z_stream))) + return Z_VERSION_ERROR; + if (strm == Z_NULL) return Z_STREAM_ERROR; + strm->msg = Z_NULL; /* in case we return an error */ + if (strm->zalloc == (alloc_func)0) { + strm->zalloc = zcalloc; + strm->opaque = (voidpf)0; + } + if (strm->zfree == (free_func)0) strm->zfree = zcfree; + state = (struct inflate_state FAR *) + ZALLOC(strm, 1, sizeof(struct inflate_state)); + if (state == Z_NULL) return Z_MEM_ERROR; + Tracev((stderr, "inflate: allocated\n")); + strm->state = (struct internal_state FAR *)state; + if (windowBits < 0) { + state->wrap = 0; + windowBits = -windowBits; + } + else { + state->wrap = (windowBits >> 4) + 1; +#ifdef GUNZIP + if (windowBits < 48) windowBits &= 15; +#endif + } + if (windowBits < 8 || windowBits > 15) { + ZFREE(strm, state); + strm->state = Z_NULL; + return Z_STREAM_ERROR; + } + state->wbits = (unsigned)windowBits; + state->window = Z_NULL; + return inflateReset(strm); +} + +int ZEXPORT inflateInit_(strm, version, stream_size) +z_streamp strm; +const char *version; +int stream_size; +{ + return inflateInit2_(strm, DEF_WBITS, version, stream_size); +} + +/* + Return state with length and distance decoding tables and index sizes set to + fixed code decoding. Normally this returns fixed tables from inffixed.h. + If BUILDFIXED is defined, then instead this routine builds the tables the + first time it's called, and returns those tables the first time and + thereafter. This reduces the size of the code by about 2K bytes, in + exchange for a little execution time. However, BUILDFIXED should not be + used for threaded applications, since the rewriting of the tables and virgin + may not be thread-safe. + */ +local void fixedtables(state) +struct inflate_state FAR *state; +{ +#ifdef BUILDFIXED + static int virgin = 1; + static code *lenfix, *distfix; + static code fixed[544]; + + /* build fixed huffman tables if first call (may not be thread safe) */ + if (virgin) { + unsigned sym, bits; + static code *next; + + /* literal/length table */ + sym = 0; + while (sym < 144) state->lens[sym++] = 8; + while (sym < 256) state->lens[sym++] = 9; + while (sym < 280) state->lens[sym++] = 7; + while (sym < 288) state->lens[sym++] = 8; + next = fixed; + lenfix = next; + bits = 9; + inflate_table(LENS, state->lens, 288, &(next), &(bits), state->work); + + /* distance table */ + sym = 0; + while (sym < 32) state->lens[sym++] = 5; + distfix = next; + bits = 5; + inflate_table(DISTS, state->lens, 32, &(next), &(bits), state->work); + + /* do this just once */ + virgin = 0; + } +#else /* !BUILDFIXED */ +# include "inffixed.h" +#endif /* BUILDFIXED */ + state->lencode = lenfix; + state->lenbits = 9; + state->distcode = distfix; + state->distbits = 5; +} + +#ifdef MAKEFIXED +#include <stdio.h> + +/* + Write out the inffixed.h that is #include'd above. Defining MAKEFIXED also + defines BUILDFIXED, so the tables are built on the fly. makefixed() writes + those tables to stdout, which would be piped to inffixed.h. A small program + can simply call makefixed to do this: + + void makefixed(void); + + int main(void) + { + makefixed(); + return 0; + } + + Then that can be linked with zlib built with MAKEFIXED defined and run: + + a.out > inffixed.h + */ +void makefixed() +{ + unsigned low, size; + struct inflate_state state; + + fixedtables(&state); + puts(" /* inffixed.h -- table for decoding fixed codes"); + puts(" * Generated automatically by makefixed()."); + puts(" */"); + puts(""); + puts(" /* WARNING: this file should *not* be used by applications."); + puts(" It is part of the implementation of this library and is"); + puts(" subject to change. Applications should only use zlib.h."); + puts(" */"); + puts(""); + size = 1U << 9; + printf(" static const code lenfix[%u] = {", size); + low = 0; + for (;;) { + if ((low % 7) == 0) printf("\n "); + printf("{%u,%u,%d}", state.lencode[low].op, state.lencode[low].bits, + state.lencode[low].val); + if (++low == size) break; + putchar(','); + } + puts("\n };"); + size = 1U << 5; + printf("\n static const code distfix[%u] = {", size); + low = 0; + for (;;) { + if ((low % 6) == 0) printf("\n "); + printf("{%u,%u,%d}", state.distcode[low].op, state.distcode[low].bits, + state.distcode[low].val); + if (++low == size) break; + putchar(','); + } + puts("\n };"); +} +#endif /* MAKEFIXED */ + +/* + Update the window with the last wsize (normally 32K) bytes written before + returning. If window does not exist yet, create it. This is only called + when a window is already in use, or when output has been written during this + inflate call, but the end of the deflate stream has not been reached yet. + It is also called to create a window for dictionary data when a dictionary + is loaded. + + Providing output buffers larger than 32K to inflate() should provide a speed + advantage, since only the last 32K of output is copied to the sliding window + upon return from inflate(), and since all distances after the first 32K of + output will fall in the output data, making match copies simpler and faster. + The advantage may be dependent on the size of the processor's data caches. + */ +local int updatewindow(strm, out) +z_streamp strm; +unsigned out; +{ + struct inflate_state FAR *state; + unsigned copy, dist; + + state = (struct inflate_state FAR *)strm->state; + + /* if it hasn't been done already, allocate space for the window */ + if (state->window == Z_NULL) { + state->window = (unsigned char FAR *) + ZALLOC(strm, 1U << state->wbits, + sizeof(unsigned char)); + if (state->window == Z_NULL) return 1; + } + + /* if window not in use yet, initialize */ + if (state->wsize == 0) { + state->wsize = 1U << state->wbits; + state->write = 0; + state->whave = 0; + } + + /* copy state->wsize or less output bytes into the circular window */ + copy = out - strm->avail_out; + if (copy >= state->wsize) { + zmemcpy(state->window, strm->next_out - state->wsize, state->wsize); + state->write = 0; + state->whave = state->wsize; + } + else { + dist = state->wsize - state->write; + if (dist > copy) dist = copy; + zmemcpy(state->window + state->write, strm->next_out - copy, dist); + copy -= dist; + if (copy) { + zmemcpy(state->window, strm->next_out - copy, copy); + state->write = copy; + state->whave = state->wsize; + } + else { + state->write += dist; + if (state->write == state->wsize) state->write = 0; + if (state->whave < state->wsize) state->whave += dist; + } + } + return 0; +} + +/* Macros for inflate(): */ + +/* check function to use adler32() for zlib or crc32() for gzip */ +#ifdef GUNZIP +# define UPDATE(check, buf, len) \ + (state->flags ? crc32(check, buf, len) : adler32(check, buf, len)) +#else +# define UPDATE(check, buf, len) adler32(check, buf, len) +#endif + +/* check macros for header crc */ +#ifdef GUNZIP +# define CRC2(check, word) \ + do { \ + hbuf[0] = (unsigned char)(word); \ + hbuf[1] = (unsigned char)((word) >> 8); \ + check = crc32(check, hbuf, 2); \ + } while (0) + +# define CRC4(check, word) \ + do { \ + hbuf[0] = (unsigned char)(word); \ + hbuf[1] = (unsigned char)((word) >> 8); \ + hbuf[2] = (unsigned char)((word) >> 16); \ + hbuf[3] = (unsigned char)((word) >> 24); \ + check = crc32(check, hbuf, 4); \ + } while (0) +#endif + +/* Load registers with state in inflate() for speed */ +#define LOAD() \ + do { \ + put = strm->next_out; \ + left = strm->avail_out; \ + next = strm->next_in; \ + have = strm->avail_in; \ + hold = state->hold; \ + bits = state->bits; \ + } while (0) + +/* Restore state from registers in inflate() */ +#define RESTORE() \ + do { \ + strm->next_out = put; \ + strm->avail_out = left; \ + strm->next_in = next; \ + strm->avail_in = have; \ + state->hold = hold; \ + state->bits = bits; \ + } while (0) + +/* Clear the input bit accumulator */ +#define INITBITS() \ + do { \ + hold = 0; \ + bits = 0; \ + } while (0) + +/* Get a byte of input into the bit accumulator, or return from inflate() + if there is no input available. */ +#define PULLBYTE() \ + do { \ + if (have == 0) goto inf_leave; \ + have--; \ + hold += (unsigned long)(*next++) << bits; \ + bits += 8; \ + } while (0) + +/* Assure that there are at least n bits in the bit accumulator. If there is + not enough available input to do that, then return from inflate(). */ +#define NEEDBITS(n) \ + do { \ + while (bits < (unsigned)(n)) \ + PULLBYTE(); \ + } while (0) + +/* Return the low n bits of the bit accumulator (n < 16) */ +#define BITS(n) \ + ((unsigned)hold & ((1U << (n)) - 1)) + +/* Remove n bits from the bit accumulator */ +#define DROPBITS(n) \ + do { \ + hold >>= (n); \ + bits -= (unsigned)(n); \ + } while (0) + +/* Remove zero to seven bits as needed to go to a byte boundary */ +#define BYTEBITS() \ + do { \ + hold >>= bits & 7; \ + bits -= bits & 7; \ + } while (0) + +/* Reverse the bytes in a 32-bit value */ +#define REVERSE(q) \ + ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \ + (((q) & 0xff00) << 8) + (((q) & 0xff) << 24)) + +/* + inflate() uses a state machine to process as much input data and generate as + much output data as possible before returning. The state machine is + structured roughly as follows: + + for (;;) switch (state) { + ... + case STATEn: + if (not enough input data or output space to make progress) + return; + ... make progress ... + state = STATEm; + break; + ... + } + + so when inflate() is called again, the same case is attempted again, and + if the appropriate resources are provided, the machine proceeds to the + next state. The NEEDBITS() macro is usually the way the state evaluates + whether it can proceed or should return. NEEDBITS() does the return if + the requested bits are not available. The typical use of the BITS macros + is: + + NEEDBITS(n); + ... do something with BITS(n) ... + DROPBITS(n); + + where NEEDBITS(n) either returns from inflate() if there isn't enough + input left to load n bits into the accumulator, or it continues. BITS(n) + gives the low n bits in the accumulator. When done, DROPBITS(n) drops + the low n bits off the accumulator. INITBITS() clears the accumulator + and sets the number of available bits to zero. BYTEBITS() discards just + enough bits to put the accumulator on a byte boundary. After BYTEBITS() + and a NEEDBITS(8), then BITS(8) would return the next byte in the stream. + + NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return + if there is no input available. The decoding of variable length codes uses + PULLBYTE() directly in order to pull just enough bytes to decode the next + code, and no more. + + Some states loop until they get enough input, making sure that enough + state information is maintained to continue the loop where it left off + if NEEDBITS() returns in the loop. For example, want, need, and keep + would all have to actually be part of the saved state in case NEEDBITS() + returns: + + case STATEw: + while (want < need) { + NEEDBITS(n); + keep[want++] = BITS(n); + DROPBITS(n); + } + state = STATEx; + case STATEx: + + As shown above, if the next state is also the next case, then the break + is omitted. + + A state may also return if there is not enough output space available to + complete that state. Those states are copying stored data, writing a + literal byte, and copying a matching string. + + When returning, a "goto inf_leave" is used to update the total counters, + update the check value, and determine whether any progress has been made + during that inflate() call in order to return the proper return code. + Progress is defined as a change in either strm->avail_in or strm->avail_out. + When there is a window, goto inf_leave will update the window with the last + output written. If a goto inf_leave occurs in the middle of decompression + and there is no window currently, goto inf_leave will create one and copy + output to the window for the next call of inflate(). + + In this implementation, the flush parameter of inflate() only affects the + return code (per zlib.h). inflate() always writes as much as possible to + strm->next_out, given the space available and the provided input--the effect + documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers + the allocation of and copying into a sliding window until necessary, which + provides the effect documented in zlib.h for Z_FINISH when the entire input + stream available. So the only thing the flush parameter actually does is: + when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it + will return Z_BUF_ERROR if it has not reached the end of the stream. + */ + +int ZEXPORT inflate(strm, flush) +z_streamp strm; +int flush; +{ + struct inflate_state FAR *state; + unsigned char FAR *next; /* next input */ + unsigned char FAR *put; /* next output */ + unsigned have, left; /* available input and output */ + unsigned long hold; /* bit buffer */ + unsigned bits; /* bits in bit buffer */ + unsigned in, out; /* save starting available input and output */ + unsigned copy; /* number of stored or match bytes to copy */ + unsigned char FAR *from; /* where to copy match bytes from */ + code this; /* current decoding table entry */ + code last; /* parent table entry */ + unsigned len; /* length to copy for repeats, bits to drop */ + int ret; /* return code */ +#ifdef GUNZIP + unsigned char hbuf[4]; /* buffer for gzip header crc calculation */ +#endif + static const unsigned short order[19] = /* permutation of code lengths */ + {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; + + if (strm == Z_NULL || strm->state == Z_NULL || strm->next_out == Z_NULL || + (strm->next_in == Z_NULL && strm->avail_in != 0)) + return Z_STREAM_ERROR; + + state = (struct inflate_state FAR *)strm->state; + if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ + LOAD(); + in = have; + out = left; + ret = Z_OK; + for (;;) + switch (state->mode) { + case HEAD: + if (state->wrap == 0) { + state->mode = TYPEDO; + break; + } + NEEDBITS(16); +#ifdef GUNZIP + if ((state->wrap & 2) && hold == 0x8b1f) { /* gzip header */ + state->check = crc32(0L, Z_NULL, 0); + CRC2(state->check, hold); + INITBITS(); + state->mode = FLAGS; + break; + } + state->flags = 0; /* expect zlib header */ + if (state->head != Z_NULL) + state->head->done = -1; + if (!(state->wrap & 1) || /* check if zlib header allowed */ +#else + if ( +#endif + ((BITS(8) << 8) + (hold >> 8)) % 31) { + strm->msg = (char *)"incorrect header check"; + state->mode = BAD; + break; + } + if (BITS(4) != Z_DEFLATED) { + strm->msg = (char *)"unknown compression method"; + state->mode = BAD; + break; + } + DROPBITS(4); + len = BITS(4) + 8; + if (len > state->wbits) { + strm->msg = (char *)"invalid window size"; + state->mode = BAD; + break; + } + state->dmax = 1U << len; + Tracev((stderr, "inflate: zlib header ok\n")); + strm->adler = state->check = adler32(0L, Z_NULL, 0); + state->mode = hold & 0x200 ? DICTID : TYPE; + INITBITS(); + break; +#ifdef GUNZIP + case FLAGS: + NEEDBITS(16); + state->flags = (int)(hold); + if ((state->flags & 0xff) != Z_DEFLATED) { + strm->msg = (char *)"unknown compression method"; + state->mode = BAD; + break; + } + if (state->flags & 0xe000) { + strm->msg = (char *)"unknown header flags set"; + state->mode = BAD; + break; + } + if (state->head != Z_NULL) + state->head->text = (int)((hold >> 8) & 1); + if (state->flags & 0x0200) CRC2(state->check, hold); + INITBITS(); + state->mode = TIME; + /*FALLTHRU*/ + case TIME: + NEEDBITS(32); + if (state->head != Z_NULL) + state->head->time = hold; + if (state->flags & 0x0200) CRC4(state->check, hold); + INITBITS(); + state->mode = OS; + /*FALLTHRU*/ + case OS: + NEEDBITS(16); + if (state->head != Z_NULL) { + state->head->xflags = (int)(hold & 0xff); + state->head->os = (int)(hold >> 8); + } + if (state->flags & 0x0200) CRC2(state->check, hold); + INITBITS(); + state->mode = EXLEN; + /*FALLTHRU*/ + case EXLEN: + if (state->flags & 0x0400) { + NEEDBITS(16); + state->length = (unsigned)(hold); + if (state->head != Z_NULL) + state->head->extra_len = (unsigned)hold; + if (state->flags & 0x0200) CRC2(state->check, hold); + INITBITS(); + } + else if (state->head != Z_NULL) + state->head->extra = Z_NULL; + state->mode = EXTRA; + /*FALLTHRU*/ + case EXTRA: + if (state->flags & 0x0400) { + copy = state->length; + if (copy > have) copy = have; + if (copy) { + if (state->head != Z_NULL && + state->head->extra != Z_NULL) { + len = state->head->extra_len - state->length; + zmemcpy(state->head->extra + len, next, + len + copy > state->head->extra_max ? + state->head->extra_max - len : copy); + } + if (state->flags & 0x0200) + state->check = crc32(state->check, next, copy); + have -= copy; + next += copy; + state->length -= copy; + } + if (state->length) goto inf_leave; + } + state->length = 0; + state->mode = NAME; + /*FALLTHRU*/ + case NAME: + if (state->flags & 0x0800) { + if (have == 0) goto inf_leave; + copy = 0; + do { + len = (unsigned)(next[copy++]); + if (state->head != Z_NULL && + state->head->name != Z_NULL && + state->length < state->head->name_max) + state->head->name[state->length++] = len; + } while (len && copy < have); + if (state->flags & 0x0200) + state->check = crc32(state->check, next, copy); + have -= copy; + next += copy; + if (len) goto inf_leave; + } + else if (state->head != Z_NULL) + state->head->name = Z_NULL; + state->length = 0; + state->mode = COMMENT; + /*FALLTHRU*/ + case COMMENT: + if (state->flags & 0x1000) { + if (have == 0) goto inf_leave; + copy = 0; + do { + len = (unsigned)(next[copy++]); + if (state->head != Z_NULL && + state->head->comment != Z_NULL && + state->length < state->head->comm_max) + state->head->comment[state->length++] = len; + } while (len && copy < have); + if (state->flags & 0x0200) + state->check = crc32(state->check, next, copy); + have -= copy; + next += copy; + if (len) goto inf_leave; + } + else if (state->head != Z_NULL) + state->head->comment = Z_NULL; + state->mode = HCRC; + /*FALLTHRU*/ + case HCRC: + if (state->flags & 0x0200) { + NEEDBITS(16); + if (hold != (state->check & 0xffff)) { + strm->msg = (char *)"header crc mismatch"; + state->mode = BAD; + break; + } + INITBITS(); + } + if (state->head != Z_NULL) { + state->head->hcrc = (int)((state->flags >> 9) & 1); + state->head->done = 1; + } + strm->adler = state->check = crc32(0L, Z_NULL, 0); + state->mode = TYPE; + break; +#endif + case DICTID: + NEEDBITS(32); + strm->adler = state->check = REVERSE(hold); + INITBITS(); + state->mode = DICT; + /*FALLTHRU*/ + case DICT: + if (state->havedict == 0) { + RESTORE(); + return Z_NEED_DICT; + } + strm->adler = state->check = adler32(0L, Z_NULL, 0); + state->mode = TYPE; + /*FALLTHRU*/ + case TYPE: + if (flush == Z_BLOCK) goto inf_leave; + /*FALLTHRU*/ + case TYPEDO: + if (state->last) { + BYTEBITS(); + state->mode = CHECK; + break; + } + NEEDBITS(3); + state->last = BITS(1); + DROPBITS(1); + switch (BITS(2)) { + case 0: /* stored block */ + Tracev((stderr, "inflate: stored block%s\n", + state->last ? " (last)" : "")); + state->mode = STORED; + break; + case 1: /* fixed block */ + fixedtables(state); + Tracev((stderr, "inflate: fixed codes block%s\n", + state->last ? " (last)" : "")); + state->mode = LEN; /* decode codes */ + break; + case 2: /* dynamic block */ + Tracev((stderr, "inflate: dynamic codes block%s\n", + state->last ? " (last)" : "")); + state->mode = TABLE; + break; + case 3: + strm->msg = (char *)"invalid block type"; + state->mode = BAD; + } + DROPBITS(2); + break; + case STORED: + BYTEBITS(); /* go to byte boundary */ + NEEDBITS(32); + if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { + strm->msg = (char *)"invalid stored block lengths"; + state->mode = BAD; + break; + } + state->length = (unsigned)hold & 0xffff; + Tracev((stderr, "inflate: stored length %u\n", + state->length)); + INITBITS(); + state->mode = COPY; + /*FALLTHRU*/ + case COPY: + copy = state->length; + if (copy) { + if (copy > have) copy = have; + if (copy > left) copy = left; + if (copy == 0) goto inf_leave; + zmemcpy(put, next, copy); + have -= copy; + next += copy; + left -= copy; + put += copy; + state->length -= copy; + break; + } + Tracev((stderr, "inflate: stored end\n")); + state->mode = TYPE; + break; + case TABLE: + NEEDBITS(14); + state->nlen = BITS(5) + 257; + DROPBITS(5); + state->ndist = BITS(5) + 1; + DROPBITS(5); + state->ncode = BITS(4) + 4; + DROPBITS(4); +#ifndef PKZIP_BUG_WORKAROUND + if (state->nlen > 286 || state->ndist > 30) { + strm->msg = (char *)"too many length or distance symbols"; + state->mode = BAD; + break; + } +#endif + Tracev((stderr, "inflate: table sizes ok\n")); + state->have = 0; + state->mode = LENLENS; + /*FALLTHRU*/ + case LENLENS: + while (state->have < state->ncode) { + NEEDBITS(3); + state->lens[order[state->have++]] = (unsigned short)BITS(3); + DROPBITS(3); + } + while (state->have < 19) + state->lens[order[state->have++]] = 0; + state->next = state->codes; + state->lencode = (code const FAR *)(state->next); + state->lenbits = 7; + ret = inflate_table(CODES, state->lens, 19, &(state->next), + &(state->lenbits), state->work); + if (ret) { + strm->msg = (char *)"invalid code lengths set"; + state->mode = BAD; + break; + } + Tracev((stderr, "inflate: code lengths ok\n")); + state->have = 0; + state->mode = CODELENS; + /*FALLTHRU*/ + case CODELENS: + while (state->have < state->nlen + state->ndist) { + for (;;) { + this = state->lencode[BITS(state->lenbits)]; + if ((unsigned)(this.bits) <= bits) break; + PULLBYTE(); + } + if (this.val < 16) { + NEEDBITS(this.bits); + DROPBITS(this.bits); + state->lens[state->have++] = this.val; + } + else { + if (this.val == 16) { + NEEDBITS(this.bits + 2); + DROPBITS(this.bits); + if (state->have == 0) { + strm->msg = (char *)"invalid bit length repeat"; + state->mode = BAD; + break; + } + len = state->lens[state->have - 1]; + copy = 3 + BITS(2); + DROPBITS(2); + } + else if (this.val == 17) { + NEEDBITS(this.bits + 3); + DROPBITS(this.bits); + len = 0; + copy = 3 + BITS(3); + DROPBITS(3); + } + else { + NEEDBITS(this.bits + 7); + DROPBITS(this.bits); + len = 0; + copy = 11 + BITS(7); + DROPBITS(7); + } + if (state->have + copy > state->nlen + state->ndist) { + strm->msg = (char *)"invalid bit length repeat"; + state->mode = BAD; + break; + } + while (copy--) + state->lens[state->have++] = (unsigned short)len; + } + } + + /* handle error breaks in while */ + if (state->mode == BAD) break; + + /* build code tables */ + state->next = state->codes; + state->lencode = (code const FAR *)(state->next); + state->lenbits = 9; + ret = inflate_table(LENS, state->lens, state->nlen, &(state->next), + &(state->lenbits), state->work); + if (ret) { + strm->msg = (char *)"invalid literal/lengths set"; + state->mode = BAD; + break; + } + state->distcode = (code const FAR *)(state->next); + state->distbits = 6; + ret = inflate_table(DISTS, state->lens + state->nlen, state->ndist, + &(state->next), &(state->distbits), state->work); + if (ret) { + strm->msg = (char *)"invalid distances set"; + state->mode = BAD; + break; + } + Tracev((stderr, "inflate: codes ok\n")); + state->mode = LEN; + /*FALLTHRU*/ + case LEN: + if (have >= 6 && left >= 258) { + RESTORE(); + inflate_fast(strm, out); + LOAD(); + break; + } + for (;;) { + this = state->lencode[BITS(state->lenbits)]; + if ((unsigned)(this.bits) <= bits) break; + PULLBYTE(); + } + if (this.op && (this.op & 0xf0) == 0) { + last = this; + for (;;) { + this = state->lencode[last.val + + (BITS(last.bits + last.op) >> last.bits)]; + if ((unsigned)(last.bits + this.bits) <= bits) break; + PULLBYTE(); + } + DROPBITS(last.bits); + } + DROPBITS(this.bits); + state->length = (unsigned)this.val; + if ((int)(this.op) == 0) { + Tracevv((stderr, this.val >= 0x20 && this.val < 0x7f ? + "inflate: literal '%c'\n" : + "inflate: literal 0x%02x\n", this.val)); + state->mode = LIT; + break; + } + if (this.op & 32) { + Tracevv((stderr, "inflate: end of block\n")); + state->mode = TYPE; + break; + } + if (this.op & 64) { + strm->msg = (char *)"invalid literal/length code"; + state->mode = BAD; + break; + } + state->extra = (unsigned)(this.op) & 15; + state->mode = LENEXT; + /*FALLTHRU*/ + case LENEXT: + if (state->extra) { + NEEDBITS(state->extra); + state->length += BITS(state->extra); + DROPBITS(state->extra); + } + Tracevv((stderr, "inflate: length %u\n", state->length)); + state->mode = DIST; + /*FALLTHRU*/ + case DIST: + for (;;) { + this = state->distcode[BITS(state->distbits)]; + if ((unsigned)(this.bits) <= bits) break; + PULLBYTE(); + } + if ((this.op & 0xf0) == 0) { + last = this; + for (;;) { + this = state->distcode[last.val + + (BITS(last.bits + last.op) >> last.bits)]; + if ((unsigned)(last.bits + this.bits) <= bits) break; + PULLBYTE(); + } + DROPBITS(last.bits); + } + DROPBITS(this.bits); + if (this.op & 64) { + strm->msg = (char *)"invalid distance code"; + state->mode = BAD; + break; + } + state->offset = (unsigned)this.val; + state->extra = (unsigned)(this.op) & 15; + state->mode = DISTEXT; + /*FALLTHRU*/ + case DISTEXT: + if (state->extra) { + NEEDBITS(state->extra); + state->offset += BITS(state->extra); + DROPBITS(state->extra); + } +#ifdef INFLATE_STRICT + if (state->offset > state->dmax) { + strm->msg = (char *)"invalid distance too far back"; + state->mode = BAD; + break; + } +#endif + if (state->offset > state->whave + out - left) { + strm->msg = (char *)"invalid distance too far back"; + state->mode = BAD; + break; + } + Tracevv((stderr, "inflate: distance %u\n", state->offset)); + state->mode = MATCH; + /*FALLTHRU*/ + case MATCH: + if (left == 0) goto inf_leave; + copy = out - left; + if (state->offset > copy) { /* copy from window */ + copy = state->offset - copy; + if (copy > state->write) { + copy -= state->write; + from = state->window + (state->wsize - copy); + } + else + from = state->window + (state->write - copy); + if (copy > state->length) copy = state->length; + } + else { /* copy from output */ + from = put - state->offset; + copy = state->length; + } + if (copy > left) copy = left; + left -= copy; + state->length -= copy; + do { + *put++ = *from++; + } while (--copy); + if (state->length == 0) state->mode = LEN; + break; + case LIT: + if (left == 0) goto inf_leave; + *put++ = (unsigned char)(state->length); + left--; + state->mode = LEN; + break; + case CHECK: + if (state->wrap) { + NEEDBITS(32); + out -= left; + strm->total_out += out; + state->total += out; + if (out) + strm->adler = state->check = + UPDATE(state->check, put - out, out); + out = left; + if (( +#ifdef GUNZIP + state->flags ? hold : +#endif + REVERSE(hold)) != state->check) { + strm->msg = (char *)"incorrect data check"; + state->mode = BAD; + break; + } + INITBITS(); + Tracev((stderr, "inflate: check matches trailer\n")); + } +#ifdef GUNZIP + state->mode = LENGTH; + /*FALLTHRU*/ + case LENGTH: + if (state->wrap && state->flags) { + NEEDBITS(32); + if (hold != (state->total & 0xffffffffUL)) { + strm->msg = (char *)"incorrect length check"; + state->mode = BAD; + break; + } + INITBITS(); + Tracev((stderr, "inflate: length matches trailer\n")); + } +#endif + state->mode = DONE; + /*FALLTHRU*/ + case DONE: + ret = Z_STREAM_END; + goto inf_leave; + case BAD: + ret = Z_DATA_ERROR; + goto inf_leave; + case MEM: + return Z_MEM_ERROR; + case SYNC: + default: + return Z_STREAM_ERROR; + } + + /* + Return from inflate(), updating the total counts and the check value. + If there was no progress during the inflate() call, return a buffer + error. Call updatewindow() to create and/or update the window state. + Note: a memory error from inflate() is non-recoverable. + */ + inf_leave: + RESTORE(); + if (state->wsize || (state->mode < CHECK && out != strm->avail_out)) + if (updatewindow(strm, out)) { + state->mode = MEM; + return Z_MEM_ERROR; + } + in -= strm->avail_in; + out -= strm->avail_out; + strm->total_in += in; + strm->total_out += out; + state->total += out; + if (state->wrap && out) + strm->adler = state->check = + UPDATE(state->check, strm->next_out - out, out); + strm->data_type = state->bits + (state->last ? 64 : 0) + + (state->mode == TYPE ? 128 : 0); + if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) + ret = Z_BUF_ERROR; + return ret; +} + +int ZEXPORT inflateEnd(strm) +z_streamp strm; +{ + struct inflate_state FAR *state; + if (strm == Z_NULL || strm->state == Z_NULL || strm->zfree == (free_func)0) + return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + if (state->window != Z_NULL) ZFREE(strm, state->window); + ZFREE(strm, strm->state); + strm->state = Z_NULL; + Tracev((stderr, "inflate: end\n")); + return Z_OK; +} + +int ZEXPORT inflateSetDictionary(strm, dictionary, dictLength) +z_streamp strm; +const Bytef *dictionary; +uInt dictLength; +{ + struct inflate_state FAR *state; + unsigned long id; + + /* check state */ + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + if (state->wrap != 0 && state->mode != DICT) + return Z_STREAM_ERROR; + + /* check for correct dictionary id */ + if (state->mode == DICT) { + id = adler32(0L, Z_NULL, 0); + id = adler32(id, dictionary, dictLength); + if (id != state->check) + return Z_DATA_ERROR; + } + + /* copy dictionary to window */ + if (updatewindow(strm, strm->avail_out)) { + state->mode = MEM; + return Z_MEM_ERROR; + } + if (dictLength > state->wsize) { + zmemcpy(state->window, dictionary + dictLength - state->wsize, + state->wsize); + state->whave = state->wsize; + } + else { + zmemcpy(state->window + state->wsize - dictLength, dictionary, + dictLength); + state->whave = dictLength; + } + state->havedict = 1; + Tracev((stderr, "inflate: dictionary set\n")); + return Z_OK; +} + +int ZEXPORT inflateGetHeader(strm, head) +z_streamp strm; +gz_headerp head; +{ + struct inflate_state FAR *state; + + /* check state */ + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + if ((state->wrap & 2) == 0) return Z_STREAM_ERROR; + + /* save header structure */ + state->head = head; + head->done = 0; + return Z_OK; +} + +/* + Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff. Return when found + or when out of input. When called, *have is the number of pattern bytes + found in order so far, in 0..3. On return *have is updated to the new + state. If on return *have equals four, then the pattern was found and the + return value is how many bytes were read including the last byte of the + pattern. If *have is less than four, then the pattern has not been found + yet and the return value is len. In the latter case, syncsearch() can be + called again with more data and the *have state. *have is initialized to + zero for the first call. + */ +local unsigned syncsearch(have, buf, len) +unsigned FAR *have; +unsigned char FAR *buf; +unsigned len; +{ + unsigned got; + unsigned next; + + got = *have; + next = 0; + while (next < len && got < 4) { + if ((int)(buf[next]) == (got < 2 ? 0 : 0xff)) + got++; + else if (buf[next]) + got = 0; + else + got = 4 - got; + next++; + } + *have = got; + return next; +} + +int ZEXPORT inflateSync(strm) +z_streamp strm; +{ + unsigned len; /* number of bytes to look at or looked at */ + unsigned long in, out; /* temporary to save total_in and total_out */ + unsigned char buf[4]; /* to restore bit buffer to byte string */ + struct inflate_state FAR *state; + + /* check parameters */ + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + if (strm->avail_in == 0 && state->bits < 8) return Z_BUF_ERROR; + + /* if first time, start search in bit buffer */ + if (state->mode != SYNC) { + state->mode = SYNC; + state->hold <<= state->bits & 7; + state->bits -= state->bits & 7; + len = 0; + while (state->bits >= 8) { + buf[len++] = (unsigned char)(state->hold); + state->hold >>= 8; + state->bits -= 8; + } + state->have = 0; + (void) syncsearch(&(state->have), buf, len); + } + + /* search available input */ + len = syncsearch(&(state->have), strm->next_in, strm->avail_in); + strm->avail_in -= len; + strm->next_in += len; + strm->total_in += len; + + /* return no joy or set up to restart inflate() on a new block */ + if (state->have != 4) return Z_DATA_ERROR; + in = strm->total_in; out = strm->total_out; + (void) inflateReset(strm); + strm->total_in = in; strm->total_out = out; + state->mode = TYPE; + return Z_OK; +} + +/* + Returns true if inflate is currently at the end of a block generated by + Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP + implementation to provide an additional safety check. PPP uses + Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored + block. When decompressing, PPP checks that at the end of input packet, + inflate is waiting for these length bytes. + */ +int ZEXPORT inflateSyncPoint(strm) +z_streamp strm; +{ + struct inflate_state FAR *state; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)strm->state; + return state->mode == STORED && state->bits == 0; +} + +int ZEXPORT inflateCopy(dest, source) +z_streamp dest; +z_streamp source; +{ + struct inflate_state FAR *state; + struct inflate_state FAR *copy; + unsigned char FAR *window; + unsigned wsize; + + /* check input */ + if (dest == Z_NULL || source == Z_NULL || source->state == Z_NULL || + source->zalloc == (alloc_func)0 || source->zfree == (free_func)0) + return Z_STREAM_ERROR; + state = (struct inflate_state FAR *)source->state; + + /* allocate space */ + copy = (struct inflate_state FAR *) + ZALLOC(source, 1, sizeof(struct inflate_state)); + if (copy == Z_NULL) return Z_MEM_ERROR; + window = Z_NULL; + if (state->window != Z_NULL) { + window = (unsigned char FAR *) + ZALLOC(source, 1U << state->wbits, sizeof(unsigned char)); + if (window == Z_NULL) { + ZFREE(source, copy); + return Z_MEM_ERROR; + } + } + + /* copy state */ + zmemcpy(dest, source, sizeof(z_stream)); + zmemcpy(copy, state, sizeof(struct inflate_state)); + if (state->lencode >= state->codes && + state->lencode <= state->codes + ENOUGH - 1) { + copy->lencode = copy->codes + (state->lencode - state->codes); + copy->distcode = copy->codes + (state->distcode - state->codes); + } + copy->next = copy->codes + (state->next - state->codes); + if (window != Z_NULL) { + wsize = 1U << state->wbits; + zmemcpy(window, state->window, wsize); + } + copy->window = window; + dest->state = (struct internal_state FAR *)copy; + return Z_OK; +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/inflate.h b/sys/cddl/contrib/opensolaris/uts/common/zmod/inflate.h new file mode 100644 index 000000000000..4d28b221770b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/inflate.h @@ -0,0 +1,117 @@ +/* inflate.h -- internal inflate state definition + * Copyright (C) 1995-2004 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +/* define NO_GZIP when compiling if you want to disable gzip header and + trailer decoding by inflate(). NO_GZIP would be used to avoid linking in + the crc code when it is not needed. For shared libraries, gzip decoding + should be left enabled. */ +#ifndef NO_GZIP +# define GUNZIP +#endif + +/* Possible inflate modes between inflate() calls */ +typedef enum { + HEAD, /* i: waiting for magic header */ + FLAGS, /* i: waiting for method and flags (gzip) */ + TIME, /* i: waiting for modification time (gzip) */ + OS, /* i: waiting for extra flags and operating system (gzip) */ + EXLEN, /* i: waiting for extra length (gzip) */ + EXTRA, /* i: waiting for extra bytes (gzip) */ + NAME, /* i: waiting for end of file name (gzip) */ + COMMENT, /* i: waiting for end of comment (gzip) */ + HCRC, /* i: waiting for header crc (gzip) */ + DICTID, /* i: waiting for dictionary check value */ + DICT, /* waiting for inflateSetDictionary() call */ + TYPE, /* i: waiting for type bits, including last-flag bit */ + TYPEDO, /* i: same, but skip check to exit inflate on new block */ + STORED, /* i: waiting for stored size (length and complement) */ + COPY, /* i/o: waiting for input or output to copy stored block */ + TABLE, /* i: waiting for dynamic block table lengths */ + LENLENS, /* i: waiting for code length code lengths */ + CODELENS, /* i: waiting for length/lit and distance code lengths */ + LEN, /* i: waiting for length/lit code */ + LENEXT, /* i: waiting for length extra bits */ + DIST, /* i: waiting for distance code */ + DISTEXT, /* i: waiting for distance extra bits */ + MATCH, /* o: waiting for output space to copy string */ + LIT, /* o: waiting for output space to write literal */ + CHECK, /* i: waiting for 32-bit check value */ + LENGTH, /* i: waiting for 32-bit length (gzip) */ + DONE, /* finished check, done -- remain here until reset */ + BAD, /* got a data error -- remain here until reset */ + MEM, /* got an inflate() memory error -- remain here until reset */ + SYNC /* looking for synchronization bytes to restart inflate() */ +} inflate_mode; + +/* + State transitions between above modes - + + (most modes can go to the BAD or MEM mode -- not shown for clarity) + + Process header: + HEAD -> (gzip) or (zlib) + (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME + NAME -> COMMENT -> HCRC -> TYPE + (zlib) -> DICTID or TYPE + DICTID -> DICT -> TYPE + Read deflate blocks: + TYPE -> STORED or TABLE or LEN or CHECK + STORED -> COPY -> TYPE + TABLE -> LENLENS -> CODELENS -> LEN + Read deflate codes: + LEN -> LENEXT or LIT or TYPE + LENEXT -> DIST -> DISTEXT -> MATCH -> LEN + LIT -> LEN + Process trailer: + CHECK -> LENGTH -> DONE + */ + +/* state maintained between inflate() calls. Approximately 7K bytes. */ +struct inflate_state { + inflate_mode mode; /* current inflate mode */ + int last; /* true if processing last block */ + int wrap; /* bit 0 true for zlib, bit 1 true for gzip */ + int havedict; /* true if dictionary provided */ + int flags; /* gzip header method and flags (0 if zlib) */ + unsigned dmax; /* zlib header max distance (INFLATE_STRICT) */ + unsigned long check; /* protected copy of check value */ + unsigned long total; /* protected copy of output count */ + gz_headerp head; /* where to save gzip header information */ + /* sliding window */ + unsigned wbits; /* log base 2 of requested window size */ + unsigned wsize; /* window size or zero if not using window */ + unsigned whave; /* valid bytes in the window */ + unsigned write; /* window write index */ + unsigned char FAR *window; /* allocated sliding window, if needed */ + /* bit accumulator */ + unsigned long hold; /* input bit accumulator */ + unsigned bits; /* number of bits in "in" */ + /* for string and stored block copying */ + unsigned length; /* literal or length of data to copy */ + unsigned offset; /* distance back to copy string from */ + /* for table and code decoding */ + unsigned extra; /* extra bits needed */ + /* fixed and dynamic code tables */ + code const FAR *lencode; /* starting table for length/literal codes */ + code const FAR *distcode; /* starting table for distance codes */ + unsigned lenbits; /* index bits for lencode */ + unsigned distbits; /* index bits for distcode */ + /* dynamic table building */ + unsigned ncode; /* number of code length code lengths */ + unsigned nlen; /* number of length code lengths */ + unsigned ndist; /* number of distance code lengths */ + unsigned have; /* number of code lengths in lens[] */ + code FAR *next; /* next available space in codes[] */ + unsigned short lens[320]; /* temporary storage for code lengths */ + unsigned short work[288]; /* work area for code table building */ + code codes[ENOUGH]; /* space for code tables */ +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/inftrees.c b/sys/cddl/contrib/opensolaris/uts/common/zmod/inftrees.c new file mode 100644 index 000000000000..2d371675301c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/inftrees.c @@ -0,0 +1,331 @@ +/* inftrees.c -- generate Huffman trees for efficient decoding + * Copyright (C) 1995-2005 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "zutil.h" +#include "inftrees.h" + +#define MAXBITS 15 + +static const char inflate_copyright[] = + " inflate 1.2.3 Copyright 1995-2005 Mark Adler "; +/* + If you use the zlib library in a product, an acknowledgment is welcome + in the documentation of your product. If for some reason you cannot + include such an acknowledgment, I would appreciate that you keep this + copyright string in the executable of your product. + */ + +/* + Build a set of tables to decode the provided canonical Huffman code. + The code lengths are lens[0..codes-1]. The result starts at *table, + whose indices are 0..2^bits-1. work is a writable array of at least + lens shorts, which is used as a work area. type is the type of code + to be generated, CODES, LENS, or DISTS. On return, zero is success, + -1 is an invalid code, and +1 means that ENOUGH isn't enough. table + on return points to the next available entry's address. bits is the + requested root table index bits, and on return it is the actual root + table index bits. It will differ if the request is greater than the + longest code or if it is less than the shortest code. + */ +int inflate_table(type, lens, codes, table, bits, work) +codetype type; +unsigned short FAR *lens; +unsigned codes; +code FAR * FAR *table; +unsigned FAR *bits; +unsigned short FAR *work; +{ + unsigned len; /* a code's length in bits */ + unsigned sym; /* index of code symbols */ + unsigned min, max; /* minimum and maximum code lengths */ + unsigned root; /* number of index bits for root table */ + unsigned curr; /* number of index bits for current table */ + unsigned drop; /* code bits to drop for sub-table */ + int left; /* number of prefix codes available */ + unsigned used; /* code entries in table used */ + unsigned huff; /* Huffman code */ + unsigned incr; /* for incrementing code, index */ + unsigned fill; /* index for replicating entries */ + unsigned low; /* low bits for current root entry */ + unsigned mask; /* mask for low root bits */ + code this; /* table entry for duplication */ + code FAR *next; /* next available space in table */ + const unsigned short FAR *base; /* base value table to use */ + const unsigned short FAR *extra; /* extra bits table to use */ + int end; /* use base and extra for symbol > end */ + unsigned short count[MAXBITS+1]; /* number of codes of each length */ + unsigned short offs[MAXBITS+1]; /* offsets in table for each length */ + static const unsigned short lbase[31] = { /* Length codes 257..285 base */ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, + 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; + static const unsigned short lext[31] = { /* Length codes 257..285 extra */ + 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, + 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 201, 196}; + static const unsigned short dbase[32] = { /* Distance codes 0..29 base */ + 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, + 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, + 8193, 12289, 16385, 24577, 0, 0}; + static const unsigned short dext[32] = { /* Distance codes 0..29 extra */ + 16, 16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, + 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, + 28, 28, 29, 29, 64, 64}; + + /* + Process a set of code lengths to create a canonical Huffman code. The + code lengths are lens[0..codes-1]. Each length corresponds to the + symbols 0..codes-1. The Huffman code is generated by first sorting the + symbols by length from short to long, and retaining the symbol order + for codes with equal lengths. Then the code starts with all zero bits + for the first code of the shortest length, and the codes are integer + increments for the same length, and zeros are appended as the length + increases. For the deflate format, these bits are stored backwards + from their more natural integer increment ordering, and so when the + decoding tables are built in the large loop below, the integer codes + are incremented backwards. + + This routine assumes, but does not check, that all of the entries in + lens[] are in the range 0..MAXBITS. The caller must assure this. + 1..MAXBITS is interpreted as that code length. zero means that that + symbol does not occur in this code. + + The codes are sorted by computing a count of codes for each length, + creating from that a table of starting indices for each length in the + sorted table, and then entering the symbols in order in the sorted + table. The sorted table is work[], with that space being provided by + the caller. + + The length counts are used for other purposes as well, i.e. finding + the minimum and maximum length codes, determining if there are any + codes at all, checking for a valid set of lengths, and looking ahead + at length counts to determine sub-table sizes when building the + decoding tables. + */ + + /* accumulate lengths for codes (assumes lens[] all in 0..MAXBITS) */ + for (len = 0; len <= MAXBITS; len++) + count[len] = 0; + for (sym = 0; sym < codes; sym++) + count[lens[sym]]++; + + /* bound code lengths, force root to be within code lengths */ + root = *bits; + for (max = MAXBITS; max >= 1; max--) + if (count[max] != 0) break; + if (root > max) root = max; + if (max == 0) { /* no symbols to code at all */ + this.op = (unsigned char)64; /* invalid code marker */ + this.bits = (unsigned char)1; + this.val = (unsigned short)0; + *(*table)++ = this; /* make a table to force an error */ + *(*table)++ = this; + *bits = 1; + return 0; /* no symbols, but wait for decoding to report error */ + } + for (min = 1; min <= MAXBITS; min++) + if (count[min] != 0) break; + if (root < min) root = min; + + /* check for an over-subscribed or incomplete set of lengths */ + left = 1; + for (len = 1; len <= MAXBITS; len++) { + left <<= 1; + left -= count[len]; + if (left < 0) return -1; /* over-subscribed */ + } + if (left > 0 && (type == CODES || max != 1)) + return -1; /* incomplete set */ + + /* generate offsets into symbol table for each length for sorting */ + offs[1] = 0; + for (len = 1; len < MAXBITS; len++) + offs[len + 1] = offs[len] + count[len]; + + /* sort symbols by length, by symbol order within each length */ + for (sym = 0; sym < codes; sym++) + if (lens[sym] != 0) work[offs[lens[sym]]++] = (unsigned short)sym; + + /* + Create and fill in decoding tables. In this loop, the table being + filled is at next and has curr index bits. The code being used is huff + with length len. That code is converted to an index by dropping drop + bits off of the bottom. For codes where len is less than drop + curr, + those top drop + curr - len bits are incremented through all values to + fill the table with replicated entries. + + root is the number of index bits for the root table. When len exceeds + root, sub-tables are created pointed to by the root entry with an index + of the low root bits of huff. This is saved in low to check for when a + new sub-table should be started. drop is zero when the root table is + being filled, and drop is root when sub-tables are being filled. + + When a new sub-table is needed, it is necessary to look ahead in the + code lengths to determine what size sub-table is needed. The length + counts are used for this, and so count[] is decremented as codes are + entered in the tables. + + used keeps track of how many table entries have been allocated from the + provided *table space. It is checked when a LENS table is being made + against the space in *table, ENOUGH, minus the maximum space needed by + the worst case distance code, MAXD. This should never happen, but the + sufficiency of ENOUGH has not been proven exhaustively, hence the check. + This assumes that when type == LENS, bits == 9. + + sym increments through all symbols, and the loop terminates when + all codes of length max, i.e. all codes, have been processed. This + routine permits incomplete codes, so another loop after this one fills + in the rest of the decoding tables with invalid code markers. + */ + + /* set up for code type */ + switch (type) { + case CODES: + base = extra = work; /* dummy value--not used */ + end = 19; + break; + case LENS: + base = lbase; + base -= 257; + extra = lext; + extra -= 257; + end = 256; + break; + default: /* DISTS */ + base = dbase; + extra = dext; + end = -1; + } + + /* initialize state for loop */ + huff = 0; /* starting code */ + sym = 0; /* starting code symbol */ + len = min; /* starting code length */ + next = *table; /* current table to fill in */ + curr = root; /* current table index bits */ + drop = 0; /* current bits to drop from code for index */ + low = (unsigned)(-1); /* trigger new sub-table when len > root */ + used = 1U << root; /* use root table entries */ + mask = used - 1; /* mask for comparing low */ + + /* check available table space */ + if (type == LENS && used >= ENOUGH - MAXD) + return 1; + + /* process all codes and make table entries */ + for (;;) { + /* create table entry */ + this.bits = (unsigned char)(len - drop); + if ((int)(work[sym]) < end) { + this.op = (unsigned char)0; + this.val = work[sym]; + } + else if ((int)(work[sym]) > end) { + this.op = (unsigned char)(extra[work[sym]]); + this.val = base[work[sym]]; + } + else { + this.op = (unsigned char)(32 + 64); /* end of block */ + this.val = 0; + } + + /* replicate for those indices with low len bits equal to huff */ + incr = 1U << (len - drop); + fill = 1U << curr; + min = fill; /* save offset to next table */ + do { + fill -= incr; + next[(huff >> drop) + fill] = this; + } while (fill != 0); + + /* backwards increment the len-bit code huff */ + incr = 1U << (len - 1); + while (huff & incr) + incr >>= 1; + if (incr != 0) { + huff &= incr - 1; + huff += incr; + } + else + huff = 0; + + /* go to next symbol, update count, len */ + sym++; + if (--(count[len]) == 0) { + if (len == max) break; + len = lens[work[sym]]; + } + + /* create new sub-table if needed */ + if (len > root && (huff & mask) != low) { + /* if first time, transition to sub-tables */ + if (drop == 0) + drop = root; + + /* increment past last table */ + next += min; /* here min is 1 << curr */ + + /* determine length of next table */ + curr = len - drop; + left = (int)(1 << curr); + while (curr + drop < max) { + left -= count[curr + drop]; + if (left <= 0) break; + curr++; + left <<= 1; + } + + /* check for enough space */ + used += 1U << curr; + if (type == LENS && used >= ENOUGH - MAXD) + return 1; + + /* point entry in root table to sub-table */ + low = huff & mask; + (*table)[low].op = (unsigned char)curr; + (*table)[low].bits = (unsigned char)root; + (*table)[low].val = (unsigned short)(next - *table); + } + } + + /* + Fill in rest of table for incomplete codes. This loop is similar to the + loop above in incrementing huff for table indices. It is assumed that + len is equal to curr + drop, so there is no loop needed to increment + through high index bits. When the current sub-table is filled, the loop + drops back to the root table to fill in any remaining entries there. + */ + this.op = (unsigned char)64; /* invalid code marker */ + this.bits = (unsigned char)(len - drop); + this.val = (unsigned short)0; + while (huff != 0) { + /* when done with sub-table, drop back to root table */ + if (drop != 0 && (huff & mask) != low) { + drop = 0; + len = root; + next = *table; + this.bits = (unsigned char)len; + } + + /* put invalid code marker in table */ + next[huff >> drop] = this; + + /* backwards increment the len-bit code huff */ + incr = 1U << (len - 1); + while (huff & incr) + incr >>= 1; + if (incr != 0) { + huff &= incr - 1; + huff += incr; + } + else + huff = 0; + } + + /* set return parameters */ + *table += used; + *bits = root; + return 0; +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/inftrees.h b/sys/cddl/contrib/opensolaris/uts/common/zmod/inftrees.h new file mode 100644 index 000000000000..546e8c082fdb --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/inftrees.h @@ -0,0 +1,57 @@ +/* inftrees.h -- header to use inftrees.c + * Copyright (C) 1995-2005 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +/* Structure for decoding tables. Each entry provides either the + information needed to do the operation requested by the code that + indexed that table entry, or it provides a pointer to another + table that indexes more bits of the code. op indicates whether + the entry is a pointer to another table, a literal, a length or + distance, an end-of-block, or an invalid code. For a table + pointer, the low four bits of op is the number of index bits of + that table. For a length or distance, the low four bits of op + is the number of extra bits to get after the code. bits is + the number of bits in this code or part of the code to drop off + of the bit buffer. val is the actual byte to output in the case + of a literal, the base length or distance, or the offset from + the current table to the next table. Each entry is four bytes. */ +typedef struct { + unsigned char op; /* operation, extra bits, table bits */ + unsigned char bits; /* bits in this part of the code */ + unsigned short val; /* offset in table or code value */ +} code; + +/* op values as set by inflate_table(): + 00000000 - literal + 0000tttt - table link, tttt != 0 is the number of table index bits + 0001eeee - length or distance, eeee is the number of extra bits + 01100000 - end of block + 01000000 - invalid code + */ + +/* Maximum size of dynamic tree. The maximum found in a long but non- + exhaustive search was 1444 code structures (852 for length/literals + and 592 for distances, the latter actually the result of an + exhaustive search). The true maximum is not known, but the value + below is more than safe. */ +#define ENOUGH 2048 +#define MAXD 592 + +/* Type of code to build for inftable() */ +typedef enum { + CODES, + LENS, + DISTS +} codetype; + +extern int inflate_table OF((codetype type, unsigned short FAR *lens, + unsigned codes, code FAR * FAR *table, + unsigned FAR *bits, unsigned short FAR *work)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/opensolaris_crc32.c b/sys/cddl/contrib/opensolaris/uts/common/zmod/opensolaris_crc32.c new file mode 100644 index 000000000000..61ad581ef562 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/opensolaris_crc32.c @@ -0,0 +1,428 @@ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* crc32.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2005 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster + * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing + * tables for updating the shift register in one step with three exclusive-ors + * instead of four steps with four exclusive-ors. This results in about a + * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore + protection on the static variables used to control the first-use generation + of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should + first call get_crc_table() to initialize the tables before allowing more than + one thread to use crc32(). + */ + +#ifdef MAKECRCH +# include <stdio.h> +# ifndef DYNAMIC_CRC_TABLE +# define DYNAMIC_CRC_TABLE +# endif /* !DYNAMIC_CRC_TABLE */ +#endif /* MAKECRCH */ + +#include "zutil.h" /* for STDC and FAR definitions */ + +#define local static + +/* Find a four-byte integer type for crc32_little() and crc32_big(). */ +#ifndef NOBYFOUR +# ifdef STDC /* need ANSI C limits.h to determine sizes */ +# include <limits.h> +# define BYFOUR +# if (UINT_MAX == 0xffffffffUL) + typedef unsigned int u4; +# else +# if (ULONG_MAX == 0xffffffffUL) + typedef unsigned long u4; +# else +# if (USHRT_MAX == 0xffffffffUL) + typedef unsigned short u4; +# else +# undef BYFOUR /* can't find a four-byte integer type! */ +# endif +# endif +# endif +# endif /* STDC */ +#endif /* !NOBYFOUR */ + +/* Definitions for doing the crc four data bytes at a time. */ +#ifdef BYFOUR +# define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \ + (((w)&0xff00)<<8)+(((w)&0xff)<<24)) + local unsigned long crc32_little OF((unsigned long, + const unsigned char FAR *, unsigned)); + local unsigned long crc32_big OF((unsigned long, + const unsigned char FAR *, unsigned)); +# define TBLS 8 +#else +# define TBLS 1 +#endif /* BYFOUR */ + +/* Local functions for crc concatenation */ +local unsigned long gf2_matrix_times OF((unsigned long *mat, + unsigned long vec)); +local void gf2_matrix_square OF((unsigned long *square, unsigned long *mat)); + +#ifdef DYNAMIC_CRC_TABLE + +local volatile int crc_table_empty = 1; +local unsigned long FAR crc_table[TBLS][256]; +local void make_crc_table OF((void)); +#ifdef MAKECRCH + local void write_table OF((FILE *, const unsigned long FAR *)); +#endif /* MAKECRCH */ +/* + Generate tables for a byte-wise 32-bit CRC calculation on the polynomial: + x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1. + + Polynomials over GF(2) are represented in binary, one bit per coefficient, + with the lowest powers in the most significant bit. Then adding polynomials + is just exclusive-or, and multiplying a polynomial by x is a right shift by + one. If we call the above polynomial p, and represent a byte as the + polynomial q, also with the lowest power in the most significant bit (so the + byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p, + where a mod b means the remainder after dividing a by b. + + This calculation is done using the shift-register method of multiplying and + taking the remainder. The register is initialized to zero, and for each + incoming bit, x^32 is added mod p to the register if the bit is a one (where + x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by + x (which is shifting right by one and adding x^32 mod p if the bit shifted + out is a one). We start with the highest power (least significant bit) of + q and repeat for all eight bits of q. + + The first table is simply the CRC of all possible eight bit values. This is + all the information needed to generate CRCs on data a byte at a time for all + combinations of CRC register values and incoming bytes. The remaining tables + allow for word-at-a-time CRC calculation for both big-endian and little- + endian machines, where a word is four bytes. +*/ +local void make_crc_table() +{ + unsigned long c; + int n, k; + unsigned long poly; /* polynomial exclusive-or pattern */ + /* terms of polynomial defining this crc (except x^32): */ + static volatile int first = 1; /* flag to limit concurrent making */ + static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26}; + + /* See if another task is already doing this (not thread-safe, but better + than nothing -- significantly reduces duration of vulnerability in + case the advice about DYNAMIC_CRC_TABLE is ignored) */ + if (first) { + first = 0; + + /* make exclusive-or pattern from polynomial (0xedb88320UL) */ + poly = 0UL; + for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++) + poly |= 1UL << (31 - p[n]); + + /* generate a crc for every 8-bit value */ + for (n = 0; n < 256; n++) { + c = (unsigned long)n; + for (k = 0; k < 8; k++) + c = c & 1 ? poly ^ (c >> 1) : c >> 1; + crc_table[0][n] = c; + } + +#ifdef BYFOUR + /* generate crc for each value followed by one, two, and three zeros, + and then the byte reversal of those as well as the first table */ + for (n = 0; n < 256; n++) { + c = crc_table[0][n]; + crc_table[4][n] = REV(c); + for (k = 1; k < 4; k++) { + c = crc_table[0][c & 0xff] ^ (c >> 8); + crc_table[k][n] = c; + crc_table[k + 4][n] = REV(c); + } + } +#endif /* BYFOUR */ + + crc_table_empty = 0; + } + else { /* not first */ + /* wait for the other guy to finish (not efficient, but rare) */ + while (crc_table_empty) + ; + } + +#ifdef MAKECRCH + /* write out CRC tables to crc32.h */ + { + FILE *out; + + out = fopen("crc32.h", "w"); + if (out == NULL) return; + fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n"); + fprintf(out, " * Generated automatically by crc32.c\n */\n\n"); + fprintf(out, "local const unsigned long FAR "); + fprintf(out, "crc_table[TBLS][256] =\n{\n {\n"); + write_table(out, crc_table[0]); +# ifdef BYFOUR + fprintf(out, "#ifdef BYFOUR\n"); + for (k = 1; k < 8; k++) { + fprintf(out, " },\n {\n"); + write_table(out, crc_table[k]); + } + fprintf(out, "#endif\n"); +# endif /* BYFOUR */ + fprintf(out, " }\n};\n"); + fclose(out); + } +#endif /* MAKECRCH */ +} + +#ifdef MAKECRCH +local void write_table(out, table) + FILE *out; + const unsigned long FAR *table; +{ + int n; + + for (n = 0; n < 256; n++) + fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : " ", table[n], + n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", ")); +} +#endif /* MAKECRCH */ + +#else /* !DYNAMIC_CRC_TABLE */ +/* ======================================================================== + * Tables of CRC-32s of all single-byte values, made by make_crc_table(). + */ +#include "crc32.h" +#endif /* DYNAMIC_CRC_TABLE */ + +/* ========================================================================= + * This function can be used by asm versions of crc32() + */ +const unsigned long FAR * ZEXPORT get_crc_table() +{ +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + return (const unsigned long FAR *)crc_table; +} + +/* ========================================================================= */ +#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8) +#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 + +/* ========================================================================= */ +unsigned long ZEXPORT crc32(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + if (buf == Z_NULL) return 0UL; + +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + +#ifdef BYFOUR + if (sizeof(void *) == sizeof(ptrdiff_t)) { + u4 endian; + + endian = 1; + if (*((unsigned char *)(&endian))) + return crc32_little(crc, buf, len); + else + return crc32_big(crc, buf, len); + } +#endif /* BYFOUR */ + crc = crc ^ 0xffffffffUL; + while (len >= 8) { + DO8; + len -= 8; + } + if (len) do { + DO1; + } while (--len); + return crc ^ 0xffffffffUL; +} + +#ifdef BYFOUR + +/* ========================================================================= */ +#define DOLIT4 c ^= *buf4++; \ + c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ + crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] +#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 + +/* ========================================================================= */ +local unsigned long crc32_little(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = (u4)crc; + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + len--; + } + + buf4 = (const u4 FAR *)(const void FAR *)buf; + while (len >= 32) { + DOLIT32; + len -= 32; + } + while (len >= 4) { + DOLIT4; + len -= 4; + } + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + } while (--len); + c = ~c; + return (unsigned long)c; +} + +/* ========================================================================= */ +#define DOBIG4 c ^= *++buf4; \ + c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ + crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] +#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 + +/* ========================================================================= */ +local unsigned long crc32_big(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = REV((u4)crc); + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + len--; + } + + buf4 = (const u4 FAR *)(const void FAR *)buf; + buf4--; + while (len >= 32) { + DOBIG32; + len -= 32; + } + while (len >= 4) { + DOBIG4; + len -= 4; + } + buf4++; + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + } while (--len); + c = ~c; + return (unsigned long)(REV(c)); +} + +#endif /* BYFOUR */ + +#define GF2_DIM 32 /* dimension of GF(2) vectors (length of CRC) */ + +/* ========================================================================= */ +local unsigned long gf2_matrix_times(mat, vec) + unsigned long *mat; + unsigned long vec; +{ + unsigned long sum; + + sum = 0; + while (vec) { + if (vec & 1) + sum ^= *mat; + vec >>= 1; + mat++; + } + return sum; +} + +/* ========================================================================= */ +local void gf2_matrix_square(square, mat) + unsigned long *square; + unsigned long *mat; +{ + int n; + + for (n = 0; n < GF2_DIM; n++) + square[n] = gf2_matrix_times(mat, mat[n]); +} + +/* ========================================================================= */ +uLong ZEXPORT crc32_combine(crc1, crc2, len2) + uLong crc1; + uLong crc2; + z_off_t len2; +{ + int n; + unsigned long row; + unsigned long even[GF2_DIM]; /* even-power-of-two zeros operator */ + unsigned long odd[GF2_DIM]; /* odd-power-of-two zeros operator */ + + /* degenerate case */ + if (len2 == 0) + return crc1; + + /* put operator for one zero bit in odd */ + odd[0] = 0xedb88320UL; /* CRC-32 polynomial */ + row = 1; + for (n = 1; n < GF2_DIM; n++) { + odd[n] = row; + row <<= 1; + } + + /* put operator for two zero bits in even */ + gf2_matrix_square(even, odd); + + /* put operator for four zero bits in odd */ + gf2_matrix_square(odd, even); + + /* apply len2 zeros to crc1 (first square will put the operator for one + zero byte, eight zero bits, in even) */ + do { + /* apply zeros operator for this bit of len2 */ + gf2_matrix_square(even, odd); + if (len2 & 1) + crc1 = gf2_matrix_times(even, crc1); + len2 >>= 1; + + /* if no more bits set, then done */ + if (len2 == 0) + break; + + /* another iteration of the loop with odd and even swapped */ + gf2_matrix_square(odd, even); + if (len2 & 1) + crc1 = gf2_matrix_times(odd, crc1); + len2 >>= 1; + + /* if no more bits set, then done */ + } while (len2 != 0); + + /* return combined crc */ + crc1 ^= crc2; + return crc1; +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/trees.c b/sys/cddl/contrib/opensolaris/uts/common/zmod/trees.c new file mode 100644 index 000000000000..ce0cebc0304c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/trees.c @@ -0,0 +1,1219 @@ +/* trees.c -- output deflated data using Huffman coding + * Copyright (C) 1995-2005 Jean-loup Gailly + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ALGORITHM + * + * The "deflation" process uses several Huffman trees. The more + * common source values are represented by shorter bit sequences. + * + * Each code tree is stored in a compressed form which is itself + * a Huffman encoding of the lengths of all the code strings (in + * ascending order by source values). The actual code strings are + * reconstructed from the lengths in the inflate process, as described + * in the deflate specification. + * + * REFERENCES + * + * Deutsch, L.P.,"'Deflate' Compressed Data Format Specification". + * Available in ftp.uu.net:/pub/archiving/zip/doc/deflate-1.1.doc + * + * Storer, James A. + * Data Compression: Methods and Theory, pp. 49-50. + * Computer Science Press, 1988. ISBN 0-7167-8156-5. + * + * Sedgewick, R. + * Algorithms, p290. + * Addison-Wesley, 1983. ISBN 0-201-06672-6. + */ + +/* #define GEN_TREES_H */ + +#include "deflate.h" + +#ifdef DEBUG +# include <ctype.h> +#endif + +/* =========================================================================== + * Constants + */ + +#define MAX_BL_BITS 7 +/* Bit length codes must not exceed MAX_BL_BITS bits */ + +#define END_BLOCK 256 +/* end of block literal code */ + +#define REP_3_6 16 +/* repeat previous bit length 3-6 times (2 bits of repeat count) */ + +#define REPZ_3_10 17 +/* repeat a zero length 3-10 times (3 bits of repeat count) */ + +#define REPZ_11_138 18 +/* repeat a zero length 11-138 times (7 bits of repeat count) */ + +local const int extra_lbits[LENGTH_CODES] /* extra bits for each length code */ + = {0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0}; + +local const int extra_dbits[D_CODES] /* extra bits for each distance code */ + = {0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13}; + +local const int extra_blbits[BL_CODES]/* extra bits for each bit length code */ + = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,7}; + +local const uch bl_order[BL_CODES] + = {16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15}; +/* The lengths of the bit length codes are sent in order of decreasing + * probability, to avoid transmitting the lengths for unused bit length codes. + */ + +#define Buf_size (8 * 2*sizeof(char)) +/* Number of bits used within bi_buf. (bi_buf might be implemented on + * more than 16 bits on some systems.) + */ + +/* =========================================================================== + * Local data. These are initialized only once. + */ + +#define DIST_CODE_LEN 512 /* see definition of array dist_code below */ + +#if defined(GEN_TREES_H) || !defined(STDC) +/* non ANSI compilers may not accept trees.h */ + +local ct_data static_ltree[L_CODES+2]; +/* The static literal tree. Since the bit lengths are imposed, there is no + * need for the L_CODES extra codes used during heap construction. However + * The codes 286 and 287 are needed to build a canonical tree (see _tr_init + * below). + */ + +local ct_data static_dtree[D_CODES]; +/* The static distance tree. (Actually a trivial tree since all codes use + * 5 bits.) + */ + +uch _dist_code[DIST_CODE_LEN]; +/* Distance codes. The first 256 values correspond to the distances + * 3 .. 258, the last 256 values correspond to the top 8 bits of + * the 15 bit distances. + */ + +uch _length_code[MAX_MATCH-MIN_MATCH+1]; +/* length code for each normalized match length (0 == MIN_MATCH) */ + +local int base_length[LENGTH_CODES]; +/* First normalized length for each code (0 = MIN_MATCH) */ + +local int base_dist[D_CODES]; +/* First normalized distance for each code (0 = distance of 1) */ + +#else +# include "trees.h" +#endif /* GEN_TREES_H */ + +struct static_tree_desc_s { + const ct_data *static_tree; /* static tree or NULL */ + const intf *extra_bits; /* extra bits for each code or NULL */ + int extra_base; /* base index for extra_bits */ + int elems; /* max number of elements in the tree */ + int max_length; /* max bit length for the codes */ +}; + +local static_tree_desc static_l_desc = +{static_ltree, extra_lbits, LITERALS+1, L_CODES, MAX_BITS}; + +local static_tree_desc static_d_desc = +{static_dtree, extra_dbits, 0, D_CODES, MAX_BITS}; + +local static_tree_desc static_bl_desc = +{(const ct_data *)0, extra_blbits, 0, BL_CODES, MAX_BL_BITS}; + +/* =========================================================================== + * Local (static) routines in this file. + */ + +local void tr_static_init OF((void)); +local void init_block OF((deflate_state *s)); +local void pqdownheap OF((deflate_state *s, ct_data *tree, int k)); +local void gen_bitlen OF((deflate_state *s, tree_desc *desc)); +local void gen_codes OF((ct_data *tree, int max_code, ushf *bl_count)); +local void build_tree OF((deflate_state *s, tree_desc *desc)); +local void scan_tree OF((deflate_state *s, ct_data *tree, int max_code)); +local void send_tree OF((deflate_state *s, ct_data *tree, int max_code)); +local int build_bl_tree OF((deflate_state *s)); +local void send_all_trees OF((deflate_state *s, int lcodes, int dcodes, + int blcodes)); +local void compress_block OF((deflate_state *s, ct_data *ltree, + ct_data *dtree)); +local void set_data_type OF((deflate_state *s)); +local unsigned bi_reverse OF((unsigned value, int length)); +local void bi_windup OF((deflate_state *s)); +local void bi_flush OF((deflate_state *s)); +local void copy_block OF((deflate_state *s, charf *buf, unsigned len, + int header)); + +#ifdef GEN_TREES_H +local void gen_trees_header OF((void)); +#endif + +#ifndef DEBUG +# define send_code(s, c, tree) send_bits(s, tree[c].Code, tree[c].Len) + /* Send a code of the given tree. c and tree must not have side effects */ + +#else /* DEBUG */ +# define send_code(s, c, tree) \ + { if (z_verbose>2) fprintf(stderr,"\ncd %3d ",(c)); \ + send_bits(s, tree[c].Code, tree[c].Len); } +#endif + +/* =========================================================================== + * Output a short LSB first on the stream. + * IN assertion: there is enough room in pendingBuf. + */ +#define put_short(s, w) { \ + put_byte(s, (uch)((w) & 0xff)); \ + put_byte(s, (uch)((ush)(w) >> 8)); \ +} + +/* =========================================================================== + * Send a value on a given number of bits. + * IN assertion: length <= 16 and value fits in length bits. + */ +#ifdef DEBUG +local void send_bits OF((deflate_state *s, int value, int length)); + +local void send_bits(s, value, length) + deflate_state *s; + int value; /* value to send */ + int length; /* number of bits */ +{ + Tracevv((stderr," l %2d v %4x ", length, value)); + Assert(length > 0 && length <= 15, "invalid length"); + s->bits_sent += (ulg)length; + + /* If not enough room in bi_buf, use (valid) bits from bi_buf and + * (16 - bi_valid) bits from value, leaving (width - (16-bi_valid)) + * unused bits in value. + */ + if (s->bi_valid > (int)Buf_size - length) { + s->bi_buf |= (value << s->bi_valid); + put_short(s, s->bi_buf); + s->bi_buf = (ush)value >> (Buf_size - s->bi_valid); + s->bi_valid += length - Buf_size; + } else { + s->bi_buf |= value << s->bi_valid; + s->bi_valid += length; + } +} +#else /* !DEBUG */ + +#define send_bits(s, value, length) \ +{ int len = length;\ + if (s->bi_valid > (int)Buf_size - len) {\ + int val = value;\ + s->bi_buf |= (val << s->bi_valid);\ + put_short(s, s->bi_buf);\ + s->bi_buf = (ush)val >> (Buf_size - s->bi_valid);\ + s->bi_valid += len - Buf_size;\ + } else {\ + s->bi_buf |= (value) << s->bi_valid;\ + s->bi_valid += len;\ + }\ +} +#endif /* DEBUG */ + + +/* the arguments must not have side effects */ + +/* =========================================================================== + * Initialize the various 'constant' tables. + */ +local void tr_static_init() +{ +#if defined(GEN_TREES_H) || !defined(STDC) + static int static_init_done = 0; + int n; /* iterates over tree elements */ + int bits; /* bit counter */ + int length; /* length value */ + int code; /* code value */ + int dist; /* distance index */ + ush bl_count[MAX_BITS+1]; + /* number of codes at each bit length for an optimal tree */ + + if (static_init_done) return; + + /* For some embedded targets, global variables are not initialized: */ + static_l_desc.static_tree = static_ltree; + static_l_desc.extra_bits = extra_lbits; + static_d_desc.static_tree = static_dtree; + static_d_desc.extra_bits = extra_dbits; + static_bl_desc.extra_bits = extra_blbits; + + /* Initialize the mapping length (0..255) -> length code (0..28) */ + length = 0; + for (code = 0; code < LENGTH_CODES-1; code++) { + base_length[code] = length; + for (n = 0; n < (1<<extra_lbits[code]); n++) { + _length_code[length++] = (uch)code; + } + } + Assert (length == 256, "tr_static_init: length != 256"); + /* Note that the length 255 (match length 258) can be represented + * in two different ways: code 284 + 5 bits or code 285, so we + * overwrite length_code[255] to use the best encoding: + */ + _length_code[length-1] = (uch)code; + + /* Initialize the mapping dist (0..32K) -> dist code (0..29) */ + dist = 0; + for (code = 0 ; code < 16; code++) { + base_dist[code] = dist; + for (n = 0; n < (1<<extra_dbits[code]); n++) { + _dist_code[dist++] = (uch)code; + } + } + Assert (dist == 256, "tr_static_init: dist != 256"); + dist >>= 7; /* from now on, all distances are divided by 128 */ + for ( ; code < D_CODES; code++) { + base_dist[code] = dist << 7; + for (n = 0; n < (1<<(extra_dbits[code]-7)); n++) { + _dist_code[256 + dist++] = (uch)code; + } + } + Assert (dist == 256, "tr_static_init: 256+dist != 512"); + + /* Construct the codes of the static literal tree */ + for (bits = 0; bits <= MAX_BITS; bits++) bl_count[bits] = 0; + n = 0; + while (n <= 143) static_ltree[n++].Len = 8, bl_count[8]++; + while (n <= 255) static_ltree[n++].Len = 9, bl_count[9]++; + while (n <= 279) static_ltree[n++].Len = 7, bl_count[7]++; + while (n <= 287) static_ltree[n++].Len = 8, bl_count[8]++; + /* Codes 286 and 287 do not exist, but we must include them in the + * tree construction to get a canonical Huffman tree (longest code + * all ones) + */ + gen_codes((ct_data *)static_ltree, L_CODES+1, bl_count); + + /* The static distance tree is trivial: */ + for (n = 0; n < D_CODES; n++) { + static_dtree[n].Len = 5; + static_dtree[n].Code = bi_reverse((unsigned)n, 5); + } + static_init_done = 1; + +# ifdef GEN_TREES_H + gen_trees_header(); +# endif +#endif /* defined(GEN_TREES_H) || !defined(STDC) */ +} + +/* =========================================================================== + * Genererate the file trees.h describing the static trees. + */ +#ifdef GEN_TREES_H +# ifndef DEBUG +# include <stdio.h> +# endif + +# define SEPARATOR(i, last, width) \ + ((i) == (last)? "\n};\n\n" : \ + ((i) % (width) == (width)-1 ? ",\n" : ", ")) + +void gen_trees_header() +{ + FILE *header = fopen("trees.h", "w"); + int i; + + Assert (header != NULL, "Can't open trees.h"); + fprintf(header, + "/* header created automatically with -DGEN_TREES_H */\n\n"); + + fprintf(header, "local const ct_data static_ltree[L_CODES+2] = {\n"); + for (i = 0; i < L_CODES+2; i++) { + fprintf(header, "{{%3u},{%3u}}%s", static_ltree[i].Code, + static_ltree[i].Len, SEPARATOR(i, L_CODES+1, 5)); + } + + fprintf(header, "local const ct_data static_dtree[D_CODES] = {\n"); + for (i = 0; i < D_CODES; i++) { + fprintf(header, "{{%2u},{%2u}}%s", static_dtree[i].Code, + static_dtree[i].Len, SEPARATOR(i, D_CODES-1, 5)); + } + + fprintf(header, "const uch _dist_code[DIST_CODE_LEN] = {\n"); + for (i = 0; i < DIST_CODE_LEN; i++) { + fprintf(header, "%2u%s", _dist_code[i], + SEPARATOR(i, DIST_CODE_LEN-1, 20)); + } + + fprintf(header, "const uch _length_code[MAX_MATCH-MIN_MATCH+1]= {\n"); + for (i = 0; i < MAX_MATCH-MIN_MATCH+1; i++) { + fprintf(header, "%2u%s", _length_code[i], + SEPARATOR(i, MAX_MATCH-MIN_MATCH, 20)); + } + + fprintf(header, "local const int base_length[LENGTH_CODES] = {\n"); + for (i = 0; i < LENGTH_CODES; i++) { + fprintf(header, "%1u%s", base_length[i], + SEPARATOR(i, LENGTH_CODES-1, 20)); + } + + fprintf(header, "local const int base_dist[D_CODES] = {\n"); + for (i = 0; i < D_CODES; i++) { + fprintf(header, "%5u%s", base_dist[i], + SEPARATOR(i, D_CODES-1, 10)); + } + + fclose(header); +} +#endif /* GEN_TREES_H */ + +/* =========================================================================== + * Initialize the tree data structures for a new zlib stream. + */ +void _tr_init(s) + deflate_state *s; +{ + tr_static_init(); + + s->l_desc.dyn_tree = s->dyn_ltree; + s->l_desc.stat_desc = &static_l_desc; + + s->d_desc.dyn_tree = s->dyn_dtree; + s->d_desc.stat_desc = &static_d_desc; + + s->bl_desc.dyn_tree = s->bl_tree; + s->bl_desc.stat_desc = &static_bl_desc; + + s->bi_buf = 0; + s->bi_valid = 0; + s->last_eob_len = 8; /* enough lookahead for inflate */ +#ifdef DEBUG + s->compressed_len = 0L; + s->bits_sent = 0L; +#endif + + /* Initialize the first block of the first file: */ + init_block(s); +} + +/* =========================================================================== + * Initialize a new block. + */ +local void init_block(s) + deflate_state *s; +{ + int n; /* iterates over tree elements */ + + /* Initialize the trees. */ + for (n = 0; n < L_CODES; n++) s->dyn_ltree[n].Freq = 0; + for (n = 0; n < D_CODES; n++) s->dyn_dtree[n].Freq = 0; + for (n = 0; n < BL_CODES; n++) s->bl_tree[n].Freq = 0; + + s->dyn_ltree[END_BLOCK].Freq = 1; + s->opt_len = s->static_len = 0L; + s->last_lit = s->matches = 0; +} + +#define SMALLEST 1 +/* Index within the heap array of least frequent node in the Huffman tree */ + + +/* =========================================================================== + * Remove the smallest element from the heap and recreate the heap with + * one less element. Updates heap and heap_len. + */ +#define pqremove(s, tree, top) \ +{\ + top = s->heap[SMALLEST]; \ + s->heap[SMALLEST] = s->heap[s->heap_len--]; \ + pqdownheap(s, tree, SMALLEST); \ +} + +/* =========================================================================== + * Compares to subtrees, using the tree depth as tie breaker when + * the subtrees have equal frequency. This minimizes the worst case length. + */ +#define smaller(tree, n, m, depth) \ + (tree[n].Freq < tree[m].Freq || \ + (tree[n].Freq == tree[m].Freq && depth[n] <= depth[m])) + +/* =========================================================================== + * Restore the heap property by moving down the tree starting at node k, + * exchanging a node with the smallest of its two sons if necessary, stopping + * when the heap property is re-established (each father smaller than its + * two sons). + */ +local void pqdownheap(s, tree, k) + deflate_state *s; + ct_data *tree; /* the tree to restore */ + int k; /* node to move down */ +{ + int v = s->heap[k]; + int j = k << 1; /* left son of k */ + while (j <= s->heap_len) { + /* Set j to the smallest of the two sons: */ + if (j < s->heap_len && + smaller(tree, s->heap[j+1], s->heap[j], s->depth)) { + j++; + } + /* Exit if v is smaller than both sons */ + if (smaller(tree, v, s->heap[j], s->depth)) break; + + /* Exchange v with the smallest son */ + s->heap[k] = s->heap[j]; k = j; + + /* And continue down the tree, setting j to the left son of k */ + j <<= 1; + } + s->heap[k] = v; +} + +/* =========================================================================== + * Compute the optimal bit lengths for a tree and update the total bit length + * for the current block. + * IN assertion: the fields freq and dad are set, heap[heap_max] and + * above are the tree nodes sorted by increasing frequency. + * OUT assertions: the field len is set to the optimal bit length, the + * array bl_count contains the frequencies for each bit length. + * The length opt_len is updated; static_len is also updated if stree is + * not null. + */ +local void gen_bitlen(s, desc) + deflate_state *s; + tree_desc *desc; /* the tree descriptor */ +{ + ct_data *tree = desc->dyn_tree; + int max_code = desc->max_code; + const ct_data *stree = desc->stat_desc->static_tree; + const intf *extra = desc->stat_desc->extra_bits; + int base = desc->stat_desc->extra_base; + int max_length = desc->stat_desc->max_length; + int h; /* heap index */ + int n, m; /* iterate over the tree elements */ + int bits; /* bit length */ + int xbits; /* extra bits */ + ush f; /* frequency */ + int overflow = 0; /* number of elements with bit length too large */ + + for (bits = 0; bits <= MAX_BITS; bits++) s->bl_count[bits] = 0; + + /* In a first pass, compute the optimal bit lengths (which may + * overflow in the case of the bit length tree). + */ + tree[s->heap[s->heap_max]].Len = 0; /* root of the heap */ + + for (h = s->heap_max+1; h < HEAP_SIZE; h++) { + n = s->heap[h]; + bits = tree[tree[n].Dad].Len + 1; + if (bits > max_length) bits = max_length, overflow++; + tree[n].Len = (ush)bits; + /* We overwrite tree[n].Dad which is no longer needed */ + + if (n > max_code) continue; /* not a leaf node */ + + s->bl_count[bits]++; + xbits = 0; + if (n >= base) xbits = extra[n-base]; + f = tree[n].Freq; + s->opt_len += (ulg)f * (bits + xbits); + if (stree) s->static_len += (ulg)f * (stree[n].Len + xbits); + } + if (overflow == 0) return; + + Trace((stderr,"\nbit length overflow\n")); + /* This happens for example on obj2 and pic of the Calgary corpus */ + + /* Find the first bit length which could increase: */ + do { + bits = max_length-1; + while (s->bl_count[bits] == 0) bits--; + s->bl_count[bits]--; /* move one leaf down the tree */ + s->bl_count[bits+1] += 2; /* move one overflow item as its brother */ + s->bl_count[max_length]--; + /* The brother of the overflow item also moves one step up, + * but this does not affect bl_count[max_length] + */ + overflow -= 2; + } while (overflow > 0); + + /* Now recompute all bit lengths, scanning in increasing frequency. + * h is still equal to HEAP_SIZE. (It is simpler to reconstruct all + * lengths instead of fixing only the wrong ones. This idea is taken + * from 'ar' written by Haruhiko Okumura.) + */ + for (bits = max_length; bits != 0; bits--) { + n = s->bl_count[bits]; + while (n != 0) { + m = s->heap[--h]; + if (m > max_code) continue; + if ((unsigned) tree[m].Len != (unsigned) bits) { + Trace((stderr,"code %d bits %d->%d\n", m, tree[m].Len, bits)); + s->opt_len += ((long)bits - (long)tree[m].Len) + *(long)tree[m].Freq; + tree[m].Len = (ush)bits; + } + n--; + } + } +} + +/* =========================================================================== + * Generate the codes for a given tree and bit counts (which need not be + * optimal). + * IN assertion: the array bl_count contains the bit length statistics for + * the given tree and the field len is set for all tree elements. + * OUT assertion: the field code is set for all tree elements of non + * zero code length. + */ +local void gen_codes (tree, max_code, bl_count) + ct_data *tree; /* the tree to decorate */ + int max_code; /* largest code with non zero frequency */ + ushf *bl_count; /* number of codes at each bit length */ +{ + ush next_code[MAX_BITS+1]; /* next code value for each bit length */ + ush code = 0; /* running code value */ + int bits; /* bit index */ + int n; /* code index */ + + /* The distribution counts are first used to generate the code values + * without bit reversal. + */ + for (bits = 1; bits <= MAX_BITS; bits++) { + next_code[bits] = code = (code + bl_count[bits-1]) << 1; + } + /* Check that the bit counts in bl_count are consistent. The last code + * must be all ones. + */ + Assert (code + bl_count[MAX_BITS]-1 == (1<<MAX_BITS)-1, + "inconsistent bit counts"); + Tracev((stderr,"\ngen_codes: max_code %d ", max_code)); + + for (n = 0; n <= max_code; n++) { + int len = tree[n].Len; + if (len == 0) continue; + /* Now reverse the bits */ + tree[n].Code = bi_reverse(next_code[len]++, len); + + Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ", + n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1)); + } +} + +/* =========================================================================== + * Construct one Huffman tree and assigns the code bit strings and lengths. + * Update the total bit length for the current block. + * IN assertion: the field freq is set for all tree elements. + * OUT assertions: the fields len and code are set to the optimal bit length + * and corresponding code. The length opt_len is updated; static_len is + * also updated if stree is not null. The field max_code is set. + */ +local void build_tree(s, desc) + deflate_state *s; + tree_desc *desc; /* the tree descriptor */ +{ + ct_data *tree = desc->dyn_tree; + const ct_data *stree = desc->stat_desc->static_tree; + int elems = desc->stat_desc->elems; + int n, m; /* iterate over heap elements */ + int max_code = -1; /* largest code with non zero frequency */ + int node; /* new node being created */ + + /* Construct the initial heap, with least frequent element in + * heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1]. + * heap[0] is not used. + */ + s->heap_len = 0, s->heap_max = HEAP_SIZE; + + for (n = 0; n < elems; n++) { + if (tree[n].Freq != 0) { + s->heap[++(s->heap_len)] = max_code = n; + s->depth[n] = 0; + } else { + tree[n].Len = 0; + } + } + + /* The pkzip format requires that at least one distance code exists, + * and that at least one bit should be sent even if there is only one + * possible code. So to avoid special checks later on we force at least + * two codes of non zero frequency. + */ + while (s->heap_len < 2) { + node = s->heap[++(s->heap_len)] = (max_code < 2 ? ++max_code : 0); + tree[node].Freq = 1; + s->depth[node] = 0; + s->opt_len--; if (stree) s->static_len -= stree[node].Len; + /* node is 0 or 1 so it does not have extra bits */ + } + desc->max_code = max_code; + + /* The elements heap[heap_len/2+1 .. heap_len] are leaves of the tree, + * establish sub-heaps of increasing lengths: + */ + for (n = s->heap_len/2; n >= 1; n--) pqdownheap(s, tree, n); + + /* Construct the Huffman tree by repeatedly combining the least two + * frequent nodes. + */ + node = elems; /* next internal node of the tree */ + do { + pqremove(s, tree, n); /* n = node of least frequency */ + m = s->heap[SMALLEST]; /* m = node of next least frequency */ + + s->heap[--(s->heap_max)] = n; /* keep the nodes sorted by frequency */ + s->heap[--(s->heap_max)] = m; + + /* Create a new node father of n and m */ + tree[node].Freq = tree[n].Freq + tree[m].Freq; + s->depth[node] = (uch)((s->depth[n] >= s->depth[m] ? + s->depth[n] : s->depth[m]) + 1); + tree[n].Dad = tree[m].Dad = (ush)node; +#ifdef DUMP_BL_TREE + if (tree == s->bl_tree) { + fprintf(stderr,"\nnode %d(%d), sons %d(%d) %d(%d)", + node, tree[node].Freq, n, tree[n].Freq, m, tree[m].Freq); + } +#endif + /* and insert the new node in the heap */ + s->heap[SMALLEST] = node++; + pqdownheap(s, tree, SMALLEST); + + } while (s->heap_len >= 2); + + s->heap[--(s->heap_max)] = s->heap[SMALLEST]; + + /* At this point, the fields freq and dad are set. We can now + * generate the bit lengths. + */ + gen_bitlen(s, (tree_desc *)desc); + + /* The field len is now set, we can generate the bit codes */ + gen_codes ((ct_data *)tree, max_code, s->bl_count); +} + +/* =========================================================================== + * Scan a literal or distance tree to determine the frequencies of the codes + * in the bit length tree. + */ +local void scan_tree (s, tree, max_code) + deflate_state *s; + ct_data *tree; /* the tree to be scanned */ + int max_code; /* and its largest code of non zero frequency */ +{ + int n; /* iterates over all tree elements */ + int prevlen = -1; /* last emitted length */ + int curlen; /* length of current code */ + int nextlen = tree[0].Len; /* length of next code */ + int count = 0; /* repeat count of the current code */ + int max_count = 7; /* max repeat count */ + int min_count = 4; /* min repeat count */ + + if (nextlen == 0) max_count = 138, min_count = 3; + tree[max_code+1].Len = (ush)0xffff; /* guard */ + + for (n = 0; n <= max_code; n++) { + curlen = nextlen; nextlen = tree[n+1].Len; + if (++count < max_count && curlen == nextlen) { + continue; + } else if (count < min_count) { + s->bl_tree[curlen].Freq += count; + } else if (curlen != 0) { + if (curlen != prevlen) s->bl_tree[curlen].Freq++; + s->bl_tree[REP_3_6].Freq++; + } else if (count <= 10) { + s->bl_tree[REPZ_3_10].Freq++; + } else { + s->bl_tree[REPZ_11_138].Freq++; + } + count = 0; prevlen = curlen; + if (nextlen == 0) { + max_count = 138, min_count = 3; + } else if (curlen == nextlen) { + max_count = 6, min_count = 3; + } else { + max_count = 7, min_count = 4; + } + } +} + +/* =========================================================================== + * Send a literal or distance tree in compressed form, using the codes in + * bl_tree. + */ +local void send_tree (s, tree, max_code) + deflate_state *s; + ct_data *tree; /* the tree to be scanned */ + int max_code; /* and its largest code of non zero frequency */ +{ + int n; /* iterates over all tree elements */ + int prevlen = -1; /* last emitted length */ + int curlen; /* length of current code */ + int nextlen = tree[0].Len; /* length of next code */ + int count = 0; /* repeat count of the current code */ + int max_count = 7; /* max repeat count */ + int min_count = 4; /* min repeat count */ + + /* tree[max_code+1].Len = -1; */ /* guard already set */ + if (nextlen == 0) max_count = 138, min_count = 3; + + for (n = 0; n <= max_code; n++) { + curlen = nextlen; nextlen = tree[n+1].Len; + if (++count < max_count && curlen == nextlen) { + continue; + } else if (count < min_count) { + do { send_code(s, curlen, s->bl_tree); } while (--count != 0); + + } else if (curlen != 0) { + if (curlen != prevlen) { + send_code(s, curlen, s->bl_tree); count--; + } + Assert(count >= 3 && count <= 6, " 3_6?"); + send_code(s, REP_3_6, s->bl_tree); send_bits(s, count-3, 2); + + } else if (count <= 10) { + send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count-3, 3); + + } else { + send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count-11, 7); + } + count = 0; prevlen = curlen; + if (nextlen == 0) { + max_count = 138, min_count = 3; + } else if (curlen == nextlen) { + max_count = 6, min_count = 3; + } else { + max_count = 7, min_count = 4; + } + } +} + +/* =========================================================================== + * Construct the Huffman tree for the bit lengths and return the index in + * bl_order of the last bit length code to send. + */ +local int build_bl_tree(s) + deflate_state *s; +{ + int max_blindex; /* index of last bit length code of non zero freq */ + + /* Determine the bit length frequencies for literal and distance trees */ + scan_tree(s, (ct_data *)s->dyn_ltree, s->l_desc.max_code); + scan_tree(s, (ct_data *)s->dyn_dtree, s->d_desc.max_code); + + /* Build the bit length tree: */ + build_tree(s, (tree_desc *)(&(s->bl_desc))); + /* opt_len now includes the length of the tree representations, except + * the lengths of the bit lengths codes and the 5+5+4 bits for the counts. + */ + + /* Determine the number of bit length codes to send. The pkzip format + * requires that at least 4 bit length codes be sent. (appnote.txt says + * 3 but the actual value used is 4.) + */ + for (max_blindex = BL_CODES-1; max_blindex >= 3; max_blindex--) { + if (s->bl_tree[bl_order[max_blindex]].Len != 0) break; + } + /* Update opt_len to include the bit length tree and counts */ + s->opt_len += 3*(max_blindex+1) + 5+5+4; + Tracev((stderr, "\ndyn trees: dyn %ld, stat %ld", + s->opt_len, s->static_len)); + + return max_blindex; +} + +/* =========================================================================== + * Send the header for a block using dynamic Huffman trees: the counts, the + * lengths of the bit length codes, the literal tree and the distance tree. + * IN assertion: lcodes >= 257, dcodes >= 1, blcodes >= 4. + */ +local void send_all_trees(s, lcodes, dcodes, blcodes) + deflate_state *s; + int lcodes, dcodes, blcodes; /* number of codes for each tree */ +{ + int rank; /* index in bl_order */ + + Assert (lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes"); + Assert (lcodes <= L_CODES && dcodes <= D_CODES && blcodes <= BL_CODES, + "too many codes"); + Tracev((stderr, "\nbl counts: ")); + send_bits(s, lcodes-257, 5); /* not +255 as stated in appnote.txt */ + send_bits(s, dcodes-1, 5); + send_bits(s, blcodes-4, 4); /* not -3 as stated in appnote.txt */ + for (rank = 0; rank < blcodes; rank++) { + Tracev((stderr, "\nbl code %2d ", bl_order[rank])); + send_bits(s, s->bl_tree[bl_order[rank]].Len, 3); + } + Tracev((stderr, "\nbl tree: sent %ld", s->bits_sent)); + + send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */ + Tracev((stderr, "\nlit tree: sent %ld", s->bits_sent)); + + send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */ + Tracev((stderr, "\ndist tree: sent %ld", s->bits_sent)); +} + +/* =========================================================================== + * Send a stored block + */ +void _tr_stored_block(s, buf, stored_len, eof) + deflate_state *s; + charf *buf; /* input block */ + ulg stored_len; /* length of input block */ + int eof; /* true if this is the last block for a file */ +{ + send_bits(s, (STORED_BLOCK<<1)+eof, 3); /* send block type */ +#ifdef DEBUG + s->compressed_len = (s->compressed_len + 3 + 7) & (ulg)~7L; + s->compressed_len += (stored_len + 4) << 3; +#endif + copy_block(s, buf, (unsigned)stored_len, 1); /* with header */ +} + +/* =========================================================================== + * Send one empty static block to give enough lookahead for inflate. + * This takes 10 bits, of which 7 may remain in the bit buffer. + * The current inflate code requires 9 bits of lookahead. If the + * last two codes for the previous block (real code plus EOB) were coded + * on 5 bits or less, inflate may have only 5+3 bits of lookahead to decode + * the last real code. In this case we send two empty static blocks instead + * of one. (There are no problems if the previous block is stored or fixed.) + * To simplify the code, we assume the worst case of last real code encoded + * on one bit only. + */ +void _tr_align(s) + deflate_state *s; +{ + send_bits(s, STATIC_TREES<<1, 3); + send_code(s, END_BLOCK, static_ltree); +#ifdef DEBUG + s->compressed_len += 10L; /* 3 for block type, 7 for EOB */ +#endif + bi_flush(s); + /* Of the 10 bits for the empty block, we have already sent + * (10 - bi_valid) bits. The lookahead for the last real code (before + * the EOB of the previous block) was thus at least one plus the length + * of the EOB plus what we have just sent of the empty static block. + */ + if (1 + s->last_eob_len + 10 - s->bi_valid < 9) { + send_bits(s, STATIC_TREES<<1, 3); + send_code(s, END_BLOCK, static_ltree); +#ifdef DEBUG + s->compressed_len += 10L; +#endif + bi_flush(s); + } + s->last_eob_len = 7; +} + +/* =========================================================================== + * Determine the best encoding for the current block: dynamic trees, static + * trees or store, and output the encoded block to the zip file. + */ +void _tr_flush_block(s, buf, stored_len, eof) + deflate_state *s; + charf *buf; /* input block, or NULL if too old */ + ulg stored_len; /* length of input block */ + int eof; /* true if this is the last block for a file */ +{ + ulg opt_lenb, static_lenb; /* opt_len and static_len in bytes */ + int max_blindex = 0; /* index of last bit length code of non zero freq */ + + /* Build the Huffman trees unless a stored block is forced */ + if (s->level > 0) { + + /* Check if the file is binary or text */ + if (stored_len > 0 && s->strm->data_type == Z_UNKNOWN) + set_data_type(s); + + /* Construct the literal and distance trees */ + build_tree(s, (tree_desc *)(&(s->l_desc))); + Tracev((stderr, "\nlit data: dyn %ld, stat %ld", s->opt_len, + s->static_len)); + + build_tree(s, (tree_desc *)(&(s->d_desc))); + Tracev((stderr, "\ndist data: dyn %ld, stat %ld", s->opt_len, + s->static_len)); + /* At this point, opt_len and static_len are the total bit lengths of + * the compressed block data, excluding the tree representations. + */ + + /* Build the bit length tree for the above two trees, and get the index + * in bl_order of the last bit length code to send. + */ + max_blindex = build_bl_tree(s); + + /* Determine the best encoding. Compute the block lengths in bytes. */ + opt_lenb = (s->opt_len+3+7)>>3; + static_lenb = (s->static_len+3+7)>>3; + + Tracev((stderr, "\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ", + opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len, + s->last_lit)); + + if (static_lenb <= opt_lenb) opt_lenb = static_lenb; + + } else { + Assert(buf != (char*)0, "lost buf"); + opt_lenb = static_lenb = stored_len + 5; /* force a stored block */ + } + +#ifdef FORCE_STORED + if (buf != (char*)0) { /* force stored block */ +#else + if (stored_len+4 <= opt_lenb && buf != (char*)0) { + /* 4: two words for the lengths */ +#endif + /* The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE. + * Otherwise we can't have processed more than WSIZE input bytes since + * the last block flush, because compression would have been + * successful. If LIT_BUFSIZE <= WSIZE, it is never too late to + * transform a block into a stored block. + */ + _tr_stored_block(s, buf, stored_len, eof); + +#ifdef FORCE_STATIC + } else if (static_lenb >= 0) { /* force static trees */ +#else + } else if (s->strategy == Z_FIXED || static_lenb == opt_lenb) { +#endif + send_bits(s, (STATIC_TREES<<1)+eof, 3); + compress_block(s, (ct_data *)static_ltree, (ct_data *)static_dtree); +#ifdef DEBUG + s->compressed_len += 3 + s->static_len; +#endif + } else { + send_bits(s, (DYN_TREES<<1)+eof, 3); + send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1, + max_blindex+1); + compress_block(s, (ct_data *)s->dyn_ltree, (ct_data *)s->dyn_dtree); +#ifdef DEBUG + s->compressed_len += 3 + s->opt_len; +#endif + } + Assert (s->compressed_len == s->bits_sent, "bad compressed size"); + /* The above check is made mod 2^32, for files larger than 512 MB + * and uLong implemented on 32 bits. + */ + init_block(s); + + if (eof) { + bi_windup(s); +#ifdef DEBUG + s->compressed_len += 7; /* align on byte boundary */ +#endif + } + Tracev((stderr,"\ncomprlen %lu(%lu) ", s->compressed_len>>3, + s->compressed_len-7*eof)); +} + +/* =========================================================================== + * Save the match info and tally the frequency counts. Return true if + * the current block must be flushed. + */ +int _tr_tally (s, dist, lc) + deflate_state *s; + unsigned dist; /* distance of matched string */ + unsigned lc; /* match length-MIN_MATCH or unmatched char (if dist==0) */ +{ + s->d_buf[s->last_lit] = (ush)dist; + s->l_buf[s->last_lit++] = (uch)lc; + if (dist == 0) { + /* lc is the unmatched char */ + s->dyn_ltree[lc].Freq++; + } else { + s->matches++; + /* Here, lc is the match length - MIN_MATCH */ + dist--; /* dist = match distance - 1 */ + Assert((ush)dist < (ush)MAX_DIST(s) && + (ush)lc <= (ush)(MAX_MATCH-MIN_MATCH) && + (ush)d_code(dist) < (ush)D_CODES, "_tr_tally: bad match"); + + s->dyn_ltree[_length_code[lc]+LITERALS+1].Freq++; + s->dyn_dtree[d_code(dist)].Freq++; + } + +#ifdef TRUNCATE_BLOCK + /* Try to guess if it is profitable to stop the current block here */ + if ((s->last_lit & 0x1fff) == 0 && s->level > 2) { + /* Compute an upper bound for the compressed length */ + ulg out_length = (ulg)s->last_lit*8L; + ulg in_length = (ulg)((long)s->strstart - s->block_start); + int dcode; + for (dcode = 0; dcode < D_CODES; dcode++) { + out_length += (ulg)s->dyn_dtree[dcode].Freq * + (5L+extra_dbits[dcode]); + } + out_length >>= 3; + Tracev((stderr,"\nlast_lit %u, in %ld, out ~%ld(%ld%%) ", + s->last_lit, in_length, out_length, + 100L - out_length*100L/in_length)); + if (s->matches < s->last_lit/2 && out_length < in_length/2) return 1; + } +#endif + return (s->last_lit == s->lit_bufsize-1); + /* We avoid equality with lit_bufsize because of wraparound at 64K + * on 16 bit machines and because stored blocks are restricted to + * 64K-1 bytes. + */ +} + +/* =========================================================================== + * Send the block data compressed using the given Huffman trees + */ +local void compress_block(s, ltree, dtree) + deflate_state *s; + ct_data *ltree; /* literal tree */ + ct_data *dtree; /* distance tree */ +{ + unsigned dist; /* distance of matched string */ + int lc; /* match length or unmatched char (if dist == 0) */ + unsigned lx = 0; /* running index in l_buf */ + unsigned code; /* the code to send */ + int extra; /* number of extra bits to send */ + + if (s->last_lit != 0) do { + dist = s->d_buf[lx]; + lc = s->l_buf[lx++]; + if (dist == 0) { + send_code(s, lc, ltree); /* send a literal byte */ + Tracecv(isgraph(lc), (stderr," '%c' ", lc)); + } else { + /* Here, lc is the match length - MIN_MATCH */ + code = _length_code[lc]; + send_code(s, code+LITERALS+1, ltree); /* send the length code */ + extra = extra_lbits[code]; + if (extra != 0) { + lc -= base_length[code]; + send_bits(s, lc, extra); /* send the extra length bits */ + } + dist--; /* dist is now the match distance - 1 */ + code = d_code(dist); + Assert (code < D_CODES, "bad d_code"); + + send_code(s, code, dtree); /* send the distance code */ + extra = extra_dbits[code]; + if (extra != 0) { + dist -= base_dist[code]; + send_bits(s, dist, extra); /* send the extra distance bits */ + } + } /* literal or match pair ? */ + + /* Check that the overlay between pending_buf and d_buf+l_buf is ok: */ + Assert((uInt)(s->pending) < s->lit_bufsize + 2*lx, + "pendingBuf overflow"); + + } while (lx < s->last_lit); + + send_code(s, END_BLOCK, ltree); + s->last_eob_len = ltree[END_BLOCK].Len; +} + +/* =========================================================================== + * Set the data type to BINARY or TEXT, using a crude approximation: + * set it to Z_TEXT if all symbols are either printable characters (33 to 255) + * or white spaces (9 to 13, or 32); or set it to Z_BINARY otherwise. + * IN assertion: the fields Freq of dyn_ltree are set. + */ +local void set_data_type(s) + deflate_state *s; +{ + int n; + + for (n = 0; n < 9; n++) + if (s->dyn_ltree[n].Freq != 0) + break; + if (n == 9) + for (n = 14; n < 32; n++) + if (s->dyn_ltree[n].Freq != 0) + break; + s->strm->data_type = (n == 32) ? Z_TEXT : Z_BINARY; +} + +/* =========================================================================== + * Reverse the first len bits of a code, using straightforward code (a faster + * method would use a table) + * IN assertion: 1 <= len <= 15 + */ +local unsigned bi_reverse(code, len) + unsigned code; /* the value to invert */ + int len; /* its bit length */ +{ + register unsigned res = 0; + do { + res |= code & 1; + code >>= 1, res <<= 1; + } while (--len > 0); + return res >> 1; +} + +/* =========================================================================== + * Flush the bit buffer, keeping at most 7 bits in it. + */ +local void bi_flush(s) + deflate_state *s; +{ + if (s->bi_valid == 16) { + put_short(s, s->bi_buf); + s->bi_buf = 0; + s->bi_valid = 0; + } else if (s->bi_valid >= 8) { + put_byte(s, (Byte)s->bi_buf); + s->bi_buf >>= 8; + s->bi_valid -= 8; + } +} + +/* =========================================================================== + * Flush the bit buffer and align the output on a byte boundary + */ +local void bi_windup(s) + deflate_state *s; +{ + if (s->bi_valid > 8) { + put_short(s, s->bi_buf); + } else if (s->bi_valid > 0) { + put_byte(s, (Byte)s->bi_buf); + } + s->bi_buf = 0; + s->bi_valid = 0; +#ifdef DEBUG + s->bits_sent = (s->bits_sent+7) & ~7; +#endif +} + +/* =========================================================================== + * Copy a stored block, storing first the length and its + * one's complement if requested. + */ +local void copy_block(s, buf, len, header) + deflate_state *s; + charf *buf; /* the input data */ + unsigned len; /* its length */ + int header; /* true if block header must be written */ +{ + bi_windup(s); /* align on byte boundary */ + s->last_eob_len = 8; /* enough lookahead for inflate */ + + if (header) { + put_short(s, (ush)len); + put_short(s, (ush)~len); +#ifdef DEBUG + s->bits_sent += 2*16; +#endif + } +#ifdef DEBUG + s->bits_sent += (ulg)len<<3; +#endif + while (len--) { + put_byte(s, *buf++); + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/zconf.h b/sys/cddl/contrib/opensolaris/uts/common/zmod/zconf.h new file mode 100644 index 000000000000..ccce7b2742da --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/zconf.h @@ -0,0 +1,117 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZCONF_H +#define _ZCONF_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * We don't want to turn on zlib's debugging. + */ +#undef DEBUG + +/* + * We define our own memory allocation and deallocation routines that use kmem. + */ +#define MY_ZCALLOC + +/* + * We don't define HAVE_MEMCPY here, but do in zutil.c, and implement our + * our versions of zmemcpy(), zmemzero(), and zmemcmp(). + */ + +/* + * We have a sufficiently capable compiler as to not need zlib's compiler hack. + */ +#define NO_DUMMY_DECL + +#define compressBound(len) (len + (len >> 12) + (len >> 14) + 11) + +#define z_off_t off_t +#define OF(p) p +#define ZEXTERN extern +#define ZEXPORT +#define ZEXPORTVA +#define FAR + +#define deflateInit_ z_deflateInit_ +#define deflate z_deflate +#define deflateEnd z_deflateEnd +#define inflateInit_ z_inflateInit_ +#define inflate z_inflate +#define inflateEnd z_inflateEnd +#define deflateInit2_ z_deflateInit2_ +#define deflateSetDictionary z_deflateSetDictionary +#define deflateCopy z_deflateCopy +#define deflateReset z_deflateReset +#define deflateParams z_deflateParams +#define deflateBound z_deflateBound +#define deflatePrime z_deflatePrime +#define inflateInit2_ z_inflateInit2_ +#define inflateSetDictionary z_inflateSetDictionary +#define inflateSync z_inflateSync +#define inflateSyncPoint z_inflateSyncPoint +#define inflateCopy z_inflateCopy +#define inflateReset z_inflateReset +#define inflateBack z_inflateBack +#define inflateBackEnd z_inflateBackEnd +#define compress zz_compress +#define compress2 zz_compress2 +#define uncompress zz_uncompress +#define adler32 z_adler32 +#define crc32 z_crc32 +#define get_crc_table z_get_crc_table +#define zError z_zError + +#define MAX_MEM_LEVEL 9 +#define MAX_WBITS 15 + +typedef unsigned char Byte; +typedef unsigned int uInt; +typedef unsigned long uLong; +typedef Byte Bytef; +typedef char charf; +typedef int intf; +typedef uInt uIntf; +typedef uLong uLongf; +typedef void *voidpc; +typedef void *voidpf; +typedef void *voidp; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZCONF_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/zlib.h b/sys/cddl/contrib/opensolaris/uts/common/zmod/zlib.h new file mode 100644 index 000000000000..9b971a0f5722 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/zlib.h @@ -0,0 +1,1359 @@ +/* zlib.h -- interface of the 'zlib' general purpose compression library + version 1.2.3, July 18th, 2005 + + Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + + + The data format used by the zlib library is described by RFCs (Request for + Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt + (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). +*/ + +#ifndef _ZLIB_H +#define _ZLIB_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "zconf.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZLIB_VERSION "1.2.3" +#define ZLIB_VERNUM 0x1230 + +/* + The 'zlib' compression library provides in-memory compression and + decompression functions, including integrity checks of the uncompressed + data. This version of the library supports only one compression method + (deflation) but other algorithms will be added later and will have the same + stream interface. + + Compression can be done in a single step if the buffers are large + enough (for example if an input file is mmap'ed), or can be done by + repeated calls of the compression function. In the latter case, the + application must provide more input and/or consume the output + (providing more output space) before each call. + + The compressed data format used by default by the in-memory functions is + the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped + around a deflate stream, which is itself documented in RFC 1951. + + The library also supports reading and writing files in gzip (.gz) format + with an interface similar to that of stdio using the functions that start + with "gz". The gzip format is different from the zlib format. gzip is a + gzip wrapper, documented in RFC 1952, wrapped around a deflate stream. + + This library can optionally read and write gzip streams in memory as well. + + The zlib format was designed to be compact and fast for use in memory + and on communications channels. The gzip format was designed for single- + file compression on file systems, has a larger header than zlib to maintain + directory information, and uses a different, slower check method than zlib. + + The library does not install any signal handler. The decoder checks + the consistency of the compressed data, so the library should never + crash even in case of corrupted input. +*/ + +typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size)); +typedef void (*free_func) OF((voidpf opaque, voidpf address)); + +struct internal_state; + +typedef struct z_stream_s { + Bytef *next_in; /* next input byte */ + uInt avail_in; /* number of bytes available at next_in */ + uLong total_in; /* total nb of input bytes read so far */ + + Bytef *next_out; /* next output byte should be put there */ + uInt avail_out; /* remaining free space at next_out */ + uLong total_out; /* total nb of bytes output so far */ + + char *msg; /* last error message, NULL if no error */ + struct internal_state FAR *state; /* not visible by applications */ + + alloc_func zalloc; /* used to allocate the internal state */ + free_func zfree; /* used to free the internal state */ + voidpf opaque; /* private data object passed to zalloc and zfree */ + + int data_type; /* best guess about the data type: binary or text */ + uLong adler; /* adler32 value of the uncompressed data */ + uLong reserved; /* reserved for future use */ +} z_stream; + +typedef z_stream FAR *z_streamp; + +/* + gzip header information passed to and from zlib routines. See RFC 1952 + for more details on the meanings of these fields. +*/ +typedef struct gz_header_s { + int text; /* true if compressed data believed to be text */ + uLong time; /* modification time */ + int xflags; /* extra flags (not used when writing a gzip file) */ + int os; /* operating system */ + Bytef *extra; /* pointer to extra field or Z_NULL if none */ + uInt extra_len; /* extra field length (valid if extra != Z_NULL) */ + uInt extra_max; /* space at extra (only when reading header) */ + Bytef *name; /* pointer to zero-terminated file name or Z_NULL */ + uInt name_max; /* space at name (only when reading header) */ + Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */ + uInt comm_max; /* space at comment (only when reading header) */ + int hcrc; /* true if there was or will be a header crc */ + int done; /* true when done reading gzip header (not used + when writing a gzip file) */ +} gz_header; + +typedef gz_header FAR *gz_headerp; + +/* + The application must update next_in and avail_in when avail_in has + dropped to zero. It must update next_out and avail_out when avail_out + has dropped to zero. The application must initialize zalloc, zfree and + opaque before calling the init function. All other fields are set by the + compression library and must not be updated by the application. + + The opaque value provided by the application will be passed as the first + parameter for calls of zalloc and zfree. This can be useful for custom + memory management. The compression library attaches no meaning to the + opaque value. + + zalloc must return Z_NULL if there is not enough memory for the object. + If zlib is used in a multi-threaded application, zalloc and zfree must be + thread safe. + + On 16-bit systems, the functions zalloc and zfree must be able to allocate + exactly 65536 bytes, but will not be required to allocate more than this + if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, + pointers returned by zalloc for objects of exactly 65536 bytes *must* + have their offset normalized to zero. The default allocation function + provided by this library ensures this (see zutil.c). To reduce memory + requirements and avoid any allocation of 64K objects, at the expense of + compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h). + + The fields total_in and total_out can be used for statistics or + progress reports. After compression, total_in holds the total size of + the uncompressed data and may be saved for use in the decompressor + (particularly if the decompressor wants to decompress everything in + a single step). +*/ + + /* constants */ + +#define Z_NO_FLUSH 0 +#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */ +#define Z_SYNC_FLUSH 2 +#define Z_FULL_FLUSH 3 +#define Z_FINISH 4 +#define Z_BLOCK 5 +/* Allowed flush values; see deflate() and inflate() below for details */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) +/* Return codes for the compression/decompression functions. Negative + * values are errors, positive values are used for special but normal events. + */ + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) +/* compression levels */ + +#define Z_FILTERED 1 +#define Z_HUFFMAN_ONLY 2 +#define Z_RLE 3 +#define Z_FIXED 4 +#define Z_DEFAULT_STRATEGY 0 +/* compression strategy; see deflateInit2() below for details */ + +#define Z_BINARY 0 +#define Z_TEXT 1 +#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */ +#define Z_UNKNOWN 2 +/* Possible values of the data_type field (though see inflate()) */ + +#define Z_DEFLATED 8 +/* The deflate compression method (the only one supported in this version) */ + +#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */ + +#define zlib_version zlibVersion() +/* for compatibility with versions < 1.0.2 */ + + /* basic functions */ + +ZEXTERN const char * ZEXPORT zlibVersion OF((void)); +/* The application can compare zlibVersion and ZLIB_VERSION for consistency. + If the first character differs, the library code actually used is + not compatible with the zlib.h header file used by the application. + This check is automatically made by deflateInit and inflateInit. + */ + +/* +ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level)); + + Initializes the internal stream state for compression. The fields + zalloc, zfree and opaque must be initialized before by the caller. + If zalloc and zfree are set to Z_NULL, deflateInit updates them to + use default allocation functions. + + The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9: + 1 gives best speed, 9 gives best compression, 0 gives no compression at + all (the input data is simply copied a block at a time). + Z_DEFAULT_COMPRESSION requests a default compromise between speed and + compression (currently equivalent to level 6). + + deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if level is not a valid compression level, + Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible + with the version assumed by the caller (ZLIB_VERSION). + msg is set to null if there is no error message. deflateInit does not + perform any compression: this will be done by deflate(). +*/ + + +ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush)); +/* + deflate compresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce some + output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. deflate performs one or both of the + following actions: + + - Compress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in and avail_in are updated and + processing will resume at this point for the next call of deflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. This action is forced if the parameter flush is non zero. + Forcing flush frequently degrades the compression ratio, so this parameter + should be set only when necessary (in interactive applications). + Some output may be provided even if flush is not set. + + Before the call of deflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating avail_in or avail_out accordingly; avail_out + should never be zero before the call. The application can consume the + compressed output when it wants, for example when the output buffer is full + (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK + and with zero avail_out, it must be called again after making room in the + output buffer because there might be more output pending. + + Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to + decide how much data to accumualte before producing output, in order to + maximize compression. + + If the parameter flush is set to Z_SYNC_FLUSH, all pending output is + flushed to the output buffer and the output is aligned on a byte boundary, so + that the decompressor can get all input data available so far. (In particular + avail_in is zero after the call if enough output space has been provided + before the call.) Flushing may degrade compression for some compression + algorithms and so it should be used only when necessary. + + If flush is set to Z_FULL_FLUSH, all output is flushed as with + Z_SYNC_FLUSH, and the compression state is reset so that decompression can + restart from this point if previous compressed data has been damaged or if + random access is desired. Using Z_FULL_FLUSH too often can seriously degrade + compression. + + If deflate returns with avail_out == 0, this function must be called again + with the same value of the flush parameter and more output space (updated + avail_out), until the flush is complete (deflate returns with non-zero + avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that + avail_out is greater than six to avoid repeated flush markers due to + avail_out == 0 on return. + + If the parameter flush is set to Z_FINISH, pending input is processed, + pending output is flushed and deflate returns with Z_STREAM_END if there + was enough output space; if deflate returns with Z_OK, this function must be + called again with Z_FINISH and more output space (updated avail_out) but no + more input data, until it returns with Z_STREAM_END or an error. After + deflate has returned Z_STREAM_END, the only possible operations on the + stream are deflateReset or deflateEnd. + + Z_FINISH can be used immediately after deflateInit if all the compression + is to be done in a single step. In this case, avail_out must be at least + the value returned by deflateBound (see below). If deflate does not return + Z_STREAM_END, then it must be called again as described above. + + deflate() sets strm->adler to the adler32 checksum of all input read + so far (that is, total_in bytes). + + deflate() may update strm->data_type if it can make a good guess about + the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered + binary. This field is only for information purposes and does not affect + the compression algorithm in any manner. + + deflate() returns Z_OK if some progress has been made (more input + processed or more output produced), Z_STREAM_END if all input has been + consumed and all output has been produced (only when flush is set to + Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example + if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible + (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not + fatal, and deflate() can be called again with more input and more output + space to continue compressing. +*/ + + +ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the + stream state was inconsistent, Z_DATA_ERROR if the stream was freed + prematurely (some input or output was discarded). In the error case, + msg may be set but then points to a static string (which must not be + deallocated). +*/ + + +/* +ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm)); + + Initializes the internal stream state for decompression. The fields + next_in, avail_in, zalloc, zfree and opaque must be initialized before by + the caller. If next_in is not Z_NULL and avail_in is large enough (the exact + value depends on the compression method), inflateInit determines the + compression method from the zlib header and allocates all data structures + accordingly; otherwise the allocation will be deferred to the first call of + inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to + use default allocation functions. + + inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_VERSION_ERROR if the zlib library version is incompatible with the + version assumed by the caller. msg is set to null if there is no error + message. inflateInit does not perform any decompression apart from reading + the zlib header if present: this will be done by inflate(). (So next_in and + avail_in may be modified, but next_out and avail_out are unchanged.) +*/ + + +ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush)); +/* + inflate decompresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce + some output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. inflate performs one or both of the + following actions: + + - Decompress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in is updated and processing + will resume at this point for the next call of inflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. inflate() provides as much output as possible, until there + is no more input data or no more space in the output buffer (see below + about the flush parameter). + + Before the call of inflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating the next_* and avail_* values accordingly. + The application can consume the uncompressed output when it wants, for + example when the output buffer is full (avail_out == 0), or after each + call of inflate(). If inflate returns Z_OK and with zero avail_out, it + must be called again after making room in the output buffer because there + might be more output pending. + + The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, + Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much + output as possible to the output buffer. Z_BLOCK requests that inflate() stop + if and when it gets to the next deflate block boundary. When decoding the + zlib or gzip format, this will cause inflate() to return immediately after + the header and before the first block. When doing a raw inflate, inflate() + will go ahead and process the first block, and will return when it gets to + the end of that block, or when it runs out of data. + + The Z_BLOCK option assists in appending to or combining deflate streams. + Also to assist in this, on return inflate() will set strm->data_type to the + number of unused bits in the last byte taken from strm->next_in, plus 64 + if inflate() is currently decoding the last block in the deflate stream, + plus 128 if inflate() returned immediately after decoding an end-of-block + code or decoding the complete header up to just before the first byte of the + deflate stream. The end-of-block will not be indicated until all of the + uncompressed data from that block has been written to strm->next_out. The + number of unused bits may in general be greater than seven, except when + bit 7 of data_type is set, in which case the number of unused bits will be + less than eight. + + inflate() should normally be called until it returns Z_STREAM_END or an + error. However if all decompression is to be performed in a single step + (a single call of inflate), the parameter flush should be set to + Z_FINISH. In this case all pending input is processed and all pending + output is flushed; avail_out must be large enough to hold all the + uncompressed data. (The size of the uncompressed data may have been saved + by the compressor for this purpose.) The next operation on this stream must + be inflateEnd to deallocate the decompression state. The use of Z_FINISH + is never required, but can be used to inform inflate that a faster approach + may be used for the single inflate() call. + + In this implementation, inflate() always flushes as much output as + possible to the output buffer, and always uses the faster approach on the + first call. So the only effect of the flush parameter in this implementation + is on the return value of inflate(), as noted below, or when it returns early + because Z_BLOCK is used. + + If a preset dictionary is needed after this call (see inflateSetDictionary + below), inflate sets strm->adler to the adler32 checksum of the dictionary + chosen by the compressor and returns Z_NEED_DICT; otherwise it sets + strm->adler to the adler32 checksum of all output produced so far (that is, + total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described + below. At the end of the stream, inflate() checks that its computed adler32 + checksum is equal to that saved by the compressor and returns Z_STREAM_END + only if the checksum is correct. + + inflate() will decompress and check either zlib-wrapped or gzip-wrapped + deflate data. The header type is detected automatically. Any information + contained in the gzip header is not retained, so applications that need that + information should instead use raw inflate, see inflateInit2() below, or + inflateBack() and perform their own processing of the gzip header and + trailer. + + inflate() returns Z_OK if some progress has been made (more input processed + or more output produced), Z_STREAM_END if the end of the compressed data has + been reached and all uncompressed output has been produced, Z_NEED_DICT if a + preset dictionary is needed at this point, Z_DATA_ERROR if the input data was + corrupted (input stream not conforming to the zlib format or incorrect check + value), Z_STREAM_ERROR if the stream structure was inconsistent (for example + if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory, + Z_BUF_ERROR if no progress is possible or if there was not enough room in the + output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and + inflate() can be called again with more input and more output space to + continue decompressing. If Z_DATA_ERROR is returned, the application may then + call inflateSync() to look for a good compression block if a partial recovery + of the data is desired. +*/ + + +ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state + was inconsistent. In the error case, msg may be set but then points to a + static string (which must not be deallocated). +*/ + + /* Advanced functions */ + +/* + The following functions are needed only in some special applications. +*/ + +/* +ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm, + int level, + int method, + int windowBits, + int memLevel, + int strategy)); + + This is another version of deflateInit with more compression options. The + fields next_in, zalloc, zfree and opaque must be initialized before by + the caller. + + The method parameter is the compression method. It must be Z_DEFLATED in + this version of the library. + + The windowBits parameter is the base two logarithm of the window size + (the size of the history buffer). It should be in the range 8..15 for this + version of the library. Larger values of this parameter result in better + compression at the expense of memory usage. The default value is 15 if + deflateInit is used instead. + + windowBits can also be -8..-15 for raw deflate. In this case, -windowBits + determines the window size. deflate() will then generate raw deflate data + with no zlib header or trailer, and will not compute an adler32 check value. + + windowBits can also be greater than 15 for optional gzip encoding. Add + 16 to windowBits to write a simple gzip header and trailer around the + compressed data instead of a zlib wrapper. The gzip header will have no + file name, no extra data, no comment, no modification time (set to zero), + no header crc, and the operating system will be set to 255 (unknown). If a + gzip stream is being written, strm->adler is a crc32 instead of an adler32. + + The memLevel parameter specifies how much memory should be allocated + for the internal compression state. memLevel=1 uses minimum memory but + is slow and reduces compression ratio; memLevel=9 uses maximum memory + for optimal speed. The default value is 8. See zconf.h for total memory + usage as a function of windowBits and memLevel. + + The strategy parameter is used to tune the compression algorithm. Use the + value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a + filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no + string match), or Z_RLE to limit match distances to one (run-length + encoding). Filtered data consists mostly of small values with a somewhat + random distribution. In this case, the compression algorithm is tuned to + compress them better. The effect of Z_FILTERED is to force more Huffman + coding and less string matching; it is somewhat intermediate between + Z_DEFAULT and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as + Z_HUFFMAN_ONLY, but give better compression for PNG image data. The strategy + parameter only affects the compression ratio but not the correctness of the + compressed output even if it is not set appropriately. Z_FIXED prevents the + use of dynamic Huffman codes, allowing for a simpler decoder for special + applications. + + deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid + method). msg is set to null if there is no error message. deflateInit2 does + not perform any compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the compression dictionary from the given byte sequence + without producing any compressed output. This function must be called + immediately after deflateInit, deflateInit2 or deflateReset, before any + call of deflate. The compressor and decompressor must use exactly the same + dictionary (see inflateSetDictionary). + + The dictionary should consist of strings (byte sequences) that are likely + to be encountered later in the data to be compressed, with the most commonly + used strings preferably put towards the end of the dictionary. Using a + dictionary is most useful when the data to be compressed is short and can be + predicted with good accuracy; the data can then be compressed better than + with the default empty dictionary. + + Depending on the size of the compression data structures selected by + deflateInit or deflateInit2, a part of the dictionary may in effect be + discarded, for example if the dictionary is larger than the window size in + deflate or deflate2. Thus the strings most likely to be useful should be + put at the end of the dictionary, not at the front. In addition, the + current implementation of deflate will use at most the window size minus + 262 bytes of the provided dictionary. + + Upon return of this function, strm->adler is set to the adler32 value + of the dictionary; the decompressor may later use this value to determine + which dictionary has been used by the compressor. (The adler32 value + applies to the whole dictionary even if only a subset of the dictionary is + actually used by the compressor.) If a raw deflate was requested, then the + adler32 value is not computed and strm->adler is not set. + + deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a + parameter is invalid (such as NULL dictionary) or the stream state is + inconsistent (for example if deflate has already been called for this stream + or if the compression method is bsort). deflateSetDictionary does not + perform any compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when several compression strategies will be + tried, for example when there are several ways of pre-processing the input + data with a filter. The streams that will be discarded should then be freed + by calling deflateEnd. Note that deflateCopy duplicates the internal + compression state which can be quite large, so this strategy is slow and + can consume lots of memory. + + deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm)); +/* + This function is equivalent to deflateEnd followed by deflateInit, + but does not free and reallocate all the internal compression state. + The stream will keep the same compression level and any other attributes + that may have been set by deflateInit2. + + deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being NULL). +*/ + +ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm, + int level, + int strategy)); +/* + Dynamically update the compression level and compression strategy. The + interpretation of level and strategy is as in deflateInit2. This can be + used to switch between compression and straight copy of the input data, or + to switch to a different kind of input data requiring a different + strategy. If the compression level is changed, the input available so far + is compressed with the old level (and may be flushed); the new level will + take effect only at the next call of deflate(). + + Before the call of deflateParams, the stream state must be set as for + a call of deflate(), since the currently available input may have to + be compressed and flushed. In particular, strm->avail_out must be non-zero. + + deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source + stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR + if strm->avail_out was zero. +*/ + +ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm, + int good_length, + int max_lazy, + int nice_length, + int max_chain)); +/* + Fine tune deflate's internal compression parameters. This should only be + used by someone who understands the algorithm used by zlib's deflate for + searching for the best matching string, and even then only by the most + fanatic optimizer trying to squeeze out the last compressed bit for their + specific input data. Read the deflate.c source code for the meaning of the + max_lazy, good_length, nice_length, and max_chain parameters. + + deflateTune() can be called after deflateInit() or deflateInit2(), and + returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream. + */ + +ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm, + uLong sourceLen)); +/* + deflateBound() returns an upper bound on the compressed size after + deflation of sourceLen bytes. It must be called after deflateInit() + or deflateInit2(). This would be used to allocate an output buffer + for deflation in a single pass, and so would be called before deflate(). +*/ + +ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm, + int bits, + int value)); +/* + deflatePrime() inserts bits in the deflate output stream. The intent + is that this function is used to start off the deflate output with the + bits leftover from a previous deflate stream when appending to it. As such, + this function can only be used for raw deflate, and must be used before the + first deflate() call after a deflateInit2() or deflateReset(). bits must be + less than or equal to 16, and that many of the least significant bits of + value will be inserted in the output. + + deflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm, + gz_headerp head)); +/* + deflateSetHeader() provides gzip header information for when a gzip + stream is requested by deflateInit2(). deflateSetHeader() may be called + after deflateInit2() or deflateReset() and before the first call of + deflate(). The text, time, os, extra field, name, and comment information + in the provided gz_header structure are written to the gzip header (xflag is + ignored -- the extra flags are set according to the compression level). The + caller must assure that, if not Z_NULL, name and comment are terminated with + a zero byte, and that if extra is not Z_NULL, that extra_len bytes are + available there. If hcrc is true, a gzip header crc is included. Note that + the current versions of the command-line version of gzip (up through version + 1.3.x) do not support header crc's, and will report that it is a "multi-part + gzip file" and give up. + + If deflateSetHeader is not used, the default gzip header has text false, + the time set to zero, and os set to 255, with no extra, name, or comment + fields. The gzip header is returned to the default state by deflateReset(). + + deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm, + int windowBits)); + + This is another version of inflateInit with an extra parameter. The + fields next_in, avail_in, zalloc, zfree and opaque must be initialized + before by the caller. + + The windowBits parameter is the base two logarithm of the maximum window + size (the size of the history buffer). It should be in the range 8..15 for + this version of the library. The default value is 15 if inflateInit is used + instead. windowBits must be greater than or equal to the windowBits value + provided to deflateInit2() while compressing, or it must be equal to 15 if + deflateInit2() was not used. If a compressed stream with a larger window + size is given as input, inflate() will return with the error code + Z_DATA_ERROR instead of trying to allocate a larger window. + + windowBits can also be -8..-15 for raw inflate. In this case, -windowBits + determines the window size. inflate() will then process raw deflate data, + not looking for a zlib or gzip header, not generating a check value, and not + looking for any check values for comparison at the end of the stream. This + is for use with other formats that use the deflate compressed data format + such as zip. Those formats provide their own check values. If a custom + format is developed using the raw deflate format for compressed data, it is + recommended that a check value such as an adler32 or a crc32 be applied to + the uncompressed data as is done in the zlib, gzip, and zip formats. For + most applications, the zlib format should be used as is. Note that comments + above on the use in deflateInit2() applies to the magnitude of windowBits. + + windowBits can also be greater than 15 for optional gzip decoding. Add + 32 to windowBits to enable zlib and gzip decoding with automatic header + detection, or add 16 to decode only the gzip format (the zlib format will + return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is + a crc32 instead of an adler32. + + inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg + is set to null if there is no error message. inflateInit2 does not perform + any decompression apart from reading the zlib header if present: this will + be done by inflate(). (So next_in and avail_in may be modified, but next_out + and avail_out are unchanged.) +*/ + +ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the decompression dictionary from the given uncompressed byte + sequence. This function must be called immediately after a call of inflate, + if that call returned Z_NEED_DICT. The dictionary chosen by the compressor + can be determined from the adler32 value returned by that call of inflate. + The compressor and decompressor must use exactly the same dictionary (see + deflateSetDictionary). For raw inflate, this function can be called + immediately after inflateInit2() or inflateReset() and before any call of + inflate() to set the dictionary. The application must insure that the + dictionary that was used for compression is provided. + + inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a + parameter is invalid (such as NULL dictionary) or the stream state is + inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the + expected one (incorrect adler32 value). inflateSetDictionary does not + perform any decompression: this will be done by subsequent calls of + inflate(). +*/ + +ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm)); +/* + Skips invalid compressed data until a full flush point (see above the + description of deflate with Z_FULL_FLUSH) can be found, or until all + available input is skipped. No output is provided. + + inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR + if no more input was provided, Z_DATA_ERROR if no flush point has been found, + or Z_STREAM_ERROR if the stream structure was inconsistent. In the success + case, the application may save the current current value of total_in which + indicates where valid compressed data was found. In the error case, the + application may repeatedly call inflateSync, providing more input each time, + until success or end of the input data. +*/ + +ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when randomly accessing a large stream. The + first pass through the stream can periodically record the inflate state, + allowing restarting inflate at those points when randomly accessing the + stream. + + inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm)); +/* + This function is equivalent to inflateEnd followed by inflateInit, + but does not free and reallocate all the internal decompression state. + The stream will keep attributes that may have been set by inflateInit2. + + inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being NULL). +*/ + +ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm, + int bits, + int value)); +/* + This function inserts bits in the inflate input stream. The intent is + that this function is used to start inflating at a bit position in the + middle of a byte. The provided bits will be used before any bytes are used + from next_in. This function should only be used with raw inflate, and + should be used before the first inflate() call after inflateInit2() or + inflateReset(). bits must be less than or equal to 16, and that many of the + least significant bits of value will be inserted in the input. + + inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm, + gz_headerp head)); +/* + inflateGetHeader() requests that gzip header information be stored in the + provided gz_header structure. inflateGetHeader() may be called after + inflateInit2() or inflateReset(), and before the first call of inflate(). + As inflate() processes the gzip stream, head->done is zero until the header + is completed, at which time head->done is set to one. If a zlib stream is + being decoded, then head->done is set to -1 to indicate that there will be + no gzip header information forthcoming. Note that Z_BLOCK can be used to + force inflate() to return immediately after header processing is complete + and before any actual data is decompressed. + + The text, time, xflags, and os fields are filled in with the gzip header + contents. hcrc is set to true if there is a header CRC. (The header CRC + was valid if done is set to one.) If extra is not Z_NULL, then extra_max + contains the maximum number of bytes to write to extra. Once done is true, + extra_len contains the actual extra field length, and extra contains the + extra field, or that field truncated if extra_max is less than extra_len. + If name is not Z_NULL, then up to name_max characters are written there, + terminated with a zero unless the length is greater than name_max. If + comment is not Z_NULL, then up to comm_max characters are written there, + terminated with a zero unless the length is greater than comm_max. When + any of extra, name, or comment are not Z_NULL and the respective field is + not present in the header, then that field is set to Z_NULL to signal its + absence. This allows the use of deflateSetHeader() with the returned + structure to duplicate the header. However if those fields are set to + allocated memory, then the application will need to save those pointers + elsewhere so that they can be eventually freed. + + If inflateGetHeader is not used, then the header information is simply + discarded. The header is always checked for validity, including the header + CRC if present. inflateReset() will reset the process to discard the header + information. The application would need to call inflateGetHeader() again to + retrieve the header from the next gzip stream. + + inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits, + unsigned char FAR *window)); + + Initialize the internal stream state for decompression using inflateBack() + calls. The fields zalloc, zfree and opaque in strm must be initialized + before the call. If zalloc and zfree are Z_NULL, then the default library- + derived memory allocation routines are used. windowBits is the base two + logarithm of the window size, in the range 8..15. window is a caller + supplied buffer of that size. Except for special applications where it is + assured that deflate was used with small window sizes, windowBits must be 15 + and a 32K byte window must be supplied to be able to decompress general + deflate streams. + + See inflateBack() for the usage of these routines. + + inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of + the paramaters are invalid, Z_MEM_ERROR if the internal state could not + be allocated, or Z_VERSION_ERROR if the version of the library does not + match the version of the header file. +*/ + +typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *)); +typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned)); + +ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm, + in_func in, void FAR *in_desc, + out_func out, void FAR *out_desc)); +/* + inflateBack() does a raw inflate with a single call using a call-back + interface for input and output. This is more efficient than inflate() for + file i/o applications in that it avoids copying between the output and the + sliding window by simply making the window itself the output buffer. This + function trusts the application to not change the output buffer passed by + the output function, at least until inflateBack() returns. + + inflateBackInit() must be called first to allocate the internal state + and to initialize the state with the user-provided window buffer. + inflateBack() may then be used multiple times to inflate a complete, raw + deflate stream with each call. inflateBackEnd() is then called to free + the allocated state. + + A raw deflate stream is one with no zlib or gzip header or trailer. + This routine would normally be used in a utility that reads zip or gzip + files and writes out uncompressed files. The utility would decode the + header and process the trailer on its own, hence this routine expects + only the raw deflate stream to decompress. This is different from the + normal behavior of inflate(), which expects either a zlib or gzip header and + trailer around the deflate stream. + + inflateBack() uses two subroutines supplied by the caller that are then + called by inflateBack() for input and output. inflateBack() calls those + routines until it reads a complete deflate stream and writes out all of the + uncompressed data, or until it encounters an error. The function's + parameters and return types are defined above in the in_func and out_func + typedefs. inflateBack() will call in(in_desc, &buf) which should return the + number of bytes of provided input, and a pointer to that input in buf. If + there is no input available, in() must return zero--buf is ignored in that + case--and inflateBack() will return a buffer error. inflateBack() will call + out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out() + should return zero on success, or non-zero on failure. If out() returns + non-zero, inflateBack() will return with an error. Neither in() nor out() + are permitted to change the contents of the window provided to + inflateBackInit(), which is also the buffer that out() uses to write from. + The length written by out() will be at most the window size. Any non-zero + amount of input may be provided by in(). + + For convenience, inflateBack() can be provided input on the first call by + setting strm->next_in and strm->avail_in. If that input is exhausted, then + in() will be called. Therefore strm->next_in must be initialized before + calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called + immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in + must also be initialized, and then if strm->avail_in is not zero, input will + initially be taken from strm->next_in[0 .. strm->avail_in - 1]. + + The in_desc and out_desc parameters of inflateBack() is passed as the + first parameter of in() and out() respectively when they are called. These + descriptors can be optionally used to pass any information that the caller- + supplied in() and out() functions need to do their job. + + On return, inflateBack() will set strm->next_in and strm->avail_in to + pass back any unused input that was provided by the last in() call. The + return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR + if in() or out() returned an error, Z_DATA_ERROR if there was a format + error in the deflate stream (in which case strm->msg is set to indicate the + nature of the error), or Z_STREAM_ERROR if the stream was not properly + initialized. In the case of Z_BUF_ERROR, an input or output error can be + distinguished using strm->next_in which will be Z_NULL only if in() returned + an error. If strm->next is not Z_NULL, then the Z_BUF_ERROR was due to + out() returning non-zero. (in() will always be called before out(), so + strm->next_in is assured to be defined if out() returns non-zero.) Note + that inflateBack() cannot return Z_OK. +*/ + +ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm)); +/* + All memory allocated by inflateBackInit() is freed. + + inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream + state was inconsistent. +*/ + +ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void)); +/* Return flags indicating compile-time options. + + Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other: + 1.0: size of uInt + 3.2: size of uLong + 5.4: size of voidpf (pointer) + 7.6: size of z_off_t + + Compiler, assembler, and debug options: + 8: DEBUG + 9: ASMV or ASMINF -- use ASM code + 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention + 11: 0 (reserved) + + One-time table building (smaller code, but not thread-safe if true): + 12: BUILDFIXED -- build static block decoding tables when needed + 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed + 14,15: 0 (reserved) + + Library content (indicates missing functionality): + 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking + deflate code when not needed) + 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect + and decode gzip streams (to avoid linking crc code) + 18-19: 0 (reserved) + + Operation variations (changes in library functionality): + 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate + 21: FASTEST -- deflate algorithm with only one, lowest compression level + 22,23: 0 (reserved) + + The sprintf variant used by gzprintf (zero is best): + 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format + 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure! + 26: 0 = returns value, 1 = void -- 1 means inferred string length returned + + Remainder: + 27-31: 0 (reserved) + */ + + + /* utility functions */ + +/* + The following utility functions are implemented on top of the + basic stream-oriented functions. To simplify the interface, some + default options are assumed (compression level and memory usage, + standard memory allocation functions). The source code of these + utility functions can easily be modified if you need special options. +*/ + +ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Compresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be at least the value returned + by compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed buffer. + This function can be used to compress a whole file at once if the + input file is mmap'ed. + compress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer. +*/ + +ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen, + int level)); +/* + Compresses the source buffer into the destination buffer. The level + parameter has the same meaning as in deflateInit. sourceLen is the byte + length of the source buffer. Upon entry, destLen is the total size of the + destination buffer, which must be at least the value returned by + compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed buffer. + + compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_BUF_ERROR if there was not enough room in the output buffer, + Z_STREAM_ERROR if the level parameter is invalid. +*/ + +ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen)); +/* + compressBound() returns an upper bound on the compressed size after + compress() or compress2() on sourceLen bytes. It would be used before + a compress() or compress2() call to allocate the destination buffer. +*/ + +ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Decompresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be large enough to hold the + entire uncompressed data. (The size of the uncompressed data must have + been saved previously by the compressor and transmitted to the decompressor + by some mechanism outside the scope of this compression library.) + Upon exit, destLen is the actual size of the compressed buffer. + This function can be used to decompress a whole file at once if the + input file is mmap'ed. + + uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete. +*/ + + +typedef voidp gzFile; + +ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode)); +/* + Opens a gzip (.gz) file for reading or writing. The mode parameter + is as in fopen ("rb" or "wb") but can also include a compression level + ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for + Huffman only compression as in "wb1h", or 'R' for run-length encoding + as in "wb1R". (See the description of deflateInit2 for more information + about the strategy parameter.) + + gzopen can be used to read a file which is not in gzip format; in this + case gzread will directly read from the file without decompression. + + gzopen returns NULL if the file could not be opened or if there was + insufficient memory to allocate the (de)compression state; errno + can be checked to distinguish the two cases (if errno is zero, the + zlib error is Z_MEM_ERROR). */ + +ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode)); +/* + gzdopen() associates a gzFile with the file descriptor fd. File + descriptors are obtained from calls like open, dup, creat, pipe or + fileno (in the file has been previously opened with fopen). + The mode parameter is as in gzopen. + The next call of gzclose on the returned gzFile will also close the + file descriptor fd, just like fclose(fdopen(fd), mode) closes the file + descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode). + gzdopen returns NULL if there was insufficient memory to allocate + the (de)compression state. +*/ + +ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy)); +/* + Dynamically update the compression level or strategy. See the description + of deflateInit2 for the meaning of these parameters. + gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not + opened for writing. +*/ + +ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len)); +/* + Reads the given number of uncompressed bytes from the compressed file. + If the input file was not in gzip format, gzread copies the given number + of bytes into the buffer. + gzread returns the number of uncompressed bytes actually read (0 for + end of file, -1 for error). */ + +ZEXTERN int ZEXPORT gzwrite OF((gzFile file, + voidpc buf, unsigned len)); +/* + Writes the given number of uncompressed bytes into the compressed file. + gzwrite returns the number of uncompressed bytes actually written + (0 in case of error). +*/ + +ZEXTERN int ZEXPORTVA gzprintf OF((gzFile file, const char *format, ...)); +/* + Converts, formats, and writes the args to the compressed file under + control of the format string, as in fprintf. gzprintf returns the number of + uncompressed bytes actually written (0 in case of error). The number of + uncompressed bytes written is limited to 4095. The caller should assure that + this limit is not exceeded. If it is exceeded, then gzprintf() will return + return an error (0) with nothing written. In this case, there may also be a + buffer overflow with unpredictable consequences, which is possible only if + zlib was compiled with the insecure functions sprintf() or vsprintf() + because the secure snprintf() or vsnprintf() functions were not available. +*/ + +ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s)); +/* + Writes the given null-terminated string to the compressed file, excluding + the terminating null character. + gzputs returns the number of characters written, or -1 in case of error. +*/ + +ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len)); +/* + Reads bytes from the compressed file until len-1 characters are read, or + a newline character is read and transferred to buf, or an end-of-file + condition is encountered. The string is then terminated with a null + character. + gzgets returns buf, or Z_NULL in case of error. +*/ + +ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c)); +/* + Writes c, converted to an unsigned char, into the compressed file. + gzputc returns the value that was written, or -1 in case of error. +*/ + +ZEXTERN int ZEXPORT gzgetc OF((gzFile file)); +/* + Reads one byte from the compressed file. gzgetc returns this byte + or -1 in case of end of file or error. +*/ + +ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file)); +/* + Push one character back onto the stream to be read again later. + Only one character of push-back is allowed. gzungetc() returns the + character pushed, or -1 on failure. gzungetc() will fail if a + character has been pushed but not read yet, or if c is -1. The pushed + character will be discarded if the stream is repositioned with gzseek() + or gzrewind(). +*/ + +ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush)); +/* + Flushes all pending output into the compressed file. The parameter + flush is as in the deflate() function. The return value is the zlib + error number (see function gzerror below). gzflush returns Z_OK if + the flush parameter is Z_FINISH and all output could be flushed. + gzflush should be called only when strictly necessary because it can + degrade compression. +*/ + +ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file, + z_off_t offset, int whence)); +/* + Sets the starting position for the next gzread or gzwrite on the + given compressed file. The offset represents a number of bytes in the + uncompressed data stream. The whence parameter is defined as in lseek(2); + the value SEEK_END is not supported. + If the file is opened for reading, this function is emulated but can be + extremely slow. If the file is opened for writing, only forward seeks are + supported; gzseek then compresses a sequence of zeroes up to the new + starting position. + + gzseek returns the resulting offset location as measured in bytes from + the beginning of the uncompressed stream, or -1 in case of error, in + particular if the file is opened for writing and the new starting position + would be before the current position. +*/ + +ZEXTERN int ZEXPORT gzrewind OF((gzFile file)); +/* + Rewinds the given file. This function is supported only for reading. + + gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET) +*/ + +ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file)); +/* + Returns the starting position for the next gzread or gzwrite on the + given compressed file. This position represents a number of bytes in the + uncompressed data stream. + + gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR) +*/ + +ZEXTERN int ZEXPORT gzeof OF((gzFile file)); +/* + Returns 1 when EOF has previously been detected reading the given + input stream, otherwise zero. +*/ + +ZEXTERN int ZEXPORT gzdirect OF((gzFile file)); +/* + Returns 1 if file is being read directly without decompression, otherwise + zero. +*/ + +ZEXTERN int ZEXPORT gzclose OF((gzFile file)); +/* + Flushes all pending output if necessary, closes the compressed file + and deallocates all the (de)compression state. The return value is the zlib + error number (see function gzerror below). +*/ + +ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum)); +/* + Returns the error message for the last error which occurred on the + given compressed file. errnum is set to zlib error number. If an + error occurred in the file system and not in the compression library, + errnum is set to Z_ERRNO and the application may consult errno + to get the exact error code. +*/ + +ZEXTERN void ZEXPORT gzclearerr OF((gzFile file)); +/* + Clears the error and end-of-file flags for file. This is analogous to the + clearerr() function in stdio. This is useful for continuing to read a gzip + file that is being written concurrently. +*/ + + /* checksum functions */ + +/* + These functions are not related to compression but are exported + anyway because they might be useful in applications using the + compression library. +*/ + +ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len)); +/* + Update a running Adler-32 checksum with the bytes buf[0..len-1] and + return the updated checksum. If buf is NULL, this function returns + the required initial value for the checksum. + An Adler-32 checksum is almost as reliable as a CRC32 but can be computed + much faster. Usage example: + + uLong adler = adler32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + adler = adler32(adler, buffer, length); + } + if (adler != original_adler) error(); +*/ + +ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2, + z_off_t len2)); +/* + Combine two Adler-32 checksums into one. For two sequences of bytes, seq1 + and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for + each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of + seq1 and seq2 concatenated, requiring only adler1, adler2, and len2. +*/ + +ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)); +/* + Update a running CRC-32 with the bytes buf[0..len-1] and return the + updated CRC-32. If buf is NULL, this function returns the required initial + value for the for the crc. Pre- and post-conditioning (one's complement) is + performed within this function so it shouldn't be done by the application. + Usage example: + + uLong crc = crc32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + crc = crc32(crc, buffer, length); + } + if (crc != original_crc) error(); +*/ + +ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2)); + +/* + Combine two CRC-32 check values into one. For two sequences of bytes, + seq1 and seq2 with lengths len1 and len2, CRC-32 check values were + calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32 + check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and + len2. +*/ + + + /* various hacks, don't look :) */ + +/* deflateInit and inflateInit are macros to allow checking the zlib version + * and the compiler's view of z_stream: + */ +ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method, + int windowBits, int memLevel, + int strategy, const char *version, + int stream_size)); +ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits, + unsigned char FAR *window, + const char *version, + int stream_size)); +#define deflateInit(strm, level) \ + deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream)) +#define inflateInit(strm) \ + inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream)) +#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \ + deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\ + (strategy), ZLIB_VERSION, sizeof(z_stream)) +#define inflateInit2(strm, windowBits) \ + inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream)) +#define inflateBackInit(strm, windowBits, window) \ + inflateBackInit_((strm), (windowBits), (window), \ + ZLIB_VERSION, sizeof(z_stream)) + + +#if !defined(_ZUTIL_H) && !defined(NO_DUMMY_DECL) + struct internal_state {int dummy;}; /* hack for buggy compilers */ +#endif + +ZEXTERN const char * ZEXPORT zError OF((int)); +ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp z)); +ZEXTERN const uLongf * ZEXPORT get_crc_table OF((void)); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZLIB_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c b/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c new file mode 100644 index 000000000000..a251331ac910 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c @@ -0,0 +1,113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/zmod.h> + +#include "zlib.h" +#include "zutil.h" + +/* + * Uncompress the buffer 'src' into the buffer 'dst'. The caller must store + * the expected decompressed data size externally so it can be passed in. + * The resulting decompressed size is then returned through dstlen. This + * function return Z_OK on success, or another error code on failure. + */ +int +z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen) +{ + z_stream zs; + int err; + + bzero(&zs, sizeof (zs)); + zs.next_in = (uchar_t *)src; + zs.avail_in = srclen; + zs.next_out = dst; + zs.avail_out = *dstlen; + + /* + * Call inflateInit2() specifying a window size of DEF_WBITS + * with the 6th bit set to indicate that the compression format + * type (zlib or gzip) should be automatically detected. + */ + if ((err = inflateInit2(&zs, DEF_WBITS | 0x20)) != Z_OK) + return (err); + + if ((err = inflate(&zs, Z_FINISH)) != Z_STREAM_END) { + (void) inflateEnd(&zs); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + + *dstlen = zs.total_out; + return (inflateEnd(&zs)); +} + +int +z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen, + int level) +{ + + z_stream zs; + int err; + + bzero(&zs, sizeof (zs)); + zs.next_in = (uchar_t *)src; + zs.avail_in = srclen; + zs.next_out = dst; + zs.avail_out = *dstlen; + + if ((err = deflateInit(&zs, level)) != Z_OK) + return (err); + + if ((err = deflate(&zs, Z_FINISH)) != Z_STREAM_END) { + (void) deflateEnd(&zs); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + + *dstlen = zs.total_out; + return (deflateEnd(&zs)); +} + +int +z_compress(void *dst, size_t *dstlen, const void *src, size_t srclen) +{ + return (z_compress_level(dst, dstlen, src, srclen, + Z_DEFAULT_COMPRESSION)); +} + +/* + * Convert a zlib error code into a string error message. + */ +const char * +z_strerror(int err) +{ + int i = Z_NEED_DICT - err; + + if (i < 0 || i > Z_NEED_DICT - Z_VERSION_ERROR) + return ("unknown error"); + + return (zError(err)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod_subr.c b/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod_subr.c new file mode 100644 index 000000000000..0542712c37f5 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod_subr.c @@ -0,0 +1,84 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/systm.h> +#include <sys/cmn_err.h> +#include <sys/kobj.h> + +struct zchdr { + uint_t zch_magic; + uint_t zch_size; +}; + +#define ZCH_MAGIC 0x3cc13cc1 + +/*ARGSUSED*/ +void * +zcalloc(void *opaque, uint_t items, uint_t size) +{ + size_t nbytes = sizeof (struct zchdr) + items * size; + struct zchdr *z = kobj_zalloc(nbytes, KM_NOWAIT|KM_TMP); + + if (z == NULL) + return (NULL); + + z->zch_magic = ZCH_MAGIC; + z->zch_size = nbytes; + + return (z + 1); +} + +/*ARGSUSED*/ +void +zcfree(void *opaque, void *ptr) +{ + struct zchdr *z = ((struct zchdr *)ptr) - 1; + + if (z->zch_magic != ZCH_MAGIC) + panic("zcfree region corrupt: hdr=%p ptr=%p", (void *)z, ptr); + + kobj_free(z, z->zch_size); +} + +void +zmemcpy(void *dest, const void *source, uint_t len) +{ + bcopy(source, dest, len); +} + +int +zmemcmp(const void *s1, const void *s2, uint_t len) +{ + return (bcmp(s1, s2, len)); +} + +void +zmemzero(void *dest, uint_t len) +{ + bzero(dest, len); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/zutil.c b/sys/cddl/contrib/opensolaris/uts/common/zmod/zutil.c new file mode 100644 index 000000000000..7d46e30b3edf --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/zutil.c @@ -0,0 +1,324 @@ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* zutil.c -- target dependent utility functions for the compression library + * Copyright (C) 1995-2005 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "zutil.h" + +#ifndef NO_DUMMY_DECL +struct internal_state {int dummy;}; /* for buggy compilers */ +#endif + +const char * const z_errmsg[10] = { +"need dictionary", /* Z_NEED_DICT 2 */ +"stream end", /* Z_STREAM_END 1 */ +"", /* Z_OK 0 */ +"file error", /* Z_ERRNO (-1) */ +"stream error", /* Z_STREAM_ERROR (-2) */ +"data error", /* Z_DATA_ERROR (-3) */ +"insufficient memory", /* Z_MEM_ERROR (-4) */ +"buffer error", /* Z_BUF_ERROR (-5) */ +"incompatible version",/* Z_VERSION_ERROR (-6) */ +""}; + + +const char * ZEXPORT zlibVersion() +{ + return ZLIB_VERSION; +} + +uLong ZEXPORT zlibCompileFlags() +{ + uLong flags; + + flags = 0; + switch (sizeof(uInt)) { + case 2: break; + case 4: flags += 1; break; + case 8: flags += 2; break; + default: flags += 3; + } + switch (sizeof(uLong)) { + case 2: break; + case 4: flags += 1 << 2; break; + case 8: flags += 2 << 2; break; + default: flags += 3 << 2; + } + switch (sizeof(voidpf)) { + case 2: break; + case 4: flags += 1 << 4; break; + case 8: flags += 2 << 4; break; + default: flags += 3 << 4; + } + switch (sizeof(z_off_t)) { + case 2: break; + case 4: flags += 1 << 6; break; + case 8: flags += 2 << 6; break; + default: flags += 3 << 6; + } +#ifdef DEBUG + flags += 1 << 8; +#endif +#if defined(ASMV) || defined(ASMINF) + flags += 1 << 9; +#endif +#ifdef ZLIB_WINAPI + flags += 1 << 10; +#endif +#ifdef BUILDFIXED + flags += 1 << 12; +#endif +#ifdef DYNAMIC_CRC_TABLE + flags += 1 << 13; +#endif +#ifdef NO_GZCOMPRESS + flags += 1L << 16; +#endif +#ifdef NO_GZIP + flags += 1L << 17; +#endif +#ifdef PKZIP_BUG_WORKAROUND + flags += 1L << 20; +#endif +#ifdef FASTEST + flags += 1L << 21; +#endif +#ifdef STDC +# ifdef NO_vsnprintf + flags += 1L << 25; +# ifdef HAS_vsprintf_void + flags += 1L << 26; +# endif +# else +# ifdef HAS_vsnprintf_void + flags += 1L << 26; +# endif +# endif +#else + flags += 1L << 24; +# ifdef NO_snprintf + flags += 1L << 25; +# ifdef HAS_sprintf_void + flags += 1L << 26; +# endif +# else +# ifdef HAS_snprintf_void + flags += 1L << 26; +# endif +# endif +#endif + return flags; +} + +#ifdef DEBUG + +# ifndef verbose +# define verbose 0 +# endif +int z_verbose = verbose; + +void z_error (m) + char *m; +{ + fprintf(stderr, "%s\n", m); + exit(1); +} +#endif + +/* exported to allow conversion of error code to string for compress() and + * uncompress() + */ +const char * ZEXPORT zError(err) + int err; +{ + return ERR_MSG(err); +} + +#if defined(_WIN32_WCE) + /* The Microsoft C Run-Time Library for Windows CE doesn't have + * errno. We define it as a global variable to simplify porting. + * Its value is always 0 and should not be used. + */ + int errno = 0; +#endif + +#define HAVE_MEMCPY +#ifndef HAVE_MEMCPY + +void zmemcpy(dest, source, len) + Bytef* dest; + const Bytef* source; + uInt len; +{ + if (len == 0) return; + do { + *dest++ = *source++; /* ??? to be unrolled */ + } while (--len != 0); +} + +int zmemcmp(s1, s2, len) + const Bytef* s1; + const Bytef* s2; + uInt len; +{ + uInt j; + + for (j = 0; j < len; j++) { + if (s1[j] != s2[j]) return 2*(s1[j] > s2[j])-1; + } + return 0; +} + +void zmemzero(dest, len) + Bytef* dest; + uInt len; +{ + if (len == 0) return; + do { + *dest++ = 0; /* ??? to be unrolled */ + } while (--len != 0); +} +#endif + + +#ifdef SYS16BIT + +#ifdef __TURBOC__ +/* Turbo C in 16-bit mode */ + +# define MY_ZCALLOC + +/* Turbo C malloc() does not allow dynamic allocation of 64K bytes + * and farmalloc(64K) returns a pointer with an offset of 8, so we + * must fix the pointer. Warning: the pointer must be put back to its + * original form in order to free it, use zcfree(). + */ + +#define MAX_PTR 10 +/* 10*64K = 640K */ + +local int next_ptr = 0; + +typedef struct ptr_table_s { + voidpf org_ptr; + voidpf new_ptr; +} ptr_table; + +local ptr_table table[MAX_PTR]; +/* This table is used to remember the original form of pointers + * to large buffers (64K). Such pointers are normalized with a zero offset. + * Since MSDOS is not a preemptive multitasking OS, this table is not + * protected from concurrent access. This hack doesn't work anyway on + * a protected system like OS/2. Use Microsoft C instead. + */ + +voidpf zcalloc (voidpf opaque, unsigned items, unsigned size) +{ + voidpf buf = opaque; /* just to make some compilers happy */ + ulg bsize = (ulg)items*size; + + /* If we allocate less than 65520 bytes, we assume that farmalloc + * will return a usable pointer which doesn't have to be normalized. + */ + if (bsize < 65520L) { + buf = farmalloc(bsize); + if (*(ush*)&buf != 0) return buf; + } else { + buf = farmalloc(bsize + 16L); + } + if (buf == NULL || next_ptr >= MAX_PTR) return NULL; + table[next_ptr].org_ptr = buf; + + /* Normalize the pointer to seg:0 */ + *((ush*)&buf+1) += ((ush)((uch*)buf-0) + 15) >> 4; + *(ush*)&buf = 0; + table[next_ptr++].new_ptr = buf; + return buf; +} + +void zcfree (voidpf opaque, voidpf ptr) +{ + int n; + if (*(ush*)&ptr != 0) { /* object < 64K */ + farfree(ptr); + return; + } + /* Find the original pointer */ + for (n = 0; n < next_ptr; n++) { + if (ptr != table[n].new_ptr) continue; + + farfree(table[n].org_ptr); + while (++n < next_ptr) { + table[n-1] = table[n]; + } + next_ptr--; + return; + } + ptr = opaque; /* just to make some compilers happy */ + Assert(0, "zcfree: ptr not found"); +} + +#endif /* __TURBOC__ */ + + +#ifdef M_I86 +/* Microsoft C in 16-bit mode */ + +# define MY_ZCALLOC + +#if (!defined(_MSC_VER) || (_MSC_VER <= 600)) +# define _halloc halloc +# define _hfree hfree +#endif + +voidpf zcalloc (voidpf opaque, unsigned items, unsigned size) +{ + if (opaque) opaque = 0; /* to make compiler happy */ + return _halloc((long)items, size); +} + +void zcfree (voidpf opaque, voidpf ptr) +{ + if (opaque) opaque = 0; /* to make compiler happy */ + _hfree(ptr); +} + +#endif /* M_I86 */ + +#endif /* SYS16BIT */ + + +#ifndef MY_ZCALLOC /* Any system without a special alloc function */ + +#ifndef STDC +extern voidp malloc OF((uInt size)); +extern voidp calloc OF((uInt items, uInt size)); +extern void free OF((voidpf ptr)); +#endif + +voidpf zcalloc (opaque, items, size) + voidpf opaque; + unsigned items; + unsigned size; +{ + if (opaque) items += size - size; /* make compiler happy */ + return sizeof(uInt) > 2 ? (voidpf)malloc(items * size) : + (voidpf)calloc(items, size); +} + +void zcfree (opaque, ptr) + voidpf opaque; + voidpf ptr; +{ + free(ptr); + if (opaque) return; /* make compiler happy */ +} + +#endif /* MY_ZCALLOC */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/zmod/zutil.h b/sys/cddl/contrib/opensolaris/uts/common/zmod/zutil.h new file mode 100644 index 000000000000..1d02c1d09783 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/zmod/zutil.h @@ -0,0 +1,274 @@ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* zutil.h -- internal interface and configuration of the compression library + * Copyright (C) 1995-2005 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +#ifndef _ZUTIL_H +#define _ZUTIL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#define ZLIB_INTERNAL +#include "zlib.h" + +#ifdef STDC +# ifndef _WIN32_WCE +# include <stddef.h> +# endif +# include <string.h> +# include <stdlib.h> +#endif +#ifdef NO_ERRNO_H +# ifdef _WIN32_WCE + /* The Microsoft C Run-Time Library for Windows CE doesn't have + * errno. We define it as a global variable to simplify porting. + * Its value is always 0 and should not be used. We rename it to + * avoid conflict with other libraries that use the same workaround. + */ +# define errno z_errno +# endif + extern int errno; +#else +# ifndef _WIN32_WCE +# include <sys/errno.h> +# endif +#endif + +#ifndef local +# define local static +#endif +/* compile with -Dlocal if your debugger can't find static symbols */ + +typedef unsigned char uch; +typedef uch FAR uchf; +typedef unsigned short ush; +typedef ush FAR ushf; +typedef unsigned long ulg; + +extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */ +/* (size given to avoid silly warnings with Visual C++) */ + +#define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)] + +#define ERR_RETURN(strm,err) \ + return (strm->msg = (char*)ERR_MSG(err), (err)) +/* To be used only when the state is known to be valid */ + + /* common constants */ + +#ifndef DEF_WBITS +# define DEF_WBITS MAX_WBITS +#endif +/* default windowBits for decompression. MAX_WBITS is for compression only */ + +#if MAX_MEM_LEVEL >= 8 +# define DEF_MEM_LEVEL 8 +#else +# define DEF_MEM_LEVEL MAX_MEM_LEVEL +#endif +/* default memLevel */ + +#define STORED_BLOCK 0 +#define STATIC_TREES 1 +#define DYN_TREES 2 +/* The three kinds of block type */ + +#define MIN_MATCH 3 +#define MAX_MATCH 258 +/* The minimum and maximum match lengths */ + +#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */ + + /* target dependencies */ + +#if defined(MSDOS) || (defined(WINDOWS) && !defined(WIN32)) +# define OS_CODE 0x00 +# if defined(__TURBOC__) || defined(__BORLANDC__) +# if(__STDC__ == 1) && (defined(__LARGE__) || defined(__COMPACT__)) + /* Allow compilation with ANSI keywords only enabled */ + void _Cdecl farfree( void *block ); + void *_Cdecl farmalloc( unsigned long nbytes ); +# else +# include <alloc.h> +# endif +# else /* MSC or DJGPP */ +# include <malloc.h> +# endif +#endif + +#ifdef AMIGA +# define OS_CODE 0x01 +#endif + +#if defined(VAXC) || defined(VMS) +# define OS_CODE 0x02 +# define F_OPEN(name, mode) \ + fopen((name), (mode), "mbc=60", "ctx=stm", "rfm=fix", "mrs=512") +#endif + +#if defined(ATARI) || defined(atarist) +# define OS_CODE 0x05 +#endif + +#ifdef OS2 +# define OS_CODE 0x06 +# ifdef M_I86 + #include <malloc.h> +# endif +#endif + +#if defined(MACOS) || defined(TARGET_OS_MAC) +# define OS_CODE 0x07 +# if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os +# include <unix.h> /* for fdopen */ +# else +# ifndef fdopen +# define fdopen(fd,mode) NULL /* No fdopen() */ +# endif +# endif +#endif + +#ifdef TOPS20 +# define OS_CODE 0x0a +#endif + +#ifdef WIN32 +# ifndef __CYGWIN__ /* Cygwin is Unix, not Win32 */ +# define OS_CODE 0x0b +# endif +#endif + +#ifdef __50SERIES /* Prime/PRIMOS */ +# define OS_CODE 0x0f +#endif + +#if defined(_BEOS_) || defined(RISCOS) +# define fdopen(fd,mode) NULL /* No fdopen() */ +#endif + +#if (defined(_MSC_VER) && (_MSC_VER > 600)) +# if defined(_WIN32_WCE) +# define fdopen(fd,mode) NULL /* No fdopen() */ +# ifndef _PTRDIFF_T_DEFINED + typedef int ptrdiff_t; +# define _PTRDIFF_T_DEFINED +# endif +# else +# define fdopen(fd,type) _fdopen(fd,type) +# endif +#endif + + /* common defaults */ + +#ifndef OS_CODE +# define OS_CODE 0x03 /* assume Unix */ +#endif + +#ifndef F_OPEN +# define F_OPEN(name, mode) fopen((name), (mode)) +#endif + + /* functions */ + +#if defined(STDC99) || (defined(__TURBOC__) && __TURBOC__ >= 0x550) +# ifndef HAVE_VSNPRINTF +# define HAVE_VSNPRINTF +# endif +#endif +#if defined(__CYGWIN__) +# ifndef HAVE_VSNPRINTF +# define HAVE_VSNPRINTF +# endif +#endif +#ifndef HAVE_VSNPRINTF +# ifdef MSDOS + /* vsnprintf may exist on some MS-DOS compilers (DJGPP?), + but for now we just assume it doesn't. */ +# define NO_vsnprintf +# endif +# ifdef __TURBOC__ +# define NO_vsnprintf +# endif +# ifdef WIN32 + /* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */ +# if !defined(vsnprintf) && !defined(NO_vsnprintf) +# define vsnprintf _vsnprintf +# endif +# endif +# ifdef __SASC +# define NO_vsnprintf +# endif +#endif +#ifdef VMS +# define NO_vsnprintf +#endif + +#if defined(pyr) +# define NO_MEMCPY +#endif +#if defined(SMALL_MEDIUM) && !defined(_MSC_VER) && !defined(__SC__) + /* Use our own functions for small and medium model with MSC <= 5.0. + * You may have to use the same strategy for Borland C (untested). + * The __SC__ check is for Symantec. + */ +# define NO_MEMCPY +#endif +#if defined(STDC) && !defined(HAVE_MEMCPY) && !defined(NO_MEMCPY) +# define HAVE_MEMCPY +#endif +#ifdef HAVE_MEMCPY +# ifdef SMALL_MEDIUM /* MSDOS small or medium model */ +# define zmemcpy _fmemcpy +# define zmemcmp _fmemcmp +# define zmemzero(dest, len) _fmemset(dest, 0, len) +# else +# define zmemcpy memcpy +# define zmemcmp memcmp +# define zmemzero(dest, len) memset(dest, 0, len) +# endif +#else + extern void zmemcpy OF((void* dest, const void* source, uInt len)); + extern int zmemcmp OF((const void* s1, const void* s2, uInt len)); + extern void zmemzero OF((void* dest, uInt len)); +#endif + +/* Diagnostic functions */ +#ifdef DEBUG +# include <stdio.h> + extern int z_verbose; + extern void z_error OF((char *m)); +# define Assert(cond,msg) {if(!(cond)) z_error(msg);} +# define Trace(x) {if (z_verbose>=0) fprintf x ;} +# define Tracev(x) {if (z_verbose>0) fprintf x ;} +# define Tracevv(x) {if (z_verbose>1) fprintf x ;} +# define Tracec(c,x) {if (z_verbose>0 && (c)) fprintf x ;} +# define Tracecv(c,x) {if (z_verbose>1 && (c)) fprintf x ;} +#else +# define Assert(cond,msg) +# define Trace(x) +# define Tracev(x) +# define Tracevv(x) +# define Tracec(c,x) +# define Tracecv(c,x) +#endif + + +voidpf zcalloc OF((voidpf opaque, unsigned items, unsigned size)); +void zcfree OF((voidpf opaque, voidpf ptr)); + +#define ZALLOC(strm, items, size) \ + (*((strm)->zalloc))((strm)->opaque, (items), (size)) +#define ZFREE(strm, addr) (*((strm)->zfree))((strm)->opaque, (voidpf)(addr)) +#define TRY_FREE(s, p) {if (p) ZFREE(s, p);} + +#endif /* _ZUTIL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c b/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c new file mode 100644 index 000000000000..f36c250167f8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c @@ -0,0 +1,1841 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Portions Copyright 2010 The FreeBSD Foundation + * + * $FreeBSD$ + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/fasttrap_isa.h> +#include <sys/fasttrap_impl.h> +#include <sys/dtrace.h> +#include <sys/dtrace_impl.h> +#include <sys/cmn_err.h> +#include <sys/types.h> +#include <sys/dtrace_bsd.h> +#include <sys/proc.h> +#include <sys/rmlock.h> +#include <cddl/dev/dtrace/dtrace_cddl.h> +#include <cddl/dev/dtrace/x86/regset.h> +#include <machine/segments.h> +#include <machine/reg.h> +#include <machine/pcb.h> +#include <machine/trap.h> +#include <sys/sysmacros.h> +#include <sys/ptrace.h> + +#ifdef __i386__ +#define r_rax r_eax +#define r_rbx r_ebx +#define r_rip r_eip +#define r_rflags r_eflags +#define r_rsp r_esp +#define r_rbp r_ebp +#endif + +/* + * Lossless User-Land Tracing on x86 + * --------------------------------- + * + * The execution of most instructions is not dependent on the address; for + * these instructions it is sufficient to copy them into the user process's + * address space and execute them. To effectively single-step an instruction + * in user-land, we copy out the following sequence of instructions to scratch + * space in the user thread's ulwp_t structure. + * + * We then set the program counter (%eip or %rip) to point to this scratch + * space. Once execution resumes, the original instruction is executed and + * then control flow is redirected to what was originally the subsequent + * instruction. If the kernel attemps to deliver a signal while single- + * stepping, the signal is deferred and the program counter is moved into the + * second sequence of instructions. The second sequence ends in a trap into + * the kernel where the deferred signal is then properly handled and delivered. + * + * For instructions whose execute is position dependent, we perform simple + * emulation. These instructions are limited to control transfer + * instructions in 32-bit mode, but in 64-bit mode there's the added wrinkle + * of %rip-relative addressing that means that almost any instruction can be + * position dependent. For all the details on how we emulate generic + * instructions included %rip-relative instructions, see the code in + * fasttrap_pid_probe() below where we handle instructions of type + * FASTTRAP_T_COMMON (under the header: Generic Instruction Tracing). + */ + +#define FASTTRAP_MODRM_MOD(modrm) (((modrm) >> 6) & 0x3) +#define FASTTRAP_MODRM_REG(modrm) (((modrm) >> 3) & 0x7) +#define FASTTRAP_MODRM_RM(modrm) ((modrm) & 0x7) +#define FASTTRAP_MODRM(mod, reg, rm) (((mod) << 6) | ((reg) << 3) | (rm)) + +#define FASTTRAP_SIB_SCALE(sib) (((sib) >> 6) & 0x3) +#define FASTTRAP_SIB_INDEX(sib) (((sib) >> 3) & 0x7) +#define FASTTRAP_SIB_BASE(sib) ((sib) & 0x7) + +#define FASTTRAP_REX_W(rex) (((rex) >> 3) & 1) +#define FASTTRAP_REX_R(rex) (((rex) >> 2) & 1) +#define FASTTRAP_REX_X(rex) (((rex) >> 1) & 1) +#define FASTTRAP_REX_B(rex) ((rex) & 1) +#define FASTTRAP_REX(w, r, x, b) \ + (0x40 | ((w) << 3) | ((r) << 2) | ((x) << 1) | (b)) + +/* + * Single-byte op-codes. + */ +#define FASTTRAP_PUSHL_EBP 0x55 + +#define FASTTRAP_JO 0x70 +#define FASTTRAP_JNO 0x71 +#define FASTTRAP_JB 0x72 +#define FASTTRAP_JAE 0x73 +#define FASTTRAP_JE 0x74 +#define FASTTRAP_JNE 0x75 +#define FASTTRAP_JBE 0x76 +#define FASTTRAP_JA 0x77 +#define FASTTRAP_JS 0x78 +#define FASTTRAP_JNS 0x79 +#define FASTTRAP_JP 0x7a +#define FASTTRAP_JNP 0x7b +#define FASTTRAP_JL 0x7c +#define FASTTRAP_JGE 0x7d +#define FASTTRAP_JLE 0x7e +#define FASTTRAP_JG 0x7f + +#define FASTTRAP_NOP 0x90 + +#define FASTTRAP_MOV_EAX 0xb8 +#define FASTTRAP_MOV_ECX 0xb9 + +#define FASTTRAP_RET16 0xc2 +#define FASTTRAP_RET 0xc3 + +#define FASTTRAP_LOOPNZ 0xe0 +#define FASTTRAP_LOOPZ 0xe1 +#define FASTTRAP_LOOP 0xe2 +#define FASTTRAP_JCXZ 0xe3 + +#define FASTTRAP_CALL 0xe8 +#define FASTTRAP_JMP32 0xe9 +#define FASTTRAP_JMP8 0xeb + +#define FASTTRAP_INT3 0xcc +#define FASTTRAP_INT 0xcd + +#define FASTTRAP_2_BYTE_OP 0x0f +#define FASTTRAP_GROUP5_OP 0xff + +/* + * Two-byte op-codes (second byte only). + */ +#define FASTTRAP_0F_JO 0x80 +#define FASTTRAP_0F_JNO 0x81 +#define FASTTRAP_0F_JB 0x82 +#define FASTTRAP_0F_JAE 0x83 +#define FASTTRAP_0F_JE 0x84 +#define FASTTRAP_0F_JNE 0x85 +#define FASTTRAP_0F_JBE 0x86 +#define FASTTRAP_0F_JA 0x87 +#define FASTTRAP_0F_JS 0x88 +#define FASTTRAP_0F_JNS 0x89 +#define FASTTRAP_0F_JP 0x8a +#define FASTTRAP_0F_JNP 0x8b +#define FASTTRAP_0F_JL 0x8c +#define FASTTRAP_0F_JGE 0x8d +#define FASTTRAP_0F_JLE 0x8e +#define FASTTRAP_0F_JG 0x8f + +#define FASTTRAP_EFLAGS_OF 0x800 +#define FASTTRAP_EFLAGS_DF 0x400 +#define FASTTRAP_EFLAGS_SF 0x080 +#define FASTTRAP_EFLAGS_ZF 0x040 +#define FASTTRAP_EFLAGS_AF 0x010 +#define FASTTRAP_EFLAGS_PF 0x004 +#define FASTTRAP_EFLAGS_CF 0x001 + +/* + * Instruction prefixes. + */ +#define FASTTRAP_PREFIX_OPERAND 0x66 +#define FASTTRAP_PREFIX_ADDRESS 0x67 +#define FASTTRAP_PREFIX_CS 0x2E +#define FASTTRAP_PREFIX_DS 0x3E +#define FASTTRAP_PREFIX_ES 0x26 +#define FASTTRAP_PREFIX_FS 0x64 +#define FASTTRAP_PREFIX_GS 0x65 +#define FASTTRAP_PREFIX_SS 0x36 +#define FASTTRAP_PREFIX_LOCK 0xF0 +#define FASTTRAP_PREFIX_REP 0xF3 +#define FASTTRAP_PREFIX_REPNE 0xF2 + +#define FASTTRAP_NOREG 0xff + +/* + * Map between instruction register encodings and the kernel constants which + * correspond to indicies into struct regs. + */ +#ifdef __amd64 +static const uint8_t regmap[16] = { + REG_RAX, REG_RCX, REG_RDX, REG_RBX, REG_RSP, REG_RBP, REG_RSI, REG_RDI, + REG_R8, REG_R9, REG_R10, REG_R11, REG_R12, REG_R13, REG_R14, REG_R15, +}; +#else +static const uint8_t regmap[8] = { + EAX, ECX, EDX, EBX, UESP, EBP, ESI, EDI +}; +#endif + +static ulong_t fasttrap_getreg(struct reg *, uint_t); + +static uint64_t +fasttrap_anarg(struct reg *rp, int function_entry, int argno) +{ + uint64_t value = 0; + int shift = function_entry ? 1 : 0; + +#ifdef __amd64 + if (curproc->p_model == DATAMODEL_LP64) { + uintptr_t *stack; + + /* + * In 64-bit mode, the first six arguments are stored in + * registers. + */ + if (argno < 6) + switch (argno) { + case 0: + return (rp->r_rdi); + case 1: + return (rp->r_rsi); + case 2: + return (rp->r_rdx); + case 3: + return (rp->r_rcx); + case 4: + return (rp->r_r8); + case 5: + return (rp->r_r9); + } + + stack = (uintptr_t *)rp->r_rsp; + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + value = dtrace_fulword(&stack[argno - 6 + shift]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR); + } else { +#endif + uint32_t *stack = (uint32_t *)rp->r_rsp; + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + value = dtrace_fuword32(&stack[argno + shift]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR); +#ifdef __amd64 + } +#endif + + return (value); +} + +/*ARGSUSED*/ +int +fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, uintptr_t pc, + fasttrap_probe_type_t type) +{ + uint8_t instr[FASTTRAP_MAX_INSTR_SIZE + 10]; + size_t len = FASTTRAP_MAX_INSTR_SIZE; + size_t first = MIN(len, PAGESIZE - (pc & PAGEOFFSET)); + uint_t start = 0; + int rmindex, size; + uint8_t seg, rex = 0; + + /* + * Read the instruction at the given address out of the process's + * address space. We don't have to worry about a debugger + * changing this instruction before we overwrite it with our trap + * instruction since P_PR_LOCK is set. Since instructions can span + * pages, we potentially read the instruction in two parts. If the + * second part fails, we just zero out that part of the instruction. + */ + if (uread(p, &instr[0], first, pc) != 0) + return (-1); + if (len > first && + uread(p, &instr[first], len - first, pc + first) != 0) { + bzero(&instr[first], len - first); + len = first; + } + + /* + * If the disassembly fails, then we have a malformed instruction. + */ + if ((size = dtrace_instr_size_isa(instr, p->p_model, &rmindex)) <= 0) + return (-1); + + /* + * Make sure the disassembler isn't completely broken. + */ + ASSERT(-1 <= rmindex && rmindex < size); + + /* + * If the computed size is greater than the number of bytes read, + * then it was a malformed instruction possibly because it fell on a + * page boundary and the subsequent page was missing or because of + * some malicious user. + */ + if (size > len) + return (-1); + + tp->ftt_size = (uint8_t)size; + tp->ftt_segment = FASTTRAP_SEG_NONE; + + /* + * Find the start of the instruction's opcode by processing any + * legacy prefixes. + */ + for (;;) { + seg = 0; + switch (instr[start]) { + case FASTTRAP_PREFIX_SS: + seg++; + /*FALLTHRU*/ + case FASTTRAP_PREFIX_GS: + seg++; + /*FALLTHRU*/ + case FASTTRAP_PREFIX_FS: + seg++; + /*FALLTHRU*/ + case FASTTRAP_PREFIX_ES: + seg++; + /*FALLTHRU*/ + case FASTTRAP_PREFIX_DS: + seg++; + /*FALLTHRU*/ + case FASTTRAP_PREFIX_CS: + seg++; + /*FALLTHRU*/ + case FASTTRAP_PREFIX_OPERAND: + case FASTTRAP_PREFIX_ADDRESS: + case FASTTRAP_PREFIX_LOCK: + case FASTTRAP_PREFIX_REP: + case FASTTRAP_PREFIX_REPNE: + if (seg != 0) { + /* + * It's illegal for an instruction to specify + * two segment prefixes -- give up on this + * illegal instruction. + */ + if (tp->ftt_segment != FASTTRAP_SEG_NONE) + return (-1); + + tp->ftt_segment = seg; + } + start++; + continue; + } + break; + } + +#ifdef __amd64 + /* + * Identify the REX prefix on 64-bit processes. + */ + if (p->p_model == DATAMODEL_LP64 && (instr[start] & 0xf0) == 0x40) + rex = instr[start++]; +#endif + + /* + * Now that we're pretty sure that the instruction is okay, copy the + * valid part to the tracepoint. + */ + bcopy(instr, tp->ftt_instr, FASTTRAP_MAX_INSTR_SIZE); + + tp->ftt_type = FASTTRAP_T_COMMON; + if (instr[start] == FASTTRAP_2_BYTE_OP) { + switch (instr[start + 1]) { + case FASTTRAP_0F_JO: + case FASTTRAP_0F_JNO: + case FASTTRAP_0F_JB: + case FASTTRAP_0F_JAE: + case FASTTRAP_0F_JE: + case FASTTRAP_0F_JNE: + case FASTTRAP_0F_JBE: + case FASTTRAP_0F_JA: + case FASTTRAP_0F_JS: + case FASTTRAP_0F_JNS: + case FASTTRAP_0F_JP: + case FASTTRAP_0F_JNP: + case FASTTRAP_0F_JL: + case FASTTRAP_0F_JGE: + case FASTTRAP_0F_JLE: + case FASTTRAP_0F_JG: + tp->ftt_type = FASTTRAP_T_JCC; + tp->ftt_code = (instr[start + 1] & 0x0f) | FASTTRAP_JO; + tp->ftt_dest = pc + tp->ftt_size + + /* LINTED - alignment */ + *(int32_t *)&instr[start + 2]; + break; + } + } else if (instr[start] == FASTTRAP_GROUP5_OP) { + uint_t mod = FASTTRAP_MODRM_MOD(instr[start + 1]); + uint_t reg = FASTTRAP_MODRM_REG(instr[start + 1]); + uint_t rm = FASTTRAP_MODRM_RM(instr[start + 1]); + + if (reg == 2 || reg == 4) { + uint_t i, sz; + + if (reg == 2) + tp->ftt_type = FASTTRAP_T_CALL; + else + tp->ftt_type = FASTTRAP_T_JMP; + + if (mod == 3) + tp->ftt_code = 2; + else + tp->ftt_code = 1; + + ASSERT(p->p_model == DATAMODEL_LP64 || rex == 0); + + /* + * See AMD x86-64 Architecture Programmer's Manual + * Volume 3, Section 1.2.7, Table 1-12, and + * Appendix A.3.1, Table A-15. + */ + if (mod != 3 && rm == 4) { + uint8_t sib = instr[start + 2]; + uint_t index = FASTTRAP_SIB_INDEX(sib); + uint_t base = FASTTRAP_SIB_BASE(sib); + + tp->ftt_scale = FASTTRAP_SIB_SCALE(sib); + + tp->ftt_index = (index == 4) ? + FASTTRAP_NOREG : + regmap[index | (FASTTRAP_REX_X(rex) << 3)]; + tp->ftt_base = (mod == 0 && base == 5) ? + FASTTRAP_NOREG : + regmap[base | (FASTTRAP_REX_B(rex) << 3)]; + + i = 3; + sz = mod == 1 ? 1 : 4; + } else { + /* + * In 64-bit mode, mod == 0 and r/m == 5 + * denotes %rip-relative addressing; in 32-bit + * mode, the base register isn't used. In both + * modes, there is a 32-bit operand. + */ + if (mod == 0 && rm == 5) { +#ifdef __amd64 + if (p->p_model == DATAMODEL_LP64) + tp->ftt_base = REG_RIP; + else +#endif + tp->ftt_base = FASTTRAP_NOREG; + sz = 4; + } else { + uint8_t base = rm | + (FASTTRAP_REX_B(rex) << 3); + + tp->ftt_base = regmap[base]; + sz = mod == 1 ? 1 : mod == 2 ? 4 : 0; + } + tp->ftt_index = FASTTRAP_NOREG; + i = 2; + } + + if (sz == 1) { + tp->ftt_dest = *(int8_t *)&instr[start + i]; + } else if (sz == 4) { + /* LINTED - alignment */ + tp->ftt_dest = *(int32_t *)&instr[start + i]; + } else { + tp->ftt_dest = 0; + } + } + } else { + switch (instr[start]) { + case FASTTRAP_RET: + tp->ftt_type = FASTTRAP_T_RET; + break; + + case FASTTRAP_RET16: + tp->ftt_type = FASTTRAP_T_RET16; + /* LINTED - alignment */ + tp->ftt_dest = *(uint16_t *)&instr[start + 1]; + break; + + case FASTTRAP_JO: + case FASTTRAP_JNO: + case FASTTRAP_JB: + case FASTTRAP_JAE: + case FASTTRAP_JE: + case FASTTRAP_JNE: + case FASTTRAP_JBE: + case FASTTRAP_JA: + case FASTTRAP_JS: + case FASTTRAP_JNS: + case FASTTRAP_JP: + case FASTTRAP_JNP: + case FASTTRAP_JL: + case FASTTRAP_JGE: + case FASTTRAP_JLE: + case FASTTRAP_JG: + tp->ftt_type = FASTTRAP_T_JCC; + tp->ftt_code = instr[start]; + tp->ftt_dest = pc + tp->ftt_size + + (int8_t)instr[start + 1]; + break; + + case FASTTRAP_LOOPNZ: + case FASTTRAP_LOOPZ: + case FASTTRAP_LOOP: + tp->ftt_type = FASTTRAP_T_LOOP; + tp->ftt_code = instr[start]; + tp->ftt_dest = pc + tp->ftt_size + + (int8_t)instr[start + 1]; + break; + + case FASTTRAP_JCXZ: + tp->ftt_type = FASTTRAP_T_JCXZ; + tp->ftt_dest = pc + tp->ftt_size + + (int8_t)instr[start + 1]; + break; + + case FASTTRAP_CALL: + tp->ftt_type = FASTTRAP_T_CALL; + tp->ftt_dest = pc + tp->ftt_size + + /* LINTED - alignment */ + *(int32_t *)&instr[start + 1]; + tp->ftt_code = 0; + break; + + case FASTTRAP_JMP32: + tp->ftt_type = FASTTRAP_T_JMP; + tp->ftt_dest = pc + tp->ftt_size + + /* LINTED - alignment */ + *(int32_t *)&instr[start + 1]; + break; + case FASTTRAP_JMP8: + tp->ftt_type = FASTTRAP_T_JMP; + tp->ftt_dest = pc + tp->ftt_size + + (int8_t)instr[start + 1]; + break; + + case FASTTRAP_PUSHL_EBP: + if (start == 0) + tp->ftt_type = FASTTRAP_T_PUSHL_EBP; + break; + + case FASTTRAP_NOP: +#ifdef __amd64 + ASSERT(p->p_model == DATAMODEL_LP64 || rex == 0); + + /* + * On amd64 we have to be careful not to confuse a nop + * (actually xchgl %eax, %eax) with an instruction using + * the same opcode, but that does something different + * (e.g. xchgl %r8d, %eax or xcghq %r8, %rax). + */ + if (FASTTRAP_REX_B(rex) == 0) +#endif + tp->ftt_type = FASTTRAP_T_NOP; + break; + + case FASTTRAP_INT3: + /* + * The pid provider shares the int3 trap with debugger + * breakpoints so we can't instrument them. + */ + ASSERT(instr[start] == FASTTRAP_INSTR); + return (-1); + + case FASTTRAP_INT: + /* + * Interrupts seem like they could be traced with + * no negative implications, but it's possible that + * a thread could be redirected by the trap handling + * code which would eventually return to the + * instruction after the interrupt. If the interrupt + * were in our scratch space, the subsequent + * instruction might be overwritten before we return. + * Accordingly we refuse to instrument any interrupt. + */ + return (-1); + } + } + +#ifdef __amd64 + if (p->p_model == DATAMODEL_LP64 && tp->ftt_type == FASTTRAP_T_COMMON) { + /* + * If the process is 64-bit and the instruction type is still + * FASTTRAP_T_COMMON -- meaning we're going to copy it out an + * execute it -- we need to watch for %rip-relative + * addressing mode. See the portion of fasttrap_pid_probe() + * below where we handle tracepoints with type + * FASTTRAP_T_COMMON for how we emulate instructions that + * employ %rip-relative addressing. + */ + if (rmindex != -1) { + uint_t mod = FASTTRAP_MODRM_MOD(instr[rmindex]); + uint_t reg = FASTTRAP_MODRM_REG(instr[rmindex]); + uint_t rm = FASTTRAP_MODRM_RM(instr[rmindex]); + + ASSERT(rmindex > start); + + if (mod == 0 && rm == 5) { + /* + * We need to be sure to avoid other + * registers used by this instruction. While + * the reg field may determine the op code + * rather than denoting a register, assuming + * that it denotes a register is always safe. + * We leave the REX field intact and use + * whatever value's there for simplicity. + */ + if (reg != 0) { + tp->ftt_ripmode = FASTTRAP_RIP_1 | + (FASTTRAP_RIP_X * + FASTTRAP_REX_B(rex)); + rm = 0; + } else { + tp->ftt_ripmode = FASTTRAP_RIP_2 | + (FASTTRAP_RIP_X * + FASTTRAP_REX_B(rex)); + rm = 1; + } + + tp->ftt_modrm = tp->ftt_instr[rmindex]; + tp->ftt_instr[rmindex] = + FASTTRAP_MODRM(2, reg, rm); + } + } + } +#endif + + return (0); +} + +int +fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp) +{ + fasttrap_instr_t instr = FASTTRAP_INSTR; + + if (uwrite(p, &instr, 1, tp->ftt_pc) != 0) + return (-1); + + return (0); +} + +int +fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp) +{ + uint8_t instr; + + /* + * Distinguish between read or write failures and a changed + * instruction. + */ + if (uread(p, &instr, 1, tp->ftt_pc) != 0) + return (0); + if (instr != FASTTRAP_INSTR) + return (0); + if (uwrite(p, &tp->ftt_instr[0], 1, tp->ftt_pc) != 0) + return (-1); + + return (0); +} + +#ifdef __amd64 +static uintptr_t +fasttrap_fulword_noerr(const void *uaddr) +{ + uintptr_t ret; + + if ((ret = fasttrap_fulword(uaddr)) != -1) + return (ret); + + return (0); +} +#endif + +static uint32_t +fasttrap_fuword32_noerr(const void *uaddr) +{ + uint32_t ret; + + if ((ret = fasttrap_fuword32(uaddr)) != -1) + return (ret); + + return (0); +} + +static void +fasttrap_return_common(struct reg *rp, uintptr_t pc, pid_t pid, + uintptr_t new_pc) +{ + fasttrap_tracepoint_t *tp; + fasttrap_bucket_t *bucket; + fasttrap_id_t *id; + struct rm_priotracker tracker; + + rm_rlock(&fasttrap_tp_lock, &tracker); + bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)]; + + for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { + if (pid == tp->ftt_pid && pc == tp->ftt_pc && + tp->ftt_proc->ftpc_acount != 0) + break; + } + + /* + * Don't sweat it if we can't find the tracepoint again; unlike + * when we're in fasttrap_pid_probe(), finding the tracepoint here + * is not essential to the correct execution of the process. + */ + if (tp == NULL) { + rm_runlock(&fasttrap_tp_lock, &tracker); + return; + } + + for (id = tp->ftt_retids; id != NULL; id = id->fti_next) { + /* + * If there's a branch that could act as a return site, we + * need to trace it, and check here if the program counter is + * external to the function. + */ + if (tp->ftt_type != FASTTRAP_T_RET && + tp->ftt_type != FASTTRAP_T_RET16 && + new_pc - id->fti_probe->ftp_faddr < + id->fti_probe->ftp_fsize) + continue; + + dtrace_probe(id->fti_probe->ftp_id, + pc - id->fti_probe->ftp_faddr, + rp->r_rax, rp->r_rbx, 0, 0); + } + + rm_runlock(&fasttrap_tp_lock, &tracker); +} + +static void +fasttrap_sigsegv(proc_t *p, kthread_t *t, uintptr_t addr) +{ + ksiginfo_t ksi; + + ksiginfo_init(&ksi); + ksi.ksi_signo = SIGSEGV; + ksi.ksi_code = SEGV_MAPERR; + ksi.ksi_addr = (caddr_t)addr; + PROC_LOCK(p); + (void)tdksignal(t, SIGSEGV, &ksi); + PROC_UNLOCK(p); +} + +#ifdef __amd64 +static void +fasttrap_usdt_args64(fasttrap_probe_t *probe, struct reg *rp, int argc, + uintptr_t *argv) +{ + int i, x, cap = MIN(argc, probe->ftp_nargs); + uintptr_t *stack = (uintptr_t *)rp->r_rsp; + + for (i = 0; i < cap; i++) { + x = probe->ftp_argmap[i]; + + if (x < 6) + argv[i] = (&rp->r_rdi)[x]; + else + argv[i] = fasttrap_fulword_noerr(&stack[x]); + } + + for (; i < argc; i++) { + argv[i] = 0; + } +} +#endif + +static void +fasttrap_usdt_args32(fasttrap_probe_t *probe, struct reg *rp, int argc, + uint32_t *argv) +{ + int i, x, cap = MIN(argc, probe->ftp_nargs); + uint32_t *stack = (uint32_t *)rp->r_rsp; + + for (i = 0; i < cap; i++) { + x = probe->ftp_argmap[i]; + + argv[i] = fasttrap_fuword32_noerr(&stack[x]); + } + + for (; i < argc; i++) { + argv[i] = 0; + } +} + +static int +fasttrap_do_seg(fasttrap_tracepoint_t *tp, struct reg *rp, uintptr_t *addr) +{ + proc_t *p = curproc; +#ifdef __i386__ + struct segment_descriptor *desc; +#else + struct user_segment_descriptor *desc; +#endif + uint16_t sel = 0, ndx, type; + uintptr_t limit; + + switch (tp->ftt_segment) { + case FASTTRAP_SEG_CS: + sel = rp->r_cs; + break; + case FASTTRAP_SEG_DS: + sel = rp->r_ds; + break; + case FASTTRAP_SEG_ES: + sel = rp->r_es; + break; + case FASTTRAP_SEG_FS: + sel = rp->r_fs; + break; + case FASTTRAP_SEG_GS: + sel = rp->r_gs; + break; + case FASTTRAP_SEG_SS: + sel = rp->r_ss; + break; + } + + /* + * Make sure the given segment register specifies a user priority + * selector rather than a kernel selector. + */ + if (ISPL(sel) != SEL_UPL) + return (-1); + + ndx = IDXSEL(sel); + + /* + * Check the bounds and grab the descriptor out of the specified + * descriptor table. + */ + if (ISLDT(sel)) { +#ifdef __i386__ + if (ndx > p->p_md.md_ldt->ldt_len) + return (-1); + + desc = (struct segment_descriptor *) + p->p_md.md_ldt[ndx].ldt_base; +#else + if (ndx > max_ldt_segment) + return (-1); + + desc = (struct user_segment_descriptor *) + p->p_md.md_ldt[ndx].ldt_base; +#endif + + } else { + if (ndx >= NGDT) + return (-1); + +#ifdef __i386__ + desc = &gdt[ndx].sd; +#else + desc = &gdt[ndx]; +#endif + } + + /* + * The descriptor must have user privilege level and it must be + * present in memory. + */ + if (desc->sd_dpl != SEL_UPL || desc->sd_p != 1) + return (-1); + + type = desc->sd_type; + + /* + * If the S bit in the type field is not set, this descriptor can + * only be used in system context. + */ + if ((type & 0x10) != 0x10) + return (-1); + + limit = USD_GETLIMIT(desc) * (desc->sd_gran ? PAGESIZE : 1); + + if (tp->ftt_segment == FASTTRAP_SEG_CS) { + /* + * The code/data bit and readable bit must both be set. + */ + if ((type & 0xa) != 0xa) + return (-1); + + if (*addr > limit) + return (-1); + } else { + /* + * The code/data bit must be clear. + */ + if ((type & 0x8) != 0) + return (-1); + + /* + * If the expand-down bit is clear, we just check the limit as + * it would naturally be applied. Otherwise, we need to check + * that the address is the range [limit + 1 .. 0xffff] or + * [limit + 1 ... 0xffffffff] depending on if the default + * operand size bit is set. + */ + if ((type & 0x4) == 0) { + if (*addr > limit) + return (-1); + } else if (desc->sd_def32) { + if (*addr < limit + 1 || 0xffff < *addr) + return (-1); + } else { + if (*addr < limit + 1 || 0xffffffff < *addr) + return (-1); + } + } + + *addr += USD_GETBASE(desc); + + return (0); +} + +int +fasttrap_pid_probe(struct trapframe *tf) +{ + struct reg reg, *rp; + proc_t *p = curproc, *pp; + struct rm_priotracker tracker; + uint64_t gen; + uintptr_t pc; + uintptr_t new_pc = 0; + fasttrap_bucket_t *bucket; + fasttrap_tracepoint_t *tp, tp_local; + pid_t pid; + dtrace_icookie_t cookie; + uint_t is_enabled = 0; + + fill_frame_regs(tf, ®); + rp = ® + + pc = rp->r_rip - 1; + + /* + * It's possible that a user (in a veritable orgy of bad planning) + * could redirect this thread's flow of control before it reached the + * return probe fasttrap. In this case we need to kill the process + * since it's in a unrecoverable state. + */ + if (curthread->t_dtrace_step) { + ASSERT(curthread->t_dtrace_on); + fasttrap_sigtrap(p, curthread, pc); + return (0); + } + + /* + * Clear all user tracing flags. + */ + curthread->t_dtrace_ft = 0; + curthread->t_dtrace_pc = 0; + curthread->t_dtrace_npc = 0; + curthread->t_dtrace_scrpc = 0; + curthread->t_dtrace_astpc = 0; +#ifdef __amd64 + curthread->t_dtrace_regv = 0; +#endif + + /* + * Treat a child created by a call to vfork(2) as if it were its + * parent. We know that there's only one thread of control in such a + * process: this one. + */ + pp = p; + sx_slock(&proctree_lock); + while (pp->p_vmspace == pp->p_pptr->p_vmspace) + pp = pp->p_pptr; + pid = pp->p_pid; + if (pp != p) { + PROC_LOCK(pp); + if ((pp->p_flag & P_WEXIT) != 0) { + /* + * This can happen if the child was created with + * rfork(2). Userspace tracing cannot work reliably in + * such a scenario, but we can at least try. + */ + PROC_UNLOCK(pp); + sx_sunlock(&proctree_lock); + return (-1); + } + _PHOLD_LITE(pp); + PROC_UNLOCK(pp); + } + sx_sunlock(&proctree_lock); + + rm_rlock(&fasttrap_tp_lock, &tracker); + + bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)]; + + /* + * Lookup the tracepoint that the process just hit. + */ + for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { + if (pid == tp->ftt_pid && pc == tp->ftt_pc && + tp->ftt_proc->ftpc_acount != 0) + break; + } + + /* + * If we couldn't find a matching tracepoint, either a tracepoint has + * been inserted without using the pid<pid> ioctl interface (see + * fasttrap_ioctl), or somehow we have mislaid this tracepoint. + */ + if (tp == NULL) { + rm_runlock(&fasttrap_tp_lock, &tracker); + gen = atomic_load_acq_64(&pp->p_fasttrap_tp_gen); + if (pp != p) + PRELE(pp); + if (curthread->t_fasttrap_tp_gen != gen) { + /* + * At least one tracepoint associated with this PID has + * been removed from the table since #BP was raised. + * Speculate that we hit a tracepoint that has since + * been removed, and retry the instruction. + */ + curthread->t_fasttrap_tp_gen = gen; +#ifdef __amd64 + tf->tf_rip = pc; +#else + tf->tf_eip = pc; +#endif + return (0); + } + return (-1); + } + if (pp != p) + PRELE(pp); + + /* + * Set the program counter to the address of the traced instruction + * so that it looks right in ustack() output. + */ + rp->r_rip = pc; + + if (tp->ftt_ids != NULL) { + fasttrap_id_t *id; + +#ifdef __amd64 + if (p->p_model == DATAMODEL_LP64) { + for (id = tp->ftt_ids; id != NULL; id = id->fti_next) { + fasttrap_probe_t *probe = id->fti_probe; + + if (id->fti_ptype == DTFTP_ENTRY) { + /* + * We note that this was an entry + * probe to help ustack() find the + * first caller. + */ + cookie = dtrace_interrupt_disable(); + DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY); + dtrace_probe(probe->ftp_id, rp->r_rdi, + rp->r_rsi, rp->r_rdx, rp->r_rcx, + rp->r_r8); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY); + dtrace_interrupt_enable(cookie); + } else if (id->fti_ptype == DTFTP_IS_ENABLED) { + /* + * Note that in this case, we don't + * call dtrace_probe() since it's only + * an artificial probe meant to change + * the flow of control so that it + * encounters the true probe. + */ + is_enabled = 1; + } else if (probe->ftp_argmap == NULL) { + dtrace_probe(probe->ftp_id, rp->r_rdi, + rp->r_rsi, rp->r_rdx, rp->r_rcx, + rp->r_r8); + } else { + uintptr_t t[5]; + + fasttrap_usdt_args64(probe, rp, + sizeof (t) / sizeof (t[0]), t); + + dtrace_probe(probe->ftp_id, t[0], t[1], + t[2], t[3], t[4]); + } + } + } else { +#endif + uintptr_t s0, s1, s2, s3, s4, s5; + uint32_t *stack = (uint32_t *)rp->r_rsp; + + /* + * In 32-bit mode, all arguments are passed on the + * stack. If this is a function entry probe, we need + * to skip the first entry on the stack as it + * represents the return address rather than a + * parameter to the function. + */ + s0 = fasttrap_fuword32_noerr(&stack[0]); + s1 = fasttrap_fuword32_noerr(&stack[1]); + s2 = fasttrap_fuword32_noerr(&stack[2]); + s3 = fasttrap_fuword32_noerr(&stack[3]); + s4 = fasttrap_fuword32_noerr(&stack[4]); + s5 = fasttrap_fuword32_noerr(&stack[5]); + + for (id = tp->ftt_ids; id != NULL; id = id->fti_next) { + fasttrap_probe_t *probe = id->fti_probe; + + if (id->fti_ptype == DTFTP_ENTRY) { + /* + * We note that this was an entry + * probe to help ustack() find the + * first caller. + */ + cookie = dtrace_interrupt_disable(); + DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY); + dtrace_probe(probe->ftp_id, s1, s2, + s3, s4, s5); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY); + dtrace_interrupt_enable(cookie); + } else if (id->fti_ptype == DTFTP_IS_ENABLED) { + /* + * Note that in this case, we don't + * call dtrace_probe() since it's only + * an artificial probe meant to change + * the flow of control so that it + * encounters the true probe. + */ + is_enabled = 1; + } else if (probe->ftp_argmap == NULL) { + dtrace_probe(probe->ftp_id, s0, s1, + s2, s3, s4); + } else { + uint32_t t[5]; + + fasttrap_usdt_args32(probe, rp, + sizeof (t) / sizeof (t[0]), t); + + dtrace_probe(probe->ftp_id, t[0], t[1], + t[2], t[3], t[4]); + } + } +#ifdef __amd64 + } +#endif + } + + /* + * We're about to do a bunch of work so we cache a local copy of + * the tracepoint to emulate the instruction, and then find the + * tracepoint again later if we need to light up any return probes. + */ + tp_local = *tp; + rm_runlock(&fasttrap_tp_lock, &tracker); + tp = &tp_local; + + /* + * Set the program counter to appear as though the traced instruction + * had completely executed. This ensures that fasttrap_getreg() will + * report the expected value for REG_RIP. + */ + rp->r_rip = pc + tp->ftt_size; + + /* + * If there's an is-enabled probe connected to this tracepoint it + * means that there was a 'xorl %eax, %eax' or 'xorq %rax, %rax' + * instruction that was placed there by DTrace when the binary was + * linked. As this probe is, in fact, enabled, we need to stuff 1 + * into %eax or %rax. Accordingly, we can bypass all the instruction + * emulation logic since we know the inevitable result. It's possible + * that a user could construct a scenario where the 'is-enabled' + * probe was on some other instruction, but that would be a rather + * exotic way to shoot oneself in the foot. + */ + if (is_enabled) { + rp->r_rax = 1; + new_pc = rp->r_rip; + goto done; + } + + /* + * We emulate certain types of instructions to ensure correctness + * (in the case of position dependent instructions) or optimize + * common cases. The rest we have the thread execute back in user- + * land. + */ + switch (tp->ftt_type) { + case FASTTRAP_T_RET: + case FASTTRAP_T_RET16: + { + uintptr_t dst = 0; + uintptr_t addr = 0; + int ret = 0; + + /* + * We have to emulate _every_ facet of the behavior of a ret + * instruction including what happens if the load from %esp + * fails; in that case, we send a SIGSEGV. + */ +#ifdef __amd64 + if (p->p_model == DATAMODEL_NATIVE) { + ret = dst = fasttrap_fulword((void *)rp->r_rsp); + addr = rp->r_rsp + sizeof (uintptr_t); + } else { +#endif + uint32_t dst32; + ret = dst32 = fasttrap_fuword32((void *)rp->r_rsp); + dst = dst32; + addr = rp->r_rsp + sizeof (uint32_t); +#ifdef __amd64 + } +#endif + + if (ret == -1) { + fasttrap_sigsegv(p, curthread, rp->r_rsp); + new_pc = pc; + break; + } + + if (tp->ftt_type == FASTTRAP_T_RET16) + addr += tp->ftt_dest; + + rp->r_rsp = addr; + new_pc = dst; + break; + } + + case FASTTRAP_T_JCC: + { + uint_t taken = 0; + + switch (tp->ftt_code) { + case FASTTRAP_JO: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_OF) != 0; + break; + case FASTTRAP_JNO: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0; + break; + case FASTTRAP_JB: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) != 0; + break; + case FASTTRAP_JAE: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) == 0; + break; + case FASTTRAP_JE: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0; + break; + case FASTTRAP_JNE: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0; + break; + case FASTTRAP_JBE: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) != 0 || + (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0; + break; + case FASTTRAP_JA: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) == 0 && + (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0; + break; + case FASTTRAP_JS: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_SF) != 0; + break; + case FASTTRAP_JNS: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0; + break; + case FASTTRAP_JP: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_PF) != 0; + break; + case FASTTRAP_JNP: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_PF) == 0; + break; + case FASTTRAP_JL: + taken = ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) != + ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0); + break; + case FASTTRAP_JGE: + taken = ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) == + ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0); + break; + case FASTTRAP_JLE: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0 || + ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) != + ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0); + break; + case FASTTRAP_JG: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0 && + ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) == + ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0); + break; + + } + + if (taken) + new_pc = tp->ftt_dest; + else + new_pc = pc + tp->ftt_size; + break; + } + + case FASTTRAP_T_LOOP: + { + uint_t taken = 0; +#ifdef __amd64 + greg_t cx = rp->r_rcx--; +#else + greg_t cx = rp->r_ecx--; +#endif + + switch (tp->ftt_code) { + case FASTTRAP_LOOPNZ: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0 && + cx != 0; + break; + case FASTTRAP_LOOPZ: + taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0 && + cx != 0; + break; + case FASTTRAP_LOOP: + taken = (cx != 0); + break; + } + + if (taken) + new_pc = tp->ftt_dest; + else + new_pc = pc + tp->ftt_size; + break; + } + + case FASTTRAP_T_JCXZ: + { +#ifdef __amd64 + greg_t cx = rp->r_rcx; +#else + greg_t cx = rp->r_ecx; +#endif + + if (cx == 0) + new_pc = tp->ftt_dest; + else + new_pc = pc + tp->ftt_size; + break; + } + + case FASTTRAP_T_PUSHL_EBP: + { + int ret = 0; + +#ifdef __amd64 + if (p->p_model == DATAMODEL_NATIVE) { + rp->r_rsp -= sizeof (uintptr_t); + ret = fasttrap_sulword((void *)rp->r_rsp, rp->r_rbp); + } else { +#endif + rp->r_rsp -= sizeof (uint32_t); + ret = fasttrap_suword32((void *)rp->r_rsp, rp->r_rbp); +#ifdef __amd64 + } +#endif + + if (ret == -1) { + fasttrap_sigsegv(p, curthread, rp->r_rsp); + new_pc = pc; + break; + } + + new_pc = pc + tp->ftt_size; + break; + } + + case FASTTRAP_T_NOP: + new_pc = pc + tp->ftt_size; + break; + + case FASTTRAP_T_JMP: + case FASTTRAP_T_CALL: + if (tp->ftt_code == 0) { + new_pc = tp->ftt_dest; + } else { + uintptr_t value, addr = tp->ftt_dest; + + if (tp->ftt_base != FASTTRAP_NOREG) + addr += fasttrap_getreg(rp, tp->ftt_base); + if (tp->ftt_index != FASTTRAP_NOREG) + addr += fasttrap_getreg(rp, tp->ftt_index) << + tp->ftt_scale; + + if (tp->ftt_code == 1) { + /* + * If there's a segment prefix for this + * instruction, we'll need to check permissions + * and bounds on the given selector, and adjust + * the address accordingly. + */ + if (tp->ftt_segment != FASTTRAP_SEG_NONE && + fasttrap_do_seg(tp, rp, &addr) != 0) { + fasttrap_sigsegv(p, curthread, addr); + new_pc = pc; + break; + } + +#ifdef __amd64 + if (p->p_model == DATAMODEL_NATIVE) { +#endif + if ((value = fasttrap_fulword((void *)addr)) + == -1) { + fasttrap_sigsegv(p, curthread, + addr); + new_pc = pc; + break; + } + new_pc = value; +#ifdef __amd64 + } else { + uint32_t value32; + addr = (uintptr_t)(uint32_t)addr; + if ((value32 = fasttrap_fuword32((void *)addr)) + == -1) { + fasttrap_sigsegv(p, curthread, + addr); + new_pc = pc; + break; + } + new_pc = value32; + } +#endif + } else { + new_pc = addr; + } + } + + /* + * If this is a call instruction, we need to push the return + * address onto the stack. If this fails, we send the process + * a SIGSEGV and reset the pc to emulate what would happen if + * this instruction weren't traced. + */ + if (tp->ftt_type == FASTTRAP_T_CALL) { + int ret = 0; + uintptr_t addr = 0, pcps; +#ifdef __amd64 + if (p->p_model == DATAMODEL_NATIVE) { + addr = rp->r_rsp - sizeof (uintptr_t); + pcps = pc + tp->ftt_size; + ret = fasttrap_sulword((void *)addr, pcps); + } else { +#endif + addr = rp->r_rsp - sizeof (uint32_t); + pcps = (uint32_t)(pc + tp->ftt_size); + ret = fasttrap_suword32((void *)addr, pcps); +#ifdef __amd64 + } +#endif + + if (ret == -1) { + fasttrap_sigsegv(p, curthread, addr); + new_pc = pc; + break; + } + + rp->r_rsp = addr; + } + + break; + + case FASTTRAP_T_COMMON: + { + uintptr_t addr; +#if defined(__amd64) + uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 22]; +#else + uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 7]; +#endif + uint_t i = 0; + fasttrap_scrspace_t *scrspace; + scrspace = fasttrap_scraddr(curthread, tp->ftt_proc); + if (scrspace == NULL) { + /* + * We failed to allocate scratch space for this thread. + * Try to write the original instruction back out and + * reset the pc. + */ + if (fasttrap_copyout(tp->ftt_instr, (void *)pc, + tp->ftt_size)) + fasttrap_sigtrap(p, curthread, pc); + new_pc = pc; + break; + } + addr = scrspace->ftss_addr; + + /* + * Generic Instruction Tracing + * --------------------------- + * + * This is the layout of the scratch space in the user-land + * thread structure for our generated instructions. + * + * 32-bit mode bytes + * ------------------------ ----- + * a: <original instruction> <= 15 + * jmp <pc + tp->ftt_size> 5 + * b: <original instruction> <= 15 + * int T_DTRACE_RET 2 + * ----- + * <= 37 + * + * 64-bit mode bytes + * ------------------------ ----- + * a: <original instruction> <= 15 + * jmp 0(%rip) 6 + * <pc + tp->ftt_size> 8 + * b: <original instruction> <= 15 + * int T_DTRACE_RET 2 + * ----- + * <= 46 + * + * The %pc is set to a, and curthread->t_dtrace_astpc is set + * to b. If we encounter a signal on the way out of the + * kernel, trap() will set %pc to curthread->t_dtrace_astpc + * so that we execute the original instruction and re-enter + * the kernel rather than redirecting to the next instruction. + * + * If there are return probes (so we know that we're going to + * need to reenter the kernel after executing the original + * instruction), the scratch space will just contain the + * original instruction followed by an interrupt -- the same + * data as at b. + * + * %rip-relative Addressing + * ------------------------ + * + * There's a further complication in 64-bit mode due to %rip- + * relative addressing. While this is clearly a beneficial + * architectural decision for position independent code, it's + * hard not to see it as a personal attack against the pid + * provider since before there was a relatively small set of + * instructions to emulate; with %rip-relative addressing, + * almost every instruction can potentially depend on the + * address at which it's executed. Rather than emulating + * the broad spectrum of instructions that can now be + * position dependent, we emulate jumps and others as in + * 32-bit mode, and take a different tack for instructions + * using %rip-relative addressing. + * + * For every instruction that uses the ModRM byte, the + * in-kernel disassembler reports its location. We use the + * ModRM byte to identify that an instruction uses + * %rip-relative addressing and to see what other registers + * the instruction uses. To emulate those instructions, + * we modify the instruction to be %rax-relative rather than + * %rip-relative (or %rcx-relative if the instruction uses + * %rax; or %r8- or %r9-relative if the REX.B is present so + * we don't have to rewrite the REX prefix). We then load + * the value that %rip would have been into the scratch + * register and generate an instruction to reset the scratch + * register back to its original value. The instruction + * sequence looks like this: + * + * 64-mode %rip-relative bytes + * ------------------------ ----- + * a: <modified instruction> <= 15 + * movq $<value>, %<scratch> 6 + * jmp 0(%rip) 6 + * <pc + tp->ftt_size> 8 + * b: <modified instruction> <= 15 + * int T_DTRACE_RET 2 + * ----- + * 52 + * + * We set curthread->t_dtrace_regv so that upon receiving + * a signal we can reset the value of the scratch register. + */ + + ASSERT(tp->ftt_size <= FASTTRAP_MAX_INSTR_SIZE); + + curthread->t_dtrace_scrpc = addr; + bcopy(tp->ftt_instr, &scratch[i], tp->ftt_size); + i += tp->ftt_size; + +#ifdef __amd64 + if (tp->ftt_ripmode != 0) { + greg_t *reg = NULL; + + ASSERT(p->p_model == DATAMODEL_LP64); + ASSERT(tp->ftt_ripmode & + (FASTTRAP_RIP_1 | FASTTRAP_RIP_2)); + + /* + * If this was a %rip-relative instruction, we change + * it to be either a %rax- or %rcx-relative + * instruction (depending on whether those registers + * are used as another operand; or %r8- or %r9- + * relative depending on the value of REX.B). We then + * set that register and generate a movq instruction + * to reset the value. + */ + if (tp->ftt_ripmode & FASTTRAP_RIP_X) + scratch[i++] = FASTTRAP_REX(1, 0, 0, 1); + else + scratch[i++] = FASTTRAP_REX(1, 0, 0, 0); + + if (tp->ftt_ripmode & FASTTRAP_RIP_1) + scratch[i++] = FASTTRAP_MOV_EAX; + else + scratch[i++] = FASTTRAP_MOV_ECX; + + switch (tp->ftt_ripmode) { + case FASTTRAP_RIP_1: + reg = &rp->r_rax; + curthread->t_dtrace_reg = REG_RAX; + break; + case FASTTRAP_RIP_2: + reg = &rp->r_rcx; + curthread->t_dtrace_reg = REG_RCX; + break; + case FASTTRAP_RIP_1 | FASTTRAP_RIP_X: + reg = &rp->r_r8; + curthread->t_dtrace_reg = REG_R8; + break; + case FASTTRAP_RIP_2 | FASTTRAP_RIP_X: + reg = &rp->r_r9; + curthread->t_dtrace_reg = REG_R9; + break; + } + + /* LINTED - alignment */ + *(uint64_t *)&scratch[i] = *reg; + curthread->t_dtrace_regv = *reg; + *reg = pc + tp->ftt_size; + i += sizeof (uint64_t); + } +#endif + + /* + * Generate the branch instruction to what would have + * normally been the subsequent instruction. In 32-bit mode, + * this is just a relative branch; in 64-bit mode this is a + * %rip-relative branch that loads the 64-bit pc value + * immediately after the jmp instruction. + */ +#ifdef __amd64 + if (p->p_model == DATAMODEL_LP64) { + scratch[i++] = FASTTRAP_GROUP5_OP; + scratch[i++] = FASTTRAP_MODRM(0, 4, 5); + /* LINTED - alignment */ + *(uint32_t *)&scratch[i] = 0; + i += sizeof (uint32_t); + /* LINTED - alignment */ + *(uint64_t *)&scratch[i] = pc + tp->ftt_size; + i += sizeof (uint64_t); + } else { +#endif + /* + * Set up the jmp to the next instruction; note that + * the size of the traced instruction cancels out. + */ + scratch[i++] = FASTTRAP_JMP32; + /* LINTED - alignment */ + *(uint32_t *)&scratch[i] = pc - addr - 5; + i += sizeof (uint32_t); +#ifdef __amd64 + } +#endif + + curthread->t_dtrace_astpc = addr + i; + bcopy(tp->ftt_instr, &scratch[i], tp->ftt_size); + i += tp->ftt_size; + scratch[i++] = FASTTRAP_INT; + scratch[i++] = T_DTRACE_RET; + + ASSERT(i <= sizeof (scratch)); + + if (fasttrap_copyout(scratch, (char *)addr, i)) { + fasttrap_sigtrap(p, curthread, pc); + new_pc = pc; + break; + } + if (tp->ftt_retids != NULL) { + curthread->t_dtrace_step = 1; + curthread->t_dtrace_ret = 1; + new_pc = curthread->t_dtrace_astpc; + } else { + new_pc = curthread->t_dtrace_scrpc; + } + + curthread->t_dtrace_pc = pc; + curthread->t_dtrace_npc = pc + tp->ftt_size; + curthread->t_dtrace_on = 1; + break; + } + + default: + panic("fasttrap: mishandled an instruction"); + } + +done: + /* + * If there were no return probes when we first found the tracepoint, + * we should feel no obligation to honor any return probes that were + * subsequently enabled -- they'll just have to wait until the next + * time around. + */ + if (tp->ftt_retids != NULL) { + /* + * We need to wait until the results of the instruction are + * apparent before invoking any return probes. If this + * instruction was emulated we can just call + * fasttrap_return_common(); if it needs to be executed, we + * need to wait until the user thread returns to the kernel. + */ + if (tp->ftt_type != FASTTRAP_T_COMMON) { + /* + * Set the program counter to the address of the traced + * instruction so that it looks right in ustack() + * output. We had previously set it to the end of the + * instruction to simplify %rip-relative addressing. + */ + rp->r_rip = pc; + + fasttrap_return_common(rp, pc, pid, new_pc); + } else { + ASSERT(curthread->t_dtrace_ret != 0); + ASSERT(curthread->t_dtrace_pc == pc); + ASSERT(curthread->t_dtrace_scrpc != 0); + ASSERT(new_pc == curthread->t_dtrace_astpc); + } + } + + rp->r_rip = new_pc; + + PROC_LOCK(p); + proc_write_regs(curthread, rp); + PROC_UNLOCK(p); + + return (0); +} + +int +fasttrap_return_probe(struct trapframe *tf) +{ + struct reg reg, *rp; + proc_t *p = curproc; + uintptr_t pc = curthread->t_dtrace_pc; + uintptr_t npc = curthread->t_dtrace_npc; + + fill_frame_regs(tf, ®); + rp = ® + + curthread->t_dtrace_pc = 0; + curthread->t_dtrace_npc = 0; + curthread->t_dtrace_scrpc = 0; + curthread->t_dtrace_astpc = 0; + +#ifdef illumos + /* + * Treat a child created by a call to vfork(2) as if it were its + * parent. We know that there's only one thread of control in such a + * process: this one. + */ + while (p->p_flag & SVFORK) { + p = p->p_parent; + } +#endif + + /* + * We set rp->r_rip to the address of the traced instruction so + * that it appears to dtrace_probe() that we're on the original + * instruction. + */ + rp->r_rip = pc; + + fasttrap_return_common(rp, pc, p->p_pid, npc); + + return (0); +} + +/*ARGSUSED*/ +uint64_t +fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno, + int aframes) +{ + struct reg r; + + fill_regs(curthread, &r); + + return (fasttrap_anarg(&r, 1, argno)); +} + +/*ARGSUSED*/ +uint64_t +fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno, + int aframes) +{ + struct reg r; + + fill_regs(curthread, &r); + + return (fasttrap_anarg(&r, 0, argno)); +} + +static ulong_t +fasttrap_getreg(struct reg *rp, uint_t reg) +{ +#ifdef __amd64 + switch (reg) { + case REG_R15: return (rp->r_r15); + case REG_R14: return (rp->r_r14); + case REG_R13: return (rp->r_r13); + case REG_R12: return (rp->r_r12); + case REG_R11: return (rp->r_r11); + case REG_R10: return (rp->r_r10); + case REG_R9: return (rp->r_r9); + case REG_R8: return (rp->r_r8); + case REG_RDI: return (rp->r_rdi); + case REG_RSI: return (rp->r_rsi); + case REG_RBP: return (rp->r_rbp); + case REG_RBX: return (rp->r_rbx); + case REG_RDX: return (rp->r_rdx); + case REG_RCX: return (rp->r_rcx); + case REG_RAX: return (rp->r_rax); + case REG_TRAPNO: return (rp->r_trapno); + case REG_ERR: return (rp->r_err); + case REG_RIP: return (rp->r_rip); + case REG_CS: return (rp->r_cs); + case REG_RFL: return (rp->r_rflags); + case REG_RSP: return (rp->r_rsp); + case REG_SS: return (rp->r_ss); + case REG_FS: return (rp->r_fs); + case REG_GS: return (rp->r_gs); + case REG_DS: return (rp->r_ds); + case REG_ES: return (rp->r_es); + case REG_FSBASE: return (rdmsr(MSR_FSBASE)); + case REG_GSBASE: return (rdmsr(MSR_GSBASE)); + } + + panic("dtrace: illegal register constant"); + /*NOTREACHED*/ +#else +#define _NGREG 19 + if (reg >= _NGREG) + panic("dtrace: illegal register constant"); + + return (((greg_t *)&rp->r_gs)[reg]); +#endif +} diff --git a/sys/cddl/contrib/opensolaris/uts/intel/sys/fasttrap_isa.h b/sys/cddl/contrib/opensolaris/uts/intel/sys/fasttrap_isa.h new file mode 100644 index 000000000000..9fee8cdb6be0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/intel/sys/fasttrap_isa.h @@ -0,0 +1,114 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _FASTTRAP_ISA_H +#define _FASTTRAP_ISA_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define FASTTRAP_MAX_INSTR_SIZE 15 + +#define FASTTRAP_INSTR 0xcc + +#define FASTTRAP_SUNWDTRACE_SIZE 64 + +typedef uint8_t fasttrap_instr_t; + +typedef struct fasttrap_machtp { + uint8_t ftmt_instr[FASTTRAP_MAX_INSTR_SIZE]; /* orig. instr. */ + uint8_t ftmt_size; /* instruction size */ +#ifdef __amd64 + uint8_t ftmt_ripmode; /* %rip-relative handling mode */ + uint8_t ftmt_modrm; /* saved modrm byte */ +#endif + uint8_t ftmt_type; /* emulation type */ + uint8_t ftmt_code; /* branch condition */ + uint8_t ftmt_base; /* branch base */ + uint8_t ftmt_index; /* branch index */ + uint8_t ftmt_scale; /* branch scale */ + uint8_t ftmt_segment; /* segment for memory accesses */ + uintptr_t ftmt_dest; /* destination of control flow */ +} fasttrap_machtp_t; + +#define ftt_instr ftt_mtp.ftmt_instr +#ifdef __amd64 +#define ftt_ripmode ftt_mtp.ftmt_ripmode +#define ftt_modrm ftt_mtp.ftmt_modrm +#endif +#define ftt_size ftt_mtp.ftmt_size +#define ftt_type ftt_mtp.ftmt_type +#define ftt_code ftt_mtp.ftmt_code +#define ftt_base ftt_mtp.ftmt_base +#define ftt_index ftt_mtp.ftmt_index +#define ftt_scale ftt_mtp.ftmt_scale +#define ftt_segment ftt_mtp.ftmt_segment +#define ftt_dest ftt_mtp.ftmt_dest + +#define FASTTRAP_T_COMMON 0x00 /* common case -- no emulation */ +#define FASTTRAP_T_JCC 0x01 /* near and far conditional jumps */ +#define FASTTRAP_T_LOOP 0x02 /* loop instructions */ +#define FASTTRAP_T_JCXZ 0x03 /* jump if %ecx/%rcx is zero */ +#define FASTTRAP_T_JMP 0x04 /* relative jump */ +#define FASTTRAP_T_CALL 0x05 /* near call (and link) */ +#define FASTTRAP_T_RET 0x06 /* ret */ +#define FASTTRAP_T_RET16 0x07 /* ret <imm16> */ + +/* + * For performance rather than correctness. + */ +#define FASTTRAP_T_PUSHL_EBP 0x10 /* pushl %ebp (for function entry) */ +#define FASTTRAP_T_NOP 0x11 /* nop */ + +#define FASTTRAP_RIP_1 0x1 +#define FASTTRAP_RIP_2 0x2 +#define FASTTRAP_RIP_X 0x4 + +/* + * Segment values. + */ +#define FASTTRAP_SEG_NONE 0 +#define FASTTRAP_SEG_CS 1 +#define FASTTRAP_SEG_DS 2 +#define FASTTRAP_SEG_ES 3 +#define FASTTRAP_SEG_FS 4 +#define FASTTRAP_SEG_GS 5 +#define FASTTRAP_SEG_SS 6 + +#define FASTTRAP_AFRAMES 3 +#define FASTTRAP_RETURN_AFRAMES 4 +#define FASTTRAP_ENTRY_AFRAMES 3 +#define FASTTRAP_OFFSET_AFRAMES 3 + +#ifdef __cplusplus +} +#endif + +#endif /* _FASTTRAP_ISA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c b/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c new file mode 100644 index 000000000000..a31eac8cf0b8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +/* + * XXX: Placeholder for MIPS fasttrap code + */ diff --git a/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h b/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h new file mode 100644 index 000000000000..eb99752ce415 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _FASTTRAP_ISA_H +#define _FASTTRAP_ISA_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif +/* + * XXXDTRACE: placehodler for MIPS fasttrap stuff + */ + +typedef uint32_t fasttrap_instr_t; +#define FASTTRAP_SUNWDTRACE_SIZE 64 + +#ifdef __cplusplus +} +#endif + +#endif /* _FASTTRAP_ISA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c b/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c new file mode 100644 index 000000000000..69ba0f3c64d0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c @@ -0,0 +1,548 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Portions Copyright 2013 Justin Hibbits */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/fasttrap_isa.h> +#include <sys/fasttrap_impl.h> +#include <sys/dtrace.h> +#include <sys/dtrace_impl.h> +#include <cddl/dev/dtrace/dtrace_cddl.h> +#include <sys/proc.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <sys/ptrace.h> +#include <sys/rmlock.h> +#include <sys/sysent.h> + +#define OP(x) ((x) >> 26) +#define OPX(x) (((x) >> 2) & 0x3FF) +#define OP_BO(x) (((x) & 0x03E00000) >> 21) +#define OP_BI(x) (((x) & 0x001F0000) >> 16) +#define OP_RS(x) (((x) & 0x03E00000) >> 21) +#define OP_RA(x) (((x) & 0x001F0000) >> 16) +#define OP_RB(x) (((x) & 0x0000F100) >> 11) + +int +fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp) +{ + fasttrap_instr_t instr = FASTTRAP_INSTR; + + if (uwrite(p, &instr, 4, tp->ftt_pc) != 0) + return (-1); + + return (0); +} + +int +fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp) +{ + uint32_t instr; + + /* + * Distinguish between read or write failures and a changed + * instruction. + */ + if (uread(p, &instr, 4, tp->ftt_pc) != 0) + return (0); + if (instr != FASTTRAP_INSTR) + return (0); + if (uwrite(p, &tp->ftt_instr, 4, tp->ftt_pc) != 0) + return (-1); + + return (0); +} + +int +fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, uintptr_t pc, + fasttrap_probe_type_t type) +{ + uint32_t instr; + //int32_t disp; + + /* + * Read the instruction at the given address out of the process's + * address space. We don't have to worry about a debugger + * changing this instruction before we overwrite it with our trap + * instruction since P_PR_LOCK is set. + */ + if (uread(p, &instr, 4, pc) != 0) + return (-1); + + /* + * Decode the instruction to fill in the probe flags. We can have + * the process execute most instructions on its own using a pc/npc + * trick, but pc-relative control transfer present a problem since + * we're relocating the instruction. We emulate these instructions + * in the kernel. We assume a default type and over-write that as + * needed. + * + * pc-relative instructions must be emulated for correctness; + * other instructions (which represent a large set of commonly traced + * instructions) are emulated or otherwise optimized for performance. + */ + tp->ftt_type = FASTTRAP_T_COMMON; + tp->ftt_instr = instr; + + switch (OP(instr)) { + /* The following are invalid for trapping (invalid opcodes, tw/twi). */ + case 0: + case 1: + case 2: + case 4: + case 5: + case 6: + case 30: + case 39: + case 58: + case 62: + case 3: /* twi */ + return (-1); + case 31: /* tw */ + if (OPX(instr) == 4) + return (-1); + else if (OPX(instr) == 444 && OP_RS(instr) == OP_RA(instr) && + OP_RS(instr) == OP_RB(instr)) + tp->ftt_type = FASTTRAP_T_NOP; + break; + case 16: + tp->ftt_type = FASTTRAP_T_BC; + tp->ftt_dest = instr & 0x0000FFFC; /* Extract target address */ + if (instr & 0x00008000) + tp->ftt_dest |= 0xFFFF0000; + /* Use as offset if not absolute address. */ + if (!(instr & 0x02)) + tp->ftt_dest += pc; + tp->ftt_bo = OP_BO(instr); + tp->ftt_bi = OP_BI(instr); + break; + case 18: + tp->ftt_type = FASTTRAP_T_B; + tp->ftt_dest = instr & 0x03FFFFFC; /* Extract target address */ + if (instr & 0x02000000) + tp->ftt_dest |= 0xFC000000; + /* Use as offset if not absolute address. */ + if (!(instr & 0x02)) + tp->ftt_dest += pc; + break; + case 19: + switch (OPX(instr)) { + case 528: /* bcctr */ + tp->ftt_type = FASTTRAP_T_BCTR; + tp->ftt_bo = OP_BO(instr); + tp->ftt_bi = OP_BI(instr); + break; + case 16: /* bclr */ + tp->ftt_type = FASTTRAP_T_BCTR; + tp->ftt_bo = OP_BO(instr); + tp->ftt_bi = OP_BI(instr); + break; + }; + break; + case 24: + if (OP_RS(instr) == OP_RA(instr) && + (instr & 0x0000FFFF) == 0) + tp->ftt_type = FASTTRAP_T_NOP; + break; + }; + + /* + * We don't know how this tracepoint is going to be used, but in case + * it's used as part of a function return probe, we need to indicate + * whether it's always a return site or only potentially a return + * site. If it's part of a return probe, it's always going to be a + * return from that function if it's a restore instruction or if + * the previous instruction was a return. If we could reliably + * distinguish jump tables from return sites, this wouldn't be + * necessary. + */ +#if 0 + if (tp->ftt_type != FASTTRAP_T_RESTORE && + (uread(p, &instr, 4, pc - sizeof (instr)) != 0 || + !(OP(instr) == 2 && OP3(instr) == OP3_RETURN))) + tp->ftt_flags |= FASTTRAP_F_RETMAYBE; +#endif + + return (0); +} + +static uint64_t +fasttrap_anarg(struct reg *rp, int argno) +{ + uint64_t value; + proc_t *p = curproc; + + /* The first 8 arguments are in registers. */ + if (argno < 8) + return rp->fixreg[argno + 3]; + + /* Arguments on stack start after SP+LR (2 register slots). */ + if (SV_PROC_FLAG(p, SV_ILP32)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + value = dtrace_fuword32((void *)(rp->fixreg[1] + 8 + + ((argno - 8) * sizeof(uint32_t)))); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR); + } else { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + value = dtrace_fuword64((void *)(rp->fixreg[1] + 48 + + ((argno - 8) * sizeof(uint64_t)))); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR); + } + return value; +} + +uint64_t +fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno, + int aframes) +{ + struct reg r; + + fill_regs(curthread, &r); + + return (fasttrap_anarg(&r, argno)); +} + +uint64_t +fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno, + int aframes) +{ + struct reg r; + + fill_regs(curthread, &r); + + return (fasttrap_anarg(&r, argno)); +} + +static void +fasttrap_usdt_args(fasttrap_probe_t *probe, struct reg *rp, int argc, + uintptr_t *argv) +{ + int i, x, cap = MIN(argc, probe->ftp_nargs); + + for (i = 0; i < cap; i++) { + x = probe->ftp_argmap[i]; + + if (x < 8) + argv[i] = rp->fixreg[x]; + else + if (SV_PROC_FLAG(curproc, SV_ILP32)) + argv[i] = fuword32((void *)(rp->fixreg[1] + 8 + + (x * sizeof(uint32_t)))); + else + argv[i] = fuword64((void *)(rp->fixreg[1] + 48 + + (x * sizeof(uint64_t)))); + } + + for (; i < argc; i++) { + argv[i] = 0; + } +} + +static void +fasttrap_return_common(struct reg *rp, uintptr_t pc, pid_t pid, + uintptr_t new_pc) +{ + struct rm_priotracker tracker; + fasttrap_tracepoint_t *tp; + fasttrap_bucket_t *bucket; + fasttrap_id_t *id; + + rm_rlock(&fasttrap_tp_lock, &tracker); + bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)]; + + for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { + if (pid == tp->ftt_pid && pc == tp->ftt_pc && + tp->ftt_proc->ftpc_acount != 0) + break; + } + + /* + * Don't sweat it if we can't find the tracepoint again; unlike + * when we're in fasttrap_pid_probe(), finding the tracepoint here + * is not essential to the correct execution of the process. + */ + if (tp == NULL) { + rm_runlock(&fasttrap_tp_lock, &tracker); + return; + } + + for (id = tp->ftt_retids; id != NULL; id = id->fti_next) { + /* + * If there's a branch that could act as a return site, we + * need to trace it, and check here if the program counter is + * external to the function. + */ + /* Skip function-local branches. */ + if ((new_pc - id->fti_probe->ftp_faddr) < id->fti_probe->ftp_fsize) + continue; + + dtrace_probe(id->fti_probe->ftp_id, + pc - id->fti_probe->ftp_faddr, + rp->fixreg[3], rp->fixreg[4], 0, 0); + } + rm_runlock(&fasttrap_tp_lock, &tracker); +} + + +static int +fasttrap_branch_taken(int bo, int bi, struct reg *regs) +{ + int crzero = 0; + + /* Branch always? */ + if ((bo & 0x14) == 0x14) + return 1; + + /* Handle decrementing ctr */ + if (!(bo & 0x04)) { + --regs->ctr; + crzero = (regs->ctr == 0); + if (bo & 0x10) { + return (!(crzero ^ (bo >> 1))); + } + } + + return (crzero | (((regs->cr >> (31 - bi)) ^ (bo >> 3)) ^ 1)); +} + + +int +fasttrap_pid_probe(struct trapframe *frame) +{ + struct reg reg, *rp; + struct rm_priotracker tracker; + proc_t *p = curproc; + uintptr_t pc; + uintptr_t new_pc = 0; + fasttrap_bucket_t *bucket; + fasttrap_tracepoint_t *tp, tp_local; + pid_t pid; + dtrace_icookie_t cookie; + uint_t is_enabled = 0; + + fill_regs(curthread, ®); + rp = ® + pc = rp->pc; + + /* + * It's possible that a user (in a veritable orgy of bad planning) + * could redirect this thread's flow of control before it reached the + * return probe fasttrap. In this case we need to kill the process + * since it's in a unrecoverable state. + */ + if (curthread->t_dtrace_step) { + ASSERT(curthread->t_dtrace_on); + fasttrap_sigtrap(p, curthread, pc); + return (0); + } + + /* + * Clear all user tracing flags. + */ + curthread->t_dtrace_ft = 0; + curthread->t_dtrace_pc = 0; + curthread->t_dtrace_npc = 0; + curthread->t_dtrace_scrpc = 0; + curthread->t_dtrace_astpc = 0; + + rm_rlock(&fasttrap_tp_lock, &tracker); + pid = p->p_pid; + bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)]; + + /* + * Lookup the tracepoint that the process just hit. + */ + for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { + if (pid == tp->ftt_pid && pc == tp->ftt_pc && + tp->ftt_proc->ftpc_acount != 0) + break; + } + + /* + * If we couldn't find a matching tracepoint, either a tracepoint has + * been inserted without using the pid<pid> ioctl interface (see + * fasttrap_ioctl), or somehow we have mislaid this tracepoint. + */ + if (tp == NULL) { + rm_runlock(&fasttrap_tp_lock, &tracker); + return (-1); + } + + if (tp->ftt_ids != NULL) { + fasttrap_id_t *id; + + for (id = tp->ftt_ids; id != NULL; id = id->fti_next) { + fasttrap_probe_t *probe = id->fti_probe; + + if (id->fti_ptype == DTFTP_ENTRY) { + /* + * We note that this was an entry + * probe to help ustack() find the + * first caller. + */ + cookie = dtrace_interrupt_disable(); + DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY); + dtrace_probe(probe->ftp_id, rp->fixreg[3], + rp->fixreg[4], rp->fixreg[5], rp->fixreg[6], + rp->fixreg[7]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY); + dtrace_interrupt_enable(cookie); + } else if (id->fti_ptype == DTFTP_IS_ENABLED) { + /* + * Note that in this case, we don't + * call dtrace_probe() since it's only + * an artificial probe meant to change + * the flow of control so that it + * encounters the true probe. + */ + is_enabled = 1; + } else if (probe->ftp_argmap == NULL) { + dtrace_probe(probe->ftp_id, rp->fixreg[3], + rp->fixreg[4], rp->fixreg[5], rp->fixreg[6], + rp->fixreg[7]); + } else { + uintptr_t t[5]; + + fasttrap_usdt_args(probe, rp, + sizeof (t) / sizeof (t[0]), t); + + dtrace_probe(probe->ftp_id, t[0], t[1], + t[2], t[3], t[4]); + } + } + } + + /* + * We're about to do a bunch of work so we cache a local copy of + * the tracepoint to emulate the instruction, and then find the + * tracepoint again later if we need to light up any return probes. + */ + tp_local = *tp; + rm_runlock(&fasttrap_tp_lock, &tracker); + tp = &tp_local; + + /* + * If there's an is-enabled probe connected to this tracepoint it + * means that there was a 'xor r3, r3, r3' + * instruction that was placed there by DTrace when the binary was + * linked. As this probe is, in fact, enabled, we need to stuff 1 + * into R3. Accordingly, we can bypass all the instruction + * emulation logic since we know the inevitable result. It's possible + * that a user could construct a scenario where the 'is-enabled' + * probe was on some other instruction, but that would be a rather + * exotic way to shoot oneself in the foot. + */ + if (is_enabled) { + rp->fixreg[3] = 1; + new_pc = rp->pc + 4; + goto done; + } + + + switch (tp->ftt_type) { + case FASTTRAP_T_NOP: + new_pc = rp->pc + 4; + break; + case FASTTRAP_T_BC: + if (!fasttrap_branch_taken(tp->ftt_bo, tp->ftt_bi, rp)) + break; + /* FALLTHROUGH */ + case FASTTRAP_T_B: + if (tp->ftt_instr & 0x01) + rp->lr = rp->pc + 4; + new_pc = tp->ftt_dest; + break; + case FASTTRAP_T_BLR: + case FASTTRAP_T_BCTR: + if (!fasttrap_branch_taken(tp->ftt_bo, tp->ftt_bi, rp)) + break; + /* FALLTHROUGH */ + if (tp->ftt_type == FASTTRAP_T_BCTR) + new_pc = rp->ctr; + else + new_pc = rp->lr; + if (tp->ftt_instr & 0x01) + rp->lr = rp->pc + 4; + break; + case FASTTRAP_T_COMMON: + break; + }; +done: + /* + * If there were no return probes when we first found the tracepoint, + * we should feel no obligation to honor any return probes that were + * subsequently enabled -- they'll just have to wait until the next + * time around. + */ + if (tp->ftt_retids != NULL) { + /* + * We need to wait until the results of the instruction are + * apparent before invoking any return probes. If this + * instruction was emulated we can just call + * fasttrap_return_common(); if it needs to be executed, we + * need to wait until the user thread returns to the kernel. + */ + if (tp->ftt_type != FASTTRAP_T_COMMON) { + fasttrap_return_common(rp, pc, pid, new_pc); + } else { + ASSERT(curthread->t_dtrace_ret != 0); + ASSERT(curthread->t_dtrace_pc == pc); + ASSERT(curthread->t_dtrace_scrpc != 0); + ASSERT(new_pc == curthread->t_dtrace_astpc); + } + } + + rp->pc = new_pc; + set_regs(curthread, rp); + + return (0); +} + +int +fasttrap_return_probe(struct trapframe *tf) +{ + struct reg reg, *rp; + proc_t *p = curproc; + uintptr_t pc = curthread->t_dtrace_pc; + uintptr_t npc = curthread->t_dtrace_npc; + + curthread->t_dtrace_pc = 0; + curthread->t_dtrace_npc = 0; + curthread->t_dtrace_scrpc = 0; + curthread->t_dtrace_astpc = 0; + + fill_regs(curthread, ®); + rp = ® + + /* + * We set rp->pc to the address of the traced instruction so + * that it appears to dtrace_probe() that we're on the original + * instruction. + */ + rp->pc = pc; + + fasttrap_return_common(rp, pc, p->p_pid, npc); + + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h b/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h new file mode 100644 index 000000000000..98d5d2675662 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Portions Copyright 2013 Justin Hibbits */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _FASTTRAP_ISA_H +#define _FASTTRAP_ISA_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define FASTTRAP_SUNWDTRACE_SIZE 64 +#define FASTTRAP_INSTR 0x0FFFDDDD + +typedef uint32_t fasttrap_instr_t; + +typedef struct fasttrap_machtp_t { + fasttrap_instr_t ftmt_instr; /* original instruction */ + uintptr_t ftmt_dest; /* branch target */ + uint8_t ftmt_type; /* emulation type */ + uint8_t ftmt_flags; /* emulation flags */ + uint8_t ftmt_bo; /* BO field */ + uint8_t ftmt_bi; /* BI field (CR bit) */ +} fasttrap_machtp_t; + +#define ftt_instr ftt_mtp.ftmt_instr +#define ftt_dest ftt_mtp.ftmt_dest +#define ftt_type ftt_mtp.ftmt_type +#define ftt_flags ftt_mtp.ftmt_flags +#define ftt_bo ftt_mtp.ftmt_bo +#define ftt_bi ftt_mtp.ftmt_bi + +#define FASTTRAP_T_COMMON 0x00 +#define FASTTRAP_T_B 0x01 +#define FASTTRAP_T_BC 0x02 +#define FASTTRAP_T_BLR 0x03 +#define FASTTRAP_T_BCTR 0x04 +#define FASTTRAP_T_NOP 0x05 + +#define FASTTRAP_AFRAMES 3 +#define FASTTRAP_RETURN_AFRAMES 4 +#define FASTTRAP_ENTRY_AFRAMES 3 +#define FASTTRAP_OFFSET_AFRAMES 3 + +#ifdef __cplusplus +} +#endif + +#endif /* _FASTTRAP_ISA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/riscv/dtrace/fasttrap_isa.c b/sys/cddl/contrib/opensolaris/uts/riscv/dtrace/fasttrap_isa.c new file mode 100644 index 000000000000..17ef0f3a94a7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/riscv/dtrace/fasttrap_isa.c @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * XXX: Placeholder for RISC-V fasttrap code + */ diff --git a/sys/cddl/contrib/opensolaris/uts/riscv/sys/fasttrap_isa.h b/sys/cddl/contrib/opensolaris/uts/riscv/sys/fasttrap_isa.h new file mode 100644 index 000000000000..9d164e4c5bd8 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/riscv/sys/fasttrap_isa.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _FASTTRAP_ISA_H +#define _FASTTRAP_ISA_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint32_t fasttrap_instr_t; + +/* XXX: Place for RISC-V fasttrap headers */ + +#ifdef __cplusplus +} +#endif + +#endif /* _FASTTRAP_ISA_H */ |